| File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |
| Warning: | line 6130, column 33 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// | ||||
| 2 | // | ||||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
| 6 | // | ||||
| 7 | //===----------------------------------------------------------------------===// | ||||
| 8 | // | ||||
| 9 | // This pass implements the Bottom Up SLP vectorizer. It detects consecutive | ||||
| 10 | // stores that can be put together into vector-stores. Next, it attempts to | ||||
| 11 | // construct vectorizable tree using the use-def chains. If a profitable tree | ||||
| 12 | // was found, the SLP vectorizer performs vectorization on the tree. | ||||
| 13 | // | ||||
| 14 | // The pass is inspired by the work described in the paper: | ||||
| 15 | // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. | ||||
| 16 | // | ||||
| 17 | //===----------------------------------------------------------------------===// | ||||
| 18 | |||||
| 19 | #include "llvm/Transforms/Vectorize/SLPVectorizer.h" | ||||
| 20 | #include "llvm/ADT/DenseMap.h" | ||||
| 21 | #include "llvm/ADT/DenseSet.h" | ||||
| 22 | #include "llvm/ADT/Optional.h" | ||||
| 23 | #include "llvm/ADT/PostOrderIterator.h" | ||||
| 24 | #include "llvm/ADT/STLExtras.h" | ||||
| 25 | #include "llvm/ADT/SetOperations.h" | ||||
| 26 | #include "llvm/ADT/SetVector.h" | ||||
| 27 | #include "llvm/ADT/SmallBitVector.h" | ||||
| 28 | #include "llvm/ADT/SmallPtrSet.h" | ||||
| 29 | #include "llvm/ADT/SmallSet.h" | ||||
| 30 | #include "llvm/ADT/SmallString.h" | ||||
| 31 | #include "llvm/ADT/Statistic.h" | ||||
| 32 | #include "llvm/ADT/iterator.h" | ||||
| 33 | #include "llvm/ADT/iterator_range.h" | ||||
| 34 | #include "llvm/Analysis/AliasAnalysis.h" | ||||
| 35 | #include "llvm/Analysis/AssumptionCache.h" | ||||
| 36 | #include "llvm/Analysis/CodeMetrics.h" | ||||
| 37 | #include "llvm/Analysis/DemandedBits.h" | ||||
| 38 | #include "llvm/Analysis/GlobalsModRef.h" | ||||
| 39 | #include "llvm/Analysis/IVDescriptors.h" | ||||
| 40 | #include "llvm/Analysis/LoopAccessAnalysis.h" | ||||
| 41 | #include "llvm/Analysis/LoopInfo.h" | ||||
| 42 | #include "llvm/Analysis/MemoryLocation.h" | ||||
| 43 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" | ||||
| 44 | #include "llvm/Analysis/ScalarEvolution.h" | ||||
| 45 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" | ||||
| 46 | #include "llvm/Analysis/TargetLibraryInfo.h" | ||||
| 47 | #include "llvm/Analysis/TargetTransformInfo.h" | ||||
| 48 | #include "llvm/Analysis/ValueTracking.h" | ||||
| 49 | #include "llvm/Analysis/VectorUtils.h" | ||||
| 50 | #include "llvm/IR/Attributes.h" | ||||
| 51 | #include "llvm/IR/BasicBlock.h" | ||||
| 52 | #include "llvm/IR/Constant.h" | ||||
| 53 | #include "llvm/IR/Constants.h" | ||||
| 54 | #include "llvm/IR/DataLayout.h" | ||||
| 55 | #include "llvm/IR/DebugLoc.h" | ||||
| 56 | #include "llvm/IR/DerivedTypes.h" | ||||
| 57 | #include "llvm/IR/Dominators.h" | ||||
| 58 | #include "llvm/IR/Function.h" | ||||
| 59 | #include "llvm/IR/IRBuilder.h" | ||||
| 60 | #include "llvm/IR/InstrTypes.h" | ||||
| 61 | #include "llvm/IR/Instruction.h" | ||||
| 62 | #include "llvm/IR/Instructions.h" | ||||
| 63 | #include "llvm/IR/IntrinsicInst.h" | ||||
| 64 | #include "llvm/IR/Intrinsics.h" | ||||
| 65 | #include "llvm/IR/Module.h" | ||||
| 66 | #include "llvm/IR/NoFolder.h" | ||||
| 67 | #include "llvm/IR/Operator.h" | ||||
| 68 | #include "llvm/IR/PatternMatch.h" | ||||
| 69 | #include "llvm/IR/Type.h" | ||||
| 70 | #include "llvm/IR/Use.h" | ||||
| 71 | #include "llvm/IR/User.h" | ||||
| 72 | #include "llvm/IR/Value.h" | ||||
| 73 | #include "llvm/IR/ValueHandle.h" | ||||
| 74 | #include "llvm/IR/Verifier.h" | ||||
| 75 | #include "llvm/InitializePasses.h" | ||||
| 76 | #include "llvm/Pass.h" | ||||
| 77 | #include "llvm/Support/Casting.h" | ||||
| 78 | #include "llvm/Support/CommandLine.h" | ||||
| 79 | #include "llvm/Support/Compiler.h" | ||||
| 80 | #include "llvm/Support/DOTGraphTraits.h" | ||||
| 81 | #include "llvm/Support/Debug.h" | ||||
| 82 | #include "llvm/Support/ErrorHandling.h" | ||||
| 83 | #include "llvm/Support/GraphWriter.h" | ||||
| 84 | #include "llvm/Support/InstructionCost.h" | ||||
| 85 | #include "llvm/Support/KnownBits.h" | ||||
| 86 | #include "llvm/Support/MathExtras.h" | ||||
| 87 | #include "llvm/Support/raw_ostream.h" | ||||
| 88 | #include "llvm/Transforms/Utils/InjectTLIMappings.h" | ||||
| 89 | #include "llvm/Transforms/Utils/LoopUtils.h" | ||||
| 90 | #include "llvm/Transforms/Vectorize.h" | ||||
| 91 | #include <algorithm> | ||||
| 92 | #include <cassert> | ||||
| 93 | #include <cstdint> | ||||
| 94 | #include <iterator> | ||||
| 95 | #include <memory> | ||||
| 96 | #include <set> | ||||
| 97 | #include <string> | ||||
| 98 | #include <tuple> | ||||
| 99 | #include <utility> | ||||
| 100 | #include <vector> | ||||
| 101 | |||||
| 102 | using namespace llvm; | ||||
| 103 | using namespace llvm::PatternMatch; | ||||
| 104 | using namespace slpvectorizer; | ||||
| 105 | |||||
| 106 | #define SV_NAME"slp-vectorizer" "slp-vectorizer" | ||||
| 107 | #define DEBUG_TYPE"SLP" "SLP" | ||||
| 108 | |||||
| 109 | STATISTIC(NumVectorInstructions, "Number of vector instructions generated")static llvm::Statistic NumVectorInstructions = {"SLP", "NumVectorInstructions" , "Number of vector instructions generated"}; | ||||
| 110 | |||||
| 111 | cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, | ||||
| 112 | cl::desc("Run the SLP vectorization passes")); | ||||
| 113 | |||||
| 114 | static cl::opt<int> | ||||
| 115 | SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, | ||||
| 116 | cl::desc("Only vectorize if you gain more than this " | ||||
| 117 | "number ")); | ||||
| 118 | |||||
| 119 | static cl::opt<bool> | ||||
| 120 | ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, | ||||
| 121 | cl::desc("Attempt to vectorize horizontal reductions")); | ||||
| 122 | |||||
| 123 | static cl::opt<bool> ShouldStartVectorizeHorAtStore( | ||||
| 124 | "slp-vectorize-hor-store", cl::init(false), cl::Hidden, | ||||
| 125 | cl::desc( | ||||
| 126 | "Attempt to vectorize horizontal reductions feeding into a store")); | ||||
| 127 | |||||
| 128 | static cl::opt<int> | ||||
| 129 | MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, | ||||
| 130 | cl::desc("Attempt to vectorize for this register size in bits")); | ||||
| 131 | |||||
| 132 | static cl::opt<unsigned> | ||||
| 133 | MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, | ||||
| 134 | cl::desc("Maximum SLP vectorization factor (0=unlimited)")); | ||||
| 135 | |||||
| 136 | static cl::opt<int> | ||||
| 137 | MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, | ||||
| 138 | cl::desc("Maximum depth of the lookup for consecutive stores.")); | ||||
| 139 | |||||
| 140 | /// Limits the size of scheduling regions in a block. | ||||
| 141 | /// It avoid long compile times for _very_ large blocks where vector | ||||
| 142 | /// instructions are spread over a wide range. | ||||
| 143 | /// This limit is way higher than needed by real-world functions. | ||||
| 144 | static cl::opt<int> | ||||
| 145 | ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, | ||||
| 146 | cl::desc("Limit the size of the SLP scheduling region per block")); | ||||
| 147 | |||||
| 148 | static cl::opt<int> MinVectorRegSizeOption( | ||||
| 149 | "slp-min-reg-size", cl::init(128), cl::Hidden, | ||||
| 150 | cl::desc("Attempt to vectorize for this register size in bits")); | ||||
| 151 | |||||
| 152 | static cl::opt<unsigned> RecursionMaxDepth( | ||||
| 153 | "slp-recursion-max-depth", cl::init(12), cl::Hidden, | ||||
| 154 | cl::desc("Limit the recursion depth when building a vectorizable tree")); | ||||
| 155 | |||||
| 156 | static cl::opt<unsigned> MinTreeSize( | ||||
| 157 | "slp-min-tree-size", cl::init(3), cl::Hidden, | ||||
| 158 | cl::desc("Only vectorize small trees if they are fully vectorizable")); | ||||
| 159 | |||||
| 160 | // The maximum depth that the look-ahead score heuristic will explore. | ||||
| 161 | // The higher this value, the higher the compilation time overhead. | ||||
| 162 | static cl::opt<int> LookAheadMaxDepth( | ||||
| 163 | "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, | ||||
| 164 | cl::desc("The maximum look-ahead depth for operand reordering scores")); | ||||
| 165 | |||||
| 166 | // The Look-ahead heuristic goes through the users of the bundle to calculate | ||||
| 167 | // the users cost in getExternalUsesCost(). To avoid compilation time increase | ||||
| 168 | // we limit the number of users visited to this value. | ||||
| 169 | static cl::opt<unsigned> LookAheadUsersBudget( | ||||
| 170 | "slp-look-ahead-users-budget", cl::init(2), cl::Hidden, | ||||
| 171 | cl::desc("The maximum number of users to visit while visiting the " | ||||
| 172 | "predecessors. This prevents compilation time increase.")); | ||||
| 173 | |||||
| 174 | static cl::opt<bool> | ||||
| 175 | ViewSLPTree("view-slp-tree", cl::Hidden, | ||||
| 176 | cl::desc("Display the SLP trees with Graphviz")); | ||||
| 177 | |||||
| 178 | // Limit the number of alias checks. The limit is chosen so that | ||||
| 179 | // it has no negative effect on the llvm benchmarks. | ||||
| 180 | static const unsigned AliasedCheckLimit = 10; | ||||
| 181 | |||||
| 182 | // Another limit for the alias checks: The maximum distance between load/store | ||||
| 183 | // instructions where alias checks are done. | ||||
| 184 | // This limit is useful for very large basic blocks. | ||||
| 185 | static const unsigned MaxMemDepDistance = 160; | ||||
| 186 | |||||
| 187 | /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling | ||||
| 188 | /// regions to be handled. | ||||
| 189 | static const int MinScheduleRegionSize = 16; | ||||
| 190 | |||||
| 191 | /// Predicate for the element types that the SLP vectorizer supports. | ||||
| 192 | /// | ||||
| 193 | /// The most important thing to filter here are types which are invalid in LLVM | ||||
| 194 | /// vectors. We also filter target specific types which have absolutely no | ||||
| 195 | /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just | ||||
| 196 | /// avoids spending time checking the cost model and realizing that they will | ||||
| 197 | /// be inevitably scalarized. | ||||
| 198 | static bool isValidElementType(Type *Ty) { | ||||
| 199 | return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && | ||||
| 200 | !Ty->isPPC_FP128Ty(); | ||||
| 201 | } | ||||
| 202 | |||||
| 203 | /// \returns true if all of the instructions in \p VL are in the same block or | ||||
| 204 | /// false otherwise. | ||||
| 205 | static bool allSameBlock(ArrayRef<Value *> VL) { | ||||
| 206 | Instruction *I0 = dyn_cast<Instruction>(VL[0]); | ||||
| 207 | if (!I0) | ||||
| 208 | return false; | ||||
| 209 | BasicBlock *BB = I0->getParent(); | ||||
| 210 | for (int I = 1, E = VL.size(); I < E; I++) { | ||||
| 211 | auto *II = dyn_cast<Instruction>(VL[I]); | ||||
| 212 | if (!II) | ||||
| 213 | return false; | ||||
| 214 | |||||
| 215 | if (BB != II->getParent()) | ||||
| 216 | return false; | ||||
| 217 | } | ||||
| 218 | return true; | ||||
| 219 | } | ||||
| 220 | |||||
| 221 | /// \returns True if the value is a constant (but not globals/constant | ||||
| 222 | /// expressions). | ||||
| 223 | static bool isConstant(Value *V) { | ||||
| 224 | return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V); | ||||
| 225 | } | ||||
| 226 | |||||
| 227 | /// \returns True if all of the values in \p VL are constants (but not | ||||
| 228 | /// globals/constant expressions). | ||||
| 229 | static bool allConstant(ArrayRef<Value *> VL) { | ||||
| 230 | // Constant expressions and globals can't be vectorized like normal integer/FP | ||||
| 231 | // constants. | ||||
| 232 | return all_of(VL, isConstant); | ||||
| 233 | } | ||||
| 234 | |||||
| 235 | /// \returns True if all of the values in \p VL are identical. | ||||
| 236 | static bool isSplat(ArrayRef<Value *> VL) { | ||||
| 237 | for (unsigned i = 1, e = VL.size(); i < e; ++i) | ||||
| 238 | if (VL[i] != VL[0]) | ||||
| 239 | return false; | ||||
| 240 | return true; | ||||
| 241 | } | ||||
| 242 | |||||
| 243 | /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. | ||||
| 244 | static bool isCommutative(Instruction *I) { | ||||
| 245 | if (auto *Cmp = dyn_cast<CmpInst>(I)) | ||||
| 246 | return Cmp->isCommutative(); | ||||
| 247 | if (auto *BO = dyn_cast<BinaryOperator>(I)) | ||||
| 248 | return BO->isCommutative(); | ||||
| 249 | // TODO: This should check for generic Instruction::isCommutative(), but | ||||
| 250 | // we need to confirm that the caller code correctly handles Intrinsics | ||||
| 251 | // for example (does not have 2 operands). | ||||
| 252 | return false; | ||||
| 253 | } | ||||
| 254 | |||||
| 255 | /// Checks if the vector of instructions can be represented as a shuffle, like: | ||||
| 256 | /// %x0 = extractelement <4 x i8> %x, i32 0 | ||||
| 257 | /// %x3 = extractelement <4 x i8> %x, i32 3 | ||||
| 258 | /// %y1 = extractelement <4 x i8> %y, i32 1 | ||||
| 259 | /// %y2 = extractelement <4 x i8> %y, i32 2 | ||||
| 260 | /// %x0x0 = mul i8 %x0, %x0 | ||||
| 261 | /// %x3x3 = mul i8 %x3, %x3 | ||||
| 262 | /// %y1y1 = mul i8 %y1, %y1 | ||||
| 263 | /// %y2y2 = mul i8 %y2, %y2 | ||||
| 264 | /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 | ||||
| 265 | /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 | ||||
| 266 | /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 | ||||
| 267 | /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 | ||||
| 268 | /// ret <4 x i8> %ins4 | ||||
| 269 | /// can be transformed into: | ||||
| 270 | /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, | ||||
| 271 | /// i32 6> | ||||
| 272 | /// %2 = mul <4 x i8> %1, %1 | ||||
| 273 | /// ret <4 x i8> %2 | ||||
| 274 | /// We convert this initially to something like: | ||||
| 275 | /// %x0 = extractelement <4 x i8> %x, i32 0 | ||||
| 276 | /// %x3 = extractelement <4 x i8> %x, i32 3 | ||||
| 277 | /// %y1 = extractelement <4 x i8> %y, i32 1 | ||||
| 278 | /// %y2 = extractelement <4 x i8> %y, i32 2 | ||||
| 279 | /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 | ||||
| 280 | /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 | ||||
| 281 | /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 | ||||
| 282 | /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 | ||||
| 283 | /// %5 = mul <4 x i8> %4, %4 | ||||
| 284 | /// %6 = extractelement <4 x i8> %5, i32 0 | ||||
| 285 | /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 | ||||
| 286 | /// %7 = extractelement <4 x i8> %5, i32 1 | ||||
| 287 | /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 | ||||
| 288 | /// %8 = extractelement <4 x i8> %5, i32 2 | ||||
| 289 | /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 | ||||
| 290 | /// %9 = extractelement <4 x i8> %5, i32 3 | ||||
| 291 | /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 | ||||
| 292 | /// ret <4 x i8> %ins4 | ||||
| 293 | /// InstCombiner transforms this into a shuffle and vector mul | ||||
| 294 | /// Mask will return the Shuffle Mask equivalent to the extracted elements. | ||||
| 295 | /// TODO: Can we split off and reuse the shuffle mask detection from | ||||
| 296 | /// TargetTransformInfo::getInstructionThroughput? | ||||
| 297 | static Optional<TargetTransformInfo::ShuffleKind> | ||||
| 298 | isShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { | ||||
| 299 | auto *EI0 = cast<ExtractElementInst>(VL[0]); | ||||
| 300 | unsigned Size = | ||||
| 301 | cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); | ||||
| 302 | Value *Vec1 = nullptr; | ||||
| 303 | Value *Vec2 = nullptr; | ||||
| 304 | enum ShuffleMode { Unknown, Select, Permute }; | ||||
| 305 | ShuffleMode CommonShuffleMode = Unknown; | ||||
| 306 | for (unsigned I = 0, E = VL.size(); I < E; ++I) { | ||||
| 307 | auto *EI = cast<ExtractElementInst>(VL[I]); | ||||
| 308 | auto *Vec = EI->getVectorOperand(); | ||||
| 309 | // All vector operands must have the same number of vector elements. | ||||
| 310 | if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) | ||||
| 311 | return None; | ||||
| 312 | auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); | ||||
| 313 | if (!Idx) | ||||
| 314 | return None; | ||||
| 315 | // Undefined behavior if Idx is negative or >= Size. | ||||
| 316 | if (Idx->getValue().uge(Size)) { | ||||
| 317 | Mask.push_back(UndefMaskElem); | ||||
| 318 | continue; | ||||
| 319 | } | ||||
| 320 | unsigned IntIdx = Idx->getValue().getZExtValue(); | ||||
| 321 | Mask.push_back(IntIdx); | ||||
| 322 | // We can extractelement from undef or poison vector. | ||||
| 323 | if (isa<UndefValue>(Vec)) | ||||
| 324 | continue; | ||||
| 325 | // For correct shuffling we have to have at most 2 different vector operands | ||||
| 326 | // in all extractelement instructions. | ||||
| 327 | if (!Vec1 || Vec1 == Vec) | ||||
| 328 | Vec1 = Vec; | ||||
| 329 | else if (!Vec2 || Vec2 == Vec) | ||||
| 330 | Vec2 = Vec; | ||||
| 331 | else | ||||
| 332 | return None; | ||||
| 333 | if (CommonShuffleMode == Permute) | ||||
| 334 | continue; | ||||
| 335 | // If the extract index is not the same as the operation number, it is a | ||||
| 336 | // permutation. | ||||
| 337 | if (IntIdx != I) { | ||||
| 338 | CommonShuffleMode = Permute; | ||||
| 339 | continue; | ||||
| 340 | } | ||||
| 341 | CommonShuffleMode = Select; | ||||
| 342 | } | ||||
| 343 | // If we're not crossing lanes in different vectors, consider it as blending. | ||||
| 344 | if (CommonShuffleMode == Select && Vec2) | ||||
| 345 | return TargetTransformInfo::SK_Select; | ||||
| 346 | // If Vec2 was never used, we have a permutation of a single vector, otherwise | ||||
| 347 | // we have permutation of 2 vectors. | ||||
| 348 | return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc | ||||
| 349 | : TargetTransformInfo::SK_PermuteSingleSrc; | ||||
| 350 | } | ||||
| 351 | |||||
| 352 | namespace { | ||||
| 353 | |||||
| 354 | /// Main data required for vectorization of instructions. | ||||
| 355 | struct InstructionsState { | ||||
| 356 | /// The very first instruction in the list with the main opcode. | ||||
| 357 | Value *OpValue = nullptr; | ||||
| 358 | |||||
| 359 | /// The main/alternate instruction. | ||||
| 360 | Instruction *MainOp = nullptr; | ||||
| 361 | Instruction *AltOp = nullptr; | ||||
| 362 | |||||
| 363 | /// The main/alternate opcodes for the list of instructions. | ||||
| 364 | unsigned getOpcode() const { | ||||
| 365 | return MainOp ? MainOp->getOpcode() : 0; | ||||
| 366 | } | ||||
| 367 | |||||
| 368 | unsigned getAltOpcode() const { | ||||
| 369 | return AltOp ? AltOp->getOpcode() : 0; | ||||
| 370 | } | ||||
| 371 | |||||
| 372 | /// Some of the instructions in the list have alternate opcodes. | ||||
| 373 | bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } | ||||
| 374 | |||||
| 375 | bool isOpcodeOrAlt(Instruction *I) const { | ||||
| 376 | unsigned CheckedOpcode = I->getOpcode(); | ||||
| 377 | return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; | ||||
| 378 | } | ||||
| 379 | |||||
| 380 | InstructionsState() = delete; | ||||
| 381 | InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) | ||||
| 382 | : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} | ||||
| 383 | }; | ||||
| 384 | |||||
| 385 | } // end anonymous namespace | ||||
| 386 | |||||
| 387 | /// Chooses the correct key for scheduling data. If \p Op has the same (or | ||||
| 388 | /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p | ||||
| 389 | /// OpValue. | ||||
| 390 | static Value *isOneOf(const InstructionsState &S, Value *Op) { | ||||
| 391 | auto *I = dyn_cast<Instruction>(Op); | ||||
| 392 | if (I && S.isOpcodeOrAlt(I)) | ||||
| 393 | return Op; | ||||
| 394 | return S.OpValue; | ||||
| 395 | } | ||||
| 396 | |||||
| 397 | /// \returns true if \p Opcode is allowed as part of of the main/alternate | ||||
| 398 | /// instruction for SLP vectorization. | ||||
| 399 | /// | ||||
| 400 | /// Example of unsupported opcode is SDIV that can potentially cause UB if the | ||||
| 401 | /// "shuffled out" lane would result in division by zero. | ||||
| 402 | static bool isValidForAlternation(unsigned Opcode) { | ||||
| 403 | if (Instruction::isIntDivRem(Opcode)) | ||||
| 404 | return false; | ||||
| 405 | |||||
| 406 | return true; | ||||
| 407 | } | ||||
| 408 | |||||
| 409 | /// \returns analysis of the Instructions in \p VL described in | ||||
| 410 | /// InstructionsState, the Opcode that we suppose the whole list | ||||
| 411 | /// could be vectorized even if its structure is diverse. | ||||
| 412 | static InstructionsState getSameOpcode(ArrayRef<Value *> VL, | ||||
| 413 | unsigned BaseIndex = 0) { | ||||
| 414 | // Make sure these are all Instructions. | ||||
| 415 | if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) | ||||
| 416 | return InstructionsState(VL[BaseIndex], nullptr, nullptr); | ||||
| 417 | |||||
| 418 | bool IsCastOp = isa<CastInst>(VL[BaseIndex]); | ||||
| 419 | bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); | ||||
| 420 | unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); | ||||
| 421 | unsigned AltOpcode = Opcode; | ||||
| 422 | unsigned AltIndex = BaseIndex; | ||||
| 423 | |||||
| 424 | // Check for one alternate opcode from another BinaryOperator. | ||||
| 425 | // TODO - generalize to support all operators (types, calls etc.). | ||||
| 426 | for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { | ||||
| 427 | unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode(); | ||||
| 428 | if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { | ||||
| 429 | if (InstOpcode == Opcode || InstOpcode == AltOpcode) | ||||
| 430 | continue; | ||||
| 431 | if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && | ||||
| 432 | isValidForAlternation(Opcode)) { | ||||
| 433 | AltOpcode = InstOpcode; | ||||
| 434 | AltIndex = Cnt; | ||||
| 435 | continue; | ||||
| 436 | } | ||||
| 437 | } else if (IsCastOp && isa<CastInst>(VL[Cnt])) { | ||||
| 438 | Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType(); | ||||
| 439 | Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType(); | ||||
| 440 | if (Ty0 == Ty1) { | ||||
| 441 | if (InstOpcode == Opcode || InstOpcode == AltOpcode) | ||||
| 442 | continue; | ||||
| 443 | if (Opcode == AltOpcode) { | ||||
| 444 | assert(isValidForAlternation(Opcode) &&((void)0) | ||||
| 445 | isValidForAlternation(InstOpcode) &&((void)0) | ||||
| 446 | "Cast isn't safe for alternation, logic needs to be updated!")((void)0); | ||||
| 447 | AltOpcode = InstOpcode; | ||||
| 448 | AltIndex = Cnt; | ||||
| 449 | continue; | ||||
| 450 | } | ||||
| 451 | } | ||||
| 452 | } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) | ||||
| 453 | continue; | ||||
| 454 | return InstructionsState(VL[BaseIndex], nullptr, nullptr); | ||||
| 455 | } | ||||
| 456 | |||||
| 457 | return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]), | ||||
| 458 | cast<Instruction>(VL[AltIndex])); | ||||
| 459 | } | ||||
| 460 | |||||
| 461 | /// \returns true if all of the values in \p VL have the same type or false | ||||
| 462 | /// otherwise. | ||||
| 463 | static bool allSameType(ArrayRef<Value *> VL) { | ||||
| 464 | Type *Ty = VL[0]->getType(); | ||||
| 465 | for (int i = 1, e = VL.size(); i < e; i++) | ||||
| 466 | if (VL[i]->getType() != Ty) | ||||
| 467 | return false; | ||||
| 468 | |||||
| 469 | return true; | ||||
| 470 | } | ||||
| 471 | |||||
| 472 | /// \returns True if Extract{Value,Element} instruction extracts element Idx. | ||||
| 473 | static Optional<unsigned> getExtractIndex(Instruction *E) { | ||||
| 474 | unsigned Opcode = E->getOpcode(); | ||||
| 475 | assert((Opcode == Instruction::ExtractElement ||((void)0) | ||||
| 476 | Opcode == Instruction::ExtractValue) &&((void)0) | ||||
| 477 | "Expected extractelement or extractvalue instruction.")((void)0); | ||||
| 478 | if (Opcode == Instruction::ExtractElement) { | ||||
| 479 | auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); | ||||
| 480 | if (!CI) | ||||
| 481 | return None; | ||||
| 482 | return CI->getZExtValue(); | ||||
| 483 | } | ||||
| 484 | ExtractValueInst *EI = cast<ExtractValueInst>(E); | ||||
| 485 | if (EI->getNumIndices() != 1) | ||||
| 486 | return None; | ||||
| 487 | return *EI->idx_begin(); | ||||
| 488 | } | ||||
| 489 | |||||
| 490 | /// \returns True if in-tree use also needs extract. This refers to | ||||
| 491 | /// possible scalar operand in vectorized instruction. | ||||
| 492 | static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, | ||||
| 493 | TargetLibraryInfo *TLI) { | ||||
| 494 | unsigned Opcode = UserInst->getOpcode(); | ||||
| 495 | switch (Opcode) { | ||||
| 496 | case Instruction::Load: { | ||||
| 497 | LoadInst *LI = cast<LoadInst>(UserInst); | ||||
| 498 | return (LI->getPointerOperand() == Scalar); | ||||
| 499 | } | ||||
| 500 | case Instruction::Store: { | ||||
| 501 | StoreInst *SI = cast<StoreInst>(UserInst); | ||||
| 502 | return (SI->getPointerOperand() == Scalar); | ||||
| 503 | } | ||||
| 504 | case Instruction::Call: { | ||||
| 505 | CallInst *CI = cast<CallInst>(UserInst); | ||||
| 506 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); | ||||
| 507 | for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { | ||||
| 508 | if (hasVectorInstrinsicScalarOpd(ID, i)) | ||||
| 509 | return (CI->getArgOperand(i) == Scalar); | ||||
| 510 | } | ||||
| 511 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | ||||
| 512 | } | ||||
| 513 | default: | ||||
| 514 | return false; | ||||
| 515 | } | ||||
| 516 | } | ||||
| 517 | |||||
| 518 | /// \returns the AA location that is being access by the instruction. | ||||
| 519 | static MemoryLocation getLocation(Instruction *I, AAResults *AA) { | ||||
| 520 | if (StoreInst *SI = dyn_cast<StoreInst>(I)) | ||||
| 521 | return MemoryLocation::get(SI); | ||||
| 522 | if (LoadInst *LI = dyn_cast<LoadInst>(I)) | ||||
| 523 | return MemoryLocation::get(LI); | ||||
| 524 | return MemoryLocation(); | ||||
| 525 | } | ||||
| 526 | |||||
| 527 | /// \returns True if the instruction is not a volatile or atomic load/store. | ||||
| 528 | static bool isSimple(Instruction *I) { | ||||
| 529 | if (LoadInst *LI = dyn_cast<LoadInst>(I)) | ||||
| 530 | return LI->isSimple(); | ||||
| 531 | if (StoreInst *SI = dyn_cast<StoreInst>(I)) | ||||
| 532 | return SI->isSimple(); | ||||
| 533 | if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) | ||||
| 534 | return !MI->isVolatile(); | ||||
| 535 | return true; | ||||
| 536 | } | ||||
| 537 | |||||
| 538 | namespace llvm { | ||||
| 539 | |||||
| 540 | static void inversePermutation(ArrayRef<unsigned> Indices, | ||||
| 541 | SmallVectorImpl<int> &Mask) { | ||||
| 542 | Mask.clear(); | ||||
| 543 | const unsigned E = Indices.size(); | ||||
| 544 | Mask.resize(E, E + 1); | ||||
| 545 | for (unsigned I = 0; I < E; ++I) | ||||
| 546 | Mask[Indices[I]] = I; | ||||
| 547 | } | ||||
| 548 | |||||
| 549 | /// \returns inserting index of InsertElement or InsertValue instruction, | ||||
| 550 | /// using Offset as base offset for index. | ||||
| 551 | static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) { | ||||
| 552 | int Index = Offset; | ||||
| 553 | if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { | ||||
| 554 | if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { | ||||
| 555 | auto *VT = cast<FixedVectorType>(IE->getType()); | ||||
| 556 | if (CI->getValue().uge(VT->getNumElements())) | ||||
| 557 | return UndefMaskElem; | ||||
| 558 | Index *= VT->getNumElements(); | ||||
| 559 | Index += CI->getZExtValue(); | ||||
| 560 | return Index; | ||||
| 561 | } | ||||
| 562 | if (isa<UndefValue>(IE->getOperand(2))) | ||||
| 563 | return UndefMaskElem; | ||||
| 564 | return None; | ||||
| 565 | } | ||||
| 566 | |||||
| 567 | auto *IV = cast<InsertValueInst>(InsertInst); | ||||
| 568 | Type *CurrentType = IV->getType(); | ||||
| 569 | for (unsigned I : IV->indices()) { | ||||
| 570 | if (auto *ST = dyn_cast<StructType>(CurrentType)) { | ||||
| 571 | Index *= ST->getNumElements(); | ||||
| 572 | CurrentType = ST->getElementType(I); | ||||
| 573 | } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { | ||||
| 574 | Index *= AT->getNumElements(); | ||||
| 575 | CurrentType = AT->getElementType(); | ||||
| 576 | } else { | ||||
| 577 | return None; | ||||
| 578 | } | ||||
| 579 | Index += I; | ||||
| 580 | } | ||||
| 581 | return Index; | ||||
| 582 | } | ||||
| 583 | |||||
| 584 | namespace slpvectorizer { | ||||
| 585 | |||||
| 586 | /// Bottom Up SLP Vectorizer. | ||||
| 587 | class BoUpSLP { | ||||
| 588 | struct TreeEntry; | ||||
| 589 | struct ScheduleData; | ||||
| 590 | |||||
| 591 | public: | ||||
| 592 | using ValueList = SmallVector<Value *, 8>; | ||||
| 593 | using InstrList = SmallVector<Instruction *, 16>; | ||||
| 594 | using ValueSet = SmallPtrSet<Value *, 16>; | ||||
| 595 | using StoreList = SmallVector<StoreInst *, 8>; | ||||
| 596 | using ExtraValueToDebugLocsMap = | ||||
| 597 | MapVector<Value *, SmallVector<Instruction *, 2>>; | ||||
| 598 | using OrdersType = SmallVector<unsigned, 4>; | ||||
| 599 | |||||
| 600 | BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, | ||||
| 601 | TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, | ||||
| 602 | DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, | ||||
| 603 | const DataLayout *DL, OptimizationRemarkEmitter *ORE) | ||||
| 604 | : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), | ||||
| 605 | DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { | ||||
| 606 | CodeMetrics::collectEphemeralValues(F, AC, EphValues); | ||||
| 607 | // Use the vector register size specified by the target unless overridden | ||||
| 608 | // by a command-line option. | ||||
| 609 | // TODO: It would be better to limit the vectorization factor based on | ||||
| 610 | // data type rather than just register size. For example, x86 AVX has | ||||
| 611 | // 256-bit registers, but it does not support integer operations | ||||
| 612 | // at that width (that requires AVX2). | ||||
| 613 | if (MaxVectorRegSizeOption.getNumOccurrences()) | ||||
| 614 | MaxVecRegSize = MaxVectorRegSizeOption; | ||||
| 615 | else | ||||
| 616 | MaxVecRegSize = | ||||
| 617 | TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) | ||||
| 618 | .getFixedSize(); | ||||
| 619 | |||||
| 620 | if (MinVectorRegSizeOption.getNumOccurrences()) | ||||
| 621 | MinVecRegSize = MinVectorRegSizeOption; | ||||
| 622 | else | ||||
| 623 | MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); | ||||
| 624 | } | ||||
| 625 | |||||
| 626 | /// Vectorize the tree that starts with the elements in \p VL. | ||||
| 627 | /// Returns the vectorized root. | ||||
| 628 | Value *vectorizeTree(); | ||||
| 629 | |||||
| 630 | /// Vectorize the tree but with the list of externally used values \p | ||||
| 631 | /// ExternallyUsedValues. Values in this MapVector can be replaced but the | ||||
| 632 | /// generated extractvalue instructions. | ||||
| 633 | Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues); | ||||
| 634 | |||||
| 635 | /// \returns the cost incurred by unwanted spills and fills, caused by | ||||
| 636 | /// holding live values over call sites. | ||||
| 637 | InstructionCost getSpillCost() const; | ||||
| 638 | |||||
| 639 | /// \returns the vectorization cost of the subtree that starts at \p VL. | ||||
| 640 | /// A negative number means that this is profitable. | ||||
| 641 | InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None); | ||||
| 642 | |||||
| 643 | /// Construct a vectorizable tree that starts at \p Roots, ignoring users for | ||||
| 644 | /// the purpose of scheduling and extraction in the \p UserIgnoreLst. | ||||
| 645 | void buildTree(ArrayRef<Value *> Roots, | ||||
| 646 | ArrayRef<Value *> UserIgnoreLst = None); | ||||
| 647 | |||||
| 648 | /// Construct a vectorizable tree that starts at \p Roots, ignoring users for | ||||
| 649 | /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking | ||||
| 650 | /// into account (and updating it, if required) list of externally used | ||||
| 651 | /// values stored in \p ExternallyUsedValues. | ||||
| 652 | void buildTree(ArrayRef<Value *> Roots, | ||||
| 653 | ExtraValueToDebugLocsMap &ExternallyUsedValues, | ||||
| 654 | ArrayRef<Value *> UserIgnoreLst = None); | ||||
| 655 | |||||
| 656 | /// Clear the internal data structures that are created by 'buildTree'. | ||||
| 657 | void deleteTree() { | ||||
| 658 | VectorizableTree.clear(); | ||||
| 659 | ScalarToTreeEntry.clear(); | ||||
| 660 | MustGather.clear(); | ||||
| 661 | ExternalUses.clear(); | ||||
| 662 | NumOpsWantToKeepOrder.clear(); | ||||
| 663 | NumOpsWantToKeepOriginalOrder = 0; | ||||
| 664 | for (auto &Iter : BlocksSchedules) { | ||||
| 665 | BlockScheduling *BS = Iter.second.get(); | ||||
| 666 | BS->clear(); | ||||
| 667 | } | ||||
| 668 | MinBWs.clear(); | ||||
| 669 | InstrElementSize.clear(); | ||||
| 670 | } | ||||
| 671 | |||||
| 672 | unsigned getTreeSize() const { return VectorizableTree.size(); } | ||||
| 673 | |||||
| 674 | /// Perform LICM and CSE on the newly generated gather sequences. | ||||
| 675 | void optimizeGatherSequence(); | ||||
| 676 | |||||
| 677 | /// \returns The best order of instructions for vectorization. | ||||
| 678 | Optional<ArrayRef<unsigned>> bestOrder() const { | ||||
| 679 | assert(llvm::all_of(((void)0) | ||||
| 680 | NumOpsWantToKeepOrder,((void)0) | ||||
| 681 | [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {((void)0) | ||||
| 682 | return D.getFirst().size() ==((void)0) | ||||
| 683 | VectorizableTree[0]->Scalars.size();((void)0) | ||||
| 684 | }) &&((void)0) | ||||
| 685 | "All orders must have the same size as number of instructions in "((void)0) | ||||
| 686 | "tree node.")((void)0); | ||||
| 687 | auto I = std::max_element( | ||||
| 688 | NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), | ||||
| 689 | [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, | ||||
| 690 | const decltype(NumOpsWantToKeepOrder)::value_type &D2) { | ||||
| 691 | return D1.second < D2.second; | ||||
| 692 | }); | ||||
| 693 | if (I == NumOpsWantToKeepOrder.end() || | ||||
| 694 | I->getSecond() <= NumOpsWantToKeepOriginalOrder) | ||||
| 695 | return None; | ||||
| 696 | |||||
| 697 | return makeArrayRef(I->getFirst()); | ||||
| 698 | } | ||||
| 699 | |||||
| 700 | /// Builds the correct order for root instructions. | ||||
| 701 | /// If some leaves have the same instructions to be vectorized, we may | ||||
| 702 | /// incorrectly evaluate the best order for the root node (it is built for the | ||||
| 703 | /// vector of instructions without repeated instructions and, thus, has less | ||||
| 704 | /// elements than the root node). This function builds the correct order for | ||||
| 705 | /// the root node. | ||||
| 706 | /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves | ||||
| 707 | /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first | ||||
| 708 | /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should | ||||
| 709 | /// be reordered, the best order will be \<1, 0\>. We need to extend this | ||||
| 710 | /// order for the root node. For the root node this order should look like | ||||
| 711 | /// \<3, 0, 1, 2\>. This function extends the order for the reused | ||||
| 712 | /// instructions. | ||||
| 713 | void findRootOrder(OrdersType &Order) { | ||||
| 714 | // If the leaf has the same number of instructions to vectorize as the root | ||||
| 715 | // - order must be set already. | ||||
| 716 | unsigned RootSize = VectorizableTree[0]->Scalars.size(); | ||||
| 717 | if (Order.size() == RootSize) | ||||
| 718 | return; | ||||
| 719 | SmallVector<unsigned, 4> RealOrder(Order.size()); | ||||
| 720 | std::swap(Order, RealOrder); | ||||
| 721 | SmallVector<int, 4> Mask; | ||||
| 722 | inversePermutation(RealOrder, Mask); | ||||
| 723 | Order.assign(Mask.begin(), Mask.end()); | ||||
| 724 | // The leaf has less number of instructions - need to find the true order of | ||||
| 725 | // the root. | ||||
| 726 | // Scan the nodes starting from the leaf back to the root. | ||||
| 727 | const TreeEntry *PNode = VectorizableTree.back().get(); | ||||
| 728 | SmallVector<const TreeEntry *, 4> Nodes(1, PNode); | ||||
| 729 | SmallPtrSet<const TreeEntry *, 4> Visited; | ||||
| 730 | while (!Nodes.empty() && Order.size() != RootSize) { | ||||
| 731 | const TreeEntry *PNode = Nodes.pop_back_val(); | ||||
| 732 | if (!Visited.insert(PNode).second) | ||||
| 733 | continue; | ||||
| 734 | const TreeEntry &Node = *PNode; | ||||
| 735 | for (const EdgeInfo &EI : Node.UserTreeIndices) | ||||
| 736 | if (EI.UserTE) | ||||
| 737 | Nodes.push_back(EI.UserTE); | ||||
| 738 | if (Node.ReuseShuffleIndices.empty()) | ||||
| 739 | continue; | ||||
| 740 | // Build the order for the parent node. | ||||
| 741 | OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); | ||||
| 742 | SmallVector<unsigned, 4> OrderCounter(Order.size(), 0); | ||||
| 743 | // The algorithm of the order extension is: | ||||
| 744 | // 1. Calculate the number of the same instructions for the order. | ||||
| 745 | // 2. Calculate the index of the new order: total number of instructions | ||||
| 746 | // with order less than the order of the current instruction + reuse | ||||
| 747 | // number of the current instruction. | ||||
| 748 | // 3. The new order is just the index of the instruction in the original | ||||
| 749 | // vector of the instructions. | ||||
| 750 | for (unsigned I : Node.ReuseShuffleIndices) | ||||
| 751 | ++OrderCounter[Order[I]]; | ||||
| 752 | SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0); | ||||
| 753 | for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { | ||||
| 754 | unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; | ||||
| 755 | unsigned OrderIdx = Order[ReusedIdx]; | ||||
| 756 | unsigned NewIdx = 0; | ||||
| 757 | for (unsigned J = 0; J < OrderIdx; ++J) | ||||
| 758 | NewIdx += OrderCounter[J]; | ||||
| 759 | NewIdx += CurrentCounter[OrderIdx]; | ||||
| 760 | ++CurrentCounter[OrderIdx]; | ||||
| 761 | assert(NewOrder[NewIdx] == RootSize &&((void)0) | ||||
| 762 | "The order index should not be written already.")((void)0); | ||||
| 763 | NewOrder[NewIdx] = I; | ||||
| 764 | } | ||||
| 765 | std::swap(Order, NewOrder); | ||||
| 766 | } | ||||
| 767 | assert(Order.size() == RootSize &&((void)0) | ||||
| 768 | "Root node is expected or the size of the order must be the same as "((void)0) | ||||
| 769 | "the number of elements in the root node.")((void)0); | ||||
| 770 | assert(llvm::all_of(Order,((void)0) | ||||
| 771 | [RootSize](unsigned Val) { return Val != RootSize; }) &&((void)0) | ||||
| 772 | "All indices must be initialized")((void)0); | ||||
| 773 | } | ||||
| 774 | |||||
| 775 | /// \return The vector element size in bits to use when vectorizing the | ||||
| 776 | /// expression tree ending at \p V. If V is a store, the size is the width of | ||||
| 777 | /// the stored value. Otherwise, the size is the width of the largest loaded | ||||
| 778 | /// value reaching V. This method is used by the vectorizer to calculate | ||||
| 779 | /// vectorization factors. | ||||
| 780 | unsigned getVectorElementSize(Value *V); | ||||
| 781 | |||||
| 782 | /// Compute the minimum type sizes required to represent the entries in a | ||||
| 783 | /// vectorizable tree. | ||||
| 784 | void computeMinimumValueSizes(); | ||||
| 785 | |||||
| 786 | // \returns maximum vector register size as set by TTI or overridden by cl::opt. | ||||
| 787 | unsigned getMaxVecRegSize() const { | ||||
| 788 | return MaxVecRegSize; | ||||
| 789 | } | ||||
| 790 | |||||
| 791 | // \returns minimum vector register size as set by cl::opt. | ||||
| 792 | unsigned getMinVecRegSize() const { | ||||
| 793 | return MinVecRegSize; | ||||
| 794 | } | ||||
| 795 | |||||
| 796 | unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { | ||||
| 797 | unsigned MaxVF = MaxVFOption.getNumOccurrences() ? | ||||
| 798 | MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); | ||||
| 799 | return MaxVF ? MaxVF : UINT_MAX(2147483647 *2U +1U); | ||||
| 800 | } | ||||
| 801 | |||||
| 802 | /// Check if homogeneous aggregate is isomorphic to some VectorType. | ||||
| 803 | /// Accepts homogeneous multidimensional aggregate of scalars/vectors like | ||||
| 804 | /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, | ||||
| 805 | /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. | ||||
| 806 | /// | ||||
| 807 | /// \returns number of elements in vector if isomorphism exists, 0 otherwise. | ||||
| 808 | unsigned canMapToVector(Type *T, const DataLayout &DL) const; | ||||
| 809 | |||||
| 810 | /// \returns True if the VectorizableTree is both tiny and not fully | ||||
| 811 | /// vectorizable. We do not vectorize such trees. | ||||
| 812 | bool isTreeTinyAndNotFullyVectorizable() const; | ||||
| 813 | |||||
| 814 | /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values | ||||
| 815 | /// can be load combined in the backend. Load combining may not be allowed in | ||||
| 816 | /// the IR optimizer, so we do not want to alter the pattern. For example, | ||||
| 817 | /// partially transforming a scalar bswap() pattern into vector code is | ||||
| 818 | /// effectively impossible for the backend to undo. | ||||
| 819 | /// TODO: If load combining is allowed in the IR optimizer, this analysis | ||||
| 820 | /// may not be necessary. | ||||
| 821 | bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; | ||||
| 822 | |||||
| 823 | /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values | ||||
| 824 | /// can be load combined in the backend. Load combining may not be allowed in | ||||
| 825 | /// the IR optimizer, so we do not want to alter the pattern. For example, | ||||
| 826 | /// partially transforming a scalar bswap() pattern into vector code is | ||||
| 827 | /// effectively impossible for the backend to undo. | ||||
| 828 | /// TODO: If load combining is allowed in the IR optimizer, this analysis | ||||
| 829 | /// may not be necessary. | ||||
| 830 | bool isLoadCombineCandidate() const; | ||||
| 831 | |||||
| 832 | OptimizationRemarkEmitter *getORE() { return ORE; } | ||||
| 833 | |||||
| 834 | /// This structure holds any data we need about the edges being traversed | ||||
| 835 | /// during buildTree_rec(). We keep track of: | ||||
| 836 | /// (i) the user TreeEntry index, and | ||||
| 837 | /// (ii) the index of the edge. | ||||
| 838 | struct EdgeInfo { | ||||
| 839 | EdgeInfo() = default; | ||||
| 840 | EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) | ||||
| 841 | : UserTE(UserTE), EdgeIdx(EdgeIdx) {} | ||||
| 842 | /// The user TreeEntry. | ||||
| 843 | TreeEntry *UserTE = nullptr; | ||||
| 844 | /// The operand index of the use. | ||||
| 845 | unsigned EdgeIdx = UINT_MAX(2147483647 *2U +1U); | ||||
| 846 | #ifndef NDEBUG1 | ||||
| 847 | friend inline raw_ostream &operator<<(raw_ostream &OS, | ||||
| 848 | const BoUpSLP::EdgeInfo &EI) { | ||||
| 849 | EI.dump(OS); | ||||
| 850 | return OS; | ||||
| 851 | } | ||||
| 852 | /// Debug print. | ||||
| 853 | void dump(raw_ostream &OS) const { | ||||
| 854 | OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") | ||||
| 855 | << " EdgeIdx:" << EdgeIdx << "}"; | ||||
| 856 | } | ||||
| 857 | LLVM_DUMP_METHOD__attribute__((noinline)) void dump() const { dump(dbgs()); } | ||||
| 858 | #endif | ||||
| 859 | }; | ||||
| 860 | |||||
| 861 | /// A helper data structure to hold the operands of a vector of instructions. | ||||
| 862 | /// This supports a fixed vector length for all operand vectors. | ||||
| 863 | class VLOperands { | ||||
| 864 | /// For each operand we need (i) the value, and (ii) the opcode that it | ||||
| 865 | /// would be attached to if the expression was in a left-linearized form. | ||||
| 866 | /// This is required to avoid illegal operand reordering. | ||||
| 867 | /// For example: | ||||
| 868 | /// \verbatim | ||||
| 869 | /// 0 Op1 | ||||
| 870 | /// |/ | ||||
| 871 | /// Op1 Op2 Linearized + Op2 | ||||
| 872 | /// \ / ----------> |/ | ||||
| 873 | /// - - | ||||
| 874 | /// | ||||
| 875 | /// Op1 - Op2 (0 + Op1) - Op2 | ||||
| 876 | /// \endverbatim | ||||
| 877 | /// | ||||
| 878 | /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. | ||||
| 879 | /// | ||||
| 880 | /// Another way to think of this is to track all the operations across the | ||||
| 881 | /// path from the operand all the way to the root of the tree and to | ||||
| 882 | /// calculate the operation that corresponds to this path. For example, the | ||||
| 883 | /// path from Op2 to the root crosses the RHS of the '-', therefore the | ||||
| 884 | /// corresponding operation is a '-' (which matches the one in the | ||||
| 885 | /// linearized tree, as shown above). | ||||
| 886 | /// | ||||
| 887 | /// For lack of a better term, we refer to this operation as Accumulated | ||||
| 888 | /// Path Operation (APO). | ||||
| 889 | struct OperandData { | ||||
| 890 | OperandData() = default; | ||||
| 891 | OperandData(Value *V, bool APO, bool IsUsed) | ||||
| 892 | : V(V), APO(APO), IsUsed(IsUsed) {} | ||||
| 893 | /// The operand value. | ||||
| 894 | Value *V = nullptr; | ||||
| 895 | /// TreeEntries only allow a single opcode, or an alternate sequence of | ||||
| 896 | /// them (e.g, +, -). Therefore, we can safely use a boolean value for the | ||||
| 897 | /// APO. It is set to 'true' if 'V' is attached to an inverse operation | ||||
| 898 | /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise | ||||
| 899 | /// (e.g., Add/Mul) | ||||
| 900 | bool APO = false; | ||||
| 901 | /// Helper data for the reordering function. | ||||
| 902 | bool IsUsed = false; | ||||
| 903 | }; | ||||
| 904 | |||||
| 905 | /// During operand reordering, we are trying to select the operand at lane | ||||
| 906 | /// that matches best with the operand at the neighboring lane. Our | ||||
| 907 | /// selection is based on the type of value we are looking for. For example, | ||||
| 908 | /// if the neighboring lane has a load, we need to look for a load that is | ||||
| 909 | /// accessing a consecutive address. These strategies are summarized in the | ||||
| 910 | /// 'ReorderingMode' enumerator. | ||||
| 911 | enum class ReorderingMode { | ||||
| 912 | Load, ///< Matching loads to consecutive memory addresses | ||||
| 913 | Opcode, ///< Matching instructions based on opcode (same or alternate) | ||||
| 914 | Constant, ///< Matching constants | ||||
| 915 | Splat, ///< Matching the same instruction multiple times (broadcast) | ||||
| 916 | Failed, ///< We failed to create a vectorizable group | ||||
| 917 | }; | ||||
| 918 | |||||
| 919 | using OperandDataVec = SmallVector<OperandData, 2>; | ||||
| 920 | |||||
| 921 | /// A vector of operand vectors. | ||||
| 922 | SmallVector<OperandDataVec, 4> OpsVec; | ||||
| 923 | |||||
| 924 | const DataLayout &DL; | ||||
| 925 | ScalarEvolution &SE; | ||||
| 926 | const BoUpSLP &R; | ||||
| 927 | |||||
| 928 | /// \returns the operand data at \p OpIdx and \p Lane. | ||||
| 929 | OperandData &getData(unsigned OpIdx, unsigned Lane) { | ||||
| 930 | return OpsVec[OpIdx][Lane]; | ||||
| 931 | } | ||||
| 932 | |||||
| 933 | /// \returns the operand data at \p OpIdx and \p Lane. Const version. | ||||
| 934 | const OperandData &getData(unsigned OpIdx, unsigned Lane) const { | ||||
| 935 | return OpsVec[OpIdx][Lane]; | ||||
| 936 | } | ||||
| 937 | |||||
| 938 | /// Clears the used flag for all entries. | ||||
| 939 | void clearUsed() { | ||||
| 940 | for (unsigned OpIdx = 0, NumOperands = getNumOperands(); | ||||
| 941 | OpIdx != NumOperands; ++OpIdx) | ||||
| 942 | for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; | ||||
| 943 | ++Lane) | ||||
| 944 | OpsVec[OpIdx][Lane].IsUsed = false; | ||||
| 945 | } | ||||
| 946 | |||||
| 947 | /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. | ||||
| 948 | void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { | ||||
| 949 | std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); | ||||
| 950 | } | ||||
| 951 | |||||
| 952 | // The hard-coded scores listed here are not very important. When computing | ||||
| 953 | // the scores of matching one sub-tree with another, we are basically | ||||
| 954 | // counting the number of values that are matching. So even if all scores | ||||
| 955 | // are set to 1, we would still get a decent matching result. | ||||
| 956 | // However, sometimes we have to break ties. For example we may have to | ||||
| 957 | // choose between matching loads vs matching opcodes. This is what these | ||||
| 958 | // scores are helping us with: they provide the order of preference. | ||||
| 959 | |||||
| 960 | /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). | ||||
| 961 | static const int ScoreConsecutiveLoads = 3; | ||||
| 962 | /// ExtractElementInst from same vector and consecutive indexes. | ||||
| 963 | static const int ScoreConsecutiveExtracts = 3; | ||||
| 964 | /// Constants. | ||||
| 965 | static const int ScoreConstants = 2; | ||||
| 966 | /// Instructions with the same opcode. | ||||
| 967 | static const int ScoreSameOpcode = 2; | ||||
| 968 | /// Instructions with alt opcodes (e.g, add + sub). | ||||
| 969 | static const int ScoreAltOpcodes = 1; | ||||
| 970 | /// Identical instructions (a.k.a. splat or broadcast). | ||||
| 971 | static const int ScoreSplat = 1; | ||||
| 972 | /// Matching with an undef is preferable to failing. | ||||
| 973 | static const int ScoreUndef = 1; | ||||
| 974 | /// Score for failing to find a decent match. | ||||
| 975 | static const int ScoreFail = 0; | ||||
| 976 | /// User exteranl to the vectorized code. | ||||
| 977 | static const int ExternalUseCost = 1; | ||||
| 978 | /// The user is internal but in a different lane. | ||||
| 979 | static const int UserInDiffLaneCost = ExternalUseCost; | ||||
| 980 | |||||
| 981 | /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. | ||||
| 982 | static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, | ||||
| 983 | ScalarEvolution &SE) { | ||||
| 984 | auto *LI1 = dyn_cast<LoadInst>(V1); | ||||
| 985 | auto *LI2 = dyn_cast<LoadInst>(V2); | ||||
| 986 | if (LI1 && LI2) { | ||||
| 987 | if (LI1->getParent() != LI2->getParent()) | ||||
| 988 | return VLOperands::ScoreFail; | ||||
| 989 | |||||
| 990 | Optional<int> Dist = getPointersDiff( | ||||
| 991 | LI1->getType(), LI1->getPointerOperand(), LI2->getType(), | ||||
| 992 | LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); | ||||
| 993 | return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads | ||||
| 994 | : VLOperands::ScoreFail; | ||||
| 995 | } | ||||
| 996 | |||||
| 997 | auto *C1 = dyn_cast<Constant>(V1); | ||||
| 998 | auto *C2 = dyn_cast<Constant>(V2); | ||||
| 999 | if (C1 && C2) | ||||
| 1000 | return VLOperands::ScoreConstants; | ||||
| 1001 | |||||
| 1002 | // Extracts from consecutive indexes of the same vector better score as | ||||
| 1003 | // the extracts could be optimized away. | ||||
| 1004 | Value *EV; | ||||
| 1005 | ConstantInt *Ex1Idx, *Ex2Idx; | ||||
| 1006 | if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && | ||||
| 1007 | match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && | ||||
| 1008 | Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) | ||||
| 1009 | return VLOperands::ScoreConsecutiveExtracts; | ||||
| 1010 | |||||
| 1011 | auto *I1 = dyn_cast<Instruction>(V1); | ||||
| 1012 | auto *I2 = dyn_cast<Instruction>(V2); | ||||
| 1013 | if (I1 && I2) { | ||||
| 1014 | if (I1 == I2) | ||||
| 1015 | return VLOperands::ScoreSplat; | ||||
| 1016 | InstructionsState S = getSameOpcode({I1, I2}); | ||||
| 1017 | // Note: Only consider instructions with <= 2 operands to avoid | ||||
| 1018 | // complexity explosion. | ||||
| 1019 | if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) | ||||
| 1020 | return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes | ||||
| 1021 | : VLOperands::ScoreSameOpcode; | ||||
| 1022 | } | ||||
| 1023 | |||||
| 1024 | if (isa<UndefValue>(V2)) | ||||
| 1025 | return VLOperands::ScoreUndef; | ||||
| 1026 | |||||
| 1027 | return VLOperands::ScoreFail; | ||||
| 1028 | } | ||||
| 1029 | |||||
| 1030 | /// Holds the values and their lane that are taking part in the look-ahead | ||||
| 1031 | /// score calculation. This is used in the external uses cost calculation. | ||||
| 1032 | SmallDenseMap<Value *, int> InLookAheadValues; | ||||
| 1033 | |||||
| 1034 | /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are | ||||
| 1035 | /// either external to the vectorized code, or require shuffling. | ||||
| 1036 | int getExternalUsesCost(const std::pair<Value *, int> &LHS, | ||||
| 1037 | const std::pair<Value *, int> &RHS) { | ||||
| 1038 | int Cost = 0; | ||||
| 1039 | std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}}; | ||||
| 1040 | for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { | ||||
| 1041 | Value *V = Values[Idx].first; | ||||
| 1042 | if (isa<Constant>(V)) { | ||||
| 1043 | // Since this is a function pass, it doesn't make semantic sense to | ||||
| 1044 | // walk the users of a subclass of Constant. The users could be in | ||||
| 1045 | // another function, or even another module that happens to be in | ||||
| 1046 | // the same LLVMContext. | ||||
| 1047 | continue; | ||||
| 1048 | } | ||||
| 1049 | |||||
| 1050 | // Calculate the absolute lane, using the minimum relative lane of LHS | ||||
| 1051 | // and RHS as base and Idx as the offset. | ||||
| 1052 | int Ln = std::min(LHS.second, RHS.second) + Idx; | ||||
| 1053 | assert(Ln >= 0 && "Bad lane calculation")((void)0); | ||||
| 1054 | unsigned UsersBudget = LookAheadUsersBudget; | ||||
| 1055 | for (User *U : V->users()) { | ||||
| 1056 | if (const TreeEntry *UserTE = R.getTreeEntry(U)) { | ||||
| 1057 | // The user is in the VectorizableTree. Check if we need to insert. | ||||
| 1058 | auto It = llvm::find(UserTE->Scalars, U); | ||||
| 1059 | assert(It != UserTE->Scalars.end() && "U is in UserTE")((void)0); | ||||
| 1060 | int UserLn = std::distance(UserTE->Scalars.begin(), It); | ||||
| 1061 | assert(UserLn >= 0 && "Bad lane")((void)0); | ||||
| 1062 | if (UserLn != Ln) | ||||
| 1063 | Cost += UserInDiffLaneCost; | ||||
| 1064 | } else { | ||||
| 1065 | // Check if the user is in the look-ahead code. | ||||
| 1066 | auto It2 = InLookAheadValues.find(U); | ||||
| 1067 | if (It2 != InLookAheadValues.end()) { | ||||
| 1068 | // The user is in the look-ahead code. Check the lane. | ||||
| 1069 | if (It2->second != Ln) | ||||
| 1070 | Cost += UserInDiffLaneCost; | ||||
| 1071 | } else { | ||||
| 1072 | // The user is neither in SLP tree nor in the look-ahead code. | ||||
| 1073 | Cost += ExternalUseCost; | ||||
| 1074 | } | ||||
| 1075 | } | ||||
| 1076 | // Limit the number of visited uses to cap compilation time. | ||||
| 1077 | if (--UsersBudget == 0) | ||||
| 1078 | break; | ||||
| 1079 | } | ||||
| 1080 | } | ||||
| 1081 | return Cost; | ||||
| 1082 | } | ||||
| 1083 | |||||
| 1084 | /// Go through the operands of \p LHS and \p RHS recursively until \p | ||||
| 1085 | /// MaxLevel, and return the cummulative score. For example: | ||||
| 1086 | /// \verbatim | ||||
| 1087 | /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] | ||||
| 1088 | /// \ / \ / \ / \ / | ||||
| 1089 | /// + + + + | ||||
| 1090 | /// G1 G2 G3 G4 | ||||
| 1091 | /// \endverbatim | ||||
| 1092 | /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at | ||||
| 1093 | /// each level recursively, accumulating the score. It starts from matching | ||||
| 1094 | /// the additions at level 0, then moves on to the loads (level 1). The | ||||
| 1095 | /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and | ||||
| 1096 | /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while | ||||
| 1097 | /// {A[0],C[0]} has a score of VLOperands::ScoreFail. | ||||
| 1098 | /// Please note that the order of the operands does not matter, as we | ||||
| 1099 | /// evaluate the score of all profitable combinations of operands. In | ||||
| 1100 | /// other words the score of G1 and G4 is the same as G1 and G2. This | ||||
| 1101 | /// heuristic is based on ideas described in: | ||||
| 1102 | /// Look-ahead SLP: Auto-vectorization in the presence of commutative | ||||
| 1103 | /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, | ||||
| 1104 | /// LuÃs F. W. Góes | ||||
| 1105 | int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, | ||||
| 1106 | const std::pair<Value *, int> &RHS, int CurrLevel, | ||||
| 1107 | int MaxLevel) { | ||||
| 1108 | |||||
| 1109 | Value *V1 = LHS.first; | ||||
| 1110 | Value *V2 = RHS.first; | ||||
| 1111 | // Get the shallow score of V1 and V2. | ||||
| 1112 | int ShallowScoreAtThisLevel = | ||||
| 1113 | std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - | ||||
| 1114 | getExternalUsesCost(LHS, RHS)); | ||||
| 1115 | int Lane1 = LHS.second; | ||||
| 1116 | int Lane2 = RHS.second; | ||||
| 1117 | |||||
| 1118 | // If reached MaxLevel, | ||||
| 1119 | // or if V1 and V2 are not instructions, | ||||
| 1120 | // or if they are SPLAT, | ||||
| 1121 | // or if they are not consecutive, early return the current cost. | ||||
| 1122 | auto *I1 = dyn_cast<Instruction>(V1); | ||||
| 1123 | auto *I2 = dyn_cast<Instruction>(V2); | ||||
| 1124 | if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || | ||||
| 1125 | ShallowScoreAtThisLevel == VLOperands::ScoreFail || | ||||
| 1126 | (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) | ||||
| 1127 | return ShallowScoreAtThisLevel; | ||||
| 1128 | assert(I1 && I2 && "Should have early exited.")((void)0); | ||||
| 1129 | |||||
| 1130 | // Keep track of in-tree values for determining the external-use cost. | ||||
| 1131 | InLookAheadValues[V1] = Lane1; | ||||
| 1132 | InLookAheadValues[V2] = Lane2; | ||||
| 1133 | |||||
| 1134 | // Contains the I2 operand indexes that got matched with I1 operands. | ||||
| 1135 | SmallSet<unsigned, 4> Op2Used; | ||||
| 1136 | |||||
| 1137 | // Recursion towards the operands of I1 and I2. We are trying all possbile | ||||
| 1138 | // operand pairs, and keeping track of the best score. | ||||
| 1139 | for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); | ||||
| 1140 | OpIdx1 != NumOperands1; ++OpIdx1) { | ||||
| 1141 | // Try to pair op1I with the best operand of I2. | ||||
| 1142 | int MaxTmpScore = 0; | ||||
| 1143 | unsigned MaxOpIdx2 = 0; | ||||
| 1144 | bool FoundBest = false; | ||||
| 1145 | // If I2 is commutative try all combinations. | ||||
| 1146 | unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; | ||||
| 1147 | unsigned ToIdx = isCommutative(I2) | ||||
| 1148 | ? I2->getNumOperands() | ||||
| 1149 | : std::min(I2->getNumOperands(), OpIdx1 + 1); | ||||
| 1150 | assert(FromIdx <= ToIdx && "Bad index")((void)0); | ||||
| 1151 | for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { | ||||
| 1152 | // Skip operands already paired with OpIdx1. | ||||
| 1153 | if (Op2Used.count(OpIdx2)) | ||||
| 1154 | continue; | ||||
| 1155 | // Recursively calculate the cost at each level | ||||
| 1156 | int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, | ||||
| 1157 | {I2->getOperand(OpIdx2), Lane2}, | ||||
| 1158 | CurrLevel + 1, MaxLevel); | ||||
| 1159 | // Look for the best score. | ||||
| 1160 | if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { | ||||
| 1161 | MaxTmpScore = TmpScore; | ||||
| 1162 | MaxOpIdx2 = OpIdx2; | ||||
| 1163 | FoundBest = true; | ||||
| 1164 | } | ||||
| 1165 | } | ||||
| 1166 | if (FoundBest) { | ||||
| 1167 | // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. | ||||
| 1168 | Op2Used.insert(MaxOpIdx2); | ||||
| 1169 | ShallowScoreAtThisLevel += MaxTmpScore; | ||||
| 1170 | } | ||||
| 1171 | } | ||||
| 1172 | return ShallowScoreAtThisLevel; | ||||
| 1173 | } | ||||
| 1174 | |||||
| 1175 | /// \Returns the look-ahead score, which tells us how much the sub-trees | ||||
| 1176 | /// rooted at \p LHS and \p RHS match, the more they match the higher the | ||||
| 1177 | /// score. This helps break ties in an informed way when we cannot decide on | ||||
| 1178 | /// the order of the operands by just considering the immediate | ||||
| 1179 | /// predecessors. | ||||
| 1180 | int getLookAheadScore(const std::pair<Value *, int> &LHS, | ||||
| 1181 | const std::pair<Value *, int> &RHS) { | ||||
| 1182 | InLookAheadValues.clear(); | ||||
| 1183 | return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); | ||||
| 1184 | } | ||||
| 1185 | |||||
| 1186 | // Search all operands in Ops[*][Lane] for the one that matches best | ||||
| 1187 | // Ops[OpIdx][LastLane] and return its opreand index. | ||||
| 1188 | // If no good match can be found, return None. | ||||
| 1189 | Optional<unsigned> | ||||
| 1190 | getBestOperand(unsigned OpIdx, int Lane, int LastLane, | ||||
| 1191 | ArrayRef<ReorderingMode> ReorderingModes) { | ||||
| 1192 | unsigned NumOperands = getNumOperands(); | ||||
| 1193 | |||||
| 1194 | // The operand of the previous lane at OpIdx. | ||||
| 1195 | Value *OpLastLane = getData(OpIdx, LastLane).V; | ||||
| 1196 | |||||
| 1197 | // Our strategy mode for OpIdx. | ||||
| 1198 | ReorderingMode RMode = ReorderingModes[OpIdx]; | ||||
| 1199 | |||||
| 1200 | // The linearized opcode of the operand at OpIdx, Lane. | ||||
| 1201 | bool OpIdxAPO = getData(OpIdx, Lane).APO; | ||||
| 1202 | |||||
| 1203 | // The best operand index and its score. | ||||
| 1204 | // Sometimes we have more than one option (e.g., Opcode and Undefs), so we | ||||
| 1205 | // are using the score to differentiate between the two. | ||||
| 1206 | struct BestOpData { | ||||
| 1207 | Optional<unsigned> Idx = None; | ||||
| 1208 | unsigned Score = 0; | ||||
| 1209 | } BestOp; | ||||
| 1210 | |||||
| 1211 | // Iterate through all unused operands and look for the best. | ||||
| 1212 | for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { | ||||
| 1213 | // Get the operand at Idx and Lane. | ||||
| 1214 | OperandData &OpData = getData(Idx, Lane); | ||||
| 1215 | Value *Op = OpData.V; | ||||
| 1216 | bool OpAPO = OpData.APO; | ||||
| 1217 | |||||
| 1218 | // Skip already selected operands. | ||||
| 1219 | if (OpData.IsUsed) | ||||
| 1220 | continue; | ||||
| 1221 | |||||
| 1222 | // Skip if we are trying to move the operand to a position with a | ||||
| 1223 | // different opcode in the linearized tree form. This would break the | ||||
| 1224 | // semantics. | ||||
| 1225 | if (OpAPO != OpIdxAPO) | ||||
| 1226 | continue; | ||||
| 1227 | |||||
| 1228 | // Look for an operand that matches the current mode. | ||||
| 1229 | switch (RMode) { | ||||
| 1230 | case ReorderingMode::Load: | ||||
| 1231 | case ReorderingMode::Constant: | ||||
| 1232 | case ReorderingMode::Opcode: { | ||||
| 1233 | bool LeftToRight = Lane > LastLane; | ||||
| 1234 | Value *OpLeft = (LeftToRight) ? OpLastLane : Op; | ||||
| 1235 | Value *OpRight = (LeftToRight) ? Op : OpLastLane; | ||||
| 1236 | unsigned Score = | ||||
| 1237 | getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); | ||||
| 1238 | if (Score > BestOp.Score) { | ||||
| 1239 | BestOp.Idx = Idx; | ||||
| 1240 | BestOp.Score = Score; | ||||
| 1241 | } | ||||
| 1242 | break; | ||||
| 1243 | } | ||||
| 1244 | case ReorderingMode::Splat: | ||||
| 1245 | if (Op == OpLastLane) | ||||
| 1246 | BestOp.Idx = Idx; | ||||
| 1247 | break; | ||||
| 1248 | case ReorderingMode::Failed: | ||||
| 1249 | return None; | ||||
| 1250 | } | ||||
| 1251 | } | ||||
| 1252 | |||||
| 1253 | if (BestOp.Idx) { | ||||
| 1254 | getData(BestOp.Idx.getValue(), Lane).IsUsed = true; | ||||
| 1255 | return BestOp.Idx; | ||||
| 1256 | } | ||||
| 1257 | // If we could not find a good match return None. | ||||
| 1258 | return None; | ||||
| 1259 | } | ||||
| 1260 | |||||
| 1261 | /// Helper for reorderOperandVecs. \Returns the lane that we should start | ||||
| 1262 | /// reordering from. This is the one which has the least number of operands | ||||
| 1263 | /// that can freely move about. | ||||
| 1264 | unsigned getBestLaneToStartReordering() const { | ||||
| 1265 | unsigned BestLane = 0; | ||||
| 1266 | unsigned Min = UINT_MAX(2147483647 *2U +1U); | ||||
| 1267 | for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; | ||||
| 1268 | ++Lane) { | ||||
| 1269 | unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); | ||||
| 1270 | if (NumFreeOps < Min) { | ||||
| 1271 | Min = NumFreeOps; | ||||
| 1272 | BestLane = Lane; | ||||
| 1273 | } | ||||
| 1274 | } | ||||
| 1275 | return BestLane; | ||||
| 1276 | } | ||||
| 1277 | |||||
| 1278 | /// \Returns the maximum number of operands that are allowed to be reordered | ||||
| 1279 | /// for \p Lane. This is used as a heuristic for selecting the first lane to | ||||
| 1280 | /// start operand reordering. | ||||
| 1281 | unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { | ||||
| 1282 | unsigned CntTrue = 0; | ||||
| 1283 | unsigned NumOperands = getNumOperands(); | ||||
| 1284 | // Operands with the same APO can be reordered. We therefore need to count | ||||
| 1285 | // how many of them we have for each APO, like this: Cnt[APO] = x. | ||||
| 1286 | // Since we only have two APOs, namely true and false, we can avoid using | ||||
| 1287 | // a map. Instead we can simply count the number of operands that | ||||
| 1288 | // correspond to one of them (in this case the 'true' APO), and calculate | ||||
| 1289 | // the other by subtracting it from the total number of operands. | ||||
| 1290 | for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) | ||||
| 1291 | if (getData(OpIdx, Lane).APO) | ||||
| 1292 | ++CntTrue; | ||||
| 1293 | unsigned CntFalse = NumOperands - CntTrue; | ||||
| 1294 | return std::max(CntTrue, CntFalse); | ||||
| 1295 | } | ||||
| 1296 | |||||
| 1297 | /// Go through the instructions in VL and append their operands. | ||||
| 1298 | void appendOperandsOfVL(ArrayRef<Value *> VL) { | ||||
| 1299 | assert(!VL.empty() && "Bad VL")((void)0); | ||||
| 1300 | assert((empty() || VL.size() == getNumLanes()) &&((void)0) | ||||
| 1301 | "Expected same number of lanes")((void)0); | ||||
| 1302 | assert(isa<Instruction>(VL[0]) && "Expected instruction")((void)0); | ||||
| 1303 | unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); | ||||
| 1304 | OpsVec.resize(NumOperands); | ||||
| 1305 | unsigned NumLanes = VL.size(); | ||||
| 1306 | for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { | ||||
| 1307 | OpsVec[OpIdx].resize(NumLanes); | ||||
| 1308 | for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { | ||||
| 1309 | assert(isa<Instruction>(VL[Lane]) && "Expected instruction")((void)0); | ||||
| 1310 | // Our tree has just 3 nodes: the root and two operands. | ||||
| 1311 | // It is therefore trivial to get the APO. We only need to check the | ||||
| 1312 | // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or | ||||
| 1313 | // RHS operand. The LHS operand of both add and sub is never attached | ||||
| 1314 | // to an inversese operation in the linearized form, therefore its APO | ||||
| 1315 | // is false. The RHS is true only if VL[Lane] is an inverse operation. | ||||
| 1316 | |||||
| 1317 | // Since operand reordering is performed on groups of commutative | ||||
| 1318 | // operations or alternating sequences (e.g., +, -), we can safely | ||||
| 1319 | // tell the inverse operations by checking commutativity. | ||||
| 1320 | bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); | ||||
| 1321 | bool APO = (OpIdx == 0) ? false : IsInverseOperation; | ||||
| 1322 | OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), | ||||
| 1323 | APO, false}; | ||||
| 1324 | } | ||||
| 1325 | } | ||||
| 1326 | } | ||||
| 1327 | |||||
| 1328 | /// \returns the number of operands. | ||||
| 1329 | unsigned getNumOperands() const { return OpsVec.size(); } | ||||
| 1330 | |||||
| 1331 | /// \returns the number of lanes. | ||||
| 1332 | unsigned getNumLanes() const { return OpsVec[0].size(); } | ||||
| 1333 | |||||
| 1334 | /// \returns the operand value at \p OpIdx and \p Lane. | ||||
| 1335 | Value *getValue(unsigned OpIdx, unsigned Lane) const { | ||||
| 1336 | return getData(OpIdx, Lane).V; | ||||
| 1337 | } | ||||
| 1338 | |||||
| 1339 | /// \returns true if the data structure is empty. | ||||
| 1340 | bool empty() const { return OpsVec.empty(); } | ||||
| 1341 | |||||
| 1342 | /// Clears the data. | ||||
| 1343 | void clear() { OpsVec.clear(); } | ||||
| 1344 | |||||
| 1345 | /// \Returns true if there are enough operands identical to \p Op to fill | ||||
| 1346 | /// the whole vector. | ||||
| 1347 | /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. | ||||
| 1348 | bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { | ||||
| 1349 | bool OpAPO = getData(OpIdx, Lane).APO; | ||||
| 1350 | for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { | ||||
| 1351 | if (Ln == Lane) | ||||
| 1352 | continue; | ||||
| 1353 | // This is set to true if we found a candidate for broadcast at Lane. | ||||
| 1354 | bool FoundCandidate = false; | ||||
| 1355 | for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { | ||||
| 1356 | OperandData &Data = getData(OpI, Ln); | ||||
| 1357 | if (Data.APO != OpAPO || Data.IsUsed) | ||||
| 1358 | continue; | ||||
| 1359 | if (Data.V == Op) { | ||||
| 1360 | FoundCandidate = true; | ||||
| 1361 | Data.IsUsed = true; | ||||
| 1362 | break; | ||||
| 1363 | } | ||||
| 1364 | } | ||||
| 1365 | if (!FoundCandidate) | ||||
| 1366 | return false; | ||||
| 1367 | } | ||||
| 1368 | return true; | ||||
| 1369 | } | ||||
| 1370 | |||||
| 1371 | public: | ||||
| 1372 | /// Initialize with all the operands of the instruction vector \p RootVL. | ||||
| 1373 | VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, | ||||
| 1374 | ScalarEvolution &SE, const BoUpSLP &R) | ||||
| 1375 | : DL(DL), SE(SE), R(R) { | ||||
| 1376 | // Append all the operands of RootVL. | ||||
| 1377 | appendOperandsOfVL(RootVL); | ||||
| 1378 | } | ||||
| 1379 | |||||
| 1380 | /// \Returns a value vector with the operands across all lanes for the | ||||
| 1381 | /// opearnd at \p OpIdx. | ||||
| 1382 | ValueList getVL(unsigned OpIdx) const { | ||||
| 1383 | ValueList OpVL(OpsVec[OpIdx].size()); | ||||
| 1384 | assert(OpsVec[OpIdx].size() == getNumLanes() &&((void)0) | ||||
| 1385 | "Expected same num of lanes across all operands")((void)0); | ||||
| 1386 | for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) | ||||
| 1387 | OpVL[Lane] = OpsVec[OpIdx][Lane].V; | ||||
| 1388 | return OpVL; | ||||
| 1389 | } | ||||
| 1390 | |||||
| 1391 | // Performs operand reordering for 2 or more operands. | ||||
| 1392 | // The original operands are in OrigOps[OpIdx][Lane]. | ||||
| 1393 | // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. | ||||
| 1394 | void reorder() { | ||||
| 1395 | unsigned NumOperands = getNumOperands(); | ||||
| 1396 | unsigned NumLanes = getNumLanes(); | ||||
| 1397 | // Each operand has its own mode. We are using this mode to help us select | ||||
| 1398 | // the instructions for each lane, so that they match best with the ones | ||||
| 1399 | // we have selected so far. | ||||
| 1400 | SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); | ||||
| 1401 | |||||
| 1402 | // This is a greedy single-pass algorithm. We are going over each lane | ||||
| 1403 | // once and deciding on the best order right away with no back-tracking. | ||||
| 1404 | // However, in order to increase its effectiveness, we start with the lane | ||||
| 1405 | // that has operands that can move the least. For example, given the | ||||
| 1406 | // following lanes: | ||||
| 1407 | // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd | ||||
| 1408 | // Lane 1 : A[1] = C[1] - B[1] // Visited 1st | ||||
| 1409 | // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd | ||||
| 1410 | // Lane 3 : A[3] = C[3] - B[3] // Visited 4th | ||||
| 1411 | // we will start at Lane 1, since the operands of the subtraction cannot | ||||
| 1412 | // be reordered. Then we will visit the rest of the lanes in a circular | ||||
| 1413 | // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. | ||||
| 1414 | |||||
| 1415 | // Find the first lane that we will start our search from. | ||||
| 1416 | unsigned FirstLane = getBestLaneToStartReordering(); | ||||
| 1417 | |||||
| 1418 | // Initialize the modes. | ||||
| 1419 | for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { | ||||
| 1420 | Value *OpLane0 = getValue(OpIdx, FirstLane); | ||||
| 1421 | // Keep track if we have instructions with all the same opcode on one | ||||
| 1422 | // side. | ||||
| 1423 | if (isa<LoadInst>(OpLane0)) | ||||
| 1424 | ReorderingModes[OpIdx] = ReorderingMode::Load; | ||||
| 1425 | else if (isa<Instruction>(OpLane0)) { | ||||
| 1426 | // Check if OpLane0 should be broadcast. | ||||
| 1427 | if (shouldBroadcast(OpLane0, OpIdx, FirstLane)) | ||||
| 1428 | ReorderingModes[OpIdx] = ReorderingMode::Splat; | ||||
| 1429 | else | ||||
| 1430 | ReorderingModes[OpIdx] = ReorderingMode::Opcode; | ||||
| 1431 | } | ||||
| 1432 | else if (isa<Constant>(OpLane0)) | ||||
| 1433 | ReorderingModes[OpIdx] = ReorderingMode::Constant; | ||||
| 1434 | else if (isa<Argument>(OpLane0)) | ||||
| 1435 | // Our best hope is a Splat. It may save some cost in some cases. | ||||
| 1436 | ReorderingModes[OpIdx] = ReorderingMode::Splat; | ||||
| 1437 | else | ||||
| 1438 | // NOTE: This should be unreachable. | ||||
| 1439 | ReorderingModes[OpIdx] = ReorderingMode::Failed; | ||||
| 1440 | } | ||||
| 1441 | |||||
| 1442 | // If the initial strategy fails for any of the operand indexes, then we | ||||
| 1443 | // perform reordering again in a second pass. This helps avoid assigning | ||||
| 1444 | // high priority to the failed strategy, and should improve reordering for | ||||
| 1445 | // the non-failed operand indexes. | ||||
| 1446 | for (int Pass = 0; Pass != 2; ++Pass) { | ||||
| 1447 | // Skip the second pass if the first pass did not fail. | ||||
| 1448 | bool StrategyFailed = false; | ||||
| 1449 | // Mark all operand data as free to use. | ||||
| 1450 | clearUsed(); | ||||
| 1451 | // We keep the original operand order for the FirstLane, so reorder the | ||||
| 1452 | // rest of the lanes. We are visiting the nodes in a circular fashion, | ||||
| 1453 | // using FirstLane as the center point and increasing the radius | ||||
| 1454 | // distance. | ||||
| 1455 | for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { | ||||
| 1456 | // Visit the lane on the right and then the lane on the left. | ||||
| 1457 | for (int Direction : {+1, -1}) { | ||||
| 1458 | int Lane = FirstLane + Direction * Distance; | ||||
| 1459 | if (Lane < 0 || Lane >= (int)NumLanes) | ||||
| 1460 | continue; | ||||
| 1461 | int LastLane = Lane - Direction; | ||||
| 1462 | assert(LastLane >= 0 && LastLane < (int)NumLanes &&((void)0) | ||||
| 1463 | "Out of bounds")((void)0); | ||||
| 1464 | // Look for a good match for each operand. | ||||
| 1465 | for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { | ||||
| 1466 | // Search for the operand that matches SortedOps[OpIdx][Lane-1]. | ||||
| 1467 | Optional<unsigned> BestIdx = | ||||
| 1468 | getBestOperand(OpIdx, Lane, LastLane, ReorderingModes); | ||||
| 1469 | // By not selecting a value, we allow the operands that follow to | ||||
| 1470 | // select a better matching value. We will get a non-null value in | ||||
| 1471 | // the next run of getBestOperand(). | ||||
| 1472 | if (BestIdx) { | ||||
| 1473 | // Swap the current operand with the one returned by | ||||
| 1474 | // getBestOperand(). | ||||
| 1475 | swap(OpIdx, BestIdx.getValue(), Lane); | ||||
| 1476 | } else { | ||||
| 1477 | // We failed to find a best operand, set mode to 'Failed'. | ||||
| 1478 | ReorderingModes[OpIdx] = ReorderingMode::Failed; | ||||
| 1479 | // Enable the second pass. | ||||
| 1480 | StrategyFailed = true; | ||||
| 1481 | } | ||||
| 1482 | } | ||||
| 1483 | } | ||||
| 1484 | } | ||||
| 1485 | // Skip second pass if the strategy did not fail. | ||||
| 1486 | if (!StrategyFailed) | ||||
| 1487 | break; | ||||
| 1488 | } | ||||
| 1489 | } | ||||
| 1490 | |||||
| 1491 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | ||||
| 1492 | LLVM_DUMP_METHOD__attribute__((noinline)) static StringRef getModeStr(ReorderingMode RMode) { | ||||
| 1493 | switch (RMode) { | ||||
| 1494 | case ReorderingMode::Load: | ||||
| 1495 | return "Load"; | ||||
| 1496 | case ReorderingMode::Opcode: | ||||
| 1497 | return "Opcode"; | ||||
| 1498 | case ReorderingMode::Constant: | ||||
| 1499 | return "Constant"; | ||||
| 1500 | case ReorderingMode::Splat: | ||||
| 1501 | return "Splat"; | ||||
| 1502 | case ReorderingMode::Failed: | ||||
| 1503 | return "Failed"; | ||||
| 1504 | } | ||||
| 1505 | llvm_unreachable("Unimplemented Reordering Type")__builtin_unreachable(); | ||||
| 1506 | } | ||||
| 1507 | |||||
| 1508 | LLVM_DUMP_METHOD__attribute__((noinline)) static raw_ostream &printMode(ReorderingMode RMode, | ||||
| 1509 | raw_ostream &OS) { | ||||
| 1510 | return OS << getModeStr(RMode); | ||||
| 1511 | } | ||||
| 1512 | |||||
| 1513 | /// Debug print. | ||||
| 1514 | LLVM_DUMP_METHOD__attribute__((noinline)) static void dumpMode(ReorderingMode RMode) { | ||||
| 1515 | printMode(RMode, dbgs()); | ||||
| 1516 | } | ||||
| 1517 | |||||
| 1518 | friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { | ||||
| 1519 | return printMode(RMode, OS); | ||||
| 1520 | } | ||||
| 1521 | |||||
| 1522 | LLVM_DUMP_METHOD__attribute__((noinline)) raw_ostream &print(raw_ostream &OS) const { | ||||
| 1523 | const unsigned Indent = 2; | ||||
| 1524 | unsigned Cnt = 0; | ||||
| 1525 | for (const OperandDataVec &OpDataVec : OpsVec) { | ||||
| 1526 | OS << "Operand " << Cnt++ << "\n"; | ||||
| 1527 | for (const OperandData &OpData : OpDataVec) { | ||||
| 1528 | OS.indent(Indent) << "{"; | ||||
| 1529 | if (Value *V = OpData.V) | ||||
| 1530 | OS << *V; | ||||
| 1531 | else | ||||
| 1532 | OS << "null"; | ||||
| 1533 | OS << ", APO:" << OpData.APO << "}\n"; | ||||
| 1534 | } | ||||
| 1535 | OS << "\n"; | ||||
| 1536 | } | ||||
| 1537 | return OS; | ||||
| 1538 | } | ||||
| 1539 | |||||
| 1540 | /// Debug print. | ||||
| 1541 | LLVM_DUMP_METHOD__attribute__((noinline)) void dump() const { print(dbgs()); } | ||||
| 1542 | #endif | ||||
| 1543 | }; | ||||
| 1544 | |||||
| 1545 | /// Checks if the instruction is marked for deletion. | ||||
| 1546 | bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } | ||||
| 1547 | |||||
| 1548 | /// Marks values operands for later deletion by replacing them with Undefs. | ||||
| 1549 | void eraseInstructions(ArrayRef<Value *> AV); | ||||
| 1550 | |||||
| 1551 | ~BoUpSLP(); | ||||
| 1552 | |||||
| 1553 | private: | ||||
| 1554 | /// Checks if all users of \p I are the part of the vectorization tree. | ||||
| 1555 | bool areAllUsersVectorized(Instruction *I, | ||||
| 1556 | ArrayRef<Value *> VectorizedVals) const; | ||||
| 1557 | |||||
| 1558 | /// \returns the cost of the vectorizable entry. | ||||
| 1559 | InstructionCost getEntryCost(const TreeEntry *E, | ||||
| 1560 | ArrayRef<Value *> VectorizedVals); | ||||
| 1561 | |||||
| 1562 | /// This is the recursive part of buildTree. | ||||
| 1563 | void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, | ||||
| 1564 | const EdgeInfo &EI); | ||||
| 1565 | |||||
| 1566 | /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can | ||||
| 1567 | /// be vectorized to use the original vector (or aggregate "bitcast" to a | ||||
| 1568 | /// vector) and sets \p CurrentOrder to the identity permutation; otherwise | ||||
| 1569 | /// returns false, setting \p CurrentOrder to either an empty vector or a | ||||
| 1570 | /// non-identity permutation that allows to reuse extract instructions. | ||||
| 1571 | bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, | ||||
| 1572 | SmallVectorImpl<unsigned> &CurrentOrder) const; | ||||
| 1573 | |||||
| 1574 | /// Vectorize a single entry in the tree. | ||||
| 1575 | Value *vectorizeTree(TreeEntry *E); | ||||
| 1576 | |||||
| 1577 | /// Vectorize a single entry in the tree, starting in \p VL. | ||||
| 1578 | Value *vectorizeTree(ArrayRef<Value *> VL); | ||||
| 1579 | |||||
| 1580 | /// \returns the scalarization cost for this type. Scalarization in this | ||||
| 1581 | /// context means the creation of vectors from a group of scalars. | ||||
| 1582 | InstructionCost | ||||
| 1583 | getGatherCost(FixedVectorType *Ty, | ||||
| 1584 | const DenseSet<unsigned> &ShuffledIndices) const; | ||||
| 1585 | |||||
| 1586 | /// Checks if the gathered \p VL can be represented as shuffle(s) of previous | ||||
| 1587 | /// tree entries. | ||||
| 1588 | /// \returns ShuffleKind, if gathered values can be represented as shuffles of | ||||
| 1589 | /// previous tree entries. \p Mask is filled with the shuffle mask. | ||||
| 1590 | Optional<TargetTransformInfo::ShuffleKind> | ||||
| 1591 | isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, | ||||
| 1592 | SmallVectorImpl<const TreeEntry *> &Entries); | ||||
| 1593 | |||||
| 1594 | /// \returns the scalarization cost for this list of values. Assuming that | ||||
| 1595 | /// this subtree gets vectorized, we may need to extract the values from the | ||||
| 1596 | /// roots. This method calculates the cost of extracting the values. | ||||
| 1597 | InstructionCost getGatherCost(ArrayRef<Value *> VL) const; | ||||
| 1598 | |||||
| 1599 | /// Set the Builder insert point to one after the last instruction in | ||||
| 1600 | /// the bundle | ||||
| 1601 | void setInsertPointAfterBundle(const TreeEntry *E); | ||||
| 1602 | |||||
| 1603 | /// \returns a vector from a collection of scalars in \p VL. | ||||
| 1604 | Value *gather(ArrayRef<Value *> VL); | ||||
| 1605 | |||||
| 1606 | /// \returns whether the VectorizableTree is fully vectorizable and will | ||||
| 1607 | /// be beneficial even the tree height is tiny. | ||||
| 1608 | bool isFullyVectorizableTinyTree() const; | ||||
| 1609 | |||||
| 1610 | /// Reorder commutative or alt operands to get better probability of | ||||
| 1611 | /// generating vectorized code. | ||||
| 1612 | static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, | ||||
| 1613 | SmallVectorImpl<Value *> &Left, | ||||
| 1614 | SmallVectorImpl<Value *> &Right, | ||||
| 1615 | const DataLayout &DL, | ||||
| 1616 | ScalarEvolution &SE, | ||||
| 1617 | const BoUpSLP &R); | ||||
| 1618 | struct TreeEntry { | ||||
| 1619 | using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; | ||||
| 1620 | TreeEntry(VecTreeTy &Container) : Container(Container) {} | ||||
| 1621 | |||||
| 1622 | /// \returns true if the scalars in VL are equal to this entry. | ||||
| 1623 | bool isSame(ArrayRef<Value *> VL) const { | ||||
| 1624 | if (VL.size() == Scalars.size()) | ||||
| 1625 | return std::equal(VL.begin(), VL.end(), Scalars.begin()); | ||||
| 1626 | return VL.size() == ReuseShuffleIndices.size() && | ||||
| 1627 | std::equal( | ||||
| 1628 | VL.begin(), VL.end(), ReuseShuffleIndices.begin(), | ||||
| 1629 | [this](Value *V, int Idx) { return V == Scalars[Idx]; }); | ||||
| 1630 | } | ||||
| 1631 | |||||
| 1632 | /// A vector of scalars. | ||||
| 1633 | ValueList Scalars; | ||||
| 1634 | |||||
| 1635 | /// The Scalars are vectorized into this value. It is initialized to Null. | ||||
| 1636 | Value *VectorizedValue = nullptr; | ||||
| 1637 | |||||
| 1638 | /// Do we need to gather this sequence or vectorize it | ||||
| 1639 | /// (either with vector instruction or with scatter/gather | ||||
| 1640 | /// intrinsics for store/load)? | ||||
| 1641 | enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; | ||||
| 1642 | EntryState State; | ||||
| 1643 | |||||
| 1644 | /// Does this sequence require some shuffling? | ||||
| 1645 | SmallVector<int, 4> ReuseShuffleIndices; | ||||
| 1646 | |||||
| 1647 | /// Does this entry require reordering? | ||||
| 1648 | SmallVector<unsigned, 4> ReorderIndices; | ||||
| 1649 | |||||
| 1650 | /// Points back to the VectorizableTree. | ||||
| 1651 | /// | ||||
| 1652 | /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has | ||||
| 1653 | /// to be a pointer and needs to be able to initialize the child iterator. | ||||
| 1654 | /// Thus we need a reference back to the container to translate the indices | ||||
| 1655 | /// to entries. | ||||
| 1656 | VecTreeTy &Container; | ||||
| 1657 | |||||
| 1658 | /// The TreeEntry index containing the user of this entry. We can actually | ||||
| 1659 | /// have multiple users so the data structure is not truly a tree. | ||||
| 1660 | SmallVector<EdgeInfo, 1> UserTreeIndices; | ||||
| 1661 | |||||
| 1662 | /// The index of this treeEntry in VectorizableTree. | ||||
| 1663 | int Idx = -1; | ||||
| 1664 | |||||
| 1665 | private: | ||||
| 1666 | /// The operands of each instruction in each lane Operands[op_index][lane]. | ||||
| 1667 | /// Note: This helps avoid the replication of the code that performs the | ||||
| 1668 | /// reordering of operands during buildTree_rec() and vectorizeTree(). | ||||
| 1669 | SmallVector<ValueList, 2> Operands; | ||||
| 1670 | |||||
| 1671 | /// The main/alternate instruction. | ||||
| 1672 | Instruction *MainOp = nullptr; | ||||
| 1673 | Instruction *AltOp = nullptr; | ||||
| 1674 | |||||
| 1675 | public: | ||||
| 1676 | /// Set this bundle's \p OpIdx'th operand to \p OpVL. | ||||
| 1677 | void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { | ||||
| 1678 | if (Operands.size() < OpIdx + 1) | ||||
| 1679 | Operands.resize(OpIdx + 1); | ||||
| 1680 | assert(Operands[OpIdx].empty() && "Already resized?")((void)0); | ||||
| 1681 | Operands[OpIdx].resize(Scalars.size()); | ||||
| 1682 | for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) | ||||
| 1683 | Operands[OpIdx][Lane] = OpVL[Lane]; | ||||
| 1684 | } | ||||
| 1685 | |||||
| 1686 | /// Set the operands of this bundle in their original order. | ||||
| 1687 | void setOperandsInOrder() { | ||||
| 1688 | assert(Operands.empty() && "Already initialized?")((void)0); | ||||
| 1689 | auto *I0 = cast<Instruction>(Scalars[0]); | ||||
| 1690 | Operands.resize(I0->getNumOperands()); | ||||
| 1691 | unsigned NumLanes = Scalars.size(); | ||||
| 1692 | for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); | ||||
| 1693 | OpIdx != NumOperands; ++OpIdx) { | ||||
| 1694 | Operands[OpIdx].resize(NumLanes); | ||||
| 1695 | for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { | ||||
| 1696 | auto *I = cast<Instruction>(Scalars[Lane]); | ||||
| 1697 | assert(I->getNumOperands() == NumOperands &&((void)0) | ||||
| 1698 | "Expected same number of operands")((void)0); | ||||
| 1699 | Operands[OpIdx][Lane] = I->getOperand(OpIdx); | ||||
| 1700 | } | ||||
| 1701 | } | ||||
| 1702 | } | ||||
| 1703 | |||||
| 1704 | /// \returns the \p OpIdx operand of this TreeEntry. | ||||
| 1705 | ValueList &getOperand(unsigned OpIdx) { | ||||
| 1706 | assert(OpIdx < Operands.size() && "Off bounds")((void)0); | ||||
| 1707 | return Operands[OpIdx]; | ||||
| 1708 | } | ||||
| 1709 | |||||
| 1710 | /// \returns the number of operands. | ||||
| 1711 | unsigned getNumOperands() const { return Operands.size(); } | ||||
| 1712 | |||||
| 1713 | /// \return the single \p OpIdx operand. | ||||
| 1714 | Value *getSingleOperand(unsigned OpIdx) const { | ||||
| 1715 | assert(OpIdx < Operands.size() && "Off bounds")((void)0); | ||||
| 1716 | assert(!Operands[OpIdx].empty() && "No operand available")((void)0); | ||||
| 1717 | return Operands[OpIdx][0]; | ||||
| 1718 | } | ||||
| 1719 | |||||
| 1720 | /// Some of the instructions in the list have alternate opcodes. | ||||
| 1721 | bool isAltShuffle() const { | ||||
| 1722 | return getOpcode() != getAltOpcode(); | ||||
| 1723 | } | ||||
| 1724 | |||||
| 1725 | bool isOpcodeOrAlt(Instruction *I) const { | ||||
| 1726 | unsigned CheckedOpcode = I->getOpcode(); | ||||
| 1727 | return (getOpcode() == CheckedOpcode || | ||||
| 1728 | getAltOpcode() == CheckedOpcode); | ||||
| 1729 | } | ||||
| 1730 | |||||
| 1731 | /// Chooses the correct key for scheduling data. If \p Op has the same (or | ||||
| 1732 | /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is | ||||
| 1733 | /// \p OpValue. | ||||
| 1734 | Value *isOneOf(Value *Op) const { | ||||
| 1735 | auto *I = dyn_cast<Instruction>(Op); | ||||
| 1736 | if (I && isOpcodeOrAlt(I)) | ||||
| 1737 | return Op; | ||||
| 1738 | return MainOp; | ||||
| 1739 | } | ||||
| 1740 | |||||
| 1741 | void setOperations(const InstructionsState &S) { | ||||
| 1742 | MainOp = S.MainOp; | ||||
| 1743 | AltOp = S.AltOp; | ||||
| 1744 | } | ||||
| 1745 | |||||
| 1746 | Instruction *getMainOp() const { | ||||
| 1747 | return MainOp; | ||||
| 1748 | } | ||||
| 1749 | |||||
| 1750 | Instruction *getAltOp() const { | ||||
| 1751 | return AltOp; | ||||
| 1752 | } | ||||
| 1753 | |||||
| 1754 | /// The main/alternate opcodes for the list of instructions. | ||||
| 1755 | unsigned getOpcode() const { | ||||
| 1756 | return MainOp ? MainOp->getOpcode() : 0; | ||||
| 1757 | } | ||||
| 1758 | |||||
| 1759 | unsigned getAltOpcode() const { | ||||
| 1760 | return AltOp ? AltOp->getOpcode() : 0; | ||||
| 1761 | } | ||||
| 1762 | |||||
| 1763 | /// Update operations state of this entry if reorder occurred. | ||||
| 1764 | bool updateStateIfReorder() { | ||||
| 1765 | if (ReorderIndices.empty()) | ||||
| 1766 | return false; | ||||
| 1767 | InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); | ||||
| 1768 | setOperations(S); | ||||
| 1769 | return true; | ||||
| 1770 | } | ||||
| 1771 | /// When ReuseShuffleIndices is empty it just returns position of \p V | ||||
| 1772 | /// within vector of Scalars. Otherwise, try to remap on its reuse index. | ||||
| 1773 | int findLaneForValue(Value *V) const { | ||||
| 1774 | unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); | ||||
| 1775 | assert(FoundLane < Scalars.size() && "Couldn't find extract lane")((void)0); | ||||
| 1776 | if (!ReuseShuffleIndices.empty()) { | ||||
| 1777 | FoundLane = std::distance(ReuseShuffleIndices.begin(), | ||||
| 1778 | find(ReuseShuffleIndices, FoundLane)); | ||||
| 1779 | } | ||||
| 1780 | return FoundLane; | ||||
| 1781 | } | ||||
| 1782 | |||||
| 1783 | #ifndef NDEBUG1 | ||||
| 1784 | /// Debug printer. | ||||
| 1785 | LLVM_DUMP_METHOD__attribute__((noinline)) void dump() const { | ||||
| 1786 | dbgs() << Idx << ".\n"; | ||||
| 1787 | for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { | ||||
| 1788 | dbgs() << "Operand " << OpI << ":\n"; | ||||
| 1789 | for (const Value *V : Operands[OpI]) | ||||
| 1790 | dbgs().indent(2) << *V << "\n"; | ||||
| 1791 | } | ||||
| 1792 | dbgs() << "Scalars: \n"; | ||||
| 1793 | for (Value *V : Scalars) | ||||
| 1794 | dbgs().indent(2) << *V << "\n"; | ||||
| 1795 | dbgs() << "State: "; | ||||
| 1796 | switch (State) { | ||||
| 1797 | case Vectorize: | ||||
| 1798 | dbgs() << "Vectorize\n"; | ||||
| 1799 | break; | ||||
| 1800 | case ScatterVectorize: | ||||
| 1801 | dbgs() << "ScatterVectorize\n"; | ||||
| 1802 | break; | ||||
| 1803 | case NeedToGather: | ||||
| 1804 | dbgs() << "NeedToGather\n"; | ||||
| 1805 | break; | ||||
| 1806 | } | ||||
| 1807 | dbgs() << "MainOp: "; | ||||
| 1808 | if (MainOp) | ||||
| 1809 | dbgs() << *MainOp << "\n"; | ||||
| 1810 | else | ||||
| 1811 | dbgs() << "NULL\n"; | ||||
| 1812 | dbgs() << "AltOp: "; | ||||
| 1813 | if (AltOp) | ||||
| 1814 | dbgs() << *AltOp << "\n"; | ||||
| 1815 | else | ||||
| 1816 | dbgs() << "NULL\n"; | ||||
| 1817 | dbgs() << "VectorizedValue: "; | ||||
| 1818 | if (VectorizedValue) | ||||
| 1819 | dbgs() << *VectorizedValue << "\n"; | ||||
| 1820 | else | ||||
| 1821 | dbgs() << "NULL\n"; | ||||
| 1822 | dbgs() << "ReuseShuffleIndices: "; | ||||
| 1823 | if (ReuseShuffleIndices.empty()) | ||||
| 1824 | dbgs() << "Empty"; | ||||
| 1825 | else | ||||
| 1826 | for (unsigned ReuseIdx : ReuseShuffleIndices) | ||||
| 1827 | dbgs() << ReuseIdx << ", "; | ||||
| 1828 | dbgs() << "\n"; | ||||
| 1829 | dbgs() << "ReorderIndices: "; | ||||
| 1830 | for (unsigned ReorderIdx : ReorderIndices) | ||||
| 1831 | dbgs() << ReorderIdx << ", "; | ||||
| 1832 | dbgs() << "\n"; | ||||
| 1833 | dbgs() << "UserTreeIndices: "; | ||||
| 1834 | for (const auto &EInfo : UserTreeIndices) | ||||
| 1835 | dbgs() << EInfo << ", "; | ||||
| 1836 | dbgs() << "\n"; | ||||
| 1837 | } | ||||
| 1838 | #endif | ||||
| 1839 | }; | ||||
| 1840 | |||||
| 1841 | #ifndef NDEBUG1 | ||||
| 1842 | void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, | ||||
| 1843 | InstructionCost VecCost, | ||||
| 1844 | InstructionCost ScalarCost) const { | ||||
| 1845 | dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); | ||||
| 1846 | dbgs() << "SLP: Costs:\n"; | ||||
| 1847 | dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n"; | ||||
| 1848 | dbgs() << "SLP: VectorCost = " << VecCost << "\n"; | ||||
| 1849 | dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n"; | ||||
| 1850 | dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " << | ||||
| 1851 | ReuseShuffleCost + VecCost - ScalarCost << "\n"; | ||||
| 1852 | } | ||||
| 1853 | #endif | ||||
| 1854 | |||||
| 1855 | /// Create a new VectorizableTree entry. | ||||
| 1856 | TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, | ||||
| 1857 | const InstructionsState &S, | ||||
| 1858 | const EdgeInfo &UserTreeIdx, | ||||
| 1859 | ArrayRef<unsigned> ReuseShuffleIndices = None, | ||||
| 1860 | ArrayRef<unsigned> ReorderIndices = None) { | ||||
| 1861 | TreeEntry::EntryState EntryState = | ||||
| 1862 | Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; | ||||
| 1863 | return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, | ||||
| 1864 | ReuseShuffleIndices, ReorderIndices); | ||||
| 1865 | } | ||||
| 1866 | |||||
| 1867 | TreeEntry *newTreeEntry(ArrayRef<Value *> VL, | ||||
| 1868 | TreeEntry::EntryState EntryState, | ||||
| 1869 | Optional<ScheduleData *> Bundle, | ||||
| 1870 | const InstructionsState &S, | ||||
| 1871 | const EdgeInfo &UserTreeIdx, | ||||
| 1872 | ArrayRef<unsigned> ReuseShuffleIndices = None, | ||||
| 1873 | ArrayRef<unsigned> ReorderIndices = None) { | ||||
| 1874 | assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||((void)0) | ||||
| 1875 | (Bundle && EntryState != TreeEntry::NeedToGather)) &&((void)0) | ||||
| 1876 | "Need to vectorize gather entry?")((void)0); | ||||
| 1877 | VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); | ||||
| 1878 | TreeEntry *Last = VectorizableTree.back().get(); | ||||
| 1879 | Last->Idx = VectorizableTree.size() - 1; | ||||
| 1880 | Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); | ||||
| 1881 | Last->State = EntryState; | ||||
| 1882 | Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), | ||||
| 1883 | ReuseShuffleIndices.end()); | ||||
| 1884 | Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); | ||||
| 1885 | Last->setOperations(S); | ||||
| 1886 | if (Last->State != TreeEntry::NeedToGather) { | ||||
| 1887 | for (Value *V : VL) { | ||||
| 1888 | assert(!getTreeEntry(V) && "Scalar already in tree!")((void)0); | ||||
| 1889 | ScalarToTreeEntry[V] = Last; | ||||
| 1890 | } | ||||
| 1891 | // Update the scheduler bundle to point to this TreeEntry. | ||||
| 1892 | unsigned Lane = 0; | ||||
| 1893 | for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; | ||||
| 1894 | BundleMember = BundleMember->NextInBundle) { | ||||
| 1895 | BundleMember->TE = Last; | ||||
| 1896 | BundleMember->Lane = Lane; | ||||
| 1897 | ++Lane; | ||||
| 1898 | } | ||||
| 1899 | assert((!Bundle.getValue() || Lane == VL.size()) &&((void)0) | ||||
| 1900 | "Bundle and VL out of sync")((void)0); | ||||
| 1901 | } else { | ||||
| 1902 | MustGather.insert(VL.begin(), VL.end()); | ||||
| 1903 | } | ||||
| 1904 | |||||
| 1905 | if (UserTreeIdx.UserTE) | ||||
| 1906 | Last->UserTreeIndices.push_back(UserTreeIdx); | ||||
| 1907 | |||||
| 1908 | return Last; | ||||
| 1909 | } | ||||
| 1910 | |||||
| 1911 | /// -- Vectorization State -- | ||||
| 1912 | /// Holds all of the tree entries. | ||||
| 1913 | TreeEntry::VecTreeTy VectorizableTree; | ||||
| 1914 | |||||
| 1915 | #ifndef NDEBUG1 | ||||
| 1916 | /// Debug printer. | ||||
| 1917 | LLVM_DUMP_METHOD__attribute__((noinline)) void dumpVectorizableTree() const { | ||||
| 1918 | for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { | ||||
| 1919 | VectorizableTree[Id]->dump(); | ||||
| 1920 | dbgs() << "\n"; | ||||
| 1921 | } | ||||
| 1922 | } | ||||
| 1923 | #endif | ||||
| 1924 | |||||
| 1925 | TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } | ||||
| 1926 | |||||
| 1927 | const TreeEntry *getTreeEntry(Value *V) const { | ||||
| 1928 | return ScalarToTreeEntry.lookup(V); | ||||
| 1929 | } | ||||
| 1930 | |||||
| 1931 | /// Maps a specific scalar to its tree entry. | ||||
| 1932 | SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; | ||||
| 1933 | |||||
| 1934 | /// Maps a value to the proposed vectorizable size. | ||||
| 1935 | SmallDenseMap<Value *, unsigned> InstrElementSize; | ||||
| 1936 | |||||
| 1937 | /// A list of scalars that we found that we need to keep as scalars. | ||||
| 1938 | ValueSet MustGather; | ||||
| 1939 | |||||
| 1940 | /// This POD struct describes one external user in the vectorized tree. | ||||
| 1941 | struct ExternalUser { | ||||
| 1942 | ExternalUser(Value *S, llvm::User *U, int L) | ||||
| 1943 | : Scalar(S), User(U), Lane(L) {} | ||||
| 1944 | |||||
| 1945 | // Which scalar in our function. | ||||
| 1946 | Value *Scalar; | ||||
| 1947 | |||||
| 1948 | // Which user that uses the scalar. | ||||
| 1949 | llvm::User *User; | ||||
| 1950 | |||||
| 1951 | // Which lane does the scalar belong to. | ||||
| 1952 | int Lane; | ||||
| 1953 | }; | ||||
| 1954 | using UserList = SmallVector<ExternalUser, 16>; | ||||
| 1955 | |||||
| 1956 | /// Checks if two instructions may access the same memory. | ||||
| 1957 | /// | ||||
| 1958 | /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it | ||||
| 1959 | /// is invariant in the calling loop. | ||||
| 1960 | bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, | ||||
| 1961 | Instruction *Inst2) { | ||||
| 1962 | // First check if the result is already in the cache. | ||||
| 1963 | AliasCacheKey key = std::make_pair(Inst1, Inst2); | ||||
| 1964 | Optional<bool> &result = AliasCache[key]; | ||||
| 1965 | if (result.hasValue()) { | ||||
| 1966 | return result.getValue(); | ||||
| 1967 | } | ||||
| 1968 | MemoryLocation Loc2 = getLocation(Inst2, AA); | ||||
| 1969 | bool aliased = true; | ||||
| 1970 | if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { | ||||
| 1971 | // Do the alias check. | ||||
| 1972 | aliased = !AA->isNoAlias(Loc1, Loc2); | ||||
| 1973 | } | ||||
| 1974 | // Store the result in the cache. | ||||
| 1975 | result = aliased; | ||||
| 1976 | return aliased; | ||||
| 1977 | } | ||||
| 1978 | |||||
| 1979 | using AliasCacheKey = std::pair<Instruction *, Instruction *>; | ||||
| 1980 | |||||
| 1981 | /// Cache for alias results. | ||||
| 1982 | /// TODO: consider moving this to the AliasAnalysis itself. | ||||
| 1983 | DenseMap<AliasCacheKey, Optional<bool>> AliasCache; | ||||
| 1984 | |||||
| 1985 | /// Removes an instruction from its block and eventually deletes it. | ||||
| 1986 | /// It's like Instruction::eraseFromParent() except that the actual deletion | ||||
| 1987 | /// is delayed until BoUpSLP is destructed. | ||||
| 1988 | /// This is required to ensure that there are no incorrect collisions in the | ||||
| 1989 | /// AliasCache, which can happen if a new instruction is allocated at the | ||||
| 1990 | /// same address as a previously deleted instruction. | ||||
| 1991 | void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) { | ||||
| 1992 | auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first; | ||||
| 1993 | It->getSecond() = It->getSecond() && ReplaceOpsWithUndef; | ||||
| 1994 | } | ||||
| 1995 | |||||
| 1996 | /// Temporary store for deleted instructions. Instructions will be deleted | ||||
| 1997 | /// eventually when the BoUpSLP is destructed. | ||||
| 1998 | DenseMap<Instruction *, bool> DeletedInstructions; | ||||
| 1999 | |||||
| 2000 | /// A list of values that need to extracted out of the tree. | ||||
| 2001 | /// This list holds pairs of (Internal Scalar : External User). External User | ||||
| 2002 | /// can be nullptr, it means that this Internal Scalar will be used later, | ||||
| 2003 | /// after vectorization. | ||||
| 2004 | UserList ExternalUses; | ||||
| 2005 | |||||
| 2006 | /// Values used only by @llvm.assume calls. | ||||
| 2007 | SmallPtrSet<const Value *, 32> EphValues; | ||||
| 2008 | |||||
| 2009 | /// Holds all of the instructions that we gathered. | ||||
| 2010 | SetVector<Instruction *> GatherSeq; | ||||
| 2011 | |||||
| 2012 | /// A list of blocks that we are going to CSE. | ||||
| 2013 | SetVector<BasicBlock *> CSEBlocks; | ||||
| 2014 | |||||
| 2015 | /// Contains all scheduling relevant data for an instruction. | ||||
| 2016 | /// A ScheduleData either represents a single instruction or a member of an | ||||
| 2017 | /// instruction bundle (= a group of instructions which is combined into a | ||||
| 2018 | /// vector instruction). | ||||
| 2019 | struct ScheduleData { | ||||
| 2020 | // The initial value for the dependency counters. It means that the | ||||
| 2021 | // dependencies are not calculated yet. | ||||
| 2022 | enum { InvalidDeps = -1 }; | ||||
| 2023 | |||||
| 2024 | ScheduleData() = default; | ||||
| 2025 | |||||
| 2026 | void init(int BlockSchedulingRegionID, Value *OpVal) { | ||||
| 2027 | FirstInBundle = this; | ||||
| 2028 | NextInBundle = nullptr; | ||||
| 2029 | NextLoadStore = nullptr; | ||||
| 2030 | IsScheduled = false; | ||||
| 2031 | SchedulingRegionID = BlockSchedulingRegionID; | ||||
| 2032 | UnscheduledDepsInBundle = UnscheduledDeps; | ||||
| 2033 | clearDependencies(); | ||||
| 2034 | OpValue = OpVal; | ||||
| 2035 | TE = nullptr; | ||||
| 2036 | Lane = -1; | ||||
| 2037 | } | ||||
| 2038 | |||||
| 2039 | /// Returns true if the dependency information has been calculated. | ||||
| 2040 | bool hasValidDependencies() const { return Dependencies != InvalidDeps; } | ||||
| 2041 | |||||
| 2042 | /// Returns true for single instructions and for bundle representatives | ||||
| 2043 | /// (= the head of a bundle). | ||||
| 2044 | bool isSchedulingEntity() const { return FirstInBundle == this; } | ||||
| 2045 | |||||
| 2046 | /// Returns true if it represents an instruction bundle and not only a | ||||
| 2047 | /// single instruction. | ||||
| 2048 | bool isPartOfBundle() const { | ||||
| 2049 | return NextInBundle != nullptr || FirstInBundle != this; | ||||
| 2050 | } | ||||
| 2051 | |||||
| 2052 | /// Returns true if it is ready for scheduling, i.e. it has no more | ||||
| 2053 | /// unscheduled depending instructions/bundles. | ||||
| 2054 | bool isReady() const { | ||||
| 2055 | assert(isSchedulingEntity() &&((void)0) | ||||
| 2056 | "can't consider non-scheduling entity for ready list")((void)0); | ||||
| 2057 | return UnscheduledDepsInBundle == 0 && !IsScheduled; | ||||
| 2058 | } | ||||
| 2059 | |||||
| 2060 | /// Modifies the number of unscheduled dependencies, also updating it for | ||||
| 2061 | /// the whole bundle. | ||||
| 2062 | int incrementUnscheduledDeps(int Incr) { | ||||
| 2063 | UnscheduledDeps += Incr; | ||||
| 2064 | return FirstInBundle->UnscheduledDepsInBundle += Incr; | ||||
| 2065 | } | ||||
| 2066 | |||||
| 2067 | /// Sets the number of unscheduled dependencies to the number of | ||||
| 2068 | /// dependencies. | ||||
| 2069 | void resetUnscheduledDeps() { | ||||
| 2070 | incrementUnscheduledDeps(Dependencies - UnscheduledDeps); | ||||
| 2071 | } | ||||
| 2072 | |||||
| 2073 | /// Clears all dependency information. | ||||
| 2074 | void clearDependencies() { | ||||
| 2075 | Dependencies = InvalidDeps; | ||||
| 2076 | resetUnscheduledDeps(); | ||||
| 2077 | MemoryDependencies.clear(); | ||||
| 2078 | } | ||||
| 2079 | |||||
| 2080 | void dump(raw_ostream &os) const { | ||||
| 2081 | if (!isSchedulingEntity()) { | ||||
| 2082 | os << "/ " << *Inst; | ||||
| 2083 | } else if (NextInBundle) { | ||||
| 2084 | os << '[' << *Inst; | ||||
| 2085 | ScheduleData *SD = NextInBundle; | ||||
| 2086 | while (SD) { | ||||
| 2087 | os << ';' << *SD->Inst; | ||||
| 2088 | SD = SD->NextInBundle; | ||||
| 2089 | } | ||||
| 2090 | os << ']'; | ||||
| 2091 | } else { | ||||
| 2092 | os << *Inst; | ||||
| 2093 | } | ||||
| 2094 | } | ||||
| 2095 | |||||
| 2096 | Instruction *Inst = nullptr; | ||||
| 2097 | |||||
| 2098 | /// Points to the head in an instruction bundle (and always to this for | ||||
| 2099 | /// single instructions). | ||||
| 2100 | ScheduleData *FirstInBundle = nullptr; | ||||
| 2101 | |||||
| 2102 | /// Single linked list of all instructions in a bundle. Null if it is a | ||||
| 2103 | /// single instruction. | ||||
| 2104 | ScheduleData *NextInBundle = nullptr; | ||||
| 2105 | |||||
| 2106 | /// Single linked list of all memory instructions (e.g. load, store, call) | ||||
| 2107 | /// in the block - until the end of the scheduling region. | ||||
| 2108 | ScheduleData *NextLoadStore = nullptr; | ||||
| 2109 | |||||
| 2110 | /// The dependent memory instructions. | ||||
| 2111 | /// This list is derived on demand in calculateDependencies(). | ||||
| 2112 | SmallVector<ScheduleData *, 4> MemoryDependencies; | ||||
| 2113 | |||||
| 2114 | /// This ScheduleData is in the current scheduling region if this matches | ||||
| 2115 | /// the current SchedulingRegionID of BlockScheduling. | ||||
| 2116 | int SchedulingRegionID = 0; | ||||
| 2117 | |||||
| 2118 | /// Used for getting a "good" final ordering of instructions. | ||||
| 2119 | int SchedulingPriority = 0; | ||||
| 2120 | |||||
| 2121 | /// The number of dependencies. Constitutes of the number of users of the | ||||
| 2122 | /// instruction plus the number of dependent memory instructions (if any). | ||||
| 2123 | /// This value is calculated on demand. | ||||
| 2124 | /// If InvalidDeps, the number of dependencies is not calculated yet. | ||||
| 2125 | int Dependencies = InvalidDeps; | ||||
| 2126 | |||||
| 2127 | /// The number of dependencies minus the number of dependencies of scheduled | ||||
| 2128 | /// instructions. As soon as this is zero, the instruction/bundle gets ready | ||||
| 2129 | /// for scheduling. | ||||
| 2130 | /// Note that this is negative as long as Dependencies is not calculated. | ||||
| 2131 | int UnscheduledDeps = InvalidDeps; | ||||
| 2132 | |||||
| 2133 | /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for | ||||
| 2134 | /// single instructions. | ||||
| 2135 | int UnscheduledDepsInBundle = InvalidDeps; | ||||
| 2136 | |||||
| 2137 | /// True if this instruction is scheduled (or considered as scheduled in the | ||||
| 2138 | /// dry-run). | ||||
| 2139 | bool IsScheduled = false; | ||||
| 2140 | |||||
| 2141 | /// Opcode of the current instruction in the schedule data. | ||||
| 2142 | Value *OpValue = nullptr; | ||||
| 2143 | |||||
| 2144 | /// The TreeEntry that this instruction corresponds to. | ||||
| 2145 | TreeEntry *TE = nullptr; | ||||
| 2146 | |||||
| 2147 | /// The lane of this node in the TreeEntry. | ||||
| 2148 | int Lane = -1; | ||||
| 2149 | }; | ||||
| 2150 | |||||
| 2151 | #ifndef NDEBUG1 | ||||
| 2152 | friend inline raw_ostream &operator<<(raw_ostream &os, | ||||
| 2153 | const BoUpSLP::ScheduleData &SD) { | ||||
| 2154 | SD.dump(os); | ||||
| 2155 | return os; | ||||
| 2156 | } | ||||
| 2157 | #endif | ||||
| 2158 | |||||
| 2159 | friend struct GraphTraits<BoUpSLP *>; | ||||
| 2160 | friend struct DOTGraphTraits<BoUpSLP *>; | ||||
| 2161 | |||||
| 2162 | /// Contains all scheduling data for a basic block. | ||||
| 2163 | struct BlockScheduling { | ||||
| 2164 | BlockScheduling(BasicBlock *BB) | ||||
| 2165 | : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} | ||||
| 2166 | |||||
| 2167 | void clear() { | ||||
| 2168 | ReadyInsts.clear(); | ||||
| 2169 | ScheduleStart = nullptr; | ||||
| 2170 | ScheduleEnd = nullptr; | ||||
| 2171 | FirstLoadStoreInRegion = nullptr; | ||||
| 2172 | LastLoadStoreInRegion = nullptr; | ||||
| 2173 | |||||
| 2174 | // Reduce the maximum schedule region size by the size of the | ||||
| 2175 | // previous scheduling run. | ||||
| 2176 | ScheduleRegionSizeLimit -= ScheduleRegionSize; | ||||
| 2177 | if (ScheduleRegionSizeLimit < MinScheduleRegionSize) | ||||
| 2178 | ScheduleRegionSizeLimit = MinScheduleRegionSize; | ||||
| 2179 | ScheduleRegionSize = 0; | ||||
| 2180 | |||||
| 2181 | // Make a new scheduling region, i.e. all existing ScheduleData is not | ||||
| 2182 | // in the new region yet. | ||||
| 2183 | ++SchedulingRegionID; | ||||
| 2184 | } | ||||
| 2185 | |||||
| 2186 | ScheduleData *getScheduleData(Value *V) { | ||||
| 2187 | ScheduleData *SD = ScheduleDataMap[V]; | ||||
| 2188 | if (SD && SD->SchedulingRegionID == SchedulingRegionID) | ||||
| 2189 | return SD; | ||||
| 2190 | return nullptr; | ||||
| 2191 | } | ||||
| 2192 | |||||
| 2193 | ScheduleData *getScheduleData(Value *V, Value *Key) { | ||||
| 2194 | if (V == Key) | ||||
| 2195 | return getScheduleData(V); | ||||
| 2196 | auto I = ExtraScheduleDataMap.find(V); | ||||
| 2197 | if (I != ExtraScheduleDataMap.end()) { | ||||
| 2198 | ScheduleData *SD = I->second[Key]; | ||||
| 2199 | if (SD && SD->SchedulingRegionID == SchedulingRegionID) | ||||
| 2200 | return SD; | ||||
| 2201 | } | ||||
| 2202 | return nullptr; | ||||
| 2203 | } | ||||
| 2204 | |||||
| 2205 | bool isInSchedulingRegion(ScheduleData *SD) const { | ||||
| 2206 | return SD->SchedulingRegionID == SchedulingRegionID; | ||||
| 2207 | } | ||||
| 2208 | |||||
| 2209 | /// Marks an instruction as scheduled and puts all dependent ready | ||||
| 2210 | /// instructions into the ready-list. | ||||
| 2211 | template <typename ReadyListType> | ||||
| 2212 | void schedule(ScheduleData *SD, ReadyListType &ReadyList) { | ||||
| 2213 | SD->IsScheduled = true; | ||||
| 2214 | LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n")do { } while (false); | ||||
| 2215 | |||||
| 2216 | ScheduleData *BundleMember = SD; | ||||
| 2217 | while (BundleMember) { | ||||
| 2218 | if (BundleMember->Inst != BundleMember->OpValue) { | ||||
| 2219 | BundleMember = BundleMember->NextInBundle; | ||||
| 2220 | continue; | ||||
| 2221 | } | ||||
| 2222 | // Handle the def-use chain dependencies. | ||||
| 2223 | |||||
| 2224 | // Decrement the unscheduled counter and insert to ready list if ready. | ||||
| 2225 | auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { | ||||
| 2226 | doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { | ||||
| 2227 | if (OpDef && OpDef->hasValidDependencies() && | ||||
| 2228 | OpDef->incrementUnscheduledDeps(-1) == 0) { | ||||
| 2229 | // There are no more unscheduled dependencies after | ||||
| 2230 | // decrementing, so we can put the dependent instruction | ||||
| 2231 | // into the ready list. | ||||
| 2232 | ScheduleData *DepBundle = OpDef->FirstInBundle; | ||||
| 2233 | assert(!DepBundle->IsScheduled &&((void)0) | ||||
| 2234 | "already scheduled bundle gets ready")((void)0); | ||||
| 2235 | ReadyList.insert(DepBundle); | ||||
| 2236 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 2237 | << "SLP: gets ready (def): " << *DepBundle << "\n")do { } while (false); | ||||
| 2238 | } | ||||
| 2239 | }); | ||||
| 2240 | }; | ||||
| 2241 | |||||
| 2242 | // If BundleMember is a vector bundle, its operands may have been | ||||
| 2243 | // reordered duiring buildTree(). We therefore need to get its operands | ||||
| 2244 | // through the TreeEntry. | ||||
| 2245 | if (TreeEntry *TE = BundleMember->TE) { | ||||
| 2246 | int Lane = BundleMember->Lane; | ||||
| 2247 | assert(Lane >= 0 && "Lane not set")((void)0); | ||||
| 2248 | |||||
| 2249 | // Since vectorization tree is being built recursively this assertion | ||||
| 2250 | // ensures that the tree entry has all operands set before reaching | ||||
| 2251 | // this code. Couple of exceptions known at the moment are extracts | ||||
| 2252 | // where their second (immediate) operand is not added. Since | ||||
| 2253 | // immediates do not affect scheduler behavior this is considered | ||||
| 2254 | // okay. | ||||
| 2255 | auto *In = TE->getMainOp(); | ||||
| 2256 | assert(In &&((void)0) | ||||
| 2257 | (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||((void)0) | ||||
| 2258 | In->getNumOperands() == TE->getNumOperands()) &&((void)0) | ||||
| 2259 | "Missed TreeEntry operands?")((void)0); | ||||
| 2260 | (void)In; // fake use to avoid build failure when assertions disabled | ||||
| 2261 | |||||
| 2262 | for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); | ||||
| 2263 | OpIdx != NumOperands; ++OpIdx) | ||||
| 2264 | if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) | ||||
| 2265 | DecrUnsched(I); | ||||
| 2266 | } else { | ||||
| 2267 | // If BundleMember is a stand-alone instruction, no operand reordering | ||||
| 2268 | // has taken place, so we directly access its operands. | ||||
| 2269 | for (Use &U : BundleMember->Inst->operands()) | ||||
| 2270 | if (auto *I = dyn_cast<Instruction>(U.get())) | ||||
| 2271 | DecrUnsched(I); | ||||
| 2272 | } | ||||
| 2273 | // Handle the memory dependencies. | ||||
| 2274 | for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { | ||||
| 2275 | if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { | ||||
| 2276 | // There are no more unscheduled dependencies after decrementing, | ||||
| 2277 | // so we can put the dependent instruction into the ready list. | ||||
| 2278 | ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; | ||||
| 2279 | assert(!DepBundle->IsScheduled &&((void)0) | ||||
| 2280 | "already scheduled bundle gets ready")((void)0); | ||||
| 2281 | ReadyList.insert(DepBundle); | ||||
| 2282 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 2283 | << "SLP: gets ready (mem): " << *DepBundle << "\n")do { } while (false); | ||||
| 2284 | } | ||||
| 2285 | } | ||||
| 2286 | BundleMember = BundleMember->NextInBundle; | ||||
| 2287 | } | ||||
| 2288 | } | ||||
| 2289 | |||||
| 2290 | void doForAllOpcodes(Value *V, | ||||
| 2291 | function_ref<void(ScheduleData *SD)> Action) { | ||||
| 2292 | if (ScheduleData *SD = getScheduleData(V)) | ||||
| 2293 | Action(SD); | ||||
| 2294 | auto I = ExtraScheduleDataMap.find(V); | ||||
| 2295 | if (I != ExtraScheduleDataMap.end()) | ||||
| 2296 | for (auto &P : I->second) | ||||
| 2297 | if (P.second->SchedulingRegionID == SchedulingRegionID) | ||||
| 2298 | Action(P.second); | ||||
| 2299 | } | ||||
| 2300 | |||||
| 2301 | /// Put all instructions into the ReadyList which are ready for scheduling. | ||||
| 2302 | template <typename ReadyListType> | ||||
| 2303 | void initialFillReadyList(ReadyListType &ReadyList) { | ||||
| 2304 | for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { | ||||
| 2305 | doForAllOpcodes(I, [&](ScheduleData *SD) { | ||||
| 2306 | if (SD->isSchedulingEntity() && SD->isReady()) { | ||||
| 2307 | ReadyList.insert(SD); | ||||
| 2308 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 2309 | << "SLP: initially in ready list: " << *I << "\n")do { } while (false); | ||||
| 2310 | } | ||||
| 2311 | }); | ||||
| 2312 | } | ||||
| 2313 | } | ||||
| 2314 | |||||
| 2315 | /// Checks if a bundle of instructions can be scheduled, i.e. has no | ||||
| 2316 | /// cyclic dependencies. This is only a dry-run, no instructions are | ||||
| 2317 | /// actually moved at this stage. | ||||
| 2318 | /// \returns the scheduling bundle. The returned Optional value is non-None | ||||
| 2319 | /// if \p VL is allowed to be scheduled. | ||||
| 2320 | Optional<ScheduleData *> | ||||
| 2321 | tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, | ||||
| 2322 | const InstructionsState &S); | ||||
| 2323 | |||||
| 2324 | /// Un-bundles a group of instructions. | ||||
| 2325 | void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); | ||||
| 2326 | |||||
| 2327 | /// Allocates schedule data chunk. | ||||
| 2328 | ScheduleData *allocateScheduleDataChunks(); | ||||
| 2329 | |||||
| 2330 | /// Extends the scheduling region so that V is inside the region. | ||||
| 2331 | /// \returns true if the region size is within the limit. | ||||
| 2332 | bool extendSchedulingRegion(Value *V, const InstructionsState &S); | ||||
| 2333 | |||||
| 2334 | /// Initialize the ScheduleData structures for new instructions in the | ||||
| 2335 | /// scheduling region. | ||||
| 2336 | void initScheduleData(Instruction *FromI, Instruction *ToI, | ||||
| 2337 | ScheduleData *PrevLoadStore, | ||||
| 2338 | ScheduleData *NextLoadStore); | ||||
| 2339 | |||||
| 2340 | /// Updates the dependency information of a bundle and of all instructions/ | ||||
| 2341 | /// bundles which depend on the original bundle. | ||||
| 2342 | void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, | ||||
| 2343 | BoUpSLP *SLP); | ||||
| 2344 | |||||
| 2345 | /// Sets all instruction in the scheduling region to un-scheduled. | ||||
| 2346 | void resetSchedule(); | ||||
| 2347 | |||||
| 2348 | BasicBlock *BB; | ||||
| 2349 | |||||
| 2350 | /// Simple memory allocation for ScheduleData. | ||||
| 2351 | std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; | ||||
| 2352 | |||||
| 2353 | /// The size of a ScheduleData array in ScheduleDataChunks. | ||||
| 2354 | int ChunkSize; | ||||
| 2355 | |||||
| 2356 | /// The allocator position in the current chunk, which is the last entry | ||||
| 2357 | /// of ScheduleDataChunks. | ||||
| 2358 | int ChunkPos; | ||||
| 2359 | |||||
| 2360 | /// Attaches ScheduleData to Instruction. | ||||
| 2361 | /// Note that the mapping survives during all vectorization iterations, i.e. | ||||
| 2362 | /// ScheduleData structures are recycled. | ||||
| 2363 | DenseMap<Value *, ScheduleData *> ScheduleDataMap; | ||||
| 2364 | |||||
| 2365 | /// Attaches ScheduleData to Instruction with the leading key. | ||||
| 2366 | DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> | ||||
| 2367 | ExtraScheduleDataMap; | ||||
| 2368 | |||||
| 2369 | struct ReadyList : SmallVector<ScheduleData *, 8> { | ||||
| 2370 | void insert(ScheduleData *SD) { push_back(SD); } | ||||
| 2371 | }; | ||||
| 2372 | |||||
| 2373 | /// The ready-list for scheduling (only used for the dry-run). | ||||
| 2374 | ReadyList ReadyInsts; | ||||
| 2375 | |||||
| 2376 | /// The first instruction of the scheduling region. | ||||
| 2377 | Instruction *ScheduleStart = nullptr; | ||||
| 2378 | |||||
| 2379 | /// The first instruction _after_ the scheduling region. | ||||
| 2380 | Instruction *ScheduleEnd = nullptr; | ||||
| 2381 | |||||
| 2382 | /// The first memory accessing instruction in the scheduling region | ||||
| 2383 | /// (can be null). | ||||
| 2384 | ScheduleData *FirstLoadStoreInRegion = nullptr; | ||||
| 2385 | |||||
| 2386 | /// The last memory accessing instruction in the scheduling region | ||||
| 2387 | /// (can be null). | ||||
| 2388 | ScheduleData *LastLoadStoreInRegion = nullptr; | ||||
| 2389 | |||||
| 2390 | /// The current size of the scheduling region. | ||||
| 2391 | int ScheduleRegionSize = 0; | ||||
| 2392 | |||||
| 2393 | /// The maximum size allowed for the scheduling region. | ||||
| 2394 | int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; | ||||
| 2395 | |||||
| 2396 | /// The ID of the scheduling region. For a new vectorization iteration this | ||||
| 2397 | /// is incremented which "removes" all ScheduleData from the region. | ||||
| 2398 | // Make sure that the initial SchedulingRegionID is greater than the | ||||
| 2399 | // initial SchedulingRegionID in ScheduleData (which is 0). | ||||
| 2400 | int SchedulingRegionID = 1; | ||||
| 2401 | }; | ||||
| 2402 | |||||
| 2403 | /// Attaches the BlockScheduling structures to basic blocks. | ||||
| 2404 | MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; | ||||
| 2405 | |||||
| 2406 | /// Performs the "real" scheduling. Done before vectorization is actually | ||||
| 2407 | /// performed in a basic block. | ||||
| 2408 | void scheduleBlock(BlockScheduling *BS); | ||||
| 2409 | |||||
| 2410 | /// List of users to ignore during scheduling and that don't need extracting. | ||||
| 2411 | ArrayRef<Value *> UserIgnoreList; | ||||
| 2412 | |||||
| 2413 | /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of | ||||
| 2414 | /// sorted SmallVectors of unsigned. | ||||
| 2415 | struct OrdersTypeDenseMapInfo { | ||||
| 2416 | static OrdersType getEmptyKey() { | ||||
| 2417 | OrdersType V; | ||||
| 2418 | V.push_back(~1U); | ||||
| 2419 | return V; | ||||
| 2420 | } | ||||
| 2421 | |||||
| 2422 | static OrdersType getTombstoneKey() { | ||||
| 2423 | OrdersType V; | ||||
| 2424 | V.push_back(~2U); | ||||
| 2425 | return V; | ||||
| 2426 | } | ||||
| 2427 | |||||
| 2428 | static unsigned getHashValue(const OrdersType &V) { | ||||
| 2429 | return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); | ||||
| 2430 | } | ||||
| 2431 | |||||
| 2432 | static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { | ||||
| 2433 | return LHS == RHS; | ||||
| 2434 | } | ||||
| 2435 | }; | ||||
| 2436 | |||||
| 2437 | /// Contains orders of operations along with the number of bundles that have | ||||
| 2438 | /// operations in this order. It stores only those orders that require | ||||
| 2439 | /// reordering, if reordering is not required it is counted using \a | ||||
| 2440 | /// NumOpsWantToKeepOriginalOrder. | ||||
| 2441 | DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder; | ||||
| 2442 | /// Number of bundles that do not require reordering. | ||||
| 2443 | unsigned NumOpsWantToKeepOriginalOrder = 0; | ||||
| 2444 | |||||
| 2445 | // Analysis and block reference. | ||||
| 2446 | Function *F; | ||||
| 2447 | ScalarEvolution *SE; | ||||
| 2448 | TargetTransformInfo *TTI; | ||||
| 2449 | TargetLibraryInfo *TLI; | ||||
| 2450 | AAResults *AA; | ||||
| 2451 | LoopInfo *LI; | ||||
| 2452 | DominatorTree *DT; | ||||
| 2453 | AssumptionCache *AC; | ||||
| 2454 | DemandedBits *DB; | ||||
| 2455 | const DataLayout *DL; | ||||
| 2456 | OptimizationRemarkEmitter *ORE; | ||||
| 2457 | |||||
| 2458 | unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. | ||||
| 2459 | unsigned MinVecRegSize; // Set by cl::opt (default: 128). | ||||
| 2460 | |||||
| 2461 | /// Instruction builder to construct the vectorized tree. | ||||
| 2462 | IRBuilder<> Builder; | ||||
| 2463 | |||||
| 2464 | /// A map of scalar integer values to the smallest bit width with which they | ||||
| 2465 | /// can legally be represented. The values map to (width, signed) pairs, | ||||
| 2466 | /// where "width" indicates the minimum bit width and "signed" is True if the | ||||
| 2467 | /// value must be signed-extended, rather than zero-extended, back to its | ||||
| 2468 | /// original width. | ||||
| 2469 | MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; | ||||
| 2470 | }; | ||||
| 2471 | |||||
| 2472 | } // end namespace slpvectorizer | ||||
| 2473 | |||||
| 2474 | template <> struct GraphTraits<BoUpSLP *> { | ||||
| 2475 | using TreeEntry = BoUpSLP::TreeEntry; | ||||
| 2476 | |||||
| 2477 | /// NodeRef has to be a pointer per the GraphWriter. | ||||
| 2478 | using NodeRef = TreeEntry *; | ||||
| 2479 | |||||
| 2480 | using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; | ||||
| 2481 | |||||
| 2482 | /// Add the VectorizableTree to the index iterator to be able to return | ||||
| 2483 | /// TreeEntry pointers. | ||||
| 2484 | struct ChildIteratorType | ||||
| 2485 | : public iterator_adaptor_base< | ||||
| 2486 | ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { | ||||
| 2487 | ContainerTy &VectorizableTree; | ||||
| 2488 | |||||
| 2489 | ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, | ||||
| 2490 | ContainerTy &VT) | ||||
| 2491 | : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} | ||||
| 2492 | |||||
| 2493 | NodeRef operator*() { return I->UserTE; } | ||||
| 2494 | }; | ||||
| 2495 | |||||
| 2496 | static NodeRef getEntryNode(BoUpSLP &R) { | ||||
| 2497 | return R.VectorizableTree[0].get(); | ||||
| 2498 | } | ||||
| 2499 | |||||
| 2500 | static ChildIteratorType child_begin(NodeRef N) { | ||||
| 2501 | return {N->UserTreeIndices.begin(), N->Container}; | ||||
| 2502 | } | ||||
| 2503 | |||||
| 2504 | static ChildIteratorType child_end(NodeRef N) { | ||||
| 2505 | return {N->UserTreeIndices.end(), N->Container}; | ||||
| 2506 | } | ||||
| 2507 | |||||
| 2508 | /// For the node iterator we just need to turn the TreeEntry iterator into a | ||||
| 2509 | /// TreeEntry* iterator so that it dereferences to NodeRef. | ||||
| 2510 | class nodes_iterator { | ||||
| 2511 | using ItTy = ContainerTy::iterator; | ||||
| 2512 | ItTy It; | ||||
| 2513 | |||||
| 2514 | public: | ||||
| 2515 | nodes_iterator(const ItTy &It2) : It(It2) {} | ||||
| 2516 | NodeRef operator*() { return It->get(); } | ||||
| 2517 | nodes_iterator operator++() { | ||||
| 2518 | ++It; | ||||
| 2519 | return *this; | ||||
| 2520 | } | ||||
| 2521 | bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } | ||||
| 2522 | }; | ||||
| 2523 | |||||
| 2524 | static nodes_iterator nodes_begin(BoUpSLP *R) { | ||||
| 2525 | return nodes_iterator(R->VectorizableTree.begin()); | ||||
| 2526 | } | ||||
| 2527 | |||||
| 2528 | static nodes_iterator nodes_end(BoUpSLP *R) { | ||||
| 2529 | return nodes_iterator(R->VectorizableTree.end()); | ||||
| 2530 | } | ||||
| 2531 | |||||
| 2532 | static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } | ||||
| 2533 | }; | ||||
| 2534 | |||||
| 2535 | template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { | ||||
| 2536 | using TreeEntry = BoUpSLP::TreeEntry; | ||||
| 2537 | |||||
| 2538 | DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} | ||||
| 2539 | |||||
| 2540 | std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { | ||||
| 2541 | std::string Str; | ||||
| 2542 | raw_string_ostream OS(Str); | ||||
| 2543 | if (isSplat(Entry->Scalars)) { | ||||
| 2544 | OS << "<splat> " << *Entry->Scalars[0]; | ||||
| 2545 | return Str; | ||||
| 2546 | } | ||||
| 2547 | for (auto V : Entry->Scalars) { | ||||
| 2548 | OS << *V; | ||||
| 2549 | if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) { | ||||
| 2550 | return EU.Scalar == V; | ||||
| 2551 | })) | ||||
| 2552 | OS << " <extract>"; | ||||
| 2553 | OS << "\n"; | ||||
| 2554 | } | ||||
| 2555 | return Str; | ||||
| 2556 | } | ||||
| 2557 | |||||
| 2558 | static std::string getNodeAttributes(const TreeEntry *Entry, | ||||
| 2559 | const BoUpSLP *) { | ||||
| 2560 | if (Entry->State == TreeEntry::NeedToGather) | ||||
| 2561 | return "color=red"; | ||||
| 2562 | return ""; | ||||
| 2563 | } | ||||
| 2564 | }; | ||||
| 2565 | |||||
| 2566 | } // end namespace llvm | ||||
| 2567 | |||||
| 2568 | BoUpSLP::~BoUpSLP() { | ||||
| 2569 | for (const auto &Pair : DeletedInstructions) { | ||||
| 2570 | // Replace operands of ignored instructions with Undefs in case if they were | ||||
| 2571 | // marked for deletion. | ||||
| 2572 | if (Pair.getSecond()) { | ||||
| 2573 | Value *Undef = UndefValue::get(Pair.getFirst()->getType()); | ||||
| 2574 | Pair.getFirst()->replaceAllUsesWith(Undef); | ||||
| 2575 | } | ||||
| 2576 | Pair.getFirst()->dropAllReferences(); | ||||
| 2577 | } | ||||
| 2578 | for (const auto &Pair : DeletedInstructions) { | ||||
| 2579 | assert(Pair.getFirst()->use_empty() &&((void)0) | ||||
| 2580 | "trying to erase instruction with users.")((void)0); | ||||
| 2581 | Pair.getFirst()->eraseFromParent(); | ||||
| 2582 | } | ||||
| 2583 | #ifdef EXPENSIVE_CHECKS | ||||
| 2584 | // If we could guarantee that this call is not extremely slow, we could | ||||
| 2585 | // remove the ifdef limitation (see PR47712). | ||||
| 2586 | assert(!verifyFunction(*F, &dbgs()))((void)0); | ||||
| 2587 | #endif | ||||
| 2588 | } | ||||
| 2589 | |||||
| 2590 | void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { | ||||
| 2591 | for (auto *V : AV) { | ||||
| 2592 | if (auto *I = dyn_cast<Instruction>(V)) | ||||
| 2593 | eraseInstruction(I, /*ReplaceOpsWithUndef=*/true); | ||||
| 2594 | }; | ||||
| 2595 | } | ||||
| 2596 | |||||
| 2597 | void BoUpSLP::buildTree(ArrayRef<Value *> Roots, | ||||
| 2598 | ArrayRef<Value *> UserIgnoreLst) { | ||||
| 2599 | ExtraValueToDebugLocsMap ExternallyUsedValues; | ||||
| 2600 | buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); | ||||
| 2601 | } | ||||
| 2602 | |||||
| 2603 | void BoUpSLP::buildTree(ArrayRef<Value *> Roots, | ||||
| 2604 | ExtraValueToDebugLocsMap &ExternallyUsedValues, | ||||
| 2605 | ArrayRef<Value *> UserIgnoreLst) { | ||||
| 2606 | deleteTree(); | ||||
| 2607 | UserIgnoreList = UserIgnoreLst; | ||||
| 2608 | if (!allSameType(Roots)) | ||||
| 2609 | return; | ||||
| 2610 | buildTree_rec(Roots, 0, EdgeInfo()); | ||||
| 2611 | |||||
| 2612 | // Collect the values that we need to extract from the tree. | ||||
| 2613 | for (auto &TEPtr : VectorizableTree) { | ||||
| 2614 | TreeEntry *Entry = TEPtr.get(); | ||||
| 2615 | |||||
| 2616 | // No need to handle users of gathered values. | ||||
| 2617 | if (Entry->State == TreeEntry::NeedToGather) | ||||
| 2618 | continue; | ||||
| 2619 | |||||
| 2620 | // For each lane: | ||||
| 2621 | for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { | ||||
| 2622 | Value *Scalar = Entry->Scalars[Lane]; | ||||
| 2623 | int FoundLane = Entry->findLaneForValue(Scalar); | ||||
| 2624 | |||||
| 2625 | // Check if the scalar is externally used as an extra arg. | ||||
| 2626 | auto ExtI = ExternallyUsedValues.find(Scalar); | ||||
| 2627 | if (ExtI != ExternallyUsedValues.end()) { | ||||
| 2628 | LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "do { } while (false) | ||||
| 2629 | << Lane << " from " << *Scalar << ".\n")do { } while (false); | ||||
| 2630 | ExternalUses.emplace_back(Scalar, nullptr, FoundLane); | ||||
| 2631 | } | ||||
| 2632 | for (User *U : Scalar->users()) { | ||||
| 2633 | LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n")do { } while (false); | ||||
| 2634 | |||||
| 2635 | Instruction *UserInst = dyn_cast<Instruction>(U); | ||||
| 2636 | if (!UserInst) | ||||
| 2637 | continue; | ||||
| 2638 | |||||
| 2639 | // Skip in-tree scalars that become vectors | ||||
| 2640 | if (TreeEntry *UseEntry = getTreeEntry(U)) { | ||||
| 2641 | Value *UseScalar = UseEntry->Scalars[0]; | ||||
| 2642 | // Some in-tree scalars will remain as scalar in vectorized | ||||
| 2643 | // instructions. If that is the case, the one in Lane 0 will | ||||
| 2644 | // be used. | ||||
| 2645 | if (UseScalar != U || | ||||
| 2646 | UseEntry->State == TreeEntry::ScatterVectorize || | ||||
| 2647 | !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { | ||||
| 2648 | LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *Udo { } while (false) | ||||
| 2649 | << ".\n")do { } while (false); | ||||
| 2650 | assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state")((void)0); | ||||
| 2651 | continue; | ||||
| 2652 | } | ||||
| 2653 | } | ||||
| 2654 | |||||
| 2655 | // Ignore users in the user ignore list. | ||||
| 2656 | if (is_contained(UserIgnoreList, UserInst)) | ||||
| 2657 | continue; | ||||
| 2658 | |||||
| 2659 | LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "do { } while (false) | ||||
| 2660 | << Lane << " from " << *Scalar << ".\n")do { } while (false); | ||||
| 2661 | ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); | ||||
| 2662 | } | ||||
| 2663 | } | ||||
| 2664 | } | ||||
| 2665 | } | ||||
| 2666 | |||||
| 2667 | void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, | ||||
| 2668 | const EdgeInfo &UserTreeIdx) { | ||||
| 2669 | assert((allConstant(VL) || allSameType(VL)) && "Invalid types!")((void)0); | ||||
| 2670 | |||||
| 2671 | InstructionsState S = getSameOpcode(VL); | ||||
| 2672 | if (Depth == RecursionMaxDepth) { | ||||
| 2673 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n")do { } while (false); | ||||
| 2674 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2675 | return; | ||||
| 2676 | } | ||||
| 2677 | |||||
| 2678 | // Don't handle scalable vectors | ||||
| 2679 | if (S.getOpcode() == Instruction::ExtractElement && | ||||
| 2680 | isa<ScalableVectorType>( | ||||
| 2681 | cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) { | ||||
| 2682 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n")do { } while (false); | ||||
| 2683 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2684 | return; | ||||
| 2685 | } | ||||
| 2686 | |||||
| 2687 | // Don't handle vectors. | ||||
| 2688 | if (S.OpValue->getType()->isVectorTy() && | ||||
| 2689 | !isa<InsertElementInst>(S.OpValue)) { | ||||
| 2690 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n")do { } while (false); | ||||
| 2691 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2692 | return; | ||||
| 2693 | } | ||||
| 2694 | |||||
| 2695 | if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) | ||||
| 2696 | if (SI->getValueOperand()->getType()->isVectorTy()) { | ||||
| 2697 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n")do { } while (false); | ||||
| 2698 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2699 | return; | ||||
| 2700 | } | ||||
| 2701 | |||||
| 2702 | // If all of the operands are identical or constant we have a simple solution. | ||||
| 2703 | if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { | ||||
| 2704 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n")do { } while (false); | ||||
| 2705 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2706 | return; | ||||
| 2707 | } | ||||
| 2708 | |||||
| 2709 | // We now know that this is a vector of instructions of the same type from | ||||
| 2710 | // the same block. | ||||
| 2711 | |||||
| 2712 | // Don't vectorize ephemeral values. | ||||
| 2713 | for (Value *V : VL) { | ||||
| 2714 | if (EphValues.count(V)) { | ||||
| 2715 | LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { } while (false) | ||||
| 2716 | << ") is ephemeral.\n")do { } while (false); | ||||
| 2717 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2718 | return; | ||||
| 2719 | } | ||||
| 2720 | } | ||||
| 2721 | |||||
| 2722 | // Check if this is a duplicate of another entry. | ||||
| 2723 | if (TreeEntry *E = getTreeEntry(S.OpValue)) { | ||||
| 2724 | LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n")do { } while (false); | ||||
| 2725 | if (!E->isSame(VL)) { | ||||
| 2726 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n")do { } while (false); | ||||
| 2727 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2728 | return; | ||||
| 2729 | } | ||||
| 2730 | // Record the reuse of the tree node. FIXME, currently this is only used to | ||||
| 2731 | // properly draw the graph rather than for the actual vectorization. | ||||
| 2732 | E->UserTreeIndices.push_back(UserTreeIdx); | ||||
| 2733 | LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValuedo { } while (false) | ||||
| 2734 | << ".\n")do { } while (false); | ||||
| 2735 | return; | ||||
| 2736 | } | ||||
| 2737 | |||||
| 2738 | // Check that none of the instructions in the bundle are already in the tree. | ||||
| 2739 | for (Value *V : VL) { | ||||
| 2740 | auto *I = dyn_cast<Instruction>(V); | ||||
| 2741 | if (!I) | ||||
| 2742 | continue; | ||||
| 2743 | if (getTreeEntry(I)) { | ||||
| 2744 | LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { } while (false) | ||||
| 2745 | << ") is already in tree.\n")do { } while (false); | ||||
| 2746 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2747 | return; | ||||
| 2748 | } | ||||
| 2749 | } | ||||
| 2750 | |||||
| 2751 | // If any of the scalars is marked as a value that needs to stay scalar, then | ||||
| 2752 | // we need to gather the scalars. | ||||
| 2753 | // The reduction nodes (stored in UserIgnoreList) also should stay scalar. | ||||
| 2754 | for (Value *V : VL) { | ||||
| 2755 | if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { | ||||
| 2756 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n")do { } while (false); | ||||
| 2757 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2758 | return; | ||||
| 2759 | } | ||||
| 2760 | } | ||||
| 2761 | |||||
| 2762 | // Check that all of the users of the scalars that we want to vectorize are | ||||
| 2763 | // schedulable. | ||||
| 2764 | auto *VL0 = cast<Instruction>(S.OpValue); | ||||
| 2765 | BasicBlock *BB = VL0->getParent(); | ||||
| 2766 | |||||
| 2767 | if (!DT->isReachableFromEntry(BB)) { | ||||
| 2768 | // Don't go into unreachable blocks. They may contain instructions with | ||||
| 2769 | // dependency cycles which confuse the final scheduling. | ||||
| 2770 | LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n")do { } while (false); | ||||
| 2771 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2772 | return; | ||||
| 2773 | } | ||||
| 2774 | |||||
| 2775 | // Check that every instruction appears once in this bundle. | ||||
| 2776 | SmallVector<unsigned, 4> ReuseShuffleIndicies; | ||||
| 2777 | SmallVector<Value *, 4> UniqueValues; | ||||
| 2778 | DenseMap<Value *, unsigned> UniquePositions; | ||||
| 2779 | for (Value *V : VL) { | ||||
| 2780 | auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); | ||||
| 2781 | ReuseShuffleIndicies.emplace_back(Res.first->second); | ||||
| 2782 | if (Res.second) | ||||
| 2783 | UniqueValues.emplace_back(V); | ||||
| 2784 | } | ||||
| 2785 | size_t NumUniqueScalarValues = UniqueValues.size(); | ||||
| 2786 | if (NumUniqueScalarValues == VL.size()) { | ||||
| 2787 | ReuseShuffleIndicies.clear(); | ||||
| 2788 | } else { | ||||
| 2789 | LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n")do { } while (false); | ||||
| 2790 | if (NumUniqueScalarValues <= 1 || | ||||
| 2791 | !llvm::isPowerOf2_32(NumUniqueScalarValues)) { | ||||
| 2792 | LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n")do { } while (false); | ||||
| 2793 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); | ||||
| 2794 | return; | ||||
| 2795 | } | ||||
| 2796 | VL = UniqueValues; | ||||
| 2797 | } | ||||
| 2798 | |||||
| 2799 | auto &BSRef = BlocksSchedules[BB]; | ||||
| 2800 | if (!BSRef) | ||||
| 2801 | BSRef = std::make_unique<BlockScheduling>(BB); | ||||
| 2802 | |||||
| 2803 | BlockScheduling &BS = *BSRef.get(); | ||||
| 2804 | |||||
| 2805 | Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); | ||||
| 2806 | if (!Bundle) { | ||||
| 2807 | LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n")do { } while (false); | ||||
| 2808 | assert((!BS.getScheduleData(VL0) ||((void)0) | ||||
| 2809 | !BS.getScheduleData(VL0)->isPartOfBundle()) &&((void)0) | ||||
| 2810 | "tryScheduleBundle should cancelScheduling on failure")((void)0); | ||||
| 2811 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 2812 | ReuseShuffleIndicies); | ||||
| 2813 | return; | ||||
| 2814 | } | ||||
| 2815 | LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n")do { } while (false); | ||||
| 2816 | |||||
| 2817 | unsigned ShuffleOrOp = S.isAltShuffle() ? | ||||
| 2818 | (unsigned) Instruction::ShuffleVector : S.getOpcode(); | ||||
| 2819 | switch (ShuffleOrOp) { | ||||
| 2820 | case Instruction::PHI: { | ||||
| 2821 | auto *PH = cast<PHINode>(VL0); | ||||
| 2822 | |||||
| 2823 | // Check for terminator values (e.g. invoke). | ||||
| 2824 | for (Value *V : VL) | ||||
| 2825 | for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { | ||||
| 2826 | Instruction *Term = dyn_cast<Instruction>( | ||||
| 2827 | cast<PHINode>(V)->getIncomingValueForBlock( | ||||
| 2828 | PH->getIncomingBlock(I))); | ||||
| 2829 | if (Term && Term->isTerminator()) { | ||||
| 2830 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 2831 | << "SLP: Need to swizzle PHINodes (terminator use).\n")do { } while (false); | ||||
| 2832 | BS.cancelScheduling(VL, VL0); | ||||
| 2833 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 2834 | ReuseShuffleIndicies); | ||||
| 2835 | return; | ||||
| 2836 | } | ||||
| 2837 | } | ||||
| 2838 | |||||
| 2839 | TreeEntry *TE = | ||||
| 2840 | newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); | ||||
| 2841 | LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n")do { } while (false); | ||||
| 2842 | |||||
| 2843 | // Keeps the reordered operands to avoid code duplication. | ||||
| 2844 | SmallVector<ValueList, 2> OperandsVec; | ||||
| 2845 | for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { | ||||
| 2846 | if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) { | ||||
| 2847 | ValueList Operands(VL.size(), PoisonValue::get(PH->getType())); | ||||
| 2848 | TE->setOperand(I, Operands); | ||||
| 2849 | OperandsVec.push_back(Operands); | ||||
| 2850 | continue; | ||||
| 2851 | } | ||||
| 2852 | ValueList Operands; | ||||
| 2853 | // Prepare the operand vector. | ||||
| 2854 | for (Value *V : VL) | ||||
| 2855 | Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock( | ||||
| 2856 | PH->getIncomingBlock(I))); | ||||
| 2857 | TE->setOperand(I, Operands); | ||||
| 2858 | OperandsVec.push_back(Operands); | ||||
| 2859 | } | ||||
| 2860 | for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) | ||||
| 2861 | buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); | ||||
| 2862 | return; | ||||
| 2863 | } | ||||
| 2864 | case Instruction::ExtractValue: | ||||
| 2865 | case Instruction::ExtractElement: { | ||||
| 2866 | OrdersType CurrentOrder; | ||||
| 2867 | bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); | ||||
| 2868 | if (Reuse) { | ||||
| 2869 | LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n")do { } while (false); | ||||
| 2870 | ++NumOpsWantToKeepOriginalOrder; | ||||
| 2871 | newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 2872 | ReuseShuffleIndicies); | ||||
| 2873 | // This is a special case, as it does not gather, but at the same time | ||||
| 2874 | // we are not extending buildTree_rec() towards the operands. | ||||
| 2875 | ValueList Op0; | ||||
| 2876 | Op0.assign(VL.size(), VL0->getOperand(0)); | ||||
| 2877 | VectorizableTree.back()->setOperand(0, Op0); | ||||
| 2878 | return; | ||||
| 2879 | } | ||||
| 2880 | if (!CurrentOrder.empty()) { | ||||
| 2881 | LLVM_DEBUG({do { } while (false) | ||||
| 2882 | dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "do { } while (false) | ||||
| 2883 | "with order";do { } while (false) | ||||
| 2884 | for (unsigned Idx : CurrentOrder)do { } while (false) | ||||
| 2885 | dbgs() << " " << Idx;do { } while (false) | ||||
| 2886 | dbgs() << "\n";do { } while (false) | ||||
| 2887 | })do { } while (false); | ||||
| 2888 | // Insert new order with initial value 0, if it does not exist, | ||||
| 2889 | // otherwise return the iterator to the existing one. | ||||
| 2890 | newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 2891 | ReuseShuffleIndicies, CurrentOrder); | ||||
| 2892 | findRootOrder(CurrentOrder); | ||||
| 2893 | ++NumOpsWantToKeepOrder[CurrentOrder]; | ||||
| 2894 | // This is a special case, as it does not gather, but at the same time | ||||
| 2895 | // we are not extending buildTree_rec() towards the operands. | ||||
| 2896 | ValueList Op0; | ||||
| 2897 | Op0.assign(VL.size(), VL0->getOperand(0)); | ||||
| 2898 | VectorizableTree.back()->setOperand(0, Op0); | ||||
| 2899 | return; | ||||
| 2900 | } | ||||
| 2901 | LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n")do { } while (false); | ||||
| 2902 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 2903 | ReuseShuffleIndicies); | ||||
| 2904 | BS.cancelScheduling(VL, VL0); | ||||
| 2905 | return; | ||||
| 2906 | } | ||||
| 2907 | case Instruction::InsertElement: { | ||||
| 2908 | assert(ReuseShuffleIndicies.empty() && "All inserts should be unique")((void)0); | ||||
| 2909 | |||||
| 2910 | // Check that we have a buildvector and not a shuffle of 2 or more | ||||
| 2911 | // different vectors. | ||||
| 2912 | ValueSet SourceVectors; | ||||
| 2913 | for (Value *V : VL) | ||||
| 2914 | SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); | ||||
| 2915 | |||||
| 2916 | if (count_if(VL, [&SourceVectors](Value *V) { | ||||
| 2917 | return !SourceVectors.contains(V); | ||||
| 2918 | }) >= 2) { | ||||
| 2919 | // Found 2nd source vector - cancel. | ||||
| 2920 | LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "do { } while (false) | ||||
| 2921 | "different source vectors.\n")do { } while (false); | ||||
| 2922 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 2923 | ReuseShuffleIndicies); | ||||
| 2924 | BS.cancelScheduling(VL, VL0); | ||||
| 2925 | return; | ||||
| 2926 | } | ||||
| 2927 | |||||
| 2928 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); | ||||
| 2929 | LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n")do { } while (false); | ||||
| 2930 | |||||
| 2931 | constexpr int NumOps = 2; | ||||
| 2932 | ValueList VectorOperands[NumOps]; | ||||
| 2933 | for (int I = 0; I < NumOps; ++I) { | ||||
| 2934 | for (Value *V : VL) | ||||
| 2935 | VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I)); | ||||
| 2936 | |||||
| 2937 | TE->setOperand(I, VectorOperands[I]); | ||||
| 2938 | } | ||||
| 2939 | buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0}); | ||||
| 2940 | return; | ||||
| 2941 | } | ||||
| 2942 | case Instruction::Load: { | ||||
| 2943 | // Check that a vectorized load would load the same memory as a scalar | ||||
| 2944 | // load. For example, we don't want to vectorize loads that are smaller | ||||
| 2945 | // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM | ||||
| 2946 | // treats loading/storing it as an i8 struct. If we vectorize loads/stores | ||||
| 2947 | // from such a struct, we read/write packed bits disagreeing with the | ||||
| 2948 | // unvectorized version. | ||||
| 2949 | Type *ScalarTy = VL0->getType(); | ||||
| 2950 | |||||
| 2951 | if (DL->getTypeSizeInBits(ScalarTy) != | ||||
| 2952 | DL->getTypeAllocSizeInBits(ScalarTy)) { | ||||
| 2953 | BS.cancelScheduling(VL, VL0); | ||||
| 2954 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 2955 | ReuseShuffleIndicies); | ||||
| 2956 | LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n")do { } while (false); | ||||
| 2957 | return; | ||||
| 2958 | } | ||||
| 2959 | |||||
| 2960 | // Make sure all loads in the bundle are simple - we can't vectorize | ||||
| 2961 | // atomic or volatile loads. | ||||
| 2962 | SmallVector<Value *, 4> PointerOps(VL.size()); | ||||
| 2963 | auto POIter = PointerOps.begin(); | ||||
| 2964 | for (Value *V : VL) { | ||||
| 2965 | auto *L = cast<LoadInst>(V); | ||||
| 2966 | if (!L->isSimple()) { | ||||
| 2967 | BS.cancelScheduling(VL, VL0); | ||||
| 2968 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 2969 | ReuseShuffleIndicies); | ||||
| 2970 | LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n")do { } while (false); | ||||
| 2971 | return; | ||||
| 2972 | } | ||||
| 2973 | *POIter = L->getPointerOperand(); | ||||
| 2974 | ++POIter; | ||||
| 2975 | } | ||||
| 2976 | |||||
| 2977 | OrdersType CurrentOrder; | ||||
| 2978 | // Check the order of pointer operands. | ||||
| 2979 | if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { | ||||
| 2980 | Value *Ptr0; | ||||
| 2981 | Value *PtrN; | ||||
| 2982 | if (CurrentOrder.empty()) { | ||||
| 2983 | Ptr0 = PointerOps.front(); | ||||
| 2984 | PtrN = PointerOps.back(); | ||||
| 2985 | } else { | ||||
| 2986 | Ptr0 = PointerOps[CurrentOrder.front()]; | ||||
| 2987 | PtrN = PointerOps[CurrentOrder.back()]; | ||||
| 2988 | } | ||||
| 2989 | Optional<int> Diff = getPointersDiff( | ||||
| 2990 | ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); | ||||
| 2991 | // Check that the sorted loads are consecutive. | ||||
| 2992 | if (static_cast<unsigned>(*Diff) == VL.size() - 1) { | ||||
| 2993 | if (CurrentOrder.empty()) { | ||||
| 2994 | // Original loads are consecutive and does not require reordering. | ||||
| 2995 | ++NumOpsWantToKeepOriginalOrder; | ||||
| 2996 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, | ||||
| 2997 | UserTreeIdx, ReuseShuffleIndicies); | ||||
| 2998 | TE->setOperandsInOrder(); | ||||
| 2999 | LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n")do { } while (false); | ||||
| 3000 | } else { | ||||
| 3001 | // Need to reorder. | ||||
| 3002 | TreeEntry *TE = | ||||
| 3003 | newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 3004 | ReuseShuffleIndicies, CurrentOrder); | ||||
| 3005 | TE->setOperandsInOrder(); | ||||
| 3006 | LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n")do { } while (false); | ||||
| 3007 | findRootOrder(CurrentOrder); | ||||
| 3008 | ++NumOpsWantToKeepOrder[CurrentOrder]; | ||||
| 3009 | } | ||||
| 3010 | return; | ||||
| 3011 | } | ||||
| 3012 | Align CommonAlignment = cast<LoadInst>(VL0)->getAlign(); | ||||
| 3013 | for (Value *V : VL) | ||||
| 3014 | CommonAlignment = | ||||
| 3015 | commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); | ||||
| 3016 | if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), | ||||
| 3017 | CommonAlignment)) { | ||||
| 3018 | // Vectorizing non-consecutive loads with `llvm.masked.gather`. | ||||
| 3019 | TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, | ||||
| 3020 | S, UserTreeIdx, ReuseShuffleIndicies); | ||||
| 3021 | TE->setOperandsInOrder(); | ||||
| 3022 | buildTree_rec(PointerOps, Depth + 1, {TE, 0}); | ||||
| 3023 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 3024 | << "SLP: added a vector of non-consecutive loads.\n")do { } while (false); | ||||
| 3025 | return; | ||||
| 3026 | } | ||||
| 3027 | } | ||||
| 3028 | |||||
| 3029 | LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n")do { } while (false); | ||||
| 3030 | BS.cancelScheduling(VL, VL0); | ||||
| 3031 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3032 | ReuseShuffleIndicies); | ||||
| 3033 | return; | ||||
| 3034 | } | ||||
| 3035 | case Instruction::ZExt: | ||||
| 3036 | case Instruction::SExt: | ||||
| 3037 | case Instruction::FPToUI: | ||||
| 3038 | case Instruction::FPToSI: | ||||
| 3039 | case Instruction::FPExt: | ||||
| 3040 | case Instruction::PtrToInt: | ||||
| 3041 | case Instruction::IntToPtr: | ||||
| 3042 | case Instruction::SIToFP: | ||||
| 3043 | case Instruction::UIToFP: | ||||
| 3044 | case Instruction::Trunc: | ||||
| 3045 | case Instruction::FPTrunc: | ||||
| 3046 | case Instruction::BitCast: { | ||||
| 3047 | Type *SrcTy = VL0->getOperand(0)->getType(); | ||||
| 3048 | for (Value *V : VL) { | ||||
| 3049 | Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); | ||||
| 3050 | if (Ty != SrcTy || !isValidElementType(Ty)) { | ||||
| 3051 | BS.cancelScheduling(VL, VL0); | ||||
| 3052 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3053 | ReuseShuffleIndicies); | ||||
| 3054 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 3055 | << "SLP: Gathering casts with different src types.\n")do { } while (false); | ||||
| 3056 | return; | ||||
| 3057 | } | ||||
| 3058 | } | ||||
| 3059 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 3060 | ReuseShuffleIndicies); | ||||
| 3061 | LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n")do { } while (false); | ||||
| 3062 | |||||
| 3063 | TE->setOperandsInOrder(); | ||||
| 3064 | for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { | ||||
| 3065 | ValueList Operands; | ||||
| 3066 | // Prepare the operand vector. | ||||
| 3067 | for (Value *V : VL) | ||||
| 3068 | Operands.push_back(cast<Instruction>(V)->getOperand(i)); | ||||
| 3069 | |||||
| 3070 | buildTree_rec(Operands, Depth + 1, {TE, i}); | ||||
| 3071 | } | ||||
| 3072 | return; | ||||
| 3073 | } | ||||
| 3074 | case Instruction::ICmp: | ||||
| 3075 | case Instruction::FCmp: { | ||||
| 3076 | // Check that all of the compares have the same predicate. | ||||
| 3077 | CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); | ||||
| 3078 | CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); | ||||
| 3079 | Type *ComparedTy = VL0->getOperand(0)->getType(); | ||||
| 3080 | for (Value *V : VL) { | ||||
| 3081 | CmpInst *Cmp = cast<CmpInst>(V); | ||||
| 3082 | if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || | ||||
| 3083 | Cmp->getOperand(0)->getType() != ComparedTy) { | ||||
| 3084 | BS.cancelScheduling(VL, VL0); | ||||
| 3085 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3086 | ReuseShuffleIndicies); | ||||
| 3087 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 3088 | << "SLP: Gathering cmp with different predicate.\n")do { } while (false); | ||||
| 3089 | return; | ||||
| 3090 | } | ||||
| 3091 | } | ||||
| 3092 | |||||
| 3093 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 3094 | ReuseShuffleIndicies); | ||||
| 3095 | LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n")do { } while (false); | ||||
| 3096 | |||||
| 3097 | ValueList Left, Right; | ||||
| 3098 | if (cast<CmpInst>(VL0)->isCommutative()) { | ||||
| 3099 | // Commutative predicate - collect + sort operands of the instructions | ||||
| 3100 | // so that each side is more likely to have the same opcode. | ||||
| 3101 | assert(P0 == SwapP0 && "Commutative Predicate mismatch")((void)0); | ||||
| 3102 | reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); | ||||
| 3103 | } else { | ||||
| 3104 | // Collect operands - commute if it uses the swapped predicate. | ||||
| 3105 | for (Value *V : VL) { | ||||
| 3106 | auto *Cmp = cast<CmpInst>(V); | ||||
| 3107 | Value *LHS = Cmp->getOperand(0); | ||||
| 3108 | Value *RHS = Cmp->getOperand(1); | ||||
| 3109 | if (Cmp->getPredicate() != P0) | ||||
| 3110 | std::swap(LHS, RHS); | ||||
| 3111 | Left.push_back(LHS); | ||||
| 3112 | Right.push_back(RHS); | ||||
| 3113 | } | ||||
| 3114 | } | ||||
| 3115 | TE->setOperand(0, Left); | ||||
| 3116 | TE->setOperand(1, Right); | ||||
| 3117 | buildTree_rec(Left, Depth + 1, {TE, 0}); | ||||
| 3118 | buildTree_rec(Right, Depth + 1, {TE, 1}); | ||||
| 3119 | return; | ||||
| 3120 | } | ||||
| 3121 | case Instruction::Select: | ||||
| 3122 | case Instruction::FNeg: | ||||
| 3123 | case Instruction::Add: | ||||
| 3124 | case Instruction::FAdd: | ||||
| 3125 | case Instruction::Sub: | ||||
| 3126 | case Instruction::FSub: | ||||
| 3127 | case Instruction::Mul: | ||||
| 3128 | case Instruction::FMul: | ||||
| 3129 | case Instruction::UDiv: | ||||
| 3130 | case Instruction::SDiv: | ||||
| 3131 | case Instruction::FDiv: | ||||
| 3132 | case Instruction::URem: | ||||
| 3133 | case Instruction::SRem: | ||||
| 3134 | case Instruction::FRem: | ||||
| 3135 | case Instruction::Shl: | ||||
| 3136 | case Instruction::LShr: | ||||
| 3137 | case Instruction::AShr: | ||||
| 3138 | case Instruction::And: | ||||
| 3139 | case Instruction::Or: | ||||
| 3140 | case Instruction::Xor: { | ||||
| 3141 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 3142 | ReuseShuffleIndicies); | ||||
| 3143 | LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n")do { } while (false); | ||||
| 3144 | |||||
| 3145 | // Sort operands of the instructions so that each side is more likely to | ||||
| 3146 | // have the same opcode. | ||||
| 3147 | if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { | ||||
| 3148 | ValueList Left, Right; | ||||
| 3149 | reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); | ||||
| 3150 | TE->setOperand(0, Left); | ||||
| 3151 | TE->setOperand(1, Right); | ||||
| 3152 | buildTree_rec(Left, Depth + 1, {TE, 0}); | ||||
| 3153 | buildTree_rec(Right, Depth + 1, {TE, 1}); | ||||
| 3154 | return; | ||||
| 3155 | } | ||||
| 3156 | |||||
| 3157 | TE->setOperandsInOrder(); | ||||
| 3158 | for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { | ||||
| 3159 | ValueList Operands; | ||||
| 3160 | // Prepare the operand vector. | ||||
| 3161 | for (Value *V : VL) | ||||
| 3162 | Operands.push_back(cast<Instruction>(V)->getOperand(i)); | ||||
| 3163 | |||||
| 3164 | buildTree_rec(Operands, Depth + 1, {TE, i}); | ||||
| 3165 | } | ||||
| 3166 | return; | ||||
| 3167 | } | ||||
| 3168 | case Instruction::GetElementPtr: { | ||||
| 3169 | // We don't combine GEPs with complicated (nested) indexing. | ||||
| 3170 | for (Value *V : VL) { | ||||
| 3171 | if (cast<Instruction>(V)->getNumOperands() != 2) { | ||||
| 3172 | LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n")do { } while (false); | ||||
| 3173 | BS.cancelScheduling(VL, VL0); | ||||
| 3174 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3175 | ReuseShuffleIndicies); | ||||
| 3176 | return; | ||||
| 3177 | } | ||||
| 3178 | } | ||||
| 3179 | |||||
| 3180 | // We can't combine several GEPs into one vector if they operate on | ||||
| 3181 | // different types. | ||||
| 3182 | Type *Ty0 = VL0->getOperand(0)->getType(); | ||||
| 3183 | for (Value *V : VL) { | ||||
| 3184 | Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType(); | ||||
| 3185 | if (Ty0 != CurTy) { | ||||
| 3186 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 3187 | << "SLP: not-vectorizable GEP (different types).\n")do { } while (false); | ||||
| 3188 | BS.cancelScheduling(VL, VL0); | ||||
| 3189 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3190 | ReuseShuffleIndicies); | ||||
| 3191 | return; | ||||
| 3192 | } | ||||
| 3193 | } | ||||
| 3194 | |||||
| 3195 | // We don't combine GEPs with non-constant indexes. | ||||
| 3196 | Type *Ty1 = VL0->getOperand(1)->getType(); | ||||
| 3197 | for (Value *V : VL) { | ||||
| 3198 | auto Op = cast<Instruction>(V)->getOperand(1); | ||||
| 3199 | if (!isa<ConstantInt>(Op) || | ||||
| 3200 | (Op->getType() != Ty1 && | ||||
| 3201 | Op->getType()->getScalarSizeInBits() > | ||||
| 3202 | DL->getIndexSizeInBits( | ||||
| 3203 | V->getType()->getPointerAddressSpace()))) { | ||||
| 3204 | LLVM_DEBUG(dbgs()do { } while (false) | ||||
| 3205 | << "SLP: not-vectorizable GEP (non-constant indexes).\n")do { } while (false); | ||||
| 3206 | BS.cancelScheduling(VL, VL0); | ||||
| 3207 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3208 | ReuseShuffleIndicies); | ||||
| 3209 | return; | ||||
| 3210 | } | ||||
| 3211 | } | ||||
| 3212 | |||||
| 3213 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 3214 | ReuseShuffleIndicies); | ||||
| 3215 | LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n")do { } while (false); | ||||
| 3216 | TE->setOperandsInOrder(); | ||||
| 3217 | for (unsigned i = 0, e = 2; i < e; ++i) { | ||||
| 3218 | ValueList Operands; | ||||
| 3219 | // Prepare the operand vector. | ||||
| 3220 | for (Value *V : VL) | ||||
| 3221 | Operands.push_back(cast<Instruction>(V)->getOperand(i)); | ||||
| 3222 | |||||
| 3223 | buildTree_rec(Operands, Depth + 1, {TE, i}); | ||||
| 3224 | } | ||||
| 3225 | return; | ||||
| 3226 | } | ||||
| 3227 | case Instruction::Store: { | ||||
| 3228 | // Check if the stores are consecutive or if we need to swizzle them. | ||||
| 3229 | llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); | ||||
| 3230 | // Avoid types that are padded when being allocated as scalars, while | ||||
| 3231 | // being packed together in a vector (such as i1). | ||||
| 3232 | if (DL->getTypeSizeInBits(ScalarTy) != | ||||
| 3233 | DL->getTypeAllocSizeInBits(ScalarTy)) { | ||||
| 3234 | BS.cancelScheduling(VL, VL0); | ||||
| 3235 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3236 | ReuseShuffleIndicies); | ||||
| 3237 | LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n")do { } while (false); | ||||
| 3238 | return; | ||||
| 3239 | } | ||||
| 3240 | // Make sure all stores in the bundle are simple - we can't vectorize | ||||
| 3241 | // atomic or volatile stores. | ||||
| 3242 | SmallVector<Value *, 4> PointerOps(VL.size()); | ||||
| 3243 | ValueList Operands(VL.size()); | ||||
| 3244 | auto POIter = PointerOps.begin(); | ||||
| 3245 | auto OIter = Operands.begin(); | ||||
| 3246 | for (Value *V : VL) { | ||||
| 3247 | auto *SI = cast<StoreInst>(V); | ||||
| 3248 | if (!SI->isSimple()) { | ||||
| 3249 | BS.cancelScheduling(VL, VL0); | ||||
| 3250 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3251 | ReuseShuffleIndicies); | ||||
| 3252 | LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n")do { } while (false); | ||||
| 3253 | return; | ||||
| 3254 | } | ||||
| 3255 | *POIter = SI->getPointerOperand(); | ||||
| 3256 | *OIter = SI->getValueOperand(); | ||||
| 3257 | ++POIter; | ||||
| 3258 | ++OIter; | ||||
| 3259 | } | ||||
| 3260 | |||||
| 3261 | OrdersType CurrentOrder; | ||||
| 3262 | // Check the order of pointer operands. | ||||
| 3263 | if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { | ||||
| 3264 | Value *Ptr0; | ||||
| 3265 | Value *PtrN; | ||||
| 3266 | if (CurrentOrder.empty()) { | ||||
| 3267 | Ptr0 = PointerOps.front(); | ||||
| 3268 | PtrN = PointerOps.back(); | ||||
| 3269 | } else { | ||||
| 3270 | Ptr0 = PointerOps[CurrentOrder.front()]; | ||||
| 3271 | PtrN = PointerOps[CurrentOrder.back()]; | ||||
| 3272 | } | ||||
| 3273 | Optional<int> Dist = | ||||
| 3274 | getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); | ||||
| 3275 | // Check that the sorted pointer operands are consecutive. | ||||
| 3276 | if (static_cast<unsigned>(*Dist) == VL.size() - 1) { | ||||
| 3277 | if (CurrentOrder.empty()) { | ||||
| 3278 | // Original stores are consecutive and does not require reordering. | ||||
| 3279 | ++NumOpsWantToKeepOriginalOrder; | ||||
| 3280 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, | ||||
| 3281 | UserTreeIdx, ReuseShuffleIndicies); | ||||
| 3282 | TE->setOperandsInOrder(); | ||||
| 3283 | buildTree_rec(Operands, Depth + 1, {TE, 0}); | ||||
| 3284 | LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n")do { } while (false); | ||||
| 3285 | } else { | ||||
| 3286 | TreeEntry *TE = | ||||
| 3287 | newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 3288 | ReuseShuffleIndicies, CurrentOrder); | ||||
| 3289 | TE->setOperandsInOrder(); | ||||
| 3290 | buildTree_rec(Operands, Depth + 1, {TE, 0}); | ||||
| 3291 | LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n")do { } while (false); | ||||
| 3292 | findRootOrder(CurrentOrder); | ||||
| 3293 | ++NumOpsWantToKeepOrder[CurrentOrder]; | ||||
| 3294 | } | ||||
| 3295 | return; | ||||
| 3296 | } | ||||
| 3297 | } | ||||
| 3298 | |||||
| 3299 | BS.cancelScheduling(VL, VL0); | ||||
| 3300 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3301 | ReuseShuffleIndicies); | ||||
| 3302 | LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n")do { } while (false); | ||||
| 3303 | return; | ||||
| 3304 | } | ||||
| 3305 | case Instruction::Call: { | ||||
| 3306 | // Check if the calls are all to the same vectorizable intrinsic or | ||||
| 3307 | // library function. | ||||
| 3308 | CallInst *CI = cast<CallInst>(VL0); | ||||
| 3309 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); | ||||
| 3310 | |||||
| 3311 | VFShape Shape = VFShape::get( | ||||
| 3312 | *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())), | ||||
| 3313 | false /*HasGlobalPred*/); | ||||
| 3314 | Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); | ||||
| 3315 | |||||
| 3316 | if (!VecFunc && !isTriviallyVectorizable(ID)) { | ||||
| 3317 | BS.cancelScheduling(VL, VL0); | ||||
| 3318 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3319 | ReuseShuffleIndicies); | ||||
| 3320 | LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n")do { } while (false); | ||||
| 3321 | return; | ||||
| 3322 | } | ||||
| 3323 | Function *F = CI->getCalledFunction(); | ||||
| 3324 | unsigned NumArgs = CI->getNumArgOperands(); | ||||
| 3325 | SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr); | ||||
| 3326 | for (unsigned j = 0; j != NumArgs; ++j) | ||||
| 3327 | if (hasVectorInstrinsicScalarOpd(ID, j)) | ||||
| 3328 | ScalarArgs[j] = CI->getArgOperand(j); | ||||
| 3329 | for (Value *V : VL) { | ||||
| 3330 | CallInst *CI2 = dyn_cast<CallInst>(V); | ||||
| 3331 | if (!CI2 || CI2->getCalledFunction() != F || | ||||
| 3332 | getVectorIntrinsicIDForCall(CI2, TLI) != ID || | ||||
| 3333 | (VecFunc && | ||||
| 3334 | VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || | ||||
| 3335 | !CI->hasIdenticalOperandBundleSchema(*CI2)) { | ||||
| 3336 | BS.cancelScheduling(VL, VL0); | ||||
| 3337 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3338 | ReuseShuffleIndicies); | ||||
| 3339 | LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *Vdo { } while (false) | ||||
| 3340 | << "\n")do { } while (false); | ||||
| 3341 | return; | ||||
| 3342 | } | ||||
| 3343 | // Some intrinsics have scalar arguments and should be same in order for | ||||
| 3344 | // them to be vectorized. | ||||
| 3345 | for (unsigned j = 0; j != NumArgs; ++j) { | ||||
| 3346 | if (hasVectorInstrinsicScalarOpd(ID, j)) { | ||||
| 3347 | Value *A1J = CI2->getArgOperand(j); | ||||
| 3348 | if (ScalarArgs[j] != A1J) { | ||||
| 3349 | BS.cancelScheduling(VL, VL0); | ||||
| 3350 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3351 | ReuseShuffleIndicies); | ||||
| 3352 | LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CIdo { } while (false) | ||||
| 3353 | << " argument " << ScalarArgs[j] << "!=" << A1Jdo { } while (false) | ||||
| 3354 | << "\n")do { } while (false); | ||||
| 3355 | return; | ||||
| 3356 | } | ||||
| 3357 | } | ||||
| 3358 | } | ||||
| 3359 | // Verify that the bundle operands are identical between the two calls. | ||||
| 3360 | if (CI->hasOperandBundles() && | ||||
| 3361 | !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), | ||||
| 3362 | CI->op_begin() + CI->getBundleOperandsEndIndex(), | ||||
| 3363 | CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { | ||||
| 3364 | BS.cancelScheduling(VL, VL0); | ||||
| 3365 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3366 | ReuseShuffleIndicies); | ||||
| 3367 | LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"do { } while (false) | ||||
| 3368 | << *CI << "!=" << *V << '\n')do { } while (false); | ||||
| 3369 | return; | ||||
| 3370 | } | ||||
| 3371 | } | ||||
| 3372 | |||||
| 3373 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 3374 | ReuseShuffleIndicies); | ||||
| 3375 | TE->setOperandsInOrder(); | ||||
| 3376 | for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { | ||||
| 3377 | ValueList Operands; | ||||
| 3378 | // Prepare the operand vector. | ||||
| 3379 | for (Value *V : VL) { | ||||
| 3380 | auto *CI2 = cast<CallInst>(V); | ||||
| 3381 | Operands.push_back(CI2->getArgOperand(i)); | ||||
| 3382 | } | ||||
| 3383 | buildTree_rec(Operands, Depth + 1, {TE, i}); | ||||
| 3384 | } | ||||
| 3385 | return; | ||||
| 3386 | } | ||||
| 3387 | case Instruction::ShuffleVector: { | ||||
| 3388 | // If this is not an alternate sequence of opcode like add-sub | ||||
| 3389 | // then do not vectorize this instruction. | ||||
| 3390 | if (!S.isAltShuffle()) { | ||||
| 3391 | BS.cancelScheduling(VL, VL0); | ||||
| 3392 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3393 | ReuseShuffleIndicies); | ||||
| 3394 | LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n")do { } while (false); | ||||
| 3395 | return; | ||||
| 3396 | } | ||||
| 3397 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, | ||||
| 3398 | ReuseShuffleIndicies); | ||||
| 3399 | LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n")do { } while (false); | ||||
| 3400 | |||||
| 3401 | // Reorder operands if reordering would enable vectorization. | ||||
| 3402 | if (isa<BinaryOperator>(VL0)) { | ||||
| 3403 | ValueList Left, Right; | ||||
| 3404 | reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); | ||||
| 3405 | TE->setOperand(0, Left); | ||||
| 3406 | TE->setOperand(1, Right); | ||||
| 3407 | buildTree_rec(Left, Depth + 1, {TE, 0}); | ||||
| 3408 | buildTree_rec(Right, Depth + 1, {TE, 1}); | ||||
| 3409 | return; | ||||
| 3410 | } | ||||
| 3411 | |||||
| 3412 | TE->setOperandsInOrder(); | ||||
| 3413 | for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { | ||||
| 3414 | ValueList Operands; | ||||
| 3415 | // Prepare the operand vector. | ||||
| 3416 | for (Value *V : VL) | ||||
| 3417 | Operands.push_back(cast<Instruction>(V)->getOperand(i)); | ||||
| 3418 | |||||
| 3419 | buildTree_rec(Operands, Depth + 1, {TE, i}); | ||||
| 3420 | } | ||||
| 3421 | return; | ||||
| 3422 | } | ||||
| 3423 | default: | ||||
| 3424 | BS.cancelScheduling(VL, VL0); | ||||
| 3425 | newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, | ||||
| 3426 | ReuseShuffleIndicies); | ||||
| 3427 | LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n")do { } while (false); | ||||
| 3428 | return; | ||||
| 3429 | } | ||||
| 3430 | } | ||||
| 3431 | |||||
| 3432 | unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { | ||||
| 3433 | unsigned N = 1; | ||||
| 3434 | Type *EltTy = T; | ||||
| 3435 | |||||
| 3436 | while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) || | ||||
| 3437 | isa<VectorType>(EltTy)) { | ||||
| 3438 | if (auto *ST = dyn_cast<StructType>(EltTy)) { | ||||
| 3439 | // Check that struct is homogeneous. | ||||
| 3440 | for (const auto *Ty : ST->elements()) | ||||
| 3441 | if (Ty != *ST->element_begin()) | ||||
| 3442 | return 0; | ||||
| 3443 | N *= ST->getNumElements(); | ||||
| 3444 | EltTy = *ST->element_begin(); | ||||
| 3445 | } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) { | ||||
| 3446 | N *= AT->getNumElements(); | ||||
| 3447 | EltTy = AT->getElementType(); | ||||
| 3448 | } else { | ||||
| 3449 | auto *VT = cast<FixedVectorType>(EltTy); | ||||
| 3450 | N *= VT->getNumElements(); | ||||
| 3451 | EltTy = VT->getElementType(); | ||||
| 3452 | } | ||||
| 3453 | } | ||||
| 3454 | |||||
| 3455 | if (!isValidElementType(EltTy)) | ||||
| 3456 | return 0; | ||||
| 3457 | uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N)); | ||||
| 3458 | if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) | ||||
| 3459 | return 0; | ||||
| 3460 | return N; | ||||
| 3461 | } | ||||
| 3462 | |||||
| 3463 | bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, | ||||
| 3464 | SmallVectorImpl<unsigned> &CurrentOrder) const { | ||||
| 3465 | Instruction *E0 = cast<Instruction>(OpValue); | ||||
| 3466 | assert(E0->getOpcode() == Instruction::ExtractElement ||((void)0) | ||||
| 3467 | E0->getOpcode() == Instruction::ExtractValue)((void)0); | ||||
| 3468 | assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode")((void)0); | ||||
| 3469 | // Check if all of the extracts come from the same vector and from the | ||||
| 3470 | // correct offset. | ||||
| 3471 | Value *Vec = E0->getOperand(0); | ||||
| 3472 | |||||
| 3473 | CurrentOrder.clear(); | ||||
| 3474 | |||||
| 3475 | // We have to extract from a vector/aggregate with the same number of elements. | ||||
| 3476 | unsigned NElts; | ||||
| 3477 | if (E0->getOpcode() == Instruction::ExtractValue) { | ||||
| 3478 | const DataLayout &DL = E0->getModule()->getDataLayout(); | ||||
| 3479 | NElts = canMapToVector(Vec->getType(), DL); | ||||
| 3480 | if (!NElts) | ||||
| 3481 | return false; | ||||
| 3482 | // Check if load can be rewritten as load of vector. | ||||
| 3483 | LoadInst *LI = dyn_cast<LoadInst>(Vec); | ||||
| 3484 | if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) | ||||
| 3485 | return false; | ||||
| 3486 | } else { | ||||
| 3487 | NElts = cast<FixedVectorType>(Vec->getType())->getNumElements(); | ||||
| 3488 | } | ||||
| 3489 | |||||
| 3490 | if (NElts != VL.size()) | ||||
| 3491 | return false; | ||||
| 3492 | |||||
| 3493 | // Check that all of the indices extract from the correct offset. | ||||
| 3494 | bool ShouldKeepOrder = true; | ||||
| 3495 | unsigned E = VL.size(); | ||||
| 3496 | // Assign to all items the initial value E + 1 so we can check if the extract | ||||
| 3497 | // instruction index was used already. | ||||
| 3498 | // Also, later we can check that all the indices are used and we have a | ||||
| 3499 | // consecutive access in the extract instructions, by checking that no | ||||
| 3500 | // element of CurrentOrder still has value E + 1. | ||||
| 3501 | CurrentOrder.assign(E, E + 1); | ||||
| 3502 | unsigned I = 0; | ||||
| 3503 | for (; I < E; ++I) { | ||||
| 3504 | auto *Inst = cast<Instruction>(VL[I]); | ||||
| 3505 | if (Inst->getOperand(0) != Vec) | ||||
| 3506 | break; | ||||
| 3507 | Optional<unsigned> Idx = getExtractIndex(Inst); | ||||
| 3508 | if (!Idx) | ||||
| 3509 | break; | ||||
| 3510 | const unsigned ExtIdx = *Idx; | ||||
| 3511 | if (ExtIdx != I) { | ||||
| 3512 | if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) | ||||
| 3513 | break; | ||||
| 3514 | ShouldKeepOrder = false; | ||||
| 3515 | CurrentOrder[ExtIdx] = I; | ||||
| 3516 | } else { | ||||
| 3517 | if (CurrentOrder[I] != E + 1) | ||||
| 3518 | break; | ||||
| 3519 | CurrentOrder[I] = I; | ||||
| 3520 | } | ||||
| 3521 | } | ||||
| 3522 | if (I < E) { | ||||
| 3523 | CurrentOrder.clear(); | ||||
| 3524 | return false; | ||||
| 3525 | } | ||||
| 3526 | |||||
| 3527 | return ShouldKeepOrder; | ||||
| 3528 | } | ||||
| 3529 | |||||
| 3530 | bool BoUpSLP::areAllUsersVectorized(Instruction *I, | ||||
| 3531 | ArrayRef<Value *> VectorizedVals) const { | ||||
| 3532 | return (I->hasOneUse() && is_contained(VectorizedVals, I)) || | ||||
| 3533 | llvm::all_of(I->users(), [this](User *U) { | ||||
| 3534 | return ScalarToTreeEntry.count(U) > 0; | ||||
| 3535 | }); | ||||
| 3536 | } | ||||
| 3537 | |||||
| 3538 | static std::pair<InstructionCost, InstructionCost> | ||||
| 3539 | getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, | ||||
| 3540 | TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { | ||||
| 3541 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); | ||||
| 3542 | |||||
| 3543 | // Calculate the cost of the scalar and vector calls. | ||||
| 3544 | SmallVector<Type *, 4> VecTys; | ||||
| 3545 | for (Use &Arg : CI->args()) | ||||
| 3546 | VecTys.push_back( | ||||
| 3547 | FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); | ||||
| 3548 | FastMathFlags FMF; | ||||
| 3549 | if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) | ||||
| 3550 | FMF = FPCI->getFastMathFlags(); | ||||
| 3551 | SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); | ||||
| 3552 | IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF, | ||||
| 3553 | dyn_cast<IntrinsicInst>(CI)); | ||||
| 3554 | auto IntrinsicCost = | ||||
| 3555 | TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); | ||||
| 3556 | |||||
| 3557 | auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( | ||||
| 3558 | VecTy->getNumElements())), | ||||
| 3559 | false /*HasGlobalPred*/); | ||||
| 3560 | Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); | ||||
| 3561 | auto LibCost = IntrinsicCost; | ||||
| 3562 | if (!CI->isNoBuiltin() && VecFunc) { | ||||
| 3563 | // Calculate the cost of the vector library call. | ||||
| 3564 | // If the corresponding vector call is cheaper, return its cost. | ||||
| 3565 | LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, | ||||
| 3566 | TTI::TCK_RecipThroughput); | ||||
| 3567 | } | ||||
| 3568 | return {IntrinsicCost, LibCost}; | ||||
| 3569 | } | ||||
| 3570 | |||||
| 3571 | /// Compute the cost of creating a vector of type \p VecTy containing the | ||||
| 3572 | /// extracted values from \p VL. | ||||
| 3573 | static InstructionCost | ||||
| 3574 | computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, | ||||
| 3575 | TargetTransformInfo::ShuffleKind ShuffleKind, | ||||
| 3576 | ArrayRef<int> Mask, TargetTransformInfo &TTI) { | ||||
| 3577 | unsigned NumOfParts = TTI.getNumberOfParts(VecTy); | ||||
| 3578 | |||||
| 3579 | if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts || | ||||
| 3580 | VecTy->getNumElements() < NumOfParts) | ||||
| 3581 | return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); | ||||
| 3582 | |||||
| 3583 | bool AllConsecutive = true; | ||||
| 3584 | unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; | ||||
| 3585 | unsigned Idx = -1; | ||||
| 3586 | InstructionCost Cost = 0; | ||||
| 3587 | |||||
| 3588 | // Process extracts in blocks of EltsPerVector to check if the source vector | ||||
| 3589 | // operand can be re-used directly. If not, add the cost of creating a shuffle | ||||
| 3590 | // to extract the values into a vector register. | ||||
| 3591 | for (auto *V : VL) { | ||||
| 3592 | ++Idx; | ||||
| 3593 | |||||
| 3594 | // Reached the start of a new vector registers. | ||||
| 3595 | if (Idx % EltsPerVector == 0) { | ||||
| 3596 | AllConsecutive = true; | ||||
| 3597 | continue; | ||||
| 3598 | } | ||||
| 3599 | |||||
| 3600 | // Check all extracts for a vector register on the target directly | ||||
| 3601 | // extract values in order. | ||||
| 3602 | unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); | ||||
| 3603 | unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); | ||||
| 3604 | AllConsecutive &= PrevIdx + 1 == CurrentIdx && | ||||
| 3605 | CurrentIdx % EltsPerVector == Idx % EltsPerVector; | ||||
| 3606 | |||||
| 3607 | if (AllConsecutive) | ||||
| 3608 | continue; | ||||
| 3609 | |||||
| 3610 | // Skip all indices, except for the last index per vector block. | ||||
| 3611 | if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) | ||||
| 3612 | continue; | ||||
| 3613 | |||||
| 3614 | // If we have a series of extracts which are not consecutive and hence | ||||
| 3615 | // cannot re-use the source vector register directly, compute the shuffle | ||||
| 3616 | // cost to extract the a vector with EltsPerVector elements. | ||||
| 3617 | Cost += TTI.getShuffleCost( | ||||
| 3618 | TargetTransformInfo::SK_PermuteSingleSrc, | ||||
| 3619 | FixedVectorType::get(VecTy->getElementType(), EltsPerVector)); | ||||
| 3620 | } | ||||
| 3621 | return Cost; | ||||
| 3622 | } | ||||
| 3623 | |||||
| 3624 | /// Shuffles \p Mask in accordance with the given \p SubMask. | ||||
| 3625 | static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { | ||||
| 3626 | if (SubMask.empty()) | ||||
| 3627 | return; | ||||
| 3628 | if (Mask.empty()) { | ||||
| 3629 | Mask.append(SubMask.begin(), SubMask.end()); | ||||
| 3630 | return; | ||||
| 3631 | } | ||||
| 3632 | SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size()); | ||||
| 3633 | int TermValue = std::min(Mask.size(), SubMask.size()); | ||||
| 3634 | for (int I = 0, E = SubMask.size(); I < E; ++I) { | ||||
| 3635 | if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || | ||||
| 3636 | Mask[SubMask[I]] >= TermValue) { | ||||
| 3637 | NewMask[I] = UndefMaskElem; | ||||
| 3638 | continue; | ||||
| 3639 | } | ||||
| 3640 | NewMask[I] = Mask[SubMask[I]]; | ||||
| 3641 | } | ||||
| 3642 | Mask.swap(NewMask); | ||||
| 3643 | } | ||||
| 3644 | |||||
| 3645 | InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, | ||||
| 3646 | ArrayRef<Value *> VectorizedVals) { | ||||
| 3647 | ArrayRef<Value*> VL = E->Scalars; | ||||
| 3648 | |||||
| 3649 | Type *ScalarTy = VL[0]->getType(); | ||||
| 3650 | if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) | ||||
| 3651 | ScalarTy = SI->getValueOperand()->getType(); | ||||
| 3652 | else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) | ||||
| 3653 | ScalarTy = CI->getOperand(0)->getType(); | ||||
| 3654 | else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) | ||||
| 3655 | ScalarTy = IE->getOperand(1)->getType(); | ||||
| 3656 | auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); | ||||
| 3657 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||
| 3658 | |||||
| 3659 | // If we have computed a smaller type for the expression, update VecTy so | ||||
| 3660 | // that the costs will be accurate. | ||||
| 3661 | if (MinBWs.count(VL[0])) | ||||
| 3662 | VecTy = FixedVectorType::get( | ||||
| 3663 | IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); | ||||
| 3664 | auto *FinalVecTy = VecTy; | ||||
| 3665 | |||||
| 3666 | unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); | ||||
| 3667 | bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); | ||||
| 3668 | if (NeedToShuffleReuses) | ||||
| 3669 | FinalVecTy = | ||||
| 3670 | FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers); | ||||
| 3671 | // FIXME: it tries to fix a problem with MSVC buildbots. | ||||
| 3672 | TargetTransformInfo &TTIRef = *TTI; | ||||
| 3673 | auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, | ||||
| 3674 | VectorizedVals](InstructionCost &Cost, | ||||
| 3675 | bool IsGather) { | ||||
| 3676 | DenseMap<Value *, int> ExtractVectorsTys; | ||||
| 3677 | for (auto *V : VL) { | ||||
| 3678 | // If all users of instruction are going to be vectorized and this | ||||
| 3679 | // instruction itself is not going to be vectorized, consider this | ||||
| 3680 | // instruction as dead and remove its cost from the final cost of the | ||||
| 3681 | // vectorized tree. | ||||
| 3682 | if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || | ||||
| 3683 | (IsGather && ScalarToTreeEntry.count(V))) | ||||
| 3684 | continue; | ||||
| 3685 | auto *EE = cast<ExtractElementInst>(V); | ||||
| 3686 | unsigned Idx = *getExtractIndex(EE); | ||||
| 3687 | if (TTIRef.getNumberOfParts(VecTy) != | ||||
| 3688 | TTIRef.getNumberOfParts(EE->getVectorOperandType())) { | ||||
| 3689 | auto It = | ||||
| 3690 | ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; | ||||
| 3691 | It->getSecond() = std::min<int>(It->second, Idx); | ||||
| 3692 | } | ||||
| 3693 | // Take credit for instruction that will become dead. | ||||
| 3694 | if (EE->hasOneUse()) { | ||||
| 3695 | Instruction *Ext = EE->user_back(); | ||||
| 3696 | if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && | ||||
| 3697 | all_of(Ext->users(), | ||||
| 3698 | [](User *U) { return isa<GetElementPtrInst>(U); })) { | ||||
| 3699 | // Use getExtractWithExtendCost() to calculate the cost of | ||||
| 3700 | // extractelement/ext pair. | ||||
| 3701 | Cost -= | ||||
| 3702 | TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), | ||||
| 3703 | EE->getVectorOperandType(), Idx); | ||||
| 3704 | // Add back the cost of s|zext which is subtracted separately. | ||||
| 3705 | Cost += TTIRef.getCastInstrCost( | ||||
| 3706 | Ext->getOpcode(), Ext->getType(), EE->getType(), | ||||
| 3707 | TTI::getCastContextHint(Ext), CostKind, Ext); | ||||
| 3708 | continue; | ||||
| 3709 | } | ||||
| 3710 | } | ||||
| 3711 | Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement, | ||||
| 3712 | EE->getVectorOperandType(), Idx); | ||||
| 3713 | } | ||||
| 3714 | // Add a cost for subvector extracts/inserts if required. | ||||
| 3715 | for (const auto &Data : ExtractVectorsTys) { | ||||
| 3716 | auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); | ||||
| 3717 | unsigned NumElts = VecTy->getNumElements(); | ||||
| 3718 | if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { | ||||
| 3719 | unsigned Idx = (Data.second / NumElts) * NumElts; | ||||
| 3720 | unsigned EENumElts = EEVTy->getNumElements(); | ||||
| 3721 | if (Idx + NumElts <= EENumElts) { | ||||
| 3722 | Cost += | ||||
| 3723 | TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, | ||||
| 3724 | EEVTy, None, Idx, VecTy); | ||||
| 3725 | } else { | ||||
| 3726 | // Need to round up the subvector type vectorization factor to avoid a | ||||
| 3727 | // crash in cost model functions. Make SubVT so that Idx + VF of SubVT | ||||
| 3728 | // <= EENumElts. | ||||
| 3729 | auto *SubVT = | ||||
| 3730 | FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); | ||||
| 3731 | Cost += | ||||
| 3732 | TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, | ||||
| 3733 | EEVTy, None, Idx, SubVT); | ||||
| 3734 | } | ||||
| 3735 | } else { | ||||
| 3736 | Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, | ||||
| 3737 | VecTy, None, 0, EEVTy); | ||||
| 3738 | } | ||||
| 3739 | } | ||||
| 3740 | }; | ||||
| 3741 | if (E->State == TreeEntry::NeedToGather) { | ||||
| 3742 | if (allConstant(VL)) | ||||
| 3743 | return 0; | ||||
| 3744 | if (isa<InsertElementInst>(VL[0])) | ||||
| 3745 | return InstructionCost::getInvalid(); | ||||
| 3746 | SmallVector<int> Mask; | ||||
| 3747 | SmallVector<const TreeEntry *> Entries; | ||||
| 3748 | Optional<TargetTransformInfo::ShuffleKind> Shuffle = | ||||
| 3749 | isGatherShuffledEntry(E, Mask, Entries); | ||||
| 3750 | if (Shuffle.hasValue()) { | ||||
| 3751 | InstructionCost GatherCost = 0; | ||||
| 3752 | if (ShuffleVectorInst::isIdentityMask(Mask)) { | ||||
| 3753 | // Perfect match in the graph, will reuse the previously vectorized | ||||
| 3754 | // node. Cost is 0. | ||||
| 3755 | LLVM_DEBUG(do { } while (false) | ||||
| 3756 | dbgs()do { } while (false) | ||||
| 3757 | << "SLP: perfect diamond match for gather bundle that starts with "do { } while (false) | ||||
| 3758 | << *VL.front() << ".\n")do { } while (false); | ||||
| 3759 | if (NeedToShuffleReuses) | ||||
| 3760 | GatherCost = | ||||
| 3761 | TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, | ||||
| 3762 | FinalVecTy, E->ReuseShuffleIndices); | ||||
| 3763 | } else { | ||||
| 3764 | LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()do { } while (false) | ||||
| 3765 | << " entries for bundle that starts with "do { } while (false) | ||||
| 3766 | << *VL.front() << ".\n")do { } while (false); | ||||
| 3767 | // Detected that instead of gather we can emit a shuffle of single/two | ||||
| 3768 | // previously vectorized nodes. Add the cost of the permutation rather | ||||
| 3769 | // than gather. | ||||
| 3770 | ::addMask(Mask, E->ReuseShuffleIndices); | ||||
| 3771 | GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); | ||||
| 3772 | } | ||||
| 3773 | return GatherCost; | ||||
| 3774 | } | ||||
| 3775 | if (isSplat(VL)) { | ||||
| 3776 | // Found the broadcasting of the single scalar, calculate the cost as the | ||||
| 3777 | // broadcast. | ||||
| 3778 | return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); | ||||
| 3779 | } | ||||
| 3780 | if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && | ||||
| 3781 | allSameBlock(VL) && | ||||
| 3782 | !isa<ScalableVectorType>( | ||||
| 3783 | cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) { | ||||
| 3784 | // Check that gather of extractelements can be represented as just a | ||||
| 3785 | // shuffle of a single/two vectors the scalars are extracted from. | ||||
| 3786 | SmallVector<int> Mask; | ||||
| 3787 | Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = | ||||
| 3788 | isShuffle(VL, Mask); | ||||
| 3789 | if (ShuffleKind.hasValue()) { | ||||
| 3790 | // Found the bunch of extractelement instructions that must be gathered | ||||
| 3791 | // into a vector and can be represented as a permutation elements in a | ||||
| 3792 | // single input vector or of 2 input vectors. | ||||
| 3793 | InstructionCost Cost = | ||||
| 3794 | computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); | ||||
| 3795 | AdjustExtractsCost(Cost, /*IsGather=*/true); | ||||
| 3796 | if (NeedToShuffleReuses) | ||||
| 3797 | Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, | ||||
| 3798 | FinalVecTy, E->ReuseShuffleIndices); | ||||
| 3799 | return Cost; | ||||
| 3800 | } | ||||
| 3801 | } | ||||
| 3802 | InstructionCost ReuseShuffleCost = 0; | ||||
| 3803 | if (NeedToShuffleReuses) | ||||
| 3804 | ReuseShuffleCost = TTI->getShuffleCost( | ||||
| 3805 | TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); | ||||
| 3806 | return ReuseShuffleCost + getGatherCost(VL); | ||||
| 3807 | } | ||||
| 3808 | InstructionCost CommonCost = 0; | ||||
| 3809 | SmallVector<int> Mask; | ||||
| 3810 | if (!E->ReorderIndices.empty()) { | ||||
| 3811 | SmallVector<int> NewMask; | ||||
| 3812 | if (E->getOpcode() == Instruction::Store) { | ||||
| 3813 | // For stores the order is actually a mask. | ||||
| 3814 | NewMask.resize(E->ReorderIndices.size()); | ||||
| 3815 | copy(E->ReorderIndices, NewMask.begin()); | ||||
| 3816 | } else { | ||||
| 3817 | inversePermutation(E->ReorderIndices, NewMask); | ||||
| 3818 | } | ||||
| 3819 | ::addMask(Mask, NewMask); | ||||
| 3820 | } | ||||
| 3821 | if (NeedToShuffleReuses) | ||||
| 3822 | ::addMask(Mask, E->ReuseShuffleIndices); | ||||
| 3823 | if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask)) | ||||
| 3824 | CommonCost = | ||||
| 3825 | TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); | ||||
| 3826 | assert((E->State == TreeEntry::Vectorize ||((void)0) | ||||
| 3827 | E->State == TreeEntry::ScatterVectorize) &&((void)0) | ||||
| 3828 | "Unhandled state")((void)0); | ||||
| 3829 | assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL")((void)0); | ||||
| 3830 | Instruction *VL0 = E->getMainOp(); | ||||
| 3831 | unsigned ShuffleOrOp = | ||||
| 3832 | E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); | ||||
| 3833 | switch (ShuffleOrOp) { | ||||
| 3834 | case Instruction::PHI: | ||||
| 3835 | return 0; | ||||
| 3836 | |||||
| 3837 | case Instruction::ExtractValue: | ||||
| 3838 | case Instruction::ExtractElement: { | ||||
| 3839 | // The common cost of removal ExtractElement/ExtractValue instructions + | ||||
| 3840 | // the cost of shuffles, if required to resuffle the original vector. | ||||
| 3841 | if (NeedToShuffleReuses) { | ||||
| 3842 | unsigned Idx = 0; | ||||
| 3843 | for (unsigned I : E->ReuseShuffleIndices) { | ||||
| 3844 | if (ShuffleOrOp == Instruction::ExtractElement) { | ||||
| 3845 | auto *EE = cast<ExtractElementInst>(VL[I]); | ||||
| 3846 | CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, | ||||
| 3847 | EE->getVectorOperandType(), | ||||
| 3848 | *getExtractIndex(EE)); | ||||
| 3849 | } else { | ||||
| 3850 | CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, | ||||
| 3851 | VecTy, Idx); | ||||
| 3852 | ++Idx; | ||||
| 3853 | } | ||||
| 3854 | } | ||||
| 3855 | Idx = ReuseShuffleNumbers; | ||||
| 3856 | for (Value *V : VL) { | ||||
| 3857 | if (ShuffleOrOp == Instruction::ExtractElement) { | ||||
| 3858 | auto *EE = cast<ExtractElementInst>(V); | ||||
| 3859 | CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, | ||||
| 3860 | EE->getVectorOperandType(), | ||||
| 3861 | *getExtractIndex(EE)); | ||||
| 3862 | } else { | ||||
| 3863 | --Idx; | ||||
| 3864 | CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, | ||||
| 3865 | VecTy, Idx); | ||||
| 3866 | } | ||||
| 3867 | } | ||||
| 3868 | } | ||||
| 3869 | if (ShuffleOrOp == Instruction::ExtractValue) { | ||||
| 3870 | for (unsigned I = 0, E = VL.size(); I < E; ++I) { | ||||
| 3871 | auto *EI = cast<Instruction>(VL[I]); | ||||
| 3872 | // Take credit for instruction that will become dead. | ||||
| 3873 | if (EI->hasOneUse()) { | ||||
| 3874 | Instruction *Ext = EI->user_back(); | ||||
| 3875 | if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && | ||||
| 3876 | all_of(Ext->users(), | ||||
| 3877 | [](User *U) { return isa<GetElementPtrInst>(U); })) { | ||||
| 3878 | // Use getExtractWithExtendCost() to calculate the cost of | ||||
| 3879 | // extractelement/ext pair. | ||||
| 3880 | CommonCost -= TTI->getExtractWithExtendCost( | ||||
| 3881 | Ext->getOpcode(), Ext->getType(), VecTy, I); | ||||
| 3882 | // Add back the cost of s|zext which is subtracted separately. | ||||
| 3883 | CommonCost += TTI->getCastInstrCost( | ||||
| 3884 | Ext->getOpcode(), Ext->getType(), EI->getType(), | ||||
| 3885 | TTI::getCastContextHint(Ext), CostKind, Ext); | ||||
| 3886 | continue; | ||||
| 3887 | } | ||||
| 3888 | } | ||||
| 3889 | CommonCost -= | ||||
| 3890 | TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); | ||||
| 3891 | } | ||||
| 3892 | } else { | ||||
| 3893 | AdjustExtractsCost(CommonCost, /*IsGather=*/false); | ||||
| 3894 | } | ||||
| 3895 | return CommonCost; | ||||
| 3896 | } | ||||
| 3897 | case Instruction::InsertElement: { | ||||
| 3898 | auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); | ||||
| 3899 | |||||
| 3900 | unsigned const NumElts = SrcVecTy->getNumElements(); | ||||
| 3901 | unsigned const NumScalars = VL.size(); | ||||
| 3902 | APInt DemandedElts = APInt::getNullValue(NumElts); | ||||
| 3903 | // TODO: Add support for Instruction::InsertValue. | ||||
| 3904 | unsigned Offset = UINT_MAX(2147483647 *2U +1U); | ||||
| 3905 | bool IsIdentity = true; | ||||
| 3906 | SmallVector<int> ShuffleMask(NumElts, UndefMaskElem); | ||||
| 3907 | for (unsigned I = 0; I < NumScalars; ++I) { | ||||
| 3908 | Optional<int> InsertIdx = getInsertIndex(VL[I], 0); | ||||
| 3909 | if (!InsertIdx || *InsertIdx == UndefMaskElem) | ||||
| 3910 | continue; | ||||
| 3911 | unsigned Idx = *InsertIdx; | ||||
| 3912 | DemandedElts.setBit(Idx); | ||||
| 3913 | if (Idx < Offset) { | ||||
| 3914 | Offset = Idx; | ||||
| 3915 | IsIdentity &= I == 0; | ||||
| 3916 | } else { | ||||
| 3917 | assert(Idx >= Offset && "Failed to find vector index offset")((void)0); | ||||
| 3918 | IsIdentity &= Idx - Offset == I; | ||||
| 3919 | } | ||||
| 3920 | ShuffleMask[Idx] = I; | ||||
| 3921 | } | ||||
| 3922 | assert(Offset < NumElts && "Failed to find vector index offset")((void)0); | ||||
| 3923 | |||||
| 3924 | InstructionCost Cost = 0; | ||||
| 3925 | Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, | ||||
| 3926 | /*Insert*/ true, /*Extract*/ false); | ||||
| 3927 | |||||
| 3928 | if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) { | ||||
| 3929 | // FIXME: Replace with SK_InsertSubvector once it is properly supported. | ||||
| 3930 | unsigned Sz = PowerOf2Ceil(Offset + NumScalars); | ||||
| 3931 | Cost += TTI->getShuffleCost( | ||||
| 3932 | TargetTransformInfo::SK_PermuteSingleSrc, | ||||
| 3933 | FixedVectorType::get(SrcVecTy->getElementType(), Sz)); | ||||
| 3934 | } else if (!IsIdentity) { | ||||
| 3935 | Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, | ||||
| 3936 | ShuffleMask); | ||||
| 3937 | } | ||||
| 3938 | |||||
| 3939 | return Cost; | ||||
| 3940 | } | ||||
| 3941 | case Instruction::ZExt: | ||||
| 3942 | case Instruction::SExt: | ||||
| 3943 | case Instruction::FPToUI: | ||||
| 3944 | case Instruction::FPToSI: | ||||
| 3945 | case Instruction::FPExt: | ||||
| 3946 | case Instruction::PtrToInt: | ||||
| 3947 | case Instruction::IntToPtr: | ||||
| 3948 | case Instruction::SIToFP: | ||||
| 3949 | case Instruction::UIToFP: | ||||
| 3950 | case Instruction::Trunc: | ||||
| 3951 | case Instruction::FPTrunc: | ||||
| 3952 | case Instruction::BitCast: { | ||||
| 3953 | Type *SrcTy = VL0->getOperand(0)->getType(); | ||||
| 3954 | InstructionCost ScalarEltCost = | ||||
| 3955 | TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, | ||||
| 3956 | TTI::getCastContextHint(VL0), CostKind, VL0); | ||||
| 3957 | if (NeedToShuffleReuses) { | ||||
| 3958 | CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; | ||||
| 3959 | } | ||||
| 3960 | |||||
| 3961 | // Calculate the cost of this instruction. | ||||
| 3962 | InstructionCost ScalarCost = VL.size() * ScalarEltCost; | ||||
| 3963 | |||||
| 3964 | auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); | ||||
| 3965 | InstructionCost VecCost = 0; | ||||
| 3966 | // Check if the values are candidates to demote. | ||||
| 3967 | if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { | ||||
| 3968 | VecCost = CommonCost + TTI->getCastInstrCost( | ||||
| 3969 | E->getOpcode(), VecTy, SrcVecTy, | ||||
| 3970 | TTI::getCastContextHint(VL0), CostKind, VL0); | ||||
| 3971 | } | ||||
| 3972 | LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { } while (false); | ||||
| 3973 | return VecCost - ScalarCost; | ||||
| 3974 | } | ||||
| 3975 | case Instruction::FCmp: | ||||
| 3976 | case Instruction::ICmp: | ||||
| 3977 | case Instruction::Select: { | ||||
| 3978 | // Calculate the cost of this instruction. | ||||
| 3979 | InstructionCost ScalarEltCost = | ||||
| 3980 | TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), | ||||
| 3981 | CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); | ||||
| 3982 | if (NeedToShuffleReuses) { | ||||
| 3983 | CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; | ||||
| 3984 | } | ||||
| 3985 | auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); | ||||
| 3986 | InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; | ||||
| 3987 | |||||
| 3988 | // Check if all entries in VL are either compares or selects with compares | ||||
| 3989 | // as condition that have the same predicates. | ||||
| 3990 | CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE; | ||||
| 3991 | bool First = true; | ||||
| 3992 | for (auto *V : VL) { | ||||
| 3993 | CmpInst::Predicate CurrentPred; | ||||
| 3994 | auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value()); | ||||
| 3995 | if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) && | ||||
| 3996 | !match(V, MatchCmp)) || | ||||
| 3997 | (!First && VecPred != CurrentPred)) { | ||||
| 3998 | VecPred = CmpInst::BAD_ICMP_PREDICATE; | ||||
| 3999 | break; | ||||
| 4000 | } | ||||
| 4001 | First = false; | ||||
| 4002 | VecPred = CurrentPred; | ||||
| 4003 | } | ||||
| 4004 | |||||
| 4005 | InstructionCost VecCost = TTI->getCmpSelInstrCost( | ||||
| 4006 | E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); | ||||
| 4007 | // Check if it is possible and profitable to use min/max for selects in | ||||
| 4008 | // VL. | ||||
| 4009 | // | ||||
| 4010 | auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); | ||||
| 4011 | if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { | ||||
| 4012 | IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, | ||||
| 4013 | {VecTy, VecTy}); | ||||
| 4014 | InstructionCost IntrinsicCost = | ||||
| 4015 | TTI->getIntrinsicInstrCost(CostAttrs, CostKind); | ||||
| 4016 | // If the selects are the only uses of the compares, they will be dead | ||||
| 4017 | // and we can adjust the cost by removing their cost. | ||||
| 4018 | if (IntrinsicAndUse.second) | ||||
| 4019 | IntrinsicCost -= | ||||
| 4020 | TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy, | ||||
| 4021 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||
| 4022 | VecCost = std::min(VecCost, IntrinsicCost); | ||||
| 4023 | } | ||||
| 4024 | LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { } while (false); | ||||
| 4025 | return CommonCost + VecCost - ScalarCost; | ||||
| 4026 | } | ||||
| 4027 | case Instruction::FNeg: | ||||
| 4028 | case Instruction::Add: | ||||
| 4029 | case Instruction::FAdd: | ||||
| 4030 | case Instruction::Sub: | ||||
| 4031 | case Instruction::FSub: | ||||
| 4032 | case Instruction::Mul: | ||||
| 4033 | case Instruction::FMul: | ||||
| 4034 | case Instruction::UDiv: | ||||
| 4035 | case Instruction::SDiv: | ||||
| 4036 | case Instruction::FDiv: | ||||
| 4037 | case Instruction::URem: | ||||
| 4038 | case Instruction::SRem: | ||||
| 4039 | case Instruction::FRem: | ||||
| 4040 | case Instruction::Shl: | ||||
| 4041 | case Instruction::LShr: | ||||
| 4042 | case Instruction::AShr: | ||||
| 4043 | case Instruction::And: | ||||
| 4044 | case Instruction::Or: | ||||
| 4045 | case Instruction::Xor: { | ||||
| 4046 | // Certain instructions can be cheaper to vectorize if they have a | ||||
| 4047 | // constant second vector operand. | ||||
| 4048 | TargetTransformInfo::OperandValueKind Op1VK = | ||||
| 4049 | TargetTransformInfo::OK_AnyValue; | ||||
| 4050 | TargetTransformInfo::OperandValueKind Op2VK = | ||||
| 4051 | TargetTransformInfo::OK_UniformConstantValue; | ||||
| 4052 | TargetTransformInfo::OperandValueProperties Op1VP = | ||||
| 4053 | TargetTransformInfo::OP_None; | ||||
| 4054 | TargetTransformInfo::OperandValueProperties Op2VP = | ||||
| 4055 | TargetTransformInfo::OP_PowerOf2; | ||||
| 4056 | |||||
| 4057 | // If all operands are exactly the same ConstantInt then set the | ||||
| 4058 | // operand kind to OK_UniformConstantValue. | ||||
| 4059 | // If instead not all operands are constants, then set the operand kind | ||||
| 4060 | // to OK_AnyValue. If all operands are constants but not the same, | ||||
| 4061 | // then set the operand kind to OK_NonUniformConstantValue. | ||||
| 4062 | ConstantInt *CInt0 = nullptr; | ||||
| 4063 | for (unsigned i = 0, e = VL.size(); i < e; ++i) { | ||||
| 4064 | const Instruction *I = cast<Instruction>(VL[i]); | ||||
| 4065 | unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0; | ||||
| 4066 | ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx)); | ||||
| 4067 | if (!CInt) { | ||||
| 4068 | Op2VK = TargetTransformInfo::OK_AnyValue; | ||||
| 4069 | Op2VP = TargetTransformInfo::OP_None; | ||||
| 4070 | break; | ||||
| 4071 | } | ||||
| 4072 | if (Op2VP == TargetTransformInfo::OP_PowerOf2 && | ||||
| 4073 | !CInt->getValue().isPowerOf2()) | ||||
| 4074 | Op2VP = TargetTransformInfo::OP_None; | ||||
| 4075 | if (i == 0) { | ||||
| 4076 | CInt0 = CInt; | ||||
| 4077 | continue; | ||||
| 4078 | } | ||||
| 4079 | if (CInt0 != CInt) | ||||
| 4080 | Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; | ||||
| 4081 | } | ||||
| 4082 | |||||
| 4083 | SmallVector<const Value *, 4> Operands(VL0->operand_values()); | ||||
| 4084 | InstructionCost ScalarEltCost = | ||||
| 4085 | TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, | ||||
| 4086 | Op2VK, Op1VP, Op2VP, Operands, VL0); | ||||
| 4087 | if (NeedToShuffleReuses) { | ||||
| 4088 | CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; | ||||
| 4089 | } | ||||
| 4090 | InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; | ||||
| 4091 | InstructionCost VecCost = | ||||
| 4092 | TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, | ||||
| 4093 | Op2VK, Op1VP, Op2VP, Operands, VL0); | ||||
| 4094 | LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { } while (false); | ||||
| 4095 | return CommonCost + VecCost - ScalarCost; | ||||
| 4096 | } | ||||
| 4097 | case Instruction::GetElementPtr: { | ||||
| 4098 | TargetTransformInfo::OperandValueKind Op1VK = | ||||
| 4099 | TargetTransformInfo::OK_AnyValue; | ||||
| 4100 | TargetTransformInfo::OperandValueKind Op2VK = | ||||
| 4101 | TargetTransformInfo::OK_UniformConstantValue; | ||||
| 4102 | |||||
| 4103 | InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( | ||||
| 4104 | Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); | ||||
| 4105 | if (NeedToShuffleReuses) { | ||||
| 4106 | CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; | ||||
| 4107 | } | ||||
| 4108 | InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; | ||||
| 4109 | InstructionCost VecCost = TTI->getArithmeticInstrCost( | ||||
| 4110 | Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); | ||||
| 4111 | LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { } while (false); | ||||
| 4112 | return CommonCost + VecCost - ScalarCost; | ||||
| 4113 | } | ||||
| 4114 | case Instruction::Load: { | ||||
| 4115 | // Cost of wide load - cost of scalar loads. | ||||
| 4116 | Align Alignment = cast<LoadInst>(VL0)->getAlign(); | ||||
| 4117 | InstructionCost ScalarEltCost = TTI->getMemoryOpCost( | ||||
| 4118 | Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); | ||||
| 4119 | if (NeedToShuffleReuses) { | ||||
| 4120 | CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; | ||||
| 4121 | } | ||||
| 4122 | InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; | ||||
| 4123 | InstructionCost VecLdCost; | ||||
| 4124 | if (E->State == TreeEntry::Vectorize) { | ||||
| 4125 | VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, | ||||
| 4126 | CostKind, VL0); | ||||
| 4127 | } else { | ||||
| 4128 | assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState")((void)0); | ||||
| 4129 | Align CommonAlignment = Alignment; | ||||
| 4130 | for (Value *V : VL) | ||||
| 4131 | CommonAlignment = | ||||
| 4132 | commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); | ||||
| 4133 | VecLdCost = TTI->getGatherScatterOpCost( | ||||
| 4134 | Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), | ||||
| 4135 | /*VariableMask=*/false, CommonAlignment, CostKind, VL0); | ||||
| 4136 | } | ||||
| 4137 | LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost))do { } while (false); | ||||
| 4138 | return CommonCost + VecLdCost - ScalarLdCost; | ||||
| 4139 | } | ||||
| 4140 | case Instruction::Store: { | ||||
| 4141 | // We know that we can merge the stores. Calculate the cost. | ||||
| 4142 | bool IsReorder = !E->ReorderIndices.empty(); | ||||
| 4143 | auto *SI = | ||||
| 4144 | cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); | ||||
| 4145 | Align Alignment = SI->getAlign(); | ||||
| 4146 | InstructionCost ScalarEltCost = TTI->getMemoryOpCost( | ||||
| 4147 | Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); | ||||
| 4148 | InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; | ||||
| 4149 | InstructionCost VecStCost = TTI->getMemoryOpCost( | ||||
| 4150 | Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); | ||||
| 4151 | LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost))do { } while (false); | ||||
| 4152 | return CommonCost + VecStCost - ScalarStCost; | ||||
| 4153 | } | ||||
| 4154 | case Instruction::Call: { | ||||
| 4155 | CallInst *CI = cast<CallInst>(VL0); | ||||
| 4156 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); | ||||
| 4157 | |||||
| 4158 | // Calculate the cost of the scalar and vector calls. | ||||
| 4159 | IntrinsicCostAttributes CostAttrs(ID, *CI, 1); | ||||
| 4160 | InstructionCost ScalarEltCost = | ||||
| 4161 | TTI->getIntrinsicInstrCost(CostAttrs, CostKind); | ||||
| 4162 | if (NeedToShuffleReuses) { | ||||
| 4163 | CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; | ||||
| 4164 | } | ||||
| 4165 | InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; | ||||
| 4166 | |||||
| 4167 | auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); | ||||
| 4168 | InstructionCost VecCallCost = | ||||
| 4169 | std::min(VecCallCosts.first, VecCallCosts.second); | ||||
| 4170 | |||||
| 4171 | LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCostdo { } while (false) | ||||
| 4172 | << " (" << VecCallCost << "-" << ScalarCallCost << ")"do { } while (false) | ||||
| 4173 | << " for " << *CI << "\n")do { } while (false); | ||||
| 4174 | |||||
| 4175 | return CommonCost + VecCallCost - ScalarCallCost; | ||||
| 4176 | } | ||||
| 4177 | case Instruction::ShuffleVector: { | ||||
| 4178 | assert(E->isAltShuffle() &&((void)0) | ||||
| 4179 | ((Instruction::isBinaryOp(E->getOpcode()) &&((void)0) | ||||
| 4180 | Instruction::isBinaryOp(E->getAltOpcode())) ||((void)0) | ||||
| 4181 | (Instruction::isCast(E->getOpcode()) &&((void)0) | ||||
| 4182 | Instruction::isCast(E->getAltOpcode()))) &&((void)0) | ||||
| 4183 | "Invalid Shuffle Vector Operand")((void)0); | ||||
| 4184 | InstructionCost ScalarCost = 0; | ||||
| 4185 | if (NeedToShuffleReuses) { | ||||
| 4186 | for (unsigned Idx : E->ReuseShuffleIndices) { | ||||
| 4187 | Instruction *I = cast<Instruction>(VL[Idx]); | ||||
| 4188 | CommonCost -= TTI->getInstructionCost(I, CostKind); | ||||
| 4189 | } | ||||
| 4190 | for (Value *V : VL) { | ||||
| 4191 | Instruction *I = cast<Instruction>(V); | ||||
| 4192 | CommonCost += TTI->getInstructionCost(I, CostKind); | ||||
| 4193 | } | ||||
| 4194 | } | ||||
| 4195 | for (Value *V : VL) { | ||||
| 4196 | Instruction *I = cast<Instruction>(V); | ||||
| 4197 | assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode")((void)0); | ||||
| 4198 | ScalarCost += TTI->getInstructionCost(I, CostKind); | ||||
| 4199 | } | ||||
| 4200 | // VecCost is equal to sum of the cost of creating 2 vectors | ||||
| 4201 | // and the cost of creating shuffle. | ||||
| 4202 | InstructionCost VecCost = 0; | ||||
| 4203 | if (Instruction::isBinaryOp(E->getOpcode())) { | ||||
| 4204 | VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind); | ||||
| 4205 | VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, | ||||
| 4206 | CostKind); | ||||
| 4207 | } else { | ||||
| 4208 | Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); | ||||
| 4209 | Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); | ||||
| 4210 | auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size()); | ||||
| 4211 | auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size()); | ||||
| 4212 | VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty, | ||||
| 4213 | TTI::CastContextHint::None, CostKind); | ||||
| 4214 | VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, | ||||
| 4215 | TTI::CastContextHint::None, CostKind); | ||||
| 4216 | } | ||||
| 4217 | |||||
| 4218 | SmallVector<int> Mask(E->Scalars.size()); | ||||
| 4219 | for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) { | ||||
| 4220 | auto *OpInst = cast<Instruction>(E->Scalars[I]); | ||||
| 4221 | assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode")((void)0); | ||||
| 4222 | Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0); | ||||
| 4223 | } | ||||
| 4224 | VecCost += | ||||
| 4225 | TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); | ||||
| 4226 | LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { } while (false); | ||||
| 4227 | return CommonCost + VecCost - ScalarCost; | ||||
| 4228 | } | ||||
| 4229 | default: | ||||
| 4230 | llvm_unreachable("Unknown instruction")__builtin_unreachable(); | ||||
| 4231 | } | ||||
| 4232 | } | ||||
| 4233 | |||||
| 4234 | bool BoUpSLP::isFullyVectorizableTinyTree() const { | ||||
| 4235 | LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "do { } while (false) | ||||
| 4236 | << VectorizableTree.size() << " is fully vectorizable .\n")do { } while (false); | ||||
| 4237 | |||||
| 4238 | // We only handle trees of heights 1 and 2. | ||||
| 4239 | if (VectorizableTree.size() == 1 && | ||||
| 4240 | VectorizableTree[0]->State == TreeEntry::Vectorize) | ||||
| 4241 | return true; | ||||
| 4242 | |||||
| 4243 | if (VectorizableTree.size() != 2) | ||||
| 4244 | return false; | ||||
| 4245 | |||||
| 4246 | // Handle splat and all-constants stores. Also try to vectorize tiny trees | ||||
| 4247 | // with the second gather nodes if they have less scalar operands rather than | ||||
| 4248 | // the initial tree element (may be profitable to shuffle the second gather) | ||||
| 4249 | // or they are extractelements, which form shuffle. | ||||
| 4250 | SmallVector<int> Mask; | ||||
| 4251 | if (VectorizableTree[0]->State == TreeEntry::Vectorize && | ||||
| 4252 | (allConstant(VectorizableTree[1]->Scalars) || | ||||
| 4253 | isSplat(VectorizableTree[1]->Scalars) || | ||||
| 4254 | (VectorizableTree[1]->State == TreeEntry::NeedToGather && | ||||
| 4255 | VectorizableTree[1]->Scalars.size() < | ||||
| 4256 | VectorizableTree[0]->Scalars.size()) || | ||||
| 4257 | (VectorizableTree[1]->State == TreeEntry::NeedToGather && | ||||
| 4258 | VectorizableTree[1]->getOpcode() == Instruction::ExtractElement && | ||||
| 4259 | isShuffle(VectorizableTree[1]->Scalars, Mask)))) | ||||
| 4260 | return true; | ||||
| 4261 | |||||
| 4262 | // Gathering cost would be too much for tiny trees. | ||||
| 4263 | if (VectorizableTree[0]->State == TreeEntry::NeedToGather || | ||||
| 4264 | VectorizableTree[1]->State == TreeEntry::NeedToGather) | ||||
| 4265 | return false; | ||||
| 4266 | |||||
| 4267 | return true; | ||||
| 4268 | } | ||||
| 4269 | |||||
| 4270 | static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, | ||||
| 4271 | TargetTransformInfo *TTI, | ||||
| 4272 | bool MustMatchOrInst) { | ||||
| 4273 | // Look past the root to find a source value. Arbitrarily follow the | ||||
| 4274 | // path through operand 0 of any 'or'. Also, peek through optional | ||||
| 4275 | // shift-left-by-multiple-of-8-bits. | ||||
| 4276 | Value *ZextLoad = Root; | ||||
| 4277 | const APInt *ShAmtC; | ||||
| 4278 | bool FoundOr = false; | ||||
| 4279 | while (!isa<ConstantExpr>(ZextLoad) && | ||||
| 4280 | (match(ZextLoad, m_Or(m_Value(), m_Value())) || | ||||
| 4281 | (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && | ||||
| 4282 | ShAmtC->urem(8) == 0))) { | ||||
| 4283 | auto *BinOp = cast<BinaryOperator>(ZextLoad); | ||||
| 4284 | ZextLoad = BinOp->getOperand(0); | ||||
| 4285 | if (BinOp->getOpcode() == Instruction::Or) | ||||
| 4286 | FoundOr = true; | ||||
| 4287 | } | ||||
| 4288 | // Check if the input is an extended load of the required or/shift expression. | ||||
| 4289 | Value *LoadPtr; | ||||
| 4290 | if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || | ||||
| 4291 | !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) | ||||
| 4292 | return false; | ||||
| 4293 | |||||
| 4294 | // Require that the total load bit width is a legal integer type. | ||||
| 4295 | // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. | ||||
| 4296 | // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. | ||||
| 4297 | Type *SrcTy = LoadPtr->getType()->getPointerElementType(); | ||||
| 4298 | unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; | ||||
| 4299 | if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) | ||||
| 4300 | return false; | ||||
| 4301 | |||||
| 4302 | // Everything matched - assume that we can fold the whole sequence using | ||||
| 4303 | // load combining. | ||||
| 4304 | LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "do { } while (false) | ||||
| 4305 | << *(cast<Instruction>(Root)) << "\n")do { } while (false); | ||||
| 4306 | |||||
| 4307 | return true; | ||||
| 4308 | } | ||||
| 4309 | |||||
| 4310 | bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { | ||||
| 4311 | if (RdxKind != RecurKind::Or) | ||||
| 4312 | return false; | ||||
| 4313 | |||||
| 4314 | unsigned NumElts = VectorizableTree[0]->Scalars.size(); | ||||
| 4315 | Value *FirstReduced = VectorizableTree[0]->Scalars[0]; | ||||
| 4316 | return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI, | ||||
| 4317 | /* MatchOr */ false); | ||||
| 4318 | } | ||||
| 4319 | |||||
| 4320 | bool BoUpSLP::isLoadCombineCandidate() const { | ||||
| 4321 | // Peek through a final sequence of stores and check if all operations are | ||||
| 4322 | // likely to be load-combined. | ||||
| 4323 | unsigned NumElts = VectorizableTree[0]->Scalars.size(); | ||||
| 4324 | for (Value *Scalar : VectorizableTree[0]->Scalars) { | ||||
| 4325 | Value *X; | ||||
| 4326 | if (!match(Scalar, m_Store(m_Value(X), m_Value())) || | ||||
| 4327 | !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) | ||||
| 4328 | return false; | ||||
| 4329 | } | ||||
| 4330 | return true; | ||||
| 4331 | } | ||||
| 4332 | |||||
| 4333 | bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { | ||||
| 4334 | // No need to vectorize inserts of gathered values. | ||||
| 4335 | if (VectorizableTree.size() == 2 && | ||||
| 4336 | isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && | ||||
| 4337 | VectorizableTree[1]->State == TreeEntry::NeedToGather) | ||||
| 4338 | return true; | ||||
| 4339 | |||||
| 4340 | // We can vectorize the tree if its size is greater than or equal to the | ||||
| 4341 | // minimum size specified by the MinTreeSize command line option. | ||||
| 4342 | if (VectorizableTree.size() >= MinTreeSize) | ||||
| 4343 | return false; | ||||
| 4344 | |||||
| 4345 | // If we have a tiny tree (a tree whose size is less than MinTreeSize), we | ||||
| 4346 | // can vectorize it if we can prove it fully vectorizable. | ||||
| 4347 | if (isFullyVectorizableTinyTree()) | ||||
| 4348 | return false; | ||||
| 4349 | |||||
| 4350 | assert(VectorizableTree.empty()((void)0) | ||||
| 4351 | ? ExternalUses.empty()((void)0) | ||||
| 4352 | : true && "We shouldn't have any external users")((void)0); | ||||
| 4353 | |||||
| 4354 | // Otherwise, we can't vectorize the tree. It is both tiny and not fully | ||||
| 4355 | // vectorizable. | ||||
| 4356 | return true; | ||||
| 4357 | } | ||||
| 4358 | |||||
| 4359 | InstructionCost BoUpSLP::getSpillCost() const { | ||||
| 4360 | // Walk from the bottom of the tree to the top, tracking which values are | ||||
| 4361 | // live. When we see a call instruction that is not part of our tree, | ||||
| 4362 | // query TTI to see if there is a cost to keeping values live over it | ||||
| 4363 | // (for example, if spills and fills are required). | ||||
| 4364 | unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); | ||||
| 4365 | InstructionCost Cost = 0; | ||||
| 4366 | |||||
| 4367 | SmallPtrSet<Instruction*, 4> LiveValues; | ||||
| 4368 | Instruction *PrevInst = nullptr; | ||||
| 4369 | |||||
| 4370 | // The entries in VectorizableTree are not necessarily ordered by their | ||||
| 4371 | // position in basic blocks. Collect them and order them by dominance so later | ||||
| 4372 | // instructions are guaranteed to be visited first. For instructions in | ||||
| 4373 | // different basic blocks, we only scan to the beginning of the block, so | ||||
| 4374 | // their order does not matter, as long as all instructions in a basic block | ||||
| 4375 | // are grouped together. Using dominance ensures a deterministic order. | ||||
| 4376 | SmallVector<Instruction *, 16> OrderedScalars; | ||||
| 4377 | for (const auto &TEPtr : VectorizableTree) { | ||||
| 4378 | Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]); | ||||
| 4379 | if (!Inst) | ||||
| 4380 | continue; | ||||
| 4381 | OrderedScalars.push_back(Inst); | ||||
| 4382 | } | ||||
| 4383 | llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) { | ||||
| 4384 | auto *NodeA = DT->getNode(A->getParent()); | ||||
| 4385 | auto *NodeB = DT->getNode(B->getParent()); | ||||
| 4386 | assert(NodeA && "Should only process reachable instructions")((void)0); | ||||
| 4387 | assert(NodeB && "Should only process reachable instructions")((void)0); | ||||
| 4388 | assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&((void)0) | ||||
| 4389 | "Different nodes should have different DFS numbers")((void)0); | ||||
| 4390 | if (NodeA != NodeB) | ||||
| 4391 | return NodeA->getDFSNumIn() < NodeB->getDFSNumIn(); | ||||
| 4392 | return B->comesBefore(A); | ||||
| 4393 | }); | ||||
| 4394 | |||||
| 4395 | for (Instruction *Inst : OrderedScalars) { | ||||
| 4396 | if (!PrevInst) { | ||||
| 4397 | PrevInst = Inst; | ||||
| 4398 | continue; | ||||
| 4399 | } | ||||
| 4400 | |||||
| 4401 | // Update LiveValues. | ||||
| 4402 | LiveValues.erase(PrevInst); | ||||
| 4403 | for (auto &J : PrevInst->operands()) { | ||||
| 4404 | if (isa<Instruction>(&*J) && getTreeEntry(&*J)) | ||||
| 4405 | LiveValues.insert(cast<Instruction>(&*J)); | ||||
| 4406 | } | ||||
| 4407 | |||||
| 4408 | LLVM_DEBUG({do { } while (false) | ||||
| 4409 | dbgs() << "SLP: #LV: " << LiveValues.size();do { } while (false) | ||||
| 4410 | for (auto *X : LiveValues)do { } while (false) | ||||
| 4411 | dbgs() << " " << X->getName();do { } while (false) | ||||
| 4412 | dbgs() << ", Looking at ";do { } while (false) | ||||
| 4413 | Inst->dump();do { } while (false) | ||||
| 4414 | })do { } while (false); | ||||
| 4415 | |||||
| 4416 | // Now find the sequence of instructions between PrevInst and Inst. | ||||
| 4417 | unsigned NumCalls = 0; | ||||
| 4418 | BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(), | ||||
| 4419 | PrevInstIt = | ||||
| 4420 | PrevInst->getIterator().getReverse(); | ||||
| 4421 | while (InstIt != PrevInstIt) { | ||||
| 4422 | if (PrevInstIt == PrevInst->getParent()->rend()) { | ||||
| 4423 | PrevInstIt = Inst->getParent()->rbegin(); | ||||
| 4424 | continue; | ||||
| 4425 | } | ||||
| 4426 | |||||
| 4427 | // Debug information does not impact spill cost. | ||||
| 4428 | if ((isa<CallInst>(&*PrevInstIt) && | ||||
| 4429 | !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && | ||||
| 4430 | &*PrevInstIt != PrevInst) | ||||
| 4431 | NumCalls++; | ||||
| 4432 | |||||
| 4433 | ++PrevInstIt; | ||||
| 4434 | } | ||||
| 4435 | |||||
| 4436 | if (NumCalls) { | ||||
| 4437 | SmallVector<Type*, 4> V; | ||||
| 4438 | for (auto *II : LiveValues) { | ||||
| 4439 | auto *ScalarTy = II->getType(); | ||||
| 4440 | if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy)) | ||||
| 4441 | ScalarTy = VectorTy->getElementType(); | ||||
| 4442 | V.push_back(FixedVectorType::get(ScalarTy, BundleWidth)); | ||||
| 4443 | } | ||||
| 4444 | Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); | ||||
| 4445 | } | ||||
| 4446 | |||||
| 4447 | PrevInst = Inst; | ||||
| 4448 | } | ||||
| 4449 | |||||
| 4450 | return Cost; | ||||
| 4451 | } | ||||
| 4452 | |||||
| 4453 | InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { | ||||
| 4454 | InstructionCost Cost = 0; | ||||
| 4455 | LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "do { } while (false) | ||||
| 4456 | << VectorizableTree.size() << ".\n")do { } while (false); | ||||
| 4457 | |||||
| 4458 | unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); | ||||
| 4459 | |||||
| 4460 | for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { | ||||
| 4461 | TreeEntry &TE = *VectorizableTree[I].get(); | ||||
| 4462 | |||||
| 4463 | InstructionCost C = getEntryCost(&TE, VectorizedVals); | ||||
| 4464 | Cost += C; | ||||
| 4465 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << Cdo { } while (false) | ||||
| 4466 | << " for bundle that starts with " << *TE.Scalars[0]do { } while (false) | ||||
| 4467 | << ".\n"do { } while (false) | ||||
| 4468 | << "SLP: Current total cost = " << Cost << "\n")do { } while (false); | ||||
| 4469 | } | ||||
| 4470 | |||||
| 4471 | SmallPtrSet<Value *, 16> ExtractCostCalculated; | ||||
| 4472 | InstructionCost ExtractCost = 0; | ||||
| 4473 | SmallVector<unsigned> VF; | ||||
| 4474 | SmallVector<SmallVector<int>> ShuffleMask; | ||||
| 4475 | SmallVector<Value *> FirstUsers; | ||||
| 4476 | SmallVector<APInt> DemandedElts; | ||||
| 4477 | for (ExternalUser &EU : ExternalUses) { | ||||
| 4478 | // We only add extract cost once for the same scalar. | ||||
| 4479 | if (!ExtractCostCalculated.insert(EU.Scalar).second) | ||||
| 4480 | continue; | ||||
| 4481 | |||||
| 4482 | // Uses by ephemeral values are free (because the ephemeral value will be | ||||
| 4483 | // removed prior to code generation, and so the extraction will be | ||||
| 4484 | // removed as well). | ||||
| 4485 | if (EphValues.count(EU.User)) | ||||
| 4486 | continue; | ||||
| 4487 | |||||
| 4488 | // No extract cost for vector "scalar" | ||||
| 4489 | if (isa<FixedVectorType>(EU.Scalar->getType())) | ||||
| 4490 | continue; | ||||
| 4491 | |||||
| 4492 | // Already counted the cost for external uses when tried to adjust the cost | ||||
| 4493 | // for extractelements, no need to add it again. | ||||
| 4494 | if (isa<ExtractElementInst>(EU.Scalar)) | ||||
| 4495 | continue; | ||||
| 4496 | |||||
| 4497 | // If found user is an insertelement, do not calculate extract cost but try | ||||
| 4498 | // to detect it as a final shuffled/identity match. | ||||
| 4499 | if (EU.User && isa<InsertElementInst>(EU.User)) { | ||||
| 4500 | if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { | ||||
| 4501 | Optional<int> InsertIdx = getInsertIndex(EU.User, 0); | ||||
| 4502 | if (!InsertIdx || *InsertIdx == UndefMaskElem) | ||||
| 4503 | continue; | ||||
| 4504 | Value *VU = EU.User; | ||||
| 4505 | auto *It = find_if(FirstUsers, [VU](Value *V) { | ||||
| 4506 | // Checks if 2 insertelements are from the same buildvector. | ||||
| 4507 | if (VU->getType() != V->getType()) | ||||
| 4508 | return false; | ||||
| 4509 | auto *IE1 = cast<InsertElementInst>(VU); | ||||
| 4510 | auto *IE2 = cast<InsertElementInst>(V); | ||||
| 4511 | // Go though of insertelement instructions trying to find either VU as | ||||
| 4512 | // the original vector for IE2 or V as the original vector for IE1. | ||||
| 4513 | do { | ||||
| 4514 | if (IE1 == VU || IE2 == V) | ||||
| 4515 | return true; | ||||
| 4516 | if (IE1) | ||||
| 4517 | IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); | ||||
| 4518 | if (IE2) | ||||
| 4519 | IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); | ||||
| 4520 | } while (IE1 || IE2); | ||||
| 4521 | return false; | ||||
| 4522 | }); | ||||
| 4523 | int VecId = -1; | ||||
| 4524 | if (It == FirstUsers.end()) { | ||||
| 4525 | VF.push_back(FTy->getNumElements()); | ||||
| 4526 | ShuffleMask.emplace_back(VF.back(), UndefMaskElem); | ||||
| 4527 | FirstUsers.push_back(EU.User); | ||||
| 4528 | DemandedElts.push_back(APInt::getNullValue(VF.back())); | ||||
| 4529 | VecId = FirstUsers.size() - 1; | ||||
| 4530 | } else { | ||||
| 4531 | VecId = std::distance(FirstUsers.begin(), It); | ||||
| 4532 | } | ||||
| 4533 | int Idx = *InsertIdx; | ||||
| 4534 | ShuffleMask[VecId][Idx] = EU.Lane; | ||||
| 4535 | DemandedElts[VecId].setBit(Idx); | ||||
| 4536 | } | ||||
| 4537 | } | ||||
| 4538 | |||||
| 4539 | // If we plan to rewrite the tree in a smaller type, we will need to sign | ||||
| 4540 | // extend the extracted value back to the original type. Here, we account | ||||
| 4541 | // for the extract and the added cost of the sign extend if needed. | ||||
| 4542 | auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); | ||||
| 4543 | auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; | ||||
| 4544 | if (MinBWs.count(ScalarRoot)) { | ||||
| 4545 | auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); | ||||
| 4546 | auto Extend = | ||||
| 4547 | MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; | ||||
| 4548 | VecTy = FixedVectorType::get(MinTy, BundleWidth); | ||||
| 4549 | ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), | ||||
| 4550 | VecTy, EU.Lane); | ||||
| 4551 | } else { | ||||
| 4552 | ExtractCost += | ||||
| 4553 | TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); | ||||
| 4554 | } | ||||
| 4555 | } | ||||
| 4556 | |||||
| 4557 | InstructionCost SpillCost = getSpillCost(); | ||||
| 4558 | Cost += SpillCost + ExtractCost; | ||||
| 4559 | for (int I = 0, E = FirstUsers.size(); I < E; ++I) { | ||||
| 4560 | // For the very first element - simple shuffle of the source vector. | ||||
| 4561 | int Limit = ShuffleMask[I].size() * 2; | ||||
| 4562 | if (I == 0 && | ||||
| 4563 | all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && | ||||
| 4564 | !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { | ||||
| 4565 | InstructionCost C = TTI->getShuffleCost( | ||||
| 4566 | TTI::SK_PermuteSingleSrc, | ||||
| 4567 | cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); | ||||
| 4568 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << Cdo { } while (false) | ||||
| 4569 | << " for final shuffle of insertelement external users "do { } while (false) | ||||
| 4570 | << *VectorizableTree.front()->Scalars.front() << ".\n"do { } while (false) | ||||
| 4571 | << "SLP: Current total cost = " << Cost << "\n")do { } while (false); | ||||
| 4572 | Cost += C; | ||||
| 4573 | continue; | ||||
| 4574 | } | ||||
| 4575 | // Other elements - permutation of 2 vectors (the initial one and the next | ||||
| 4576 | // Ith incoming vector). | ||||
| 4577 | unsigned VF = ShuffleMask[I].size(); | ||||
| 4578 | for (unsigned Idx = 0; Idx < VF; ++Idx) { | ||||
| 4579 | int &Mask = ShuffleMask[I][Idx]; | ||||
| 4580 | Mask = Mask == UndefMaskElem ? Idx : VF + Mask; | ||||
| 4581 | } | ||||
| 4582 | InstructionCost C = TTI->getShuffleCost( | ||||
| 4583 | TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), | ||||
| 4584 | ShuffleMask[I]); | ||||
| 4585 | LLVM_DEBUG(do { } while (false) | ||||
| 4586 | dbgs()do { } while (false) | ||||
| 4587 | << "SLP: Adding cost " << Cdo { } while (false) | ||||
| 4588 | << " for final shuffle of vector node and external insertelement users "do { } while (false) | ||||
| 4589 | << *VectorizableTree.front()->Scalars.front() << ".\n"do { } while (false) | ||||
| 4590 | << "SLP: Current total cost = " << Cost << "\n")do { } while (false); | ||||
| 4591 | Cost += C; | ||||
| 4592 | InstructionCost InsertCost = TTI->getScalarizationOverhead( | ||||
| 4593 | cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], | ||||
| 4594 | /*Insert*/ true, | ||||
| 4595 | /*Extract*/ false); | ||||
| 4596 | Cost -= InsertCost; | ||||
| 4597 | LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCostdo { } while (false) | ||||
| 4598 | << " for insertelements gather.\n"do { } while (false) | ||||
| 4599 | << "SLP: Current total cost = " << Cost << "\n")do { } while (false); | ||||
| 4600 | } | ||||
| 4601 | |||||
| 4602 | #ifndef NDEBUG1 | ||||
| 4603 | SmallString<256> Str; | ||||
| 4604 | { | ||||
| 4605 | raw_svector_ostream OS(Str); | ||||
| 4606 | OS << "SLP: Spill Cost = " << SpillCost << ".\n" | ||||
| 4607 | << "SLP: Extract Cost = " << ExtractCost << ".\n" | ||||
| 4608 | << "SLP: Total Cost = " << Cost << ".\n"; | ||||
| 4609 | } | ||||
| 4610 | LLVM_DEBUG(dbgs() << Str)do { } while (false); | ||||
| 4611 | if (ViewSLPTree) | ||||
| 4612 | ViewGraph(this, "SLP" + F->getName(), false, Str); | ||||
| 4613 | #endif | ||||
| 4614 | |||||
| 4615 | return Cost; | ||||
| 4616 | } | ||||
| 4617 | |||||
| 4618 | Optional<TargetTransformInfo::ShuffleKind> | ||||
| 4619 | BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, | ||||
| 4620 | SmallVectorImpl<const TreeEntry *> &Entries) { | ||||
| 4621 | // TODO: currently checking only for Scalars in the tree entry, need to count | ||||
| 4622 | // reused elements too for better cost estimation. | ||||
| 4623 | Mask.assign(TE->Scalars.size(), UndefMaskElem); | ||||
| 4624 | Entries.clear(); | ||||
| 4625 | // Build a lists of values to tree entries. | ||||
| 4626 | DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs; | ||||
| 4627 | for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { | ||||
| 4628 | if (EntryPtr.get() == TE) | ||||
| 4629 | break; | ||||
| 4630 | if (EntryPtr->State != TreeEntry::NeedToGather) | ||||
| 4631 | continue; | ||||
| 4632 | for (Value *V : EntryPtr->Scalars) | ||||
| 4633 | ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); | ||||
| 4634 | } | ||||
| 4635 | // Find all tree entries used by the gathered values. If no common entries | ||||
| 4636 | // found - not a shuffle. | ||||
| 4637 | // Here we build a set of tree nodes for each gathered value and trying to | ||||
| 4638 | // find the intersection between these sets. If we have at least one common | ||||
| 4639 | // tree node for each gathered value - we have just a permutation of the | ||||
| 4640 | // single vector. If we have 2 different sets, we're in situation where we | ||||
| 4641 | // have a permutation of 2 input vectors. | ||||
| 4642 | SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; | ||||
| 4643 | DenseMap<Value *, int> UsedValuesEntry; | ||||
| 4644 | for (Value *V : TE->Scalars) { | ||||
| 4645 | if (isa<UndefValue>(V)) | ||||
| 4646 | continue; | ||||
| 4647 | // Build a list of tree entries where V is used. | ||||
| 4648 | SmallPtrSet<const TreeEntry *, 4> VToTEs; | ||||
| 4649 | auto It = ValueToTEs.find(V); | ||||
| 4650 | if (It != ValueToTEs.end()) | ||||
| 4651 | VToTEs = It->second; | ||||
| 4652 | if (const TreeEntry *VTE = getTreeEntry(V)) | ||||
| 4653 | VToTEs.insert(VTE); | ||||
| 4654 | if (VToTEs.empty()) | ||||
| 4655 | return None; | ||||
| 4656 | if (UsedTEs.empty()) { | ||||
| 4657 | // The first iteration, just insert the list of nodes to vector. | ||||
| 4658 | UsedTEs.push_back(VToTEs); | ||||
| 4659 | } else { | ||||
| 4660 | // Need to check if there are any previously used tree nodes which use V. | ||||
| 4661 | // If there are no such nodes, consider that we have another one input | ||||
| 4662 | // vector. | ||||
| 4663 | SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs); | ||||
| 4664 | unsigned Idx = 0; | ||||
| 4665 | for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) { | ||||
| 4666 | // Do we have a non-empty intersection of previously listed tree entries | ||||
| 4667 | // and tree entries using current V? | ||||
| 4668 | set_intersect(VToTEs, Set); | ||||
| 4669 | if (!VToTEs.empty()) { | ||||
| 4670 | // Yes, write the new subset and continue analysis for the next | ||||
| 4671 | // scalar. | ||||
| 4672 | Set.swap(VToTEs); | ||||
| 4673 | break; | ||||
| 4674 | } | ||||
| 4675 | VToTEs = SavedVToTEs; | ||||
| 4676 | ++Idx; | ||||
| 4677 | } | ||||
| 4678 | // No non-empty intersection found - need to add a second set of possible | ||||
| 4679 | // source vectors. | ||||
| 4680 | if (Idx == UsedTEs.size()) { | ||||
| 4681 | // If the number of input vectors is greater than 2 - not a permutation, | ||||
| 4682 | // fallback to the regular gather. | ||||
| 4683 | if (UsedTEs.size() == 2) | ||||
| 4684 | return None; | ||||
| 4685 | UsedTEs.push_back(SavedVToTEs); | ||||
| 4686 | Idx = UsedTEs.size() - 1; | ||||
| 4687 | } | ||||
| 4688 | UsedValuesEntry.try_emplace(V, Idx); | ||||
| 4689 | } | ||||
| 4690 | } | ||||
| 4691 | |||||
| 4692 | unsigned VF = 0; | ||||
| 4693 | if (UsedTEs.size() == 1) { | ||||
| 4694 | // Try to find the perfect match in another gather node at first. | ||||
| 4695 | auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { | ||||
| 4696 | return EntryPtr->isSame(TE->Scalars); | ||||
| 4697 | }); | ||||
| 4698 | if (It != UsedTEs.front().end()) { | ||||
| 4699 | Entries.push_back(*It); | ||||
| 4700 | std::iota(Mask.begin(), Mask.end(), 0); | ||||
| 4701 | return TargetTransformInfo::SK_PermuteSingleSrc; | ||||
| 4702 | } | ||||
| 4703 | // No perfect match, just shuffle, so choose the first tree node. | ||||
| 4704 | Entries.push_back(*UsedTEs.front().begin()); | ||||
| 4705 | } else { | ||||
| 4706 | // Try to find nodes with the same vector factor. | ||||
| 4707 | assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.")((void)0); | ||||
| 4708 | // FIXME: Shall be replaced by GetVF function once non-power-2 patch is | ||||
| 4709 | // landed. | ||||
| 4710 | auto &&GetVF = [](const TreeEntry *TE) { | ||||
| 4711 | if (!TE->ReuseShuffleIndices.empty()) | ||||
| 4712 | return TE->ReuseShuffleIndices.size(); | ||||
| 4713 | return TE->Scalars.size(); | ||||
| 4714 | }; | ||||
| 4715 | DenseMap<int, const TreeEntry *> VFToTE; | ||||
| 4716 | for (const TreeEntry *TE : UsedTEs.front()) | ||||
| 4717 | VFToTE.try_emplace(GetVF(TE), TE); | ||||
| 4718 | for (const TreeEntry *TE : UsedTEs.back()) { | ||||
| 4719 | auto It = VFToTE.find(GetVF(TE)); | ||||
| 4720 | if (It != VFToTE.end()) { | ||||
| 4721 | VF = It->first; | ||||
| 4722 | Entries.push_back(It->second); | ||||
| 4723 | Entries.push_back(TE); | ||||
| 4724 | break; | ||||
| 4725 | } | ||||
| 4726 | } | ||||
| 4727 | // No 2 source vectors with the same vector factor - give up and do regular | ||||
| 4728 | // gather. | ||||
| 4729 | if (Entries.empty()) | ||||
| 4730 | return None; | ||||
| 4731 | } | ||||
| 4732 | |||||
| 4733 | // Build a shuffle mask for better cost estimation and vector emission. | ||||
| 4734 | for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { | ||||
| 4735 | Value *V = TE->Scalars[I]; | ||||
| 4736 | if (isa<UndefValue>(V)) | ||||
| 4737 | continue; | ||||
| 4738 | unsigned Idx = UsedValuesEntry.lookup(V); | ||||
| 4739 | const TreeEntry *VTE = Entries[Idx]; | ||||
| 4740 | int FoundLane = VTE->findLaneForValue(V); | ||||
| 4741 | Mask[I] = Idx * VF + FoundLane; | ||||
| 4742 | // Extra check required by isSingleSourceMaskImpl function (called by | ||||
| 4743 | // ShuffleVectorInst::isSingleSourceMask). | ||||
| 4744 | if (Mask[I] >= 2 * E) | ||||
| 4745 | return None; | ||||
| 4746 | } | ||||
| 4747 | switch (Entries.size()) { | ||||
| 4748 | case 1: | ||||
| 4749 | return TargetTransformInfo::SK_PermuteSingleSrc; | ||||
| 4750 | case 2: | ||||
| 4751 | return TargetTransformInfo::SK_PermuteTwoSrc; | ||||
| 4752 | default: | ||||
| 4753 | break; | ||||
| 4754 | } | ||||
| 4755 | return None; | ||||
| 4756 | } | ||||
| 4757 | |||||
| 4758 | InstructionCost | ||||
| 4759 | BoUpSLP::getGatherCost(FixedVectorType *Ty, | ||||
| 4760 | const DenseSet<unsigned> &ShuffledIndices) const { | ||||
| 4761 | unsigned NumElts = Ty->getNumElements(); | ||||
| 4762 | APInt DemandedElts = APInt::getNullValue(NumElts); | ||||
| 4763 | for (unsigned I = 0; I < NumElts; ++I) | ||||
| 4764 | if (!ShuffledIndices.count(I)) | ||||
| 4765 | DemandedElts.setBit(I); | ||||
| 4766 | InstructionCost Cost = | ||||
| 4767 | TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, | ||||
| 4768 | /*Extract*/ false); | ||||
| 4769 | if (!ShuffledIndices.empty()) | ||||
| 4770 | Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); | ||||
| 4771 | return Cost; | ||||
| 4772 | } | ||||
| 4773 | |||||
| 4774 | InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { | ||||
| 4775 | // Find the type of the operands in VL. | ||||
| 4776 | Type *ScalarTy = VL[0]->getType(); | ||||
| 4777 | if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) | ||||
| 4778 | ScalarTy = SI->getValueOperand()->getType(); | ||||
| 4779 | auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); | ||||
| 4780 | // Find the cost of inserting/extracting values from the vector. | ||||
| 4781 | // Check if the same elements are inserted several times and count them as | ||||
| 4782 | // shuffle candidates. | ||||
| 4783 | DenseSet<unsigned> ShuffledElements; | ||||
| 4784 | DenseSet<Value *> UniqueElements; | ||||
| 4785 | // Iterate in reverse order to consider insert elements with the high cost. | ||||
| 4786 | for (unsigned I = VL.size(); I > 0; --I) { | ||||
| 4787 | unsigned Idx = I - 1; | ||||
| 4788 | if (isConstant(VL[Idx])) | ||||
| 4789 | continue; | ||||
| 4790 | if (!UniqueElements.insert(VL[Idx]).second) | ||||
| 4791 | ShuffledElements.insert(Idx); | ||||
| 4792 | } | ||||
| 4793 | return getGatherCost(VecTy, ShuffledElements); | ||||
| 4794 | } | ||||
| 4795 | |||||
| 4796 | // Perform operand reordering on the instructions in VL and return the reordered | ||||
| 4797 | // operands in Left and Right. | ||||
| 4798 | void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, | ||||
| 4799 | SmallVectorImpl<Value *> &Left, | ||||
| 4800 | SmallVectorImpl<Value *> &Right, | ||||
| 4801 | const DataLayout &DL, | ||||
| 4802 | ScalarEvolution &SE, | ||||
| 4803 | const BoUpSLP &R) { | ||||
| 4804 | if (VL.empty()) | ||||
| 4805 | return; | ||||
| 4806 | VLOperands Ops(VL, DL, SE, R); | ||||
| 4807 | // Reorder the operands in place. | ||||
| 4808 | Ops.reorder(); | ||||
| 4809 | Left = Ops.getVL(0); | ||||
| 4810 | Right = Ops.getVL(1); | ||||
| 4811 | } | ||||
| 4812 | |||||
| 4813 | void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { | ||||
| 4814 | // Get the basic block this bundle is in. All instructions in the bundle | ||||
| 4815 | // should be in this block. | ||||
| 4816 | auto *Front = E->getMainOp(); | ||||
| 4817 | auto *BB = Front->getParent(); | ||||
| 4818 | assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {((void)0) | ||||
| 4819 | auto *I = cast<Instruction>(V);((void)0) | ||||
| 4820 | return !E->isOpcodeOrAlt(I) || I->getParent() == BB;((void)0) | ||||
| 4821 | }))((void)0); | ||||
| 4822 | |||||
| 4823 | // The last instruction in the bundle in program order. | ||||
| 4824 | Instruction *LastInst = nullptr; | ||||
| 4825 | |||||
| 4826 | // Find the last instruction. The common case should be that BB has been | ||||
| 4827 | // scheduled, and the last instruction is VL.back(). So we start with | ||||
| 4828 | // VL.back() and iterate over schedule data until we reach the end of the | ||||
| 4829 | // bundle. The end of the bundle is marked by null ScheduleData. | ||||
| 4830 | if (BlocksSchedules.count(BB)) { | ||||
| 4831 | auto *Bundle = | ||||
| 4832 | BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); | ||||
| 4833 | if (Bundle && Bundle->isPartOfBundle()) | ||||
| 4834 | for (; Bundle; Bundle = Bundle->NextInBundle) | ||||
| 4835 | if (Bundle->OpValue == Bundle->Inst) | ||||
| 4836 | LastInst = Bundle->Inst; | ||||
| 4837 | } | ||||
| 4838 | |||||
| 4839 | // LastInst can still be null at this point if there's either not an entry | ||||
| 4840 | // for BB in BlocksSchedules or there's no ScheduleData available for | ||||
| 4841 | // VL.back(). This can be the case if buildTree_rec aborts for various | ||||
| 4842 | // reasons (e.g., the maximum recursion depth is reached, the maximum region | ||||
| 4843 | // size is reached, etc.). ScheduleData is initialized in the scheduling | ||||
| 4844 | // "dry-run". | ||||
| 4845 | // | ||||
| 4846 | // If this happens, we can still find the last instruction by brute force. We | ||||
| 4847 | // iterate forwards from Front (inclusive) until we either see all | ||||
| 4848 | // instructions in the bundle or reach the end of the block. If Front is the | ||||
| 4849 | // last instruction in program order, LastInst will be set to Front, and we | ||||
| 4850 | // will visit all the remaining instructions in the block. | ||||
| 4851 | // | ||||
| 4852 | // One of the reasons we exit early from buildTree_rec is to place an upper | ||||
| 4853 | // bound on compile-time. Thus, taking an additional compile-time hit here is | ||||
| 4854 | // not ideal. However, this should be exceedingly rare since it requires that | ||||
| 4855 | // we both exit early from buildTree_rec and that the bundle be out-of-order | ||||
| 4856 | // (causing us to iterate all the way to the end of the block). | ||||
| 4857 | if (!LastInst) { | ||||
| 4858 | SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end()); | ||||
| 4859 | for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { | ||||
| 4860 | if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I)) | ||||
| 4861 | LastInst = &I; | ||||
| 4862 | if (Bundle.empty()) | ||||
| 4863 | break; | ||||
| 4864 | } | ||||
| 4865 | } | ||||
| 4866 | assert(LastInst && "Failed to find last instruction in bundle")((void)0); | ||||
| 4867 | |||||
| 4868 | // Set the insertion point after the last instruction in the bundle. Set the | ||||
| 4869 | // debug location to Front. | ||||
| 4870 | Builder.SetInsertPoint(BB, ++LastInst->getIterator()); | ||||
| 4871 | Builder.SetCurrentDebugLocation(Front->getDebugLoc()); | ||||
| 4872 | } | ||||
| 4873 | |||||
| 4874 | Value *BoUpSLP::gather(ArrayRef<Value *> VL) { | ||||
| 4875 | // List of instructions/lanes from current block and/or the blocks which are | ||||
| 4876 | // part of the current loop. These instructions will be inserted at the end to | ||||
| 4877 | // make it possible to optimize loops and hoist invariant instructions out of | ||||
| 4878 | // the loops body with better chances for success. | ||||
| 4879 | SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts; | ||||
| 4880 | SmallSet<int, 4> PostponedIndices; | ||||
| 4881 | Loop *L = LI->getLoopFor(Builder.GetInsertBlock()); | ||||
| 4882 | auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) { | ||||
| 4883 | SmallPtrSet<BasicBlock *, 4> Visited; | ||||
| 4884 | while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second) | ||||
| 4885 | InsertBB = InsertBB->getSinglePredecessor(); | ||||
| 4886 | return InsertBB && InsertBB == InstBB; | ||||
| 4887 | }; | ||||
| 4888 | for (int I = 0, E = VL.size(); I < E; ++I) { | ||||
| 4889 | if (auto *Inst = dyn_cast<Instruction>(VL[I])) | ||||
| 4890 | if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || | ||||
| 4891 | getTreeEntry(Inst) || (L && (L->contains(Inst)))) && | ||||
| 4892 | PostponedIndices.insert(I).second) | ||||
| 4893 | PostponedInsts.emplace_back(Inst, I); | ||||
| 4894 | } | ||||
| 4895 | |||||
| 4896 | auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) { | ||||
| 4897 | Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos)); | ||||
| 4898 | auto *InsElt = dyn_cast<InsertElementInst>(Vec); | ||||
| 4899 | if (!InsElt) | ||||
| 4900 | return Vec; | ||||
| 4901 | GatherSeq.insert(InsElt); | ||||
| 4902 | CSEBlocks.insert(InsElt->getParent()); | ||||
| 4903 | // Add to our 'need-to-extract' list. | ||||
| 4904 | if (TreeEntry *Entry = getTreeEntry(V)) { | ||||
| 4905 | // Find which lane we need to extract. | ||||
| 4906 | unsigned FoundLane = Entry->findLaneForValue(V); | ||||
| 4907 | ExternalUses.emplace_back(V, InsElt, FoundLane); | ||||
| 4908 | } | ||||
| 4909 | return Vec; | ||||
| 4910 | }; | ||||
| 4911 | Value *Val0 = | ||||
| 4912 | isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; | ||||
| 4913 | FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); | ||||
| 4914 | Value *Vec = PoisonValue::get(VecTy); | ||||
| 4915 | SmallVector<int> NonConsts; | ||||
| 4916 | // Insert constant values at first. | ||||
| 4917 | for (int I = 0, E = VL.size(); I < E; ++I) { | ||||
| 4918 | if (PostponedIndices.contains(I)) | ||||
| 4919 | continue; | ||||
| 4920 | if (!isConstant(VL[I])) { | ||||
| 4921 | NonConsts.push_back(I); | ||||
| 4922 | continue; | ||||
| 4923 | } | ||||
| 4924 | Vec = CreateInsertElement(Vec, VL[I], I); | ||||
| 4925 | } | ||||
| 4926 | // Insert non-constant values. | ||||
| 4927 | for (int I : NonConsts) | ||||
| 4928 | Vec = CreateInsertElement(Vec, VL[I], I); | ||||
| 4929 | // Append instructions, which are/may be part of the loop, in the end to make | ||||
| 4930 | // it possible to hoist non-loop-based instructions. | ||||
| 4931 | for (const std::pair<Value *, unsigned> &Pair : PostponedInsts) | ||||
| 4932 | Vec = CreateInsertElement(Vec, Pair.first, Pair.second); | ||||
| 4933 | |||||
| 4934 | return Vec; | ||||
| 4935 | } | ||||
| 4936 | |||||
| 4937 | namespace { | ||||
| 4938 | /// Merges shuffle masks and emits final shuffle instruction, if required. | ||||
| 4939 | class ShuffleInstructionBuilder { | ||||
| 4940 | IRBuilderBase &Builder; | ||||
| 4941 | const unsigned VF = 0; | ||||
| 4942 | bool IsFinalized = false; | ||||
| 4943 | SmallVector<int, 4> Mask; | ||||
| 4944 | |||||
| 4945 | public: | ||||
| 4946 | ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF) | ||||
| 4947 | : Builder(Builder), VF(VF) {} | ||||
| 4948 | |||||
| 4949 | /// Adds a mask, inverting it before applying. | ||||
| 4950 | void addInversedMask(ArrayRef<unsigned> SubMask) { | ||||
| 4951 | if (SubMask.empty()) | ||||
| 4952 | return; | ||||
| 4953 | SmallVector<int, 4> NewMask; | ||||
| 4954 | inversePermutation(SubMask, NewMask); | ||||
| 4955 | addMask(NewMask); | ||||
| 4956 | } | ||||
| 4957 | |||||
| 4958 | /// Functions adds masks, merging them into single one. | ||||
| 4959 | void addMask(ArrayRef<unsigned> SubMask) { | ||||
| 4960 | SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end()); | ||||
| 4961 | addMask(NewMask); | ||||
| 4962 | } | ||||
| 4963 | |||||
| 4964 | void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); } | ||||
| 4965 | |||||
| 4966 | Value *finalize(Value *V) { | ||||
| 4967 | IsFinalized = true; | ||||
| 4968 | unsigned ValueVF = cast<FixedVectorType>(V->getType())->getNumElements(); | ||||
| 4969 | if (VF == ValueVF && Mask.empty()) | ||||
| 4970 | return V; | ||||
| 4971 | SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem); | ||||
| 4972 | std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0); | ||||
| 4973 | addMask(NormalizedMask); | ||||
| 4974 | |||||
| 4975 | if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) | ||||
| 4976 | return V; | ||||
| 4977 | return Builder.CreateShuffleVector(V, Mask, "shuffle"); | ||||
| 4978 | } | ||||
| 4979 | |||||
| 4980 | ~ShuffleInstructionBuilder() { | ||||
| 4981 | assert((IsFinalized || Mask.empty()) &&((void)0) | ||||
| 4982 | "Shuffle construction must be finalized.")((void)0); | ||||
| 4983 | } | ||||
| 4984 | }; | ||||
| 4985 | } // namespace | ||||
| 4986 | |||||
| 4987 | Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { | ||||
| 4988 | unsigned VF = VL.size(); | ||||
| 4989 | InstructionsState S = getSameOpcode(VL); | ||||
| 4990 | if (S.getOpcode()) { | ||||
| 4991 | if (TreeEntry *E = getTreeEntry(S.OpValue)) | ||||
| 4992 | if (E->isSame(VL)) { | ||||
| 4993 | Value *V = vectorizeTree(E); | ||||
| 4994 | if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) { | ||||
| 4995 | if (!E->ReuseShuffleIndices.empty()) { | ||||
| 4996 | // Reshuffle to get only unique values. | ||||
| 4997 | // If some of the scalars are duplicated in the vectorization tree | ||||
| 4998 | // entry, we do not vectorize them but instead generate a mask for | ||||
| 4999 | // the reuses. But if there are several users of the same entry, | ||||
| 5000 | // they may have different vectorization factors. This is especially | ||||
| 5001 | // important for PHI nodes. In this case, we need to adapt the | ||||
| 5002 | // resulting instruction for the user vectorization factor and have | ||||
| 5003 | // to reshuffle it again to take only unique elements of the vector. | ||||
| 5004 | // Without this code the function incorrectly returns reduced vector | ||||
| 5005 | // instruction with the same elements, not with the unique ones. | ||||
| 5006 | |||||
| 5007 | // block: | ||||
| 5008 | // %phi = phi <2 x > { .., %entry} {%shuffle, %block} | ||||
| 5009 | // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1> | ||||
| 5010 | // ... (use %2) | ||||
| 5011 | // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2} | ||||
| 5012 | // br %block | ||||
| 5013 | SmallVector<int> UniqueIdxs; | ||||
| 5014 | SmallSet<int, 4> UsedIdxs; | ||||
| 5015 | int Pos = 0; | ||||
| 5016 | int Sz = VL.size(); | ||||
| 5017 | for (int Idx : E->ReuseShuffleIndices) { | ||||
| 5018 | if (Idx != Sz && UsedIdxs.insert(Idx).second) | ||||
| 5019 | UniqueIdxs.emplace_back(Pos); | ||||
| 5020 | ++Pos; | ||||
| 5021 | } | ||||
| 5022 | assert(VF >= UsedIdxs.size() && "Expected vectorization factor "((void)0) | ||||
| 5023 | "less than original vector size.")((void)0); | ||||
| 5024 | UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); | ||||
| 5025 | V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); | ||||
| 5026 | } else { | ||||
| 5027 | assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&((void)0) | ||||
| 5028 | "Expected vectorization factor less "((void)0) | ||||
| 5029 | "than original vector size.")((void)0); | ||||
| 5030 | SmallVector<int> UniformMask(VF, 0); | ||||
| 5031 | std::iota(UniformMask.begin(), UniformMask.end(), 0); | ||||
| 5032 | V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); | ||||
| 5033 | } | ||||
| 5034 | } | ||||
| 5035 | return V; | ||||
| 5036 | } | ||||
| 5037 | } | ||||
| 5038 | |||||
| 5039 | // Check that every instruction appears once in this bundle. | ||||
| 5040 | SmallVector<int> ReuseShuffleIndicies; | ||||
| 5041 | SmallVector<Value *> UniqueValues; | ||||
| 5042 | if (VL.size() > 2) { | ||||
| 5043 | DenseMap<Value *, unsigned> UniquePositions; | ||||
| 5044 | unsigned NumValues = | ||||
| 5045 | std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) { | ||||
| 5046 | return !isa<UndefValue>(V); | ||||
| 5047 | }).base()); | ||||
| 5048 | VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues)); | ||||
| 5049 | int UniqueVals = 0; | ||||
| 5050 | bool HasUndefs = false; | ||||
| 5051 | for (Value *V : VL.drop_back(VL.size() - VF)) { | ||||
| 5052 | if (isa<UndefValue>(V)) { | ||||
| 5053 | ReuseShuffleIndicies.emplace_back(UndefMaskElem); | ||||
| 5054 | HasUndefs = true; | ||||
| 5055 | continue; | ||||
| 5056 | } | ||||
| 5057 | if (isConstant(V)) { | ||||
| 5058 | ReuseShuffleIndicies.emplace_back(UniqueValues.size()); | ||||
| 5059 | UniqueValues.emplace_back(V); | ||||
| 5060 | continue; | ||||
| 5061 | } | ||||
| 5062 | auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); | ||||
| 5063 | ReuseShuffleIndicies.emplace_back(Res.first->second); | ||||
| 5064 | if (Res.second) { | ||||
| 5065 | UniqueValues.emplace_back(V); | ||||
| 5066 | ++UniqueVals; | ||||
| 5067 | } | ||||
| 5068 | } | ||||
| 5069 | if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) { | ||||
| 5070 | // Emit pure splat vector. | ||||
| 5071 | // FIXME: why it is not identified as an identity. | ||||
| 5072 | unsigned NumUndefs = count(ReuseShuffleIndicies, UndefMaskElem); | ||||
| 5073 | if (NumUndefs == ReuseShuffleIndicies.size() - 1) | ||||
| 5074 | ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), | ||||
| 5075 | UndefMaskElem); | ||||
| 5076 | else | ||||
| 5077 | ReuseShuffleIndicies.assign(VF, 0); | ||||
| 5078 | } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) { | ||||
| 5079 | ReuseShuffleIndicies.clear(); | ||||
| 5080 | UniqueValues.clear(); | ||||
| 5081 | UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues)); | ||||
| 5082 | } | ||||
| 5083 | UniqueValues.append(VF - UniqueValues.size(), | ||||
| 5084 | PoisonValue::get(VL[0]->getType())); | ||||
| 5085 | VL = UniqueValues; | ||||
| 5086 | } | ||||
| 5087 | |||||
| 5088 | ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); | ||||
| 5089 | Value *Vec = gather(VL); | ||||
| 5090 | if (!ReuseShuffleIndicies.empty()) { | ||||
| 5091 | ShuffleBuilder.addMask(ReuseShuffleIndicies); | ||||
| 5092 | Vec = ShuffleBuilder.finalize(Vec); | ||||
| 5093 | if (auto *I = dyn_cast<Instruction>(Vec)) { | ||||
| 5094 | GatherSeq.insert(I); | ||||
| 5095 | CSEBlocks.insert(I->getParent()); | ||||
| 5096 | } | ||||
| 5097 | } | ||||
| 5098 | return Vec; | ||||
| 5099 | } | ||||
| 5100 | |||||
| 5101 | Value *BoUpSLP::vectorizeTree(TreeEntry *E) { | ||||
| 5102 | IRBuilder<>::InsertPointGuard Guard(Builder); | ||||
| 5103 | |||||
| 5104 | if (E->VectorizedValue) { | ||||
| 5105 | LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n")do { } while (false); | ||||
| 5106 | return E->VectorizedValue; | ||||
| 5107 | } | ||||
| 5108 | |||||
| 5109 | bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); | ||||
| 5110 | unsigned VF = E->Scalars.size(); | ||||
| 5111 | if (NeedToShuffleReuses) | ||||
| 5112 | VF = E->ReuseShuffleIndices.size(); | ||||
| 5113 | ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); | ||||
| 5114 | if (E->State == TreeEntry::NeedToGather) { | ||||
| 5115 | setInsertPointAfterBundle(E); | ||||
| 5116 | Value *Vec; | ||||
| 5117 | SmallVector<int> Mask; | ||||
| 5118 | SmallVector<const TreeEntry *> Entries; | ||||
| 5119 | Optional<TargetTransformInfo::ShuffleKind> Shuffle = | ||||
| 5120 | isGatherShuffledEntry(E, Mask, Entries); | ||||
| 5121 | if (Shuffle.hasValue()) { | ||||
| 5122 | assert((Entries.size() == 1 || Entries.size() == 2) &&((void)0) | ||||
| 5123 | "Expected shuffle of 1 or 2 entries.")((void)0); | ||||
| 5124 | Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, | ||||
| 5125 | Entries.back()->VectorizedValue, Mask); | ||||
| 5126 | } else { | ||||
| 5127 | Vec = gather(E->Scalars); | ||||
| 5128 | } | ||||
| 5129 | if (NeedToShuffleReuses) { | ||||
| 5130 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5131 | Vec = ShuffleBuilder.finalize(Vec); | ||||
| 5132 | if (auto *I = dyn_cast<Instruction>(Vec)) { | ||||
| 5133 | GatherSeq.insert(I); | ||||
| 5134 | CSEBlocks.insert(I->getParent()); | ||||
| 5135 | } | ||||
| 5136 | } | ||||
| 5137 | E->VectorizedValue = Vec; | ||||
| 5138 | return Vec; | ||||
| 5139 | } | ||||
| 5140 | |||||
| 5141 | assert((E->State == TreeEntry::Vectorize ||((void)0) | ||||
| 5142 | E->State == TreeEntry::ScatterVectorize) &&((void)0) | ||||
| 5143 | "Unhandled state")((void)0); | ||||
| 5144 | unsigned ShuffleOrOp = | ||||
| 5145 | E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); | ||||
| 5146 | Instruction *VL0 = E->getMainOp(); | ||||
| 5147 | Type *ScalarTy = VL0->getType(); | ||||
| 5148 | if (auto *Store = dyn_cast<StoreInst>(VL0)) | ||||
| 5149 | ScalarTy = Store->getValueOperand()->getType(); | ||||
| 5150 | else if (auto *IE = dyn_cast<InsertElementInst>(VL0)) | ||||
| 5151 | ScalarTy = IE->getOperand(1)->getType(); | ||||
| 5152 | auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); | ||||
| 5153 | switch (ShuffleOrOp) { | ||||
| 5154 | case Instruction::PHI: { | ||||
| 5155 | auto *PH = cast<PHINode>(VL0); | ||||
| 5156 | Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); | ||||
| 5157 | Builder.SetCurrentDebugLocation(PH->getDebugLoc()); | ||||
| 5158 | PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); | ||||
| 5159 | Value *V = NewPhi; | ||||
| 5160 | if (NeedToShuffleReuses) | ||||
| 5161 | V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); | ||||
| 5162 | |||||
| 5163 | E->VectorizedValue = V; | ||||
| 5164 | |||||
| 5165 | // PHINodes may have multiple entries from the same block. We want to | ||||
| 5166 | // visit every block once. | ||||
| 5167 | SmallPtrSet<BasicBlock*, 4> VisitedBBs; | ||||
| 5168 | |||||
| 5169 | for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { | ||||
| 5170 | ValueList Operands; | ||||
| 5171 | BasicBlock *IBB = PH->getIncomingBlock(i); | ||||
| 5172 | |||||
| 5173 | if (!VisitedBBs.insert(IBB).second) { | ||||
| 5174 | NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); | ||||
| 5175 | continue; | ||||
| 5176 | } | ||||
| 5177 | |||||
| 5178 | Builder.SetInsertPoint(IBB->getTerminator()); | ||||
| 5179 | Builder.SetCurrentDebugLocation(PH->getDebugLoc()); | ||||
| 5180 | Value *Vec = vectorizeTree(E->getOperand(i)); | ||||
| 5181 | NewPhi->addIncoming(Vec, IBB); | ||||
| 5182 | } | ||||
| 5183 | |||||
| 5184 | assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&((void)0) | ||||
| 5185 | "Invalid number of incoming values")((void)0); | ||||
| 5186 | return V; | ||||
| 5187 | } | ||||
| 5188 | |||||
| 5189 | case Instruction::ExtractElement: { | ||||
| 5190 | Value *V = E->getSingleOperand(0); | ||||
| 5191 | Builder.SetInsertPoint(VL0); | ||||
| 5192 | ShuffleBuilder.addInversedMask(E->ReorderIndices); | ||||
| 5193 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5194 | V = ShuffleBuilder.finalize(V); | ||||
| 5195 | E->VectorizedValue = V; | ||||
| 5196 | return V; | ||||
| 5197 | } | ||||
| 5198 | case Instruction::ExtractValue: { | ||||
| 5199 | auto *LI = cast<LoadInst>(E->getSingleOperand(0)); | ||||
| 5200 | Builder.SetInsertPoint(LI); | ||||
| 5201 | auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); | ||||
| 5202 | Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); | ||||
| 5203 | LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); | ||||
| 5204 | Value *NewV = propagateMetadata(V, E->Scalars); | ||||
| 5205 | ShuffleBuilder.addInversedMask(E->ReorderIndices); | ||||
| 5206 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5207 | NewV = ShuffleBuilder.finalize(NewV); | ||||
| 5208 | E->VectorizedValue = NewV; | ||||
| 5209 | return NewV; | ||||
| 5210 | } | ||||
| 5211 | case Instruction::InsertElement: { | ||||
| 5212 | Builder.SetInsertPoint(VL0); | ||||
| 5213 | Value *V = vectorizeTree(E->getOperand(1)); | ||||
| 5214 | |||||
| 5215 | const unsigned NumElts = | ||||
| 5216 | cast<FixedVectorType>(VL0->getType())->getNumElements(); | ||||
| 5217 | const unsigned NumScalars = E->Scalars.size(); | ||||
| 5218 | |||||
| 5219 | // Create InsertVector shuffle if necessary | ||||
| 5220 | Instruction *FirstInsert = nullptr; | ||||
| 5221 | bool IsIdentity = true; | ||||
| 5222 | unsigned Offset = UINT_MAX(2147483647 *2U +1U); | ||||
| 5223 | for (unsigned I = 0; I < NumScalars; ++I) { | ||||
| 5224 | Value *Scalar = E->Scalars[I]; | ||||
| 5225 | if (!FirstInsert && | ||||
| 5226 | !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0))) | ||||
| 5227 | FirstInsert = cast<Instruction>(Scalar); | ||||
| 5228 | Optional<int> InsertIdx = getInsertIndex(Scalar, 0); | ||||
| 5229 | if (!InsertIdx || *InsertIdx == UndefMaskElem) | ||||
| 5230 | continue; | ||||
| 5231 | unsigned Idx = *InsertIdx; | ||||
| 5232 | if (Idx < Offset) { | ||||
| 5233 | Offset = Idx; | ||||
| 5234 | IsIdentity &= I == 0; | ||||
| 5235 | } else { | ||||
| 5236 | assert(Idx >= Offset && "Failed to find vector index offset")((void)0); | ||||
| 5237 | IsIdentity &= Idx - Offset == I; | ||||
| 5238 | } | ||||
| 5239 | } | ||||
| 5240 | assert(Offset < NumElts && "Failed to find vector index offset")((void)0); | ||||
| 5241 | |||||
| 5242 | // Create shuffle to resize vector | ||||
| 5243 | SmallVector<int> Mask(NumElts, UndefMaskElem); | ||||
| 5244 | if (!IsIdentity) { | ||||
| 5245 | for (unsigned I = 0; I < NumScalars; ++I) { | ||||
| 5246 | Value *Scalar = E->Scalars[I]; | ||||
| 5247 | Optional<int> InsertIdx = getInsertIndex(Scalar, 0); | ||||
| 5248 | if (!InsertIdx || *InsertIdx == UndefMaskElem) | ||||
| 5249 | continue; | ||||
| 5250 | Mask[*InsertIdx - Offset] = I; | ||||
| 5251 | } | ||||
| 5252 | } else { | ||||
| 5253 | std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); | ||||
| 5254 | } | ||||
| 5255 | if (!IsIdentity || NumElts != NumScalars) | ||||
| 5256 | V = Builder.CreateShuffleVector(V, Mask); | ||||
| 5257 | |||||
| 5258 | if (NumElts != NumScalars) { | ||||
| 5259 | SmallVector<int> InsertMask(NumElts); | ||||
| 5260 | std::iota(InsertMask.begin(), InsertMask.end(), 0); | ||||
| 5261 | for (unsigned I = 0; I < NumElts; I++) { | ||||
| 5262 | if (Mask[I] != UndefMaskElem) | ||||
| 5263 | InsertMask[Offset + I] = NumElts + I; | ||||
| 5264 | } | ||||
| 5265 | |||||
| 5266 | V = Builder.CreateShuffleVector( | ||||
| 5267 | FirstInsert->getOperand(0), V, InsertMask, | ||||
| 5268 | cast<Instruction>(E->Scalars.back())->getName()); | ||||
| 5269 | } | ||||
| 5270 | |||||
| 5271 | ++NumVectorInstructions; | ||||
| 5272 | E->VectorizedValue = V; | ||||
| 5273 | return V; | ||||
| 5274 | } | ||||
| 5275 | case Instruction::ZExt: | ||||
| 5276 | case Instruction::SExt: | ||||
| 5277 | case Instruction::FPToUI: | ||||
| 5278 | case Instruction::FPToSI: | ||||
| 5279 | case Instruction::FPExt: | ||||
| 5280 | case Instruction::PtrToInt: | ||||
| 5281 | case Instruction::IntToPtr: | ||||
| 5282 | case Instruction::SIToFP: | ||||
| 5283 | case Instruction::UIToFP: | ||||
| 5284 | case Instruction::Trunc: | ||||
| 5285 | case Instruction::FPTrunc: | ||||
| 5286 | case Instruction::BitCast: { | ||||
| 5287 | setInsertPointAfterBundle(E); | ||||
| 5288 | |||||
| 5289 | Value *InVec = vectorizeTree(E->getOperand(0)); | ||||
| 5290 | |||||
| 5291 | if (E->VectorizedValue) { | ||||
| 5292 | LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { } while (false); | ||||
| 5293 | return E->VectorizedValue; | ||||
| 5294 | } | ||||
| 5295 | |||||
| 5296 | auto *CI = cast<CastInst>(VL0); | ||||
| 5297 | Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); | ||||
| 5298 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5299 | V = ShuffleBuilder.finalize(V); | ||||
| 5300 | |||||
| 5301 | E->VectorizedValue = V; | ||||
| 5302 | ++NumVectorInstructions; | ||||
| 5303 | return V; | ||||
| 5304 | } | ||||
| 5305 | case Instruction::FCmp: | ||||
| 5306 | case Instruction::ICmp: { | ||||
| 5307 | setInsertPointAfterBundle(E); | ||||
| 5308 | |||||
| 5309 | Value *L = vectorizeTree(E->getOperand(0)); | ||||
| 5310 | Value *R = vectorizeTree(E->getOperand(1)); | ||||
| 5311 | |||||
| 5312 | if (E->VectorizedValue) { | ||||
| 5313 | LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { } while (false); | ||||
| 5314 | return E->VectorizedValue; | ||||
| 5315 | } | ||||
| 5316 | |||||
| 5317 | CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); | ||||
| 5318 | Value *V = Builder.CreateCmp(P0, L, R); | ||||
| 5319 | propagateIRFlags(V, E->Scalars, VL0); | ||||
| 5320 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5321 | V = ShuffleBuilder.finalize(V); | ||||
| 5322 | |||||
| 5323 | E->VectorizedValue = V; | ||||
| 5324 | ++NumVectorInstructions; | ||||
| 5325 | return V; | ||||
| 5326 | } | ||||
| 5327 | case Instruction::Select: { | ||||
| 5328 | setInsertPointAfterBundle(E); | ||||
| 5329 | |||||
| 5330 | Value *Cond = vectorizeTree(E->getOperand(0)); | ||||
| 5331 | Value *True = vectorizeTree(E->getOperand(1)); | ||||
| 5332 | Value *False = vectorizeTree(E->getOperand(2)); | ||||
| 5333 | |||||
| 5334 | if (E->VectorizedValue) { | ||||
| 5335 | LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { } while (false); | ||||
| 5336 | return E->VectorizedValue; | ||||
| 5337 | } | ||||
| 5338 | |||||
| 5339 | Value *V = Builder.CreateSelect(Cond, True, False); | ||||
| 5340 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5341 | V = ShuffleBuilder.finalize(V); | ||||
| 5342 | |||||
| 5343 | E->VectorizedValue = V; | ||||
| 5344 | ++NumVectorInstructions; | ||||
| 5345 | return V; | ||||
| 5346 | } | ||||
| 5347 | case Instruction::FNeg: { | ||||
| 5348 | setInsertPointAfterBundle(E); | ||||
| 5349 | |||||
| 5350 | Value *Op = vectorizeTree(E->getOperand(0)); | ||||
| 5351 | |||||
| 5352 | if (E->VectorizedValue) { | ||||
| 5353 | LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { } while (false); | ||||
| 5354 | return E->VectorizedValue; | ||||
| 5355 | } | ||||
| 5356 | |||||
| 5357 | Value *V = Builder.CreateUnOp( | ||||
| 5358 | static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); | ||||
| 5359 | propagateIRFlags(V, E->Scalars, VL0); | ||||
| 5360 | if (auto *I = dyn_cast<Instruction>(V)) | ||||
| 5361 | V = propagateMetadata(I, E->Scalars); | ||||
| 5362 | |||||
| 5363 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5364 | V = ShuffleBuilder.finalize(V); | ||||
| 5365 | |||||
| 5366 | E->VectorizedValue = V; | ||||
| 5367 | ++NumVectorInstructions; | ||||
| 5368 | |||||
| 5369 | return V; | ||||
| 5370 | } | ||||
| 5371 | case Instruction::Add: | ||||
| 5372 | case Instruction::FAdd: | ||||
| 5373 | case Instruction::Sub: | ||||
| 5374 | case Instruction::FSub: | ||||
| 5375 | case Instruction::Mul: | ||||
| 5376 | case Instruction::FMul: | ||||
| 5377 | case Instruction::UDiv: | ||||
| 5378 | case Instruction::SDiv: | ||||
| 5379 | case Instruction::FDiv: | ||||
| 5380 | case Instruction::URem: | ||||
| 5381 | case Instruction::SRem: | ||||
| 5382 | case Instruction::FRem: | ||||
| 5383 | case Instruction::Shl: | ||||
| 5384 | case Instruction::LShr: | ||||
| 5385 | case Instruction::AShr: | ||||
| 5386 | case Instruction::And: | ||||
| 5387 | case Instruction::Or: | ||||
| 5388 | case Instruction::Xor: { | ||||
| 5389 | setInsertPointAfterBundle(E); | ||||
| 5390 | |||||
| 5391 | Value *LHS = vectorizeTree(E->getOperand(0)); | ||||
| 5392 | Value *RHS = vectorizeTree(E->getOperand(1)); | ||||
| 5393 | |||||
| 5394 | if (E->VectorizedValue) { | ||||
| 5395 | LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { } while (false); | ||||
| 5396 | return E->VectorizedValue; | ||||
| 5397 | } | ||||
| 5398 | |||||
| 5399 | Value *V = Builder.CreateBinOp( | ||||
| 5400 | static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, | ||||
| 5401 | RHS); | ||||
| 5402 | propagateIRFlags(V, E->Scalars, VL0); | ||||
| 5403 | if (auto *I = dyn_cast<Instruction>(V)) | ||||
| 5404 | V = propagateMetadata(I, E->Scalars); | ||||
| 5405 | |||||
| 5406 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5407 | V = ShuffleBuilder.finalize(V); | ||||
| 5408 | |||||
| 5409 | E->VectorizedValue = V; | ||||
| 5410 | ++NumVectorInstructions; | ||||
| 5411 | |||||
| 5412 | return V; | ||||
| 5413 | } | ||||
| 5414 | case Instruction::Load: { | ||||
| 5415 | // Loads are inserted at the head of the tree because we don't want to | ||||
| 5416 | // sink them all the way down past store instructions. | ||||
| 5417 | bool IsReorder = E->updateStateIfReorder(); | ||||
| 5418 | if (IsReorder) | ||||
| 5419 | VL0 = E->getMainOp(); | ||||
| 5420 | setInsertPointAfterBundle(E); | ||||
| 5421 | |||||
| 5422 | LoadInst *LI = cast<LoadInst>(VL0); | ||||
| 5423 | Instruction *NewLI; | ||||
| 5424 | unsigned AS = LI->getPointerAddressSpace(); | ||||
| 5425 | Value *PO = LI->getPointerOperand(); | ||||
| 5426 | if (E->State == TreeEntry::Vectorize) { | ||||
| 5427 | |||||
| 5428 | Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); | ||||
| 5429 | |||||
| 5430 | // The pointer operand uses an in-tree scalar so we add the new BitCast | ||||
| 5431 | // to ExternalUses list to make sure that an extract will be generated | ||||
| 5432 | // in the future. | ||||
| 5433 | if (getTreeEntry(PO)) | ||||
| 5434 | ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0); | ||||
| 5435 | |||||
| 5436 | NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); | ||||
| 5437 | } else { | ||||
| 5438 | assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state")((void)0); | ||||
| 5439 | Value *VecPtr = vectorizeTree(E->getOperand(0)); | ||||
| 5440 | // Use the minimum alignment of the gathered loads. | ||||
| 5441 | Align CommonAlignment = LI->getAlign(); | ||||
| 5442 | for (Value *V : E->Scalars) | ||||
| 5443 | CommonAlignment = | ||||
| 5444 | commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); | ||||
| 5445 | NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); | ||||
| 5446 | } | ||||
| 5447 | Value *V = propagateMetadata(NewLI, E->Scalars); | ||||
| 5448 | |||||
| 5449 | ShuffleBuilder.addInversedMask(E->ReorderIndices); | ||||
| 5450 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5451 | V = ShuffleBuilder.finalize(V); | ||||
| 5452 | E->VectorizedValue = V; | ||||
| 5453 | ++NumVectorInstructions; | ||||
| 5454 | return V; | ||||
| 5455 | } | ||||
| 5456 | case Instruction::Store: { | ||||
| 5457 | bool IsReorder = !E->ReorderIndices.empty(); | ||||
| 5458 | auto *SI = cast<StoreInst>( | ||||
| 5459 | IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); | ||||
| 5460 | unsigned AS = SI->getPointerAddressSpace(); | ||||
| 5461 | |||||
| 5462 | setInsertPointAfterBundle(E); | ||||
| 5463 | |||||
| 5464 | Value *VecValue = vectorizeTree(E->getOperand(0)); | ||||
| 5465 | ShuffleBuilder.addMask(E->ReorderIndices); | ||||
| 5466 | VecValue = ShuffleBuilder.finalize(VecValue); | ||||
| 5467 | |||||
| 5468 | Value *ScalarPtr = SI->getPointerOperand(); | ||||
| 5469 | Value *VecPtr = Builder.CreateBitCast( | ||||
| 5470 | ScalarPtr, VecValue->getType()->getPointerTo(AS)); | ||||
| 5471 | StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr, | ||||
| 5472 | SI->getAlign()); | ||||
| 5473 | |||||
| 5474 | // The pointer operand uses an in-tree scalar, so add the new BitCast to | ||||
| 5475 | // ExternalUses to make sure that an extract will be generated in the | ||||
| 5476 | // future. | ||||
| 5477 | if (getTreeEntry(ScalarPtr)) | ||||
| 5478 | ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); | ||||
| 5479 | |||||
| 5480 | Value *V = propagateMetadata(ST, E->Scalars); | ||||
| 5481 | |||||
| 5482 | E->VectorizedValue = V; | ||||
| 5483 | ++NumVectorInstructions; | ||||
| 5484 | return V; | ||||
| 5485 | } | ||||
| 5486 | case Instruction::GetElementPtr: { | ||||
| 5487 | setInsertPointAfterBundle(E); | ||||
| 5488 | |||||
| 5489 | Value *Op0 = vectorizeTree(E->getOperand(0)); | ||||
| 5490 | |||||
| 5491 | std::vector<Value *> OpVecs; | ||||
| 5492 | for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; | ||||
| 5493 | ++j) { | ||||
| 5494 | ValueList &VL = E->getOperand(j); | ||||
| 5495 | // Need to cast all elements to the same type before vectorization to | ||||
| 5496 | // avoid crash. | ||||
| 5497 | Type *VL0Ty = VL0->getOperand(j)->getType(); | ||||
| 5498 | Type *Ty = llvm::all_of( | ||||
| 5499 | VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); }) | ||||
| 5500 | ? VL0Ty | ||||
| 5501 | : DL->getIndexType(cast<GetElementPtrInst>(VL0) | ||||
| 5502 | ->getPointerOperandType() | ||||
| 5503 | ->getScalarType()); | ||||
| 5504 | for (Value *&V : VL) { | ||||
| 5505 | auto *CI = cast<ConstantInt>(V); | ||||
| 5506 | V = ConstantExpr::getIntegerCast(CI, Ty, | ||||
| 5507 | CI->getValue().isSignBitSet()); | ||||
| 5508 | } | ||||
| 5509 | Value *OpVec = vectorizeTree(VL); | ||||
| 5510 | OpVecs.push_back(OpVec); | ||||
| 5511 | } | ||||
| 5512 | |||||
| 5513 | Value *V = Builder.CreateGEP( | ||||
| 5514 | cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs); | ||||
| 5515 | if (Instruction *I = dyn_cast<Instruction>(V)) | ||||
| 5516 | V = propagateMetadata(I, E->Scalars); | ||||
| 5517 | |||||
| 5518 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5519 | V = ShuffleBuilder.finalize(V); | ||||
| 5520 | |||||
| 5521 | E->VectorizedValue = V; | ||||
| 5522 | ++NumVectorInstructions; | ||||
| 5523 | |||||
| 5524 | return V; | ||||
| 5525 | } | ||||
| 5526 | case Instruction::Call: { | ||||
| 5527 | CallInst *CI = cast<CallInst>(VL0); | ||||
| 5528 | setInsertPointAfterBundle(E); | ||||
| 5529 | |||||
| 5530 | Intrinsic::ID IID = Intrinsic::not_intrinsic; | ||||
| 5531 | if (Function *FI = CI->getCalledFunction()) | ||||
| 5532 | IID = FI->getIntrinsicID(); | ||||
| 5533 | |||||
| 5534 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); | ||||
| 5535 | |||||
| 5536 | auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); | ||||
| 5537 | bool UseIntrinsic = ID != Intrinsic::not_intrinsic && | ||||
| 5538 | VecCallCosts.first <= VecCallCosts.second; | ||||
| 5539 | |||||
| 5540 | Value *ScalarArg = nullptr; | ||||
| 5541 | std::vector<Value *> OpVecs; | ||||
| 5542 | SmallVector<Type *, 2> TysForDecl = | ||||
| 5543 | {FixedVectorType::get(CI->getType(), E->Scalars.size())}; | ||||
| 5544 | for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { | ||||
| 5545 | ValueList OpVL; | ||||
| 5546 | // Some intrinsics have scalar arguments. This argument should not be | ||||
| 5547 | // vectorized. | ||||
| 5548 | if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) { | ||||
| 5549 | CallInst *CEI = cast<CallInst>(VL0); | ||||
| 5550 | ScalarArg = CEI->getArgOperand(j); | ||||
| 5551 | OpVecs.push_back(CEI->getArgOperand(j)); | ||||
| 5552 | if (hasVectorInstrinsicOverloadedScalarOpd(IID, j)) | ||||
| 5553 | TysForDecl.push_back(ScalarArg->getType()); | ||||
| 5554 | continue; | ||||
| 5555 | } | ||||
| 5556 | |||||
| 5557 | Value *OpVec = vectorizeTree(E->getOperand(j)); | ||||
| 5558 | LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n")do { } while (false); | ||||
| 5559 | OpVecs.push_back(OpVec); | ||||
| 5560 | } | ||||
| 5561 | |||||
| 5562 | Function *CF; | ||||
| 5563 | if (!UseIntrinsic) { | ||||
| 5564 | VFShape Shape = | ||||
| 5565 | VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>( | ||||
| 5566 | VecTy->getNumElements())), | ||||
| 5567 | false /*HasGlobalPred*/); | ||||
| 5568 | CF = VFDatabase(*CI).getVectorizedFunction(Shape); | ||||
| 5569 | } else { | ||||
| 5570 | CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl); | ||||
| 5571 | } | ||||
| 5572 | |||||
| 5573 | SmallVector<OperandBundleDef, 1> OpBundles; | ||||
| 5574 | CI->getOperandBundlesAsDefs(OpBundles); | ||||
| 5575 | Value *V = Builder.CreateCall(CF, OpVecs, OpBundles); | ||||
| 5576 | |||||
| 5577 | // The scalar argument uses an in-tree scalar so we add the new vectorized | ||||
| 5578 | // call to ExternalUses list to make sure that an extract will be | ||||
| 5579 | // generated in the future. | ||||
| 5580 | if (ScalarArg && getTreeEntry(ScalarArg)) | ||||
| 5581 | ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); | ||||
| 5582 | |||||
| 5583 | propagateIRFlags(V, E->Scalars, VL0); | ||||
| 5584 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5585 | V = ShuffleBuilder.finalize(V); | ||||
| 5586 | |||||
| 5587 | E->VectorizedValue = V; | ||||
| 5588 | ++NumVectorInstructions; | ||||
| 5589 | return V; | ||||
| 5590 | } | ||||
| 5591 | case Instruction::ShuffleVector: { | ||||
| 5592 | assert(E->isAltShuffle() &&((void)0) | ||||
| 5593 | ((Instruction::isBinaryOp(E->getOpcode()) &&((void)0) | ||||
| 5594 | Instruction::isBinaryOp(E->getAltOpcode())) ||((void)0) | ||||
| 5595 | (Instruction::isCast(E->getOpcode()) &&((void)0) | ||||
| 5596 | Instruction::isCast(E->getAltOpcode()))) &&((void)0) | ||||
| 5597 | "Invalid Shuffle Vector Operand")((void)0); | ||||
| 5598 | |||||
| 5599 | Value *LHS = nullptr, *RHS = nullptr; | ||||
| 5600 | if (Instruction::isBinaryOp(E->getOpcode())) { | ||||
| 5601 | setInsertPointAfterBundle(E); | ||||
| 5602 | LHS = vectorizeTree(E->getOperand(0)); | ||||
| 5603 | RHS = vectorizeTree(E->getOperand(1)); | ||||
| 5604 | } else { | ||||
| 5605 | setInsertPointAfterBundle(E); | ||||
| 5606 | LHS = vectorizeTree(E->getOperand(0)); | ||||
| 5607 | } | ||||
| 5608 | |||||
| 5609 | if (E->VectorizedValue) { | ||||
| 5610 | LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { } while (false); | ||||
| 5611 | return E->VectorizedValue; | ||||
| 5612 | } | ||||
| 5613 | |||||
| 5614 | Value *V0, *V1; | ||||
| 5615 | if (Instruction::isBinaryOp(E->getOpcode())) { | ||||
| 5616 | V0 = Builder.CreateBinOp( | ||||
| 5617 | static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); | ||||
| 5618 | V1 = Builder.CreateBinOp( | ||||
| 5619 | static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); | ||||
| 5620 | } else { | ||||
| 5621 | V0 = Builder.CreateCast( | ||||
| 5622 | static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); | ||||
| 5623 | V1 = Builder.CreateCast( | ||||
| 5624 | static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); | ||||
| 5625 | } | ||||
| 5626 | |||||
| 5627 | // Create shuffle to take alternate operations from the vector. | ||||
| 5628 | // Also, gather up main and alt scalar ops to propagate IR flags to | ||||
| 5629 | // each vector operation. | ||||
| 5630 | ValueList OpScalars, AltScalars; | ||||
| 5631 | unsigned Sz = E->Scalars.size(); | ||||
| 5632 | SmallVector<int> Mask(Sz); | ||||
| 5633 | for (unsigned I = 0; I < Sz; ++I) { | ||||
| 5634 | auto *OpInst = cast<Instruction>(E->Scalars[I]); | ||||
| 5635 | assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode")((void)0); | ||||
| 5636 | if (OpInst->getOpcode() == E->getAltOpcode()) { | ||||
| 5637 | Mask[I] = Sz + I; | ||||
| 5638 | AltScalars.push_back(E->Scalars[I]); | ||||
| 5639 | } else { | ||||
| 5640 | Mask[I] = I; | ||||
| 5641 | OpScalars.push_back(E->Scalars[I]); | ||||
| 5642 | } | ||||
| 5643 | } | ||||
| 5644 | |||||
| 5645 | propagateIRFlags(V0, OpScalars); | ||||
| 5646 | propagateIRFlags(V1, AltScalars); | ||||
| 5647 | |||||
| 5648 | Value *V = Builder.CreateShuffleVector(V0, V1, Mask); | ||||
| 5649 | if (Instruction *I = dyn_cast<Instruction>(V)) | ||||
| 5650 | V = propagateMetadata(I, E->Scalars); | ||||
| 5651 | ShuffleBuilder.addMask(E->ReuseShuffleIndices); | ||||
| 5652 | V = ShuffleBuilder.finalize(V); | ||||
| 5653 | |||||
| 5654 | E->VectorizedValue = V; | ||||
| 5655 | ++NumVectorInstructions; | ||||
| 5656 | |||||
| 5657 | return V; | ||||
| 5658 | } | ||||
| 5659 | default: | ||||
| 5660 | llvm_unreachable("unknown inst")__builtin_unreachable(); | ||||
| 5661 | } | ||||
| 5662 | return nullptr; | ||||
| 5663 | } | ||||
| 5664 | |||||
| 5665 | Value *BoUpSLP::vectorizeTree() { | ||||
| 5666 | ExtraValueToDebugLocsMap ExternallyUsedValues; | ||||
| 5667 | return vectorizeTree(ExternallyUsedValues); | ||||
| 5668 | } | ||||
| 5669 | |||||
| 5670 | Value * | ||||
| 5671 | BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { | ||||
| 5672 | // All blocks must be scheduled before any instructions are inserted. | ||||
| 5673 | for (auto &BSIter : BlocksSchedules) { | ||||
| 5674 | scheduleBlock(BSIter.second.get()); | ||||
| 5675 | } | ||||
| 5676 | |||||
| 5677 | Builder.SetInsertPoint(&F->getEntryBlock().front()); | ||||
| 5678 | auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); | ||||
| 5679 | |||||
| 5680 | // If the vectorized tree can be rewritten in a smaller type, we truncate the | ||||
| 5681 | // vectorized root. InstCombine will then rewrite the entire expression. We | ||||
| 5682 | // sign extend the extracted values below. | ||||
| 5683 | auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; | ||||
| 5684 | if (MinBWs.count(ScalarRoot)) { | ||||
| 5685 | if (auto *I = dyn_cast<Instruction>(VectorRoot)) { | ||||
| 5686 | // If current instr is a phi and not the last phi, insert it after the | ||||
| 5687 | // last phi node. | ||||
| 5688 | if (isa<PHINode>(I)) | ||||
| 5689 | Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt()); | ||||
| 5690 | else | ||||
| 5691 | Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); | ||||
| 5692 | } | ||||
| 5693 | auto BundleWidth = VectorizableTree[0]->Scalars.size(); | ||||
| 5694 | auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); | ||||
| 5695 | auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); | ||||
| 5696 | auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); | ||||
| 5697 | VectorizableTree[0]->VectorizedValue = Trunc; | ||||
| 5698 | } | ||||
| 5699 | |||||
| 5700 | LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()do { } while (false) | ||||
| 5701 | << " values .\n")do { } while (false); | ||||
| 5702 | |||||
| 5703 | // Extract all of the elements with the external uses. | ||||
| 5704 | for (const auto &ExternalUse : ExternalUses) { | ||||
| 5705 | Value *Scalar = ExternalUse.Scalar; | ||||
| 5706 | llvm::User *User = ExternalUse.User; | ||||
| 5707 | |||||
| 5708 | // Skip users that we already RAUW. This happens when one instruction | ||||
| 5709 | // has multiple uses of the same value. | ||||
| 5710 | if (User && !is_contained(Scalar->users(), User)) | ||||
| 5711 | continue; | ||||
| 5712 | TreeEntry *E = getTreeEntry(Scalar); | ||||
| 5713 | assert(E && "Invalid scalar")((void)0); | ||||
| 5714 | assert(E->State != TreeEntry::NeedToGather &&((void)0) | ||||
| 5715 | "Extracting from a gather list")((void)0); | ||||
| 5716 | |||||
| 5717 | Value *Vec = E->VectorizedValue; | ||||
| 5718 | assert(Vec && "Can't find vectorizable value")((void)0); | ||||
| 5719 | |||||
| 5720 | Value *Lane = Builder.getInt32(ExternalUse.Lane); | ||||
| 5721 | auto ExtractAndExtendIfNeeded = [&](Value *Vec) { | ||||
| 5722 | if (Scalar->getType() != Vec->getType()) { | ||||
| 5723 | Value *Ex; | ||||
| 5724 | // "Reuse" the existing extract to improve final codegen. | ||||
| 5725 | if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { | ||||
| 5726 | Ex = Builder.CreateExtractElement(ES->getOperand(0), | ||||
| 5727 | ES->getOperand(1)); | ||||
| 5728 | } else { | ||||
| 5729 | Ex = Builder.CreateExtractElement(Vec, Lane); | ||||
| 5730 | } | ||||
| 5731 | // If necessary, sign-extend or zero-extend ScalarRoot | ||||
| 5732 | // to the larger type. | ||||
| 5733 | if (!MinBWs.count(ScalarRoot)) | ||||
| 5734 | return Ex; | ||||
| 5735 | if (MinBWs[ScalarRoot].second) | ||||
| 5736 | return Builder.CreateSExt(Ex, Scalar->getType()); | ||||
| 5737 | return Builder.CreateZExt(Ex, Scalar->getType()); | ||||
| 5738 | } | ||||
| 5739 | assert(isa<FixedVectorType>(Scalar->getType()) &&((void)0) | ||||
| 5740 | isa<InsertElementInst>(Scalar) &&((void)0) | ||||
| 5741 | "In-tree scalar of vector type is not insertelement?")((void)0); | ||||
| 5742 | return Vec; | ||||
| 5743 | }; | ||||
| 5744 | // If User == nullptr, the Scalar is used as extra arg. Generate | ||||
| 5745 | // ExtractElement instruction and update the record for this scalar in | ||||
| 5746 | // ExternallyUsedValues. | ||||
| 5747 | if (!User) { | ||||
| 5748 | assert(ExternallyUsedValues.count(Scalar) &&((void)0) | ||||
| 5749 | "Scalar with nullptr as an external user must be registered in "((void)0) | ||||
| 5750 | "ExternallyUsedValues map")((void)0); | ||||
| 5751 | if (auto *VecI = dyn_cast<Instruction>(Vec)) { | ||||
| 5752 | Builder.SetInsertPoint(VecI->getParent(), | ||||
| 5753 | std::next(VecI->getIterator())); | ||||
| 5754 | } else { | ||||
| 5755 | Builder.SetInsertPoint(&F->getEntryBlock().front()); | ||||
| 5756 | } | ||||
| 5757 | Value *NewInst = ExtractAndExtendIfNeeded(Vec); | ||||
| 5758 | CSEBlocks.insert(cast<Instruction>(Scalar)->getParent()); | ||||
| 5759 | auto &NewInstLocs = ExternallyUsedValues[NewInst]; | ||||
| 5760 | auto It = ExternallyUsedValues.find(Scalar); | ||||
| 5761 | assert(It != ExternallyUsedValues.end() &&((void)0) | ||||
| 5762 | "Externally used scalar is not found in ExternallyUsedValues")((void)0); | ||||
| 5763 | NewInstLocs.append(It->second); | ||||
| 5764 | ExternallyUsedValues.erase(Scalar); | ||||
| 5765 | // Required to update internally referenced instructions. | ||||
| 5766 | Scalar->replaceAllUsesWith(NewInst); | ||||
| 5767 | continue; | ||||
| 5768 | } | ||||
| 5769 | |||||
| 5770 | // Generate extracts for out-of-tree users. | ||||
| 5771 | // Find the insertion point for the extractelement lane. | ||||
| 5772 | if (auto *VecI = dyn_cast<Instruction>(Vec)) { | ||||
| 5773 | if (PHINode *PH = dyn_cast<PHINode>(User)) { | ||||
| 5774 | for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { | ||||
| 5775 | if (PH->getIncomingValue(i) == Scalar) { | ||||
| 5776 | Instruction *IncomingTerminator = | ||||
| 5777 | PH->getIncomingBlock(i)->getTerminator(); | ||||
| 5778 | if (isa<CatchSwitchInst>(IncomingTerminator)) { | ||||
| 5779 | Builder.SetInsertPoint(VecI->getParent(), | ||||
| 5780 | std::next(VecI->getIterator())); | ||||
| 5781 | } else { | ||||
| 5782 | Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); | ||||
| 5783 | } | ||||
| 5784 | Value *NewInst = ExtractAndExtendIfNeeded(Vec); | ||||
| 5785 | CSEBlocks.insert(PH->getIncomingBlock(i)); | ||||
| 5786 | PH->setOperand(i, NewInst); | ||||
| 5787 | } | ||||
| 5788 | } | ||||
| 5789 | } else { | ||||
| 5790 | Builder.SetInsertPoint(cast<Instruction>(User)); | ||||
| 5791 | Value *NewInst = ExtractAndExtendIfNeeded(Vec); | ||||
| 5792 | CSEBlocks.insert(cast<Instruction>(User)->getParent()); | ||||
| 5793 | User->replaceUsesOfWith(Scalar, NewInst); | ||||
| 5794 | } | ||||
| 5795 | } else { | ||||
| 5796 | Builder.SetInsertPoint(&F->getEntryBlock().front()); | ||||
| 5797 | Value *NewInst = ExtractAndExtendIfNeeded(Vec); | ||||
| 5798 | CSEBlocks.insert(&F->getEntryBlock()); | ||||
| 5799 | User->replaceUsesOfWith(Scalar, NewInst); | ||||
| 5800 | } | ||||
| 5801 | |||||
| 5802 | LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n")do { } while (false); | ||||
| 5803 | } | ||||
| 5804 | |||||
| 5805 | // For each vectorized value: | ||||
| 5806 | for (auto &TEPtr : VectorizableTree) { | ||||
| 5807 | TreeEntry *Entry = TEPtr.get(); | ||||
| 5808 | |||||
| 5809 | // No need to handle users of gathered values. | ||||
| 5810 | if (Entry->State == TreeEntry::NeedToGather) | ||||
| 5811 | continue; | ||||
| 5812 | |||||
| 5813 | assert(Entry->VectorizedValue && "Can't find vectorizable value")((void)0); | ||||
| 5814 | |||||
| 5815 | // For each lane: | ||||
| 5816 | for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { | ||||
| 5817 | Value *Scalar = Entry->Scalars[Lane]; | ||||
| 5818 | |||||
| 5819 | #ifndef NDEBUG1 | ||||
| 5820 | Type *Ty = Scalar->getType(); | ||||
| 5821 | if (!Ty->isVoidTy()) { | ||||
| 5822 | for (User *U : Scalar->users()) { | ||||
| 5823 | LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n")do { } while (false); | ||||
| 5824 | |||||
| 5825 | // It is legal to delete users in the ignorelist. | ||||
| 5826 | assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&((void)0) | ||||
| 5827 | "Deleting out-of-tree value")((void)0); | ||||
| 5828 | } | ||||
| 5829 | } | ||||
| 5830 | #endif | ||||
| 5831 | LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n")do { } while (false); | ||||
| 5832 | eraseInstruction(cast<Instruction>(Scalar)); | ||||
| 5833 | } | ||||
| 5834 | } | ||||
| 5835 | |||||
| 5836 | Builder.ClearInsertionPoint(); | ||||
| 5837 | InstrElementSize.clear(); | ||||
| 5838 | |||||
| 5839 | return VectorizableTree[0]->VectorizedValue; | ||||
| 5840 | } | ||||
| 5841 | |||||
| 5842 | void BoUpSLP::optimizeGatherSequence() { | ||||
| 5843 | LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()do { } while (false) | ||||
| 5844 | << " gather sequences instructions.\n")do { } while (false); | ||||
| 5845 | // LICM InsertElementInst sequences. | ||||
| 5846 | for (Instruction *I : GatherSeq) { | ||||
| 5847 | if (isDeleted(I)) | ||||
| 5848 | continue; | ||||
| 5849 | |||||
| 5850 | // Check if this block is inside a loop. | ||||
| 5851 | Loop *L = LI->getLoopFor(I->getParent()); | ||||
| 5852 | if (!L) | ||||
| 5853 | continue; | ||||
| 5854 | |||||
| 5855 | // Check if it has a preheader. | ||||
| 5856 | BasicBlock *PreHeader = L->getLoopPreheader(); | ||||
| 5857 | if (!PreHeader) | ||||
| 5858 | continue; | ||||
| 5859 | |||||
| 5860 | // If the vector or the element that we insert into it are | ||||
| 5861 | // instructions that are defined in this basic block then we can't | ||||
| 5862 | // hoist this instruction. | ||||
| 5863 | auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); | ||||
| 5864 | auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); | ||||
| 5865 | if (Op0 && L->contains(Op0)) | ||||
| 5866 | continue; | ||||
| 5867 | if (Op1 && L->contains(Op1)) | ||||
| 5868 | continue; | ||||
| 5869 | |||||
| 5870 | // We can hoist this instruction. Move it to the pre-header. | ||||
| 5871 | I->moveBefore(PreHeader->getTerminator()); | ||||
| 5872 | } | ||||
| 5873 | |||||
| 5874 | // Make a list of all reachable blocks in our CSE queue. | ||||
| 5875 | SmallVector<const DomTreeNode *, 8> CSEWorkList; | ||||
| 5876 | CSEWorkList.reserve(CSEBlocks.size()); | ||||
| 5877 | for (BasicBlock *BB : CSEBlocks) | ||||
| 5878 | if (DomTreeNode *N = DT->getNode(BB)) { | ||||
| 5879 | assert(DT->isReachableFromEntry(N))((void)0); | ||||
| 5880 | CSEWorkList.push_back(N); | ||||
| 5881 | } | ||||
| 5882 | |||||
| 5883 | // Sort blocks by domination. This ensures we visit a block after all blocks | ||||
| 5884 | // dominating it are visited. | ||||
| 5885 | llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) { | ||||
| 5886 | assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&((void)0) | ||||
| 5887 | "Different nodes should have different DFS numbers")((void)0); | ||||
| 5888 | return A->getDFSNumIn() < B->getDFSNumIn(); | ||||
| 5889 | }); | ||||
| 5890 | |||||
| 5891 | // Perform O(N^2) search over the gather sequences and merge identical | ||||
| 5892 | // instructions. TODO: We can further optimize this scan if we split the | ||||
| 5893 | // instructions into different buckets based on the insert lane. | ||||
| 5894 | SmallVector<Instruction *, 16> Visited; | ||||
| 5895 | for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { | ||||
| 5896 | assert(*I &&((void)0) | ||||
| 5897 | (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&((void)0) | ||||
| 5898 | "Worklist not sorted properly!")((void)0); | ||||
| 5899 | BasicBlock *BB = (*I)->getBlock(); | ||||
| 5900 | // For all instructions in blocks containing gather sequences: | ||||
| 5901 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { | ||||
| 5902 | Instruction *In = &*it++; | ||||
| 5903 | if (isDeleted(In)) | ||||
| 5904 | continue; | ||||
| 5905 | if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) | ||||
| 5906 | continue; | ||||
| 5907 | |||||
| 5908 | // Check if we can replace this instruction with any of the | ||||
| 5909 | // visited instructions. | ||||
| 5910 | for (Instruction *v : Visited) { | ||||
| 5911 | if (In->isIdenticalTo(v) && | ||||
| 5912 | DT->dominates(v->getParent(), In->getParent())) { | ||||
| 5913 | In->replaceAllUsesWith(v); | ||||
| 5914 | eraseInstruction(In); | ||||
| 5915 | In = nullptr; | ||||
| 5916 | break; | ||||
| 5917 | } | ||||
| 5918 | } | ||||
| 5919 | if (In) { | ||||
| 5920 | assert(!is_contained(Visited, In))((void)0); | ||||
| 5921 | Visited.push_back(In); | ||||
| 5922 | } | ||||
| 5923 | } | ||||
| 5924 | } | ||||
| 5925 | CSEBlocks.clear(); | ||||
| 5926 | GatherSeq.clear(); | ||||
| 5927 | } | ||||
| 5928 | |||||
| 5929 | // Groups the instructions to a bundle (which is then a single scheduling entity) | ||||
| 5930 | // and schedules instructions until the bundle gets ready. | ||||
| 5931 | Optional<BoUpSLP::ScheduleData *> | ||||
| 5932 | BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, | ||||
| 5933 | const InstructionsState &S) { | ||||
| 5934 | if (isa<PHINode>(S.OpValue) || isa<InsertElementInst>(S.OpValue)) | ||||
| 5935 | return nullptr; | ||||
| 5936 | |||||
| 5937 | // Initialize the instruction bundle. | ||||
| 5938 | Instruction *OldScheduleEnd = ScheduleEnd; | ||||
| 5939 | ScheduleData *PrevInBundle = nullptr; | ||||
| 5940 | ScheduleData *Bundle = nullptr; | ||||
| 5941 | bool ReSchedule = false; | ||||
| 5942 | LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n")do { } while (false); | ||||
| 5943 | |||||
| 5944 | auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule, | ||||
| 5945 | ScheduleData *Bundle) { | ||||
| 5946 | // The scheduling region got new instructions at the lower end (or it is a | ||||
| 5947 | // new region for the first bundle). This makes it necessary to | ||||
| 5948 | // recalculate all dependencies. | ||||
| 5949 | // It is seldom that this needs to be done a second time after adding the | ||||
| 5950 | // initial bundle to the region. | ||||
| 5951 | if (ScheduleEnd != OldScheduleEnd) { | ||||
| 5952 | for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) | ||||
| 5953 | doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); | ||||
| 5954 | ReSchedule = true; | ||||
| 5955 | } | ||||
| 5956 | if (ReSchedule) { | ||||
| 5957 | resetSchedule(); | ||||
| 5958 | initialFillReadyList(ReadyInsts); | ||||
| 5959 | } | ||||
| 5960 | if (Bundle) { | ||||
| 5961 | LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundledo { } while (false) | ||||
| 5962 | << " in block " << BB->getName() << "\n")do { } while (false); | ||||
| 5963 | calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP); | ||||
| 5964 | } | ||||
| 5965 | |||||
| 5966 | // Now try to schedule the new bundle or (if no bundle) just calculate | ||||
| 5967 | // dependencies. As soon as the bundle is "ready" it means that there are no | ||||
| 5968 | // cyclic dependencies and we can schedule it. Note that's important that we | ||||
| 5969 | // don't "schedule" the bundle yet (see cancelScheduling). | ||||
| 5970 | while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) && | ||||
| 5971 | !ReadyInsts.empty()) { | ||||
| 5972 | ScheduleData *Picked = ReadyInsts.pop_back_val(); | ||||
| 5973 | if (Picked->isSchedulingEntity() && Picked->isReady()) | ||||
| 5974 | schedule(Picked, ReadyInsts); | ||||
| 5975 | } | ||||
| 5976 | }; | ||||
| 5977 | |||||
| 5978 | // Make sure that the scheduling region contains all | ||||
| 5979 | // instructions of the bundle. | ||||
| 5980 | for (Value *V : VL) { | ||||
| 5981 | if (!extendSchedulingRegion(V, S)) { | ||||
| 5982 | // If the scheduling region got new instructions at the lower end (or it | ||||
| 5983 | // is a new region for the first bundle). This makes it necessary to | ||||
| 5984 | // recalculate all dependencies. | ||||
| 5985 | // Otherwise the compiler may crash trying to incorrectly calculate | ||||
| 5986 | // dependencies and emit instruction in the wrong order at the actual | ||||
| 5987 | // scheduling. | ||||
| 5988 | TryScheduleBundle(/*ReSchedule=*/false, nullptr); | ||||
| 5989 | return None; | ||||
| 5990 | } | ||||
| 5991 | } | ||||
| 5992 | |||||
| 5993 | for (Value *V : VL) { | ||||
| 5994 | ScheduleData *BundleMember = getScheduleData(V); | ||||
| 5995 | assert(BundleMember &&((void)0) | ||||
| 5996 | "no ScheduleData for bundle member (maybe not in same basic block)")((void)0); | ||||
| 5997 | if (BundleMember->IsScheduled) { | ||||
| 5998 | // A bundle member was scheduled as single instruction before and now | ||||
| 5999 | // needs to be scheduled as part of the bundle. We just get rid of the | ||||
| 6000 | // existing schedule. | ||||
| 6001 | LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMemberdo { } while (false) | ||||
| 6002 | << " was already scheduled\n")do { } while (false); | ||||
| 6003 | ReSchedule = true; | ||||
| 6004 | } | ||||
| 6005 | assert(BundleMember->isSchedulingEntity() &&((void)0) | ||||
| 6006 | "bundle member already part of other bundle")((void)0); | ||||
| 6007 | if (PrevInBundle) { | ||||
| 6008 | PrevInBundle->NextInBundle = BundleMember; | ||||
| 6009 | } else { | ||||
| 6010 | Bundle = BundleMember; | ||||
| 6011 | } | ||||
| 6012 | BundleMember->UnscheduledDepsInBundle = 0; | ||||
| 6013 | Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; | ||||
| 6014 | |||||
| 6015 | // Group the instructions to a bundle. | ||||
| 6016 | BundleMember->FirstInBundle = Bundle; | ||||
| 6017 | PrevInBundle = BundleMember; | ||||
| 6018 | } | ||||
| 6019 | assert(Bundle && "Failed to find schedule bundle")((void)0); | ||||
| 6020 | TryScheduleBundle(ReSchedule, Bundle); | ||||
| 6021 | if (!Bundle->isReady()) { | ||||
| 6022 | cancelScheduling(VL, S.OpValue); | ||||
| 6023 | return None; | ||||
| 6024 | } | ||||
| 6025 | return Bundle; | ||||
| 6026 | } | ||||
| 6027 | |||||
| 6028 | void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, | ||||
| 6029 | Value *OpValue) { | ||||
| 6030 | if (isa<PHINode>(OpValue) || isa<InsertElementInst>(OpValue)) | ||||
| 6031 | return; | ||||
| 6032 | |||||
| 6033 | ScheduleData *Bundle = getScheduleData(OpValue); | ||||
| 6034 | LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n")do { } while (false); | ||||
| 6035 | assert(!Bundle->IsScheduled &&((void)0) | ||||
| 6036 | "Can't cancel bundle which is already scheduled")((void)0); | ||||
| 6037 | assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&((void)0) | ||||
| 6038 | "tried to unbundle something which is not a bundle")((void)0); | ||||
| 6039 | |||||
| 6040 | // Un-bundle: make single instructions out of the bundle. | ||||
| 6041 | ScheduleData *BundleMember = Bundle; | ||||
| 6042 | while (BundleMember) { | ||||
| 6043 | assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links")((void)0); | ||||
| 6044 | BundleMember->FirstInBundle = BundleMember; | ||||
| 6045 | ScheduleData *Next = BundleMember->NextInBundle; | ||||
| 6046 | BundleMember->NextInBundle = nullptr; | ||||
| 6047 | BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; | ||||
| 6048 | if (BundleMember->UnscheduledDepsInBundle == 0) { | ||||
| 6049 | ReadyInsts.insert(BundleMember); | ||||
| 6050 | } | ||||
| 6051 | BundleMember = Next; | ||||
| 6052 | } | ||||
| 6053 | } | ||||
| 6054 | |||||
| 6055 | BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { | ||||
| 6056 | // Allocate a new ScheduleData for the instruction. | ||||
| 6057 | if (ChunkPos >= ChunkSize) { | ||||
| 6058 | ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); | ||||
| 6059 | ChunkPos = 0; | ||||
| 6060 | } | ||||
| 6061 | return &(ScheduleDataChunks.back()[ChunkPos++]); | ||||
| 6062 | } | ||||
| 6063 | |||||
| 6064 | bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, | ||||
| 6065 | const InstructionsState &S) { | ||||
| 6066 | if (getScheduleData(V, isOneOf(S, V))) | ||||
| |||||
| 6067 | return true; | ||||
| 6068 | Instruction *I = dyn_cast<Instruction>(V); | ||||
| 6069 | assert(I && "bundle member must be an instruction")((void)0); | ||||
| 6070 | assert(!isa<PHINode>(I) && !isa<InsertElementInst>(I) &&((void)0) | ||||
| 6071 | "phi nodes/insertelements don't need to be scheduled")((void)0); | ||||
| 6072 | auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { | ||||
| 6073 | ScheduleData *ISD = getScheduleData(I); | ||||
| 6074 | if (!ISD
| ||||
| 6075 | return false; | ||||
| 6076 | assert(isInSchedulingRegion(ISD) &&((void)0) | ||||
| 6077 | "ScheduleData not in scheduling region")((void)0); | ||||
| 6078 | ScheduleData *SD = allocateScheduleDataChunks(); | ||||
| 6079 | SD->Inst = I; | ||||
| 6080 | SD->init(SchedulingRegionID, S.OpValue); | ||||
| 6081 | ExtraScheduleDataMap[I][S.OpValue] = SD; | ||||
| 6082 | return true; | ||||
| 6083 | }; | ||||
| 6084 | if (CheckSheduleForI(I)) | ||||
| 6085 | return true; | ||||
| 6086 | if (!ScheduleStart) { | ||||
| 6087 | // It's the first instruction in the new region. | ||||
| 6088 | initScheduleData(I, I->getNextNode(), nullptr, nullptr); | ||||
| 6089 | ScheduleStart = I; | ||||
| 6090 | ScheduleEnd = I->getNextNode(); | ||||
| 6091 | if (isOneOf(S, I) != I) | ||||
| 6092 | CheckSheduleForI(I); | ||||
| 6093 | assert(ScheduleEnd && "tried to vectorize a terminator?")((void)0); | ||||
| 6094 | LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n")do { } while (false); | ||||
| 6095 | return true; | ||||
| 6096 | } | ||||
| 6097 | // Search up and down at the same time, because we don't know if the new | ||||
| 6098 | // instruction is above or below the existing scheduling region. | ||||
| 6099 | BasicBlock::reverse_iterator UpIter = | ||||
| 6100 | ++ScheduleStart->getIterator().getReverse(); | ||||
| 6101 | BasicBlock::reverse_iterator UpperEnd = BB->rend(); | ||||
| 6102 | BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); | ||||
| 6103 | BasicBlock::iterator LowerEnd = BB->end(); | ||||
| 6104 | while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I && | ||||
| 6105 | &*DownIter != I) { | ||||
| 6106 | if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { | ||||
| 6107 | LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n")do { } while (false); | ||||
| 6108 | return false; | ||||
| 6109 | } | ||||
| 6110 | |||||
| 6111 | ++UpIter; | ||||
| 6112 | ++DownIter; | ||||
| 6113 | } | ||||
| 6114 | if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) { | ||||
| 6115 | assert(I->getParent() == ScheduleStart->getParent() &&((void)0) | ||||
| 6116 | "Instruction is in wrong basic block.")((void)0); | ||||
| 6117 | initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); | ||||
| 6118 | ScheduleStart = I; | ||||
| 6119 | if (isOneOf(S, I) != I) | ||||
| 6120 | CheckSheduleForI(I); | ||||
| 6121 | LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *Ido { } while (false) | ||||
| 6122 | << "\n")do { } while (false); | ||||
| 6123 | return true; | ||||
| 6124 | } | ||||
| 6125 | assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&((void)0) | ||||
| 6126 | "Expected to reach top of the basic block or instruction down the "((void)0) | ||||
| 6127 | "lower end.")((void)0); | ||||
| 6128 | assert(I->getParent() == ScheduleEnd->getParent() &&((void)0) | ||||
| 6129 | "Instruction is in wrong basic block.")((void)0); | ||||
| 6130 | initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, | ||||
| |||||
| 6131 | nullptr); | ||||
| 6132 | ScheduleEnd = I->getNextNode(); | ||||
| 6133 | if (isOneOf(S, I) != I) | ||||
| 6134 | CheckSheduleForI(I); | ||||
| 6135 | assert(ScheduleEnd && "tried to vectorize a terminator?")((void)0); | ||||
| 6136 | LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n")do { } while (false); | ||||
| 6137 | return true; | ||||
| 6138 | } | ||||
| 6139 | |||||
| 6140 | void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, | ||||
| 6141 | Instruction *ToI, | ||||
| 6142 | ScheduleData *PrevLoadStore, | ||||
| 6143 | ScheduleData *NextLoadStore) { | ||||
| 6144 | ScheduleData *CurrentLoadStore = PrevLoadStore; | ||||
| 6145 | for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { | ||||
| 6146 | ScheduleData *SD = ScheduleDataMap[I]; | ||||
| 6147 | if (!SD) { | ||||
| 6148 | SD = allocateScheduleDataChunks(); | ||||
| 6149 | ScheduleDataMap[I] = SD; | ||||
| 6150 | SD->Inst = I; | ||||
| 6151 | } | ||||
| 6152 | assert(!isInSchedulingRegion(SD) &&((void)0) | ||||
| 6153 | "new ScheduleData already in scheduling region")((void)0); | ||||
| 6154 | SD->init(SchedulingRegionID, I); | ||||
| 6155 | |||||
| 6156 | if (I->mayReadOrWriteMemory() && | ||||
| 6157 | (!isa<IntrinsicInst>(I) || | ||||
| 6158 | (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect && | ||||
| 6159 | cast<IntrinsicInst>(I)->getIntrinsicID() != | ||||
| 6160 | Intrinsic::pseudoprobe))) { | ||||
| 6161 | // Update the linked list of memory accessing instructions. | ||||
| 6162 | if (CurrentLoadStore) { | ||||
| 6163 | CurrentLoadStore->NextLoadStore = SD; | ||||
| 6164 | } else { | ||||
| 6165 | FirstLoadStoreInRegion = SD; | ||||
| 6166 | } | ||||
| 6167 | CurrentLoadStore = SD; | ||||
| 6168 | } | ||||
| 6169 | } | ||||
| 6170 | if (NextLoadStore) { | ||||
| 6171 | if (CurrentLoadStore) | ||||
| 6172 | CurrentLoadStore->NextLoadStore = NextLoadStore; | ||||
| 6173 | } else { | ||||
| 6174 | LastLoadStoreInRegion = CurrentLoadStore; | ||||
| 6175 | } | ||||
| 6176 | } | ||||
| 6177 | |||||
| 6178 | void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, | ||||
| 6179 | bool InsertInReadyList, | ||||
| 6180 | BoUpSLP *SLP) { | ||||
| 6181 | assert(SD->isSchedulingEntity())((void)0); | ||||
| 6182 | |||||
| 6183 | SmallVector<ScheduleData *, 10> WorkList; | ||||
| 6184 | WorkList.push_back(SD); | ||||
| 6185 | |||||
| 6186 | while (!WorkList.empty()) { | ||||
| 6187 | ScheduleData *SD = WorkList.pop_back_val(); | ||||
| 6188 | |||||
| 6189 | ScheduleData *BundleMember = SD; | ||||
| 6190 | while (BundleMember) { | ||||
| 6191 | assert(isInSchedulingRegion(BundleMember))((void)0); | ||||
| 6192 | if (!BundleMember->hasValidDependencies()) { | ||||
| 6193 | |||||
| 6194 | LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMemberdo { } while (false) | ||||
| 6195 | << "\n")do { } while (false); | ||||
| 6196 | BundleMember->Dependencies = 0; | ||||
| 6197 | BundleMember->resetUnscheduledDeps(); | ||||
| 6198 | |||||
| 6199 | // Handle def-use chain dependencies. | ||||
| 6200 | if (BundleMember->OpValue != BundleMember->Inst) { | ||||
| 6201 | ScheduleData *UseSD = getScheduleData(BundleMember->Inst); | ||||
| 6202 | if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { | ||||
| 6203 | BundleMember->Dependencies++; | ||||
| 6204 | ScheduleData *DestBundle = UseSD->FirstInBundle; | ||||
| 6205 | if (!DestBundle->IsScheduled) | ||||
| 6206 | BundleMember->incrementUnscheduledDeps(1); | ||||
| 6207 | if (!DestBundle->hasValidDependencies()) | ||||
| 6208 | WorkList.push_back(DestBundle); | ||||
| 6209 | } | ||||
| 6210 | } else { | ||||
| 6211 | for (User *U : BundleMember->Inst->users()) { | ||||
| 6212 | if (isa<Instruction>(U)) { | ||||
| 6213 | ScheduleData *UseSD = getScheduleData(U); | ||||
| 6214 | if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { | ||||
| 6215 | BundleMember->Dependencies++; | ||||
| 6216 | ScheduleData *DestBundle = UseSD->FirstInBundle; | ||||
| 6217 | if (!DestBundle->IsScheduled) | ||||
| 6218 | BundleMember->incrementUnscheduledDeps(1); | ||||
| 6219 | if (!DestBundle->hasValidDependencies()) | ||||
| 6220 | WorkList.push_back(DestBundle); | ||||
| 6221 | } | ||||
| 6222 | } else { | ||||
| 6223 | // I'm not sure if this can ever happen. But we need to be safe. | ||||
| 6224 | // This lets the instruction/bundle never be scheduled and | ||||
| 6225 | // eventually disable vectorization. | ||||
| 6226 | BundleMember->Dependencies++; | ||||
| 6227 | BundleMember->incrementUnscheduledDeps(1); | ||||
| 6228 | } | ||||
| 6229 | } | ||||
| 6230 | } | ||||
| 6231 | |||||
| 6232 | // Handle the memory dependencies. | ||||
| 6233 | ScheduleData *DepDest = BundleMember->NextLoadStore; | ||||
| 6234 | if (DepDest) { | ||||
| 6235 | Instruction *SrcInst = BundleMember->Inst; | ||||
| 6236 | MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); | ||||
| 6237 | bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); | ||||
| 6238 | unsigned numAliased = 0; | ||||
| 6239 | unsigned DistToSrc = 1; | ||||
| 6240 | |||||
| 6241 | while (DepDest) { | ||||
| 6242 | assert(isInSchedulingRegion(DepDest))((void)0); | ||||
| 6243 | |||||
| 6244 | // We have two limits to reduce the complexity: | ||||
| 6245 | // 1) AliasedCheckLimit: It's a small limit to reduce calls to | ||||
| 6246 | // SLP->isAliased (which is the expensive part in this loop). | ||||
| 6247 | // 2) MaxMemDepDistance: It's for very large blocks and it aborts | ||||
| 6248 | // the whole loop (even if the loop is fast, it's quadratic). | ||||
| 6249 | // It's important for the loop break condition (see below) to | ||||
| 6250 | // check this limit even between two read-only instructions. | ||||
| 6251 | if (DistToSrc >= MaxMemDepDistance || | ||||
| 6252 | ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && | ||||
| 6253 | (numAliased >= AliasedCheckLimit || | ||||
| 6254 | SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { | ||||
| 6255 | |||||
| 6256 | // We increment the counter only if the locations are aliased | ||||
| 6257 | // (instead of counting all alias checks). This gives a better | ||||
| 6258 | // balance between reduced runtime and accurate dependencies. | ||||
| 6259 | numAliased++; | ||||
| 6260 | |||||
| 6261 | DepDest->MemoryDependencies.push_back(BundleMember); | ||||
| 6262 | BundleMember->Dependencies++; | ||||
| 6263 | ScheduleData *DestBundle = DepDest->FirstInBundle; | ||||
| 6264 | if (!DestBundle->IsScheduled) { | ||||
| 6265 | BundleMember->incrementUnscheduledDeps(1); | ||||
| 6266 | } | ||||
| 6267 | if (!DestBundle->hasValidDependencies()) { | ||||
| 6268 | WorkList.push_back(DestBundle); | ||||
| 6269 | } | ||||
| 6270 | } | ||||
| 6271 | DepDest = DepDest->NextLoadStore; | ||||
| 6272 | |||||
| 6273 | // Example, explaining the loop break condition: Let's assume our | ||||
| 6274 | // starting instruction is i0 and MaxMemDepDistance = 3. | ||||
| 6275 | // | ||||
| 6276 | // +--------v--v--v | ||||
| 6277 | // i0,i1,i2,i3,i4,i5,i6,i7,i8 | ||||
| 6278 | // +--------^--^--^ | ||||
| 6279 | // | ||||
| 6280 | // MaxMemDepDistance let us stop alias-checking at i3 and we add | ||||
| 6281 | // dependencies from i0 to i3,i4,.. (even if they are not aliased). | ||||
| 6282 | // Previously we already added dependencies from i3 to i6,i7,i8 | ||||
| 6283 | // (because of MaxMemDepDistance). As we added a dependency from | ||||
| 6284 | // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 | ||||
| 6285 | // and we can abort this loop at i6. | ||||
| 6286 | if (DistToSrc >= 2 * MaxMemDepDistance) | ||||
| 6287 | break; | ||||
| 6288 | DistToSrc++; | ||||
| 6289 | } | ||||
| 6290 | } | ||||
| 6291 | } | ||||
| 6292 | BundleMember = BundleMember->NextInBundle; | ||||
| 6293 | } | ||||
| 6294 | if (InsertInReadyList && SD->isReady()) { | ||||
| 6295 | ReadyInsts.push_back(SD); | ||||
| 6296 | LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Instdo { } while (false) | ||||
| 6297 | << "\n")do { } while (false); | ||||
| 6298 | } | ||||
| 6299 | } | ||||
| 6300 | } | ||||
| 6301 | |||||
| 6302 | void BoUpSLP::BlockScheduling::resetSchedule() { | ||||
| 6303 | assert(ScheduleStart &&((void)0) | ||||
| 6304 | "tried to reset schedule on block which has not been scheduled")((void)0); | ||||
| 6305 | for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { | ||||
| 6306 | doForAllOpcodes(I, [&](ScheduleData *SD) { | ||||
| 6307 | assert(isInSchedulingRegion(SD) &&((void)0) | ||||
| 6308 | "ScheduleData not in scheduling region")((void)0); | ||||
| 6309 | SD->IsScheduled = false; | ||||
| 6310 | SD->resetUnscheduledDeps(); | ||||
| 6311 | }); | ||||
| 6312 | } | ||||
| 6313 | ReadyInsts.clear(); | ||||
| 6314 | } | ||||
| 6315 | |||||
| 6316 | void BoUpSLP::scheduleBlock(BlockScheduling *BS) { | ||||
| 6317 | if (!BS->ScheduleStart) | ||||
| 6318 | return; | ||||
| 6319 | |||||
| 6320 | LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n")do { } while (false); | ||||
| 6321 | |||||
| 6322 | BS->resetSchedule(); | ||||
| 6323 | |||||
| 6324 | // For the real scheduling we use a more sophisticated ready-list: it is | ||||
| 6325 | // sorted by the original instruction location. This lets the final schedule | ||||
| 6326 | // be as close as possible to the original instruction order. | ||||
| 6327 | struct ScheduleDataCompare { | ||||
| 6328 | bool operator()(ScheduleData *SD1, ScheduleData *SD2) const { | ||||
| 6329 | return SD2->SchedulingPriority < SD1->SchedulingPriority; | ||||
| 6330 | } | ||||
| 6331 | }; | ||||
| 6332 | std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; | ||||
| 6333 | |||||
| 6334 | // Ensure that all dependency data is updated and fill the ready-list with | ||||
| 6335 | // initial instructions. | ||||
| 6336 | int Idx = 0; | ||||
| 6337 | int NumToSchedule = 0; | ||||
| 6338 | for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; | ||||
| 6339 | I = I->getNextNode()) { | ||||
| 6340 | BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { | ||||
| 6341 | assert((isa<InsertElementInst>(SD->Inst) ||((void)0) | ||||
| 6342 | SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&((void)0) | ||||
| 6343 | "scheduler and vectorizer bundle mismatch")((void)0); | ||||
| 6344 | SD->FirstInBundle->SchedulingPriority = Idx++; | ||||
| 6345 | if (SD->isSchedulingEntity()) { | ||||
| 6346 | BS->calculateDependencies(SD, false, this); | ||||
| 6347 | NumToSchedule++; | ||||
| 6348 | } | ||||
| 6349 | }); | ||||
| 6350 | } | ||||
| 6351 | BS->initialFillReadyList(ReadyInsts); | ||||
| 6352 | |||||
| 6353 | Instruction *LastScheduledInst = BS->ScheduleEnd; | ||||
| 6354 | |||||
| 6355 | // Do the "real" scheduling. | ||||
| 6356 | while (!ReadyInsts.empty()) { | ||||
| 6357 | ScheduleData *picked = *ReadyInsts.begin(); | ||||
| 6358 | ReadyInsts.erase(ReadyInsts.begin()); | ||||
| 6359 | |||||
| 6360 | // Move the scheduled instruction(s) to their dedicated places, if not | ||||
| 6361 | // there yet. | ||||
| 6362 | ScheduleData *BundleMember = picked; | ||||
| 6363 | while (BundleMember) { | ||||
| 6364 | Instruction *pickedInst = BundleMember->Inst; | ||||
| 6365 | if (pickedInst->getNextNode() != LastScheduledInst) { | ||||
| 6366 | BS->BB->getInstList().remove(pickedInst); | ||||
| 6367 | BS->BB->getInstList().insert(LastScheduledInst->getIterator(), | ||||
| 6368 | pickedInst); | ||||
| 6369 | } | ||||
| 6370 | LastScheduledInst = pickedInst; | ||||
| 6371 | BundleMember = BundleMember->NextInBundle; | ||||
| 6372 | } | ||||
| 6373 | |||||
| 6374 | BS->schedule(picked, ReadyInsts); | ||||
| 6375 | NumToSchedule--; | ||||
| 6376 | } | ||||
| 6377 | assert(NumToSchedule == 0 && "could not schedule all instructions")((void)0); | ||||
| 6378 | |||||
| 6379 | // Avoid duplicate scheduling of the block. | ||||
| 6380 | BS->ScheduleStart = nullptr; | ||||
| 6381 | } | ||||
| 6382 | |||||
| 6383 | unsigned BoUpSLP::getVectorElementSize(Value *V) { | ||||
| 6384 | // If V is a store, just return the width of the stored value (or value | ||||
| 6385 | // truncated just before storing) without traversing the expression tree. | ||||
| 6386 | // This is the common case. | ||||
| 6387 | if (auto *Store = dyn_cast<StoreInst>(V)) { | ||||
| 6388 | if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) | ||||
| 6389 | return DL->getTypeSizeInBits(Trunc->getSrcTy()); | ||||
| 6390 | return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); | ||||
| 6391 | } | ||||
| 6392 | |||||
| 6393 | if (auto *IEI = dyn_cast<InsertElementInst>(V)) | ||||
| 6394 | return getVectorElementSize(IEI->getOperand(1)); | ||||
| 6395 | |||||
| 6396 | auto E = InstrElementSize.find(V); | ||||
| 6397 | if (E != InstrElementSize.end()) | ||||
| 6398 | return E->second; | ||||
| 6399 | |||||
| 6400 | // If V is not a store, we can traverse the expression tree to find loads | ||||
| 6401 | // that feed it. The type of the loaded value may indicate a more suitable | ||||
| 6402 | // width than V's type. We want to base the vector element size on the width | ||||
| 6403 | // of memory operations where possible. | ||||
| 6404 | SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist; | ||||
| 6405 | SmallPtrSet<Instruction *, 16> Visited; | ||||
| 6406 | if (auto *I = dyn_cast<Instruction>(V)) { | ||||
| 6407 | Worklist.emplace_back(I, I->getParent()); | ||||
| 6408 | Visited.insert(I); | ||||
| 6409 | } | ||||
| 6410 | |||||
| 6411 | // Traverse the expression tree in bottom-up order looking for loads. If we | ||||
| 6412 | // encounter an instruction we don't yet handle, we give up. | ||||
| 6413 | auto Width = 0u; | ||||
| 6414 | while (!Worklist.empty()) { | ||||
| 6415 | Instruction *I; | ||||
| 6416 | BasicBlock *Parent; | ||||
| 6417 | std::tie(I, Parent) = Worklist.pop_back_val(); | ||||
| 6418 | |||||
| 6419 | // We should only be looking at scalar instructions here. If the current | ||||
| 6420 | // instruction has a vector type, skip. | ||||
| 6421 | auto *Ty = I->getType(); | ||||
| 6422 | if (isa<VectorType>(Ty)) | ||||
| 6423 | continue; | ||||
| 6424 | |||||
| 6425 | // If the current instruction is a load, update MaxWidth to reflect the | ||||
| 6426 | // width of the loaded value. | ||||
| 6427 | if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) || | ||||
| 6428 | isa<ExtractValueInst>(I)) | ||||
| 6429 | Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty)); | ||||
| 6430 | |||||
| 6431 | // Otherwise, we need to visit the operands of the instruction. We only | ||||
| 6432 | // handle the interesting cases from buildTree here. If an operand is an | ||||
| 6433 | // instruction we haven't yet visited and from the same basic block as the | ||||
| 6434 | // user or the use is a PHI node, we add it to the worklist. | ||||
| 6435 | else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || | ||||
| 6436 | isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) || | ||||
| 6437 | isa<UnaryOperator>(I)) { | ||||
| 6438 | for (Use &U : I->operands()) | ||||
| 6439 | if (auto *J = dyn_cast<Instruction>(U.get())) | ||||
| 6440 | if (Visited.insert(J).second && | ||||
| 6441 | (isa<PHINode>(I) || J->getParent() == Parent)) | ||||
| 6442 | Worklist.emplace_back(J, J->getParent()); | ||||
| 6443 | } else { | ||||
| 6444 | break; | ||||
| 6445 | } | ||||
| 6446 | } | ||||
| 6447 | |||||
| 6448 | // If we didn't encounter a memory access in the expression tree, or if we | ||||
| 6449 | // gave up for some reason, just return the width of V. Otherwise, return the | ||||
| 6450 | // maximum width we found. | ||||
| 6451 | if (!Width) { | ||||
| 6452 | if (auto *CI = dyn_cast<CmpInst>(V)) | ||||
| 6453 | V = CI->getOperand(0); | ||||
| 6454 | Width = DL->getTypeSizeInBits(V->getType()); | ||||
| 6455 | } | ||||
| 6456 | |||||
| 6457 | for (Instruction *I : Visited) | ||||
| 6458 | InstrElementSize[I] = Width; | ||||
| 6459 | |||||
| 6460 | return Width; | ||||
| 6461 | } | ||||
| 6462 | |||||
| 6463 | // Determine if a value V in a vectorizable expression Expr can be demoted to a | ||||
| 6464 | // smaller type with a truncation. We collect the values that will be demoted | ||||
| 6465 | // in ToDemote and additional roots that require investigating in Roots. | ||||
| 6466 | static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, | ||||
| 6467 | SmallVectorImpl<Value *> &ToDemote, | ||||
| 6468 | SmallVectorImpl<Value *> &Roots) { | ||||
| 6469 | // We can always demote constants. | ||||
| 6470 | if (isa<Constant>(V)) { | ||||
| 6471 | ToDemote.push_back(V); | ||||
| 6472 | return true; | ||||
| 6473 | } | ||||
| 6474 | |||||
| 6475 | // If the value is not an instruction in the expression with only one use, it | ||||
| 6476 | // cannot be demoted. | ||||
| 6477 | auto *I = dyn_cast<Instruction>(V); | ||||
| 6478 | if (!I || !I->hasOneUse() || !Expr.count(I)) | ||||
| 6479 | return false; | ||||
| 6480 | |||||
| 6481 | switch (I->getOpcode()) { | ||||
| 6482 | |||||
| 6483 | // We can always demote truncations and extensions. Since truncations can | ||||
| 6484 | // seed additional demotion, we save the truncated value. | ||||
| 6485 | case Instruction::Trunc: | ||||
| 6486 | Roots.push_back(I->getOperand(0)); | ||||
| 6487 | break; | ||||
| 6488 | case Instruction::ZExt: | ||||
| 6489 | case Instruction::SExt: | ||||
| 6490 | if (isa<ExtractElementInst>(I->getOperand(0)) || | ||||
| 6491 | isa<InsertElementInst>(I->getOperand(0))) | ||||
| 6492 | return false; | ||||
| 6493 | break; | ||||
| 6494 | |||||
| 6495 | // We can demote certain binary operations if we can demote both of their | ||||
| 6496 | // operands. | ||||
| 6497 | case Instruction::Add: | ||||
| 6498 | case Instruction::Sub: | ||||
| 6499 | case Instruction::Mul: | ||||
| 6500 | case Instruction::And: | ||||
| 6501 | case Instruction::Or: | ||||
| 6502 | case Instruction::Xor: | ||||
| 6503 | if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || | ||||
| 6504 | !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) | ||||
| 6505 | return false; | ||||
| 6506 | break; | ||||
| 6507 | |||||
| 6508 | // We can demote selects if we can demote their true and false values. | ||||
| 6509 | case Instruction::Select: { | ||||
| 6510 | SelectInst *SI = cast<SelectInst>(I); | ||||
| 6511 | if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || | ||||
| 6512 | !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) | ||||
| 6513 | return false; | ||||
| 6514 | break; | ||||
| 6515 | } | ||||
| 6516 | |||||
| 6517 | // We can demote phis if we can demote all their incoming operands. Note that | ||||
| 6518 | // we don't need to worry about cycles since we ensure single use above. | ||||
| 6519 | case Instruction::PHI: { | ||||
| 6520 | PHINode *PN = cast<PHINode>(I); | ||||
| 6521 | for (Value *IncValue : PN->incoming_values()) | ||||
| 6522 | if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) | ||||
| 6523 | return false; | ||||
| 6524 | break; | ||||
| 6525 | } | ||||
| 6526 | |||||
| 6527 | // Otherwise, conservatively give up. | ||||
| 6528 | default: | ||||
| 6529 | return false; | ||||
| 6530 | } | ||||
| 6531 | |||||
| 6532 | // Record the value that we can demote. | ||||
| 6533 | ToDemote.push_back(V); | ||||
| 6534 | return true; | ||||
| 6535 | } | ||||
| 6536 | |||||
| 6537 | void BoUpSLP::computeMinimumValueSizes() { | ||||
| 6538 | // If there are no external uses, the expression tree must be rooted by a | ||||
| 6539 | // store. We can't demote in-memory values, so there is nothing to do here. | ||||
| 6540 | if (ExternalUses.empty()) | ||||
| 6541 | return; | ||||
| 6542 | |||||
| 6543 | // We only attempt to truncate integer expressions. | ||||
| 6544 | auto &TreeRoot = VectorizableTree[0]->Scalars; | ||||
| 6545 | auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType()); | ||||
| 6546 | if (!TreeRootIT) | ||||
| 6547 | return; | ||||
| 6548 | |||||
| 6549 | // If the expression is not rooted by a store, these roots should have | ||||
| 6550 | // external uses. We will rely on InstCombine to rewrite the expression in | ||||
| 6551 | // the narrower type. However, InstCombine only rewrites single-use values. | ||||
| 6552 | // This means that if a tree entry other than a root is used externally, it | ||||
| 6553 | // must have multiple uses and InstCombine will not rewrite it. The code | ||||
| 6554 | // below ensures that only the roots are used externally. | ||||
| 6555 | SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end()); | ||||
| 6556 | for (auto &EU : ExternalUses) | ||||
| 6557 | if (!Expr.erase(EU.Scalar)) | ||||
| 6558 | return; | ||||
| 6559 | if (!Expr.empty()) | ||||
| 6560 | return; | ||||
| 6561 | |||||
| 6562 | // Collect the scalar values of the vectorizable expression. We will use this | ||||
| 6563 | // context to determine which values can be demoted. If we see a truncation, | ||||
| 6564 | // we mark it as seeding another demotion. | ||||
| 6565 | for (auto &EntryPtr : VectorizableTree) | ||||
| 6566 | Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end()); | ||||
| 6567 | |||||
| 6568 | // Ensure the roots of the vectorizable tree don't form a cycle. They must | ||||
| 6569 | // have a single external user that is not in the vectorizable tree. | ||||
| 6570 | for (auto *Root : TreeRoot) | ||||
| 6571 | if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) | ||||
| 6572 | return; | ||||
| 6573 | |||||
| 6574 | // Conservatively determine if we can actually truncate the roots of the | ||||
| 6575 | // expression. Collect the values that can be demoted in ToDemote and | ||||
| 6576 | // additional roots that require investigating in Roots. | ||||
| 6577 | SmallVector<Value *, 32> ToDemote; | ||||
| 6578 | SmallVector<Value *, 4> Roots; | ||||
| 6579 | for (auto *Root : TreeRoot) | ||||
| 6580 | if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) | ||||
| 6581 | return; | ||||
| 6582 | |||||
| 6583 | // The maximum bit width required to represent all the values that can be | ||||
| 6584 | // demoted without loss of precision. It would be safe to truncate the roots | ||||
| 6585 | // of the expression to this width. | ||||
| 6586 | auto MaxBitWidth = 8u; | ||||
| 6587 | |||||
| 6588 | // We first check if all the bits of the roots are demanded. If they're not, | ||||
| 6589 | // we can truncate the roots to this narrower type. | ||||
| 6590 | for (auto *Root : TreeRoot) { | ||||
| 6591 | auto Mask = DB->getDemandedBits(cast<Instruction>(Root)); | ||||
| 6592 | MaxBitWidth = std::max<unsigned>( | ||||
| 6593 | Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); | ||||
| 6594 | } | ||||
| 6595 | |||||
| 6596 | // True if the roots can be zero-extended back to their original type, rather | ||||
| 6597 | // than sign-extended. We know that if the leading bits are not demanded, we | ||||
| 6598 | // can safely zero-extend. So we initialize IsKnownPositive to True. | ||||
| 6599 | bool IsKnownPositive = true; | ||||
| 6600 | |||||
| 6601 | // If all the bits of the roots are demanded, we can try a little harder to | ||||
| 6602 | // compute a narrower type. This can happen, for example, if the roots are | ||||
| 6603 | // getelementptr indices. InstCombine promotes these indices to the pointer | ||||
| 6604 | // width. Thus, all their bits are technically demanded even though the | ||||
| 6605 | // address computation might be vectorized in a smaller type. | ||||
| 6606 | // | ||||
| 6607 | // We start by looking at each entry that can be demoted. We compute the | ||||
| 6608 | // maximum bit width required to store the scalar by using ValueTracking to | ||||
| 6609 | // compute the number of high-order bits we can truncate. | ||||
| 6610 | if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && | ||||
| 6611 | llvm::all_of(TreeRoot, [](Value *R) { | ||||
| 6612 | assert(R->hasOneUse() && "Root should have only one use!")((void)0); | ||||
| 6613 | return isa<GetElementPtrInst>(R->user_back()); | ||||
| 6614 | })) { | ||||
| 6615 | MaxBitWidth = 8u; | ||||
| 6616 | |||||
| 6617 | // Determine if the sign bit of all the roots is known to be zero. If not, | ||||
| 6618 | // IsKnownPositive is set to False. | ||||
| 6619 | IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) { | ||||
| 6620 | KnownBits Known = computeKnownBits(R, *DL); | ||||
| 6621 | return Known.isNonNegative(); | ||||
| 6622 | }); | ||||
| 6623 | |||||
| 6624 | // Determine the maximum number of bits required to store the scalar | ||||
| 6625 | // values. | ||||
| 6626 | for (auto *Scalar : ToDemote) { | ||||
| 6627 | auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT); | ||||
| 6628 | auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); | ||||
| 6629 | MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth); | ||||
| 6630 | } | ||||
| 6631 | |||||
| 6632 | // If we can't prove that the sign bit is zero, we must add one to the | ||||
| 6633 | // maximum bit width to account for the unknown sign bit. This preserves | ||||
| 6634 | // the existing sign bit so we can safely sign-extend the root back to the | ||||
| 6635 | // original type. Otherwise, if we know the sign bit is zero, we will | ||||
| 6636 | // zero-extend the root instead. | ||||
| 6637 | // | ||||
| 6638 | // FIXME: This is somewhat suboptimal, as there will be cases where adding | ||||
| 6639 | // one to the maximum bit width will yield a larger-than-necessary | ||||
| 6640 | // type. In general, we need to add an extra bit only if we can't | ||||
| 6641 | // prove that the upper bit of the original type is equal to the | ||||
| 6642 | // upper bit of the proposed smaller type. If these two bits are the | ||||
| 6643 | // same (either zero or one) we know that sign-extending from the | ||||
| 6644 | // smaller type will result in the same value. Here, since we can't | ||||
| 6645 | // yet prove this, we are just making the proposed smaller type | ||||
| 6646 | // larger to ensure correctness. | ||||
| 6647 | if (!IsKnownPositive) | ||||
| 6648 | ++MaxBitWidth; | ||||
| 6649 | } | ||||
| 6650 | |||||
| 6651 | // Round MaxBitWidth up to the next power-of-two. | ||||
| 6652 | if (!isPowerOf2_64(MaxBitWidth)) | ||||
| 6653 | MaxBitWidth = NextPowerOf2(MaxBitWidth); | ||||
| 6654 | |||||
| 6655 | // If the maximum bit width we compute is less than the with of the roots' | ||||
| 6656 | // type, we can proceed with the narrowing. Otherwise, do nothing. | ||||
| 6657 | if (MaxBitWidth >= TreeRootIT->getBitWidth()) | ||||
| 6658 | return; | ||||
| 6659 | |||||
| 6660 | // If we can truncate the root, we must collect additional values that might | ||||
| 6661 | // be demoted as a result. That is, those seeded by truncations we will | ||||
| 6662 | // modify. | ||||
| 6663 | while (!Roots.empty()) | ||||
| 6664 | collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); | ||||
| 6665 | |||||
| 6666 | // Finally, map the values we can demote to the maximum bit with we computed. | ||||
| 6667 | for (auto *Scalar : ToDemote) | ||||
| 6668 | MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive); | ||||
| 6669 | } | ||||
| 6670 | |||||
| 6671 | namespace { | ||||
| 6672 | |||||
| 6673 | /// The SLPVectorizer Pass. | ||||
| 6674 | struct SLPVectorizer : public FunctionPass { | ||||
| 6675 | SLPVectorizerPass Impl; | ||||
| 6676 | |||||
| 6677 | /// Pass identification, replacement for typeid | ||||
| 6678 | static char ID; | ||||
| 6679 | |||||
| 6680 | explicit SLPVectorizer() : FunctionPass(ID) { | ||||
| 6681 | initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); | ||||
| 6682 | } | ||||
| 6683 | |||||
| 6684 | bool doInitialization(Module &M) override { | ||||
| 6685 | return false; | ||||
| 6686 | } | ||||
| 6687 | |||||
| 6688 | bool runOnFunction(Function &F) override { | ||||
| 6689 | if (skipFunction(F)) | ||||
| 6690 | return false; | ||||
| 6691 | |||||
| 6692 | auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); | ||||
| 6693 | auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); | ||||
| 6694 | auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); | ||||
| 6695 | auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; | ||||
| 6696 | auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); | ||||
| 6697 | auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); | ||||
| 6698 | auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); | ||||
| 6699 | auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); | ||||
| 6700 | auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); | ||||
| 6701 | auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); | ||||
| 6702 | |||||
| 6703 | return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); | ||||
| 6704 | } | ||||
| 6705 | |||||
| 6706 | void getAnalysisUsage(AnalysisUsage &AU) const override { | ||||
| 6707 | FunctionPass::getAnalysisUsage(AU); | ||||
| 6708 | AU.addRequired<AssumptionCacheTracker>(); | ||||
| 6709 | AU.addRequired<ScalarEvolutionWrapperPass>(); | ||||
| 6710 | AU.addRequired<AAResultsWrapperPass>(); | ||||
| 6711 | AU.addRequired<TargetTransformInfoWrapperPass>(); | ||||
| 6712 | AU.addRequired<LoopInfoWrapperPass>(); | ||||
| 6713 | AU.addRequired<DominatorTreeWrapperPass>(); | ||||
| 6714 | AU.addRequired<DemandedBitsWrapperPass>(); | ||||
| 6715 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); | ||||
| 6716 | AU.addRequired<InjectTLIMappingsLegacy>(); | ||||
| 6717 | AU.addPreserved<LoopInfoWrapperPass>(); | ||||
| 6718 | AU.addPreserved<DominatorTreeWrapperPass>(); | ||||
| 6719 | AU.addPreserved<AAResultsWrapperPass>(); | ||||
| 6720 | AU.addPreserved<GlobalsAAWrapperPass>(); | ||||
| 6721 | AU.setPreservesCFG(); | ||||
| 6722 | } | ||||
| 6723 | }; | ||||
| 6724 | |||||
| 6725 | } // end anonymous namespace | ||||
| 6726 | |||||
| 6727 | PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { | ||||
| 6728 | auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); | ||||
| 6729 | auto *TTI = &AM.getResult<TargetIRAnalysis>(F); | ||||
| 6730 | auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F); | ||||
| 6731 | auto *AA = &AM.getResult<AAManager>(F); | ||||
| 6732 | auto *LI = &AM.getResult<LoopAnalysis>(F); | ||||
| 6733 | auto *DT = &AM.getResult<DominatorTreeAnalysis>(F); | ||||
| 6734 | auto *AC = &AM.getResult<AssumptionAnalysis>(F); | ||||
| 6735 | auto *DB = &AM.getResult<DemandedBitsAnalysis>(F); | ||||
| 6736 | auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); | ||||
| 6737 | |||||
| 6738 | bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); | ||||
| 6739 | if (!Changed) | ||||
| 6740 | return PreservedAnalyses::all(); | ||||
| 6741 | |||||
| 6742 | PreservedAnalyses PA; | ||||
| 6743 | PA.preserveSet<CFGAnalyses>(); | ||||
| 6744 | return PA; | ||||
| 6745 | } | ||||
| 6746 | |||||
| 6747 | bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, | ||||
| 6748 | TargetTransformInfo *TTI_, | ||||
| 6749 | TargetLibraryInfo *TLI_, AAResults *AA_, | ||||
| 6750 | LoopInfo *LI_, DominatorTree *DT_, | ||||
| 6751 | AssumptionCache *AC_, DemandedBits *DB_, | ||||
| 6752 | OptimizationRemarkEmitter *ORE_) { | ||||
| 6753 | if (!RunSLPVectorization) | ||||
| 6754 | return false; | ||||
| 6755 | SE = SE_; | ||||
| 6756 | TTI = TTI_; | ||||
| 6757 | TLI = TLI_; | ||||
| 6758 | AA = AA_; | ||||
| 6759 | LI = LI_; | ||||
| 6760 | DT = DT_; | ||||
| 6761 | AC = AC_; | ||||
| 6762 | DB = DB_; | ||||
| 6763 | DL = &F.getParent()->getDataLayout(); | ||||
| 6764 | |||||
| 6765 | Stores.clear(); | ||||
| 6766 | GEPs.clear(); | ||||
| 6767 | bool Changed = false; | ||||
| 6768 | |||||
| 6769 | // If the target claims to have no vector registers don't attempt | ||||
| 6770 | // vectorization. | ||||
| 6771 | if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) | ||||
| 6772 | return false; | ||||
| 6773 | |||||
| 6774 | // Don't vectorize when the attribute NoImplicitFloat is used. | ||||
| 6775 | if (F.hasFnAttribute(Attribute::NoImplicitFloat)) | ||||
| 6776 | return false; | ||||
| 6777 | |||||
| 6778 | LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n")do { } while (false); | ||||
| 6779 | |||||
| 6780 | // Use the bottom up slp vectorizer to construct chains that start with | ||||
| 6781 | // store instructions. | ||||
| 6782 | BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); | ||||
| 6783 | |||||
| 6784 | // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to | ||||
| 6785 | // delete instructions. | ||||
| 6786 | |||||
| 6787 | // Update DFS numbers now so that we can use them for ordering. | ||||
| 6788 | DT->updateDFSNumbers(); | ||||
| 6789 | |||||
| 6790 | // Scan the blocks in the function in post order. | ||||
| 6791 | for (auto BB : post_order(&F.getEntryBlock())) { | ||||
| 6792 | collectSeedInstructions(BB); | ||||
| 6793 | |||||
| 6794 | // Vectorize trees that end at stores. | ||||
| 6795 | if (!Stores.empty()) { | ||||
| 6796 | LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()do { } while (false) | ||||
| 6797 | << " underlying objects.\n")do { } while (false); | ||||
| 6798 | Changed |= vectorizeStoreChains(R); | ||||
| 6799 | } | ||||
| 6800 | |||||
| 6801 | // Vectorize trees that end at reductions. | ||||
| 6802 | Changed |= vectorizeChainsInBlock(BB, R); | ||||
| 6803 | |||||
| 6804 | // Vectorize the index computations of getelementptr instructions. This | ||||
| 6805 | // is primarily intended to catch gather-like idioms ending at | ||||
| 6806 | // non-consecutive loads. | ||||
| 6807 | if (!GEPs.empty()) { | ||||
| 6808 | LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()do { } while (false) | ||||
| 6809 | << " underlying objects.\n")do { } while (false); | ||||
| 6810 | Changed |= vectorizeGEPIndices(BB, R); | ||||
| 6811 | } | ||||
| 6812 | } | ||||
| 6813 | |||||
| 6814 | if (Changed) { | ||||
| 6815 | R.optimizeGatherSequence(); | ||||
| 6816 | LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n")do { } while (false); | ||||
| 6817 | } | ||||
| 6818 | return Changed; | ||||
| 6819 | } | ||||
| 6820 | |||||
| 6821 | /// Order may have elements assigned special value (size) which is out of | ||||
| 6822 | /// bounds. Such indices only appear on places which correspond to undef values | ||||
| 6823 | /// (see canReuseExtract for details) and used in order to avoid undef values | ||||
| 6824 | /// have effect on operands ordering. | ||||
| 6825 | /// The first loop below simply finds all unused indices and then the next loop | ||||
| 6826 | /// nest assigns these indices for undef values positions. | ||||
| 6827 | /// As an example below Order has two undef positions and they have assigned | ||||
| 6828 | /// values 3 and 7 respectively: | ||||
| 6829 | /// before: 6 9 5 4 9 2 1 0 | ||||
| 6830 | /// after: 6 3 5 4 7 2 1 0 | ||||
| 6831 | /// \returns Fixed ordering. | ||||
| 6832 | static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef<unsigned> Order) { | ||||
| 6833 | BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end()); | ||||
| 6834 | const unsigned Sz = NewOrder.size(); | ||||
| 6835 | SmallBitVector UsedIndices(Sz); | ||||
| 6836 | SmallVector<int> MaskedIndices; | ||||
| 6837 | for (int I = 0, E = NewOrder.size(); I < E; ++I) { | ||||
| 6838 | if (NewOrder[I] < Sz) | ||||
| 6839 | UsedIndices.set(NewOrder[I]); | ||||
| 6840 | else | ||||
| 6841 | MaskedIndices.push_back(I); | ||||
| 6842 | } | ||||
| 6843 | if (MaskedIndices.empty()) | ||||
| 6844 | return NewOrder; | ||||
| 6845 | SmallVector<int> AvailableIndices(MaskedIndices.size()); | ||||
| 6846 | unsigned Cnt = 0; | ||||
| 6847 | int Idx = UsedIndices.find_first(); | ||||
| 6848 | do { | ||||
| 6849 | AvailableIndices[Cnt] = Idx; | ||||
| 6850 | Idx = UsedIndices.find_next(Idx); | ||||
| 6851 | ++Cnt; | ||||
| 6852 | } while (Idx > 0); | ||||
| 6853 | assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.")((void)0); | ||||
| 6854 | for (int I = 0, E = MaskedIndices.size(); I < E; ++I) | ||||
| 6855 | NewOrder[MaskedIndices[I]] = AvailableIndices[I]; | ||||
| 6856 | return NewOrder; | ||||
| 6857 | } | ||||
| 6858 | |||||
| 6859 | bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, | ||||
| 6860 | unsigned Idx) { | ||||
| 6861 | LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()do { } while (false) | ||||
| 6862 | << "\n")do { } while (false); | ||||
| 6863 | const unsigned Sz = R.getVectorElementSize(Chain[0]); | ||||
| 6864 | const unsigned MinVF = R.getMinVecRegSize() / Sz; | ||||
| 6865 | unsigned VF = Chain.size(); | ||||
| 6866 | |||||
| 6867 | if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) | ||||
| 6868 | return false; | ||||
| 6869 | |||||
| 6870 | LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idxdo { } while (false) | ||||
| 6871 | << "\n")do { } while (false); | ||||
| 6872 | |||||
| 6873 | R.buildTree(Chain); | ||||
| 6874 | Optional<ArrayRef<unsigned>> Order = R.bestOrder(); | ||||
| 6875 | // TODO: Handle orders of size less than number of elements in the vector. | ||||
| 6876 | if (Order && Order->size() == Chain.size()) { | ||||
| 6877 | // TODO: reorder tree nodes without tree rebuilding. | ||||
| 6878 | SmallVector<Value *, 4> ReorderedOps(Chain.size()); | ||||
| 6879 | transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), | ||||
| 6880 | [Chain](const unsigned Idx) { return Chain[Idx]; }); | ||||
| 6881 | R.buildTree(ReorderedOps); | ||||
| 6882 | } | ||||
| 6883 | if (R.isTreeTinyAndNotFullyVectorizable()) | ||||
| 6884 | return false; | ||||
| 6885 | if (R.isLoadCombineCandidate()) | ||||
| 6886 | return false; | ||||
| 6887 | |||||
| 6888 | R.computeMinimumValueSizes(); | ||||
| 6889 | |||||
| 6890 | InstructionCost Cost = R.getTreeCost(); | ||||
| 6891 | |||||
| 6892 | LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n")do { } while (false); | ||||
| 6893 | if (Cost < -SLPCostThreshold) { | ||||
| 6894 | LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n")do { } while (false); | ||||
| 6895 | |||||
| 6896 | using namespace ore; | ||||
| 6897 | |||||
| 6898 | R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "StoresVectorized", | ||||
| 6899 | cast<StoreInst>(Chain[0])) | ||||
| 6900 | << "Stores SLP vectorized with cost " << NV("Cost", Cost) | ||||
| 6901 | << " and with tree size " | ||||
| 6902 | << NV("TreeSize", R.getTreeSize())); | ||||
| 6903 | |||||
| 6904 | R.vectorizeTree(); | ||||
| 6905 | return true; | ||||
| 6906 | } | ||||
| 6907 | |||||
| 6908 | return false; | ||||
| 6909 | } | ||||
| 6910 | |||||
| 6911 | bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, | ||||
| 6912 | BoUpSLP &R) { | ||||
| 6913 | // We may run into multiple chains that merge into a single chain. We mark the | ||||
| 6914 | // stores that we vectorized so that we don't visit the same store twice. | ||||
| 6915 | BoUpSLP::ValueSet VectorizedStores; | ||||
| 6916 | bool Changed = false; | ||||
| 6917 | |||||
| 6918 | int E = Stores.size(); | ||||
| 6919 | SmallBitVector Tails(E, false); | ||||
| 6920 | int MaxIter = MaxStoreLookup.getValue(); | ||||
| 6921 | SmallVector<std::pair<int, int>, 16> ConsecutiveChain( | ||||
| 6922 | E, std::make_pair(E, INT_MAX2147483647)); | ||||
| 6923 | SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false)); | ||||
| 6924 | int IterCnt; | ||||
| 6925 | auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, | ||||
| 6926 | &CheckedPairs, | ||||
| 6927 | &ConsecutiveChain](int K, int Idx) { | ||||
| 6928 | if (IterCnt >= MaxIter) | ||||
| 6929 | return true; | ||||
| 6930 | if (CheckedPairs[Idx].test(K)) | ||||
| 6931 | return ConsecutiveChain[K].second == 1 && | ||||
| 6932 | ConsecutiveChain[K].first == Idx; | ||||
| 6933 | ++IterCnt; | ||||
| 6934 | CheckedPairs[Idx].set(K); | ||||
| 6935 | CheckedPairs[K].set(Idx); | ||||
| 6936 | Optional<int> Diff = getPointersDiff( | ||||
| 6937 | Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(), | ||||
| 6938 | Stores[Idx]->getValueOperand()->getType(), | ||||
| 6939 | Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); | ||||
| 6940 | if (!Diff || *Diff == 0) | ||||
| 6941 | return false; | ||||
| 6942 | int Val = *Diff; | ||||
| 6943 | if (Val < 0) { | ||||
| 6944 | if (ConsecutiveChain[Idx].second > -Val) { | ||||
| 6945 | Tails.set(K); | ||||
| 6946 | ConsecutiveChain[Idx] = std::make_pair(K, -Val); | ||||
| 6947 | } | ||||
| 6948 | return false; | ||||
| 6949 | } | ||||
| 6950 | if (ConsecutiveChain[K].second <= Val) | ||||
| 6951 | return false; | ||||
| 6952 | |||||
| 6953 | Tails.set(Idx); | ||||
| 6954 | ConsecutiveChain[K] = std::make_pair(Idx, Val); | ||||
| 6955 | return Val == 1; | ||||
| 6956 | }; | ||||
| 6957 | // Do a quadratic search on all of the given stores in reverse order and find | ||||
| 6958 | // all of the pairs of stores that follow each other. | ||||
| 6959 | for (int Idx = E - 1; Idx >= 0; --Idx) { | ||||
| 6960 | // If a store has multiple consecutive store candidates, search according | ||||
| 6961 | // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... | ||||
| 6962 | // This is because usually pairing with immediate succeeding or preceding | ||||
| 6963 | // candidate create the best chance to find slp vectorization opportunity. | ||||
| 6964 | const int MaxLookDepth = std::max(E - Idx, Idx + 1); | ||||
| 6965 | IterCnt = 0; | ||||
| 6966 | for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) | ||||
| 6967 | if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || | ||||
| 6968 | (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) | ||||
| 6969 | break; | ||||
| 6970 | } | ||||
| 6971 | |||||
| 6972 | // Tracks if we tried to vectorize stores starting from the given tail | ||||
| 6973 | // already. | ||||
| 6974 | SmallBitVector TriedTails(E, false); | ||||
| 6975 | // For stores that start but don't end a link in the chain: | ||||
| 6976 | for (int Cnt = E; Cnt > 0; --Cnt) { | ||||
| 6977 | int I = Cnt - 1; | ||||
| 6978 | if (ConsecutiveChain[I].first == E || Tails.test(I)) | ||||
| 6979 | continue; | ||||
| 6980 | // We found a store instr that starts a chain. Now follow the chain and try | ||||
| 6981 | // to vectorize it. | ||||
| 6982 | BoUpSLP::ValueList Operands; | ||||
| 6983 | // Collect the chain into a list. | ||||
| 6984 | while (I != E && !VectorizedStores.count(Stores[I])) { | ||||
| 6985 | Operands.push_back(Stores[I]); | ||||
| 6986 | Tails.set(I); | ||||
| 6987 | if (ConsecutiveChain[I].second != 1) { | ||||
| 6988 | // Mark the new end in the chain and go back, if required. It might be | ||||
| 6989 | // required if the original stores come in reversed order, for example. | ||||
| 6990 | if (ConsecutiveChain[I].first != E && | ||||
| 6991 | Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) && | ||||
| 6992 | !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) { | ||||
| 6993 | TriedTails.set(I); | ||||
| 6994 | Tails.reset(ConsecutiveChain[I].first); | ||||
| 6995 | if (Cnt < ConsecutiveChain[I].first + 2) | ||||
| 6996 | Cnt = ConsecutiveChain[I].first + 2; | ||||
| 6997 | } | ||||
| 6998 | break; | ||||
| 6999 | } | ||||
| 7000 | // Move to the next value in the chain. | ||||
| 7001 | I = ConsecutiveChain[I].first; | ||||
| 7002 | } | ||||
| 7003 | assert(!Operands.empty() && "Expected non-empty list of stores.")((void)0); | ||||
| 7004 | |||||
| 7005 | unsigned MaxVecRegSize = R.getMaxVecRegSize(); | ||||
| 7006 | unsigned EltSize = R.getVectorElementSize(Operands[0]); | ||||
| 7007 | unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); | ||||
| 7008 | |||||
| 7009 | unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize); | ||||
| 7010 | unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), | ||||
| 7011 | MaxElts); | ||||
| 7012 | |||||
| 7013 | // FIXME: Is division-by-2 the correct step? Should we assert that the | ||||
| 7014 | // register size is a power-of-2? | ||||
| 7015 | unsigned StartIdx = 0; | ||||
| 7016 | for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { | ||||
| 7017 | for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { | ||||
| 7018 | ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); | ||||
| 7019 | if (!VectorizedStores.count(Slice.front()) && | ||||
| 7020 | !VectorizedStores.count(Slice.back()) && | ||||
| 7021 | vectorizeStoreChain(Slice, R, Cnt)) { | ||||
| 7022 | // Mark the vectorized stores so that we don't vectorize them again. | ||||
| 7023 | VectorizedStores.insert(Slice.begin(), Slice.end()); | ||||
| 7024 | Changed = true; | ||||
| 7025 | // If we vectorized initial block, no need to try to vectorize it | ||||
| 7026 | // again. | ||||
| 7027 | if (Cnt == StartIdx) | ||||
| 7028 | StartIdx += Size; | ||||
| 7029 | Cnt += Size; | ||||
| 7030 | continue; | ||||
| 7031 | } | ||||
| 7032 | ++Cnt; | ||||
| 7033 | } | ||||
| 7034 | // Check if the whole array was vectorized already - exit. | ||||
| 7035 | if (StartIdx >= Operands.size()) | ||||
| 7036 | break; | ||||
| 7037 | } | ||||
| 7038 | } | ||||
| 7039 | |||||
| 7040 | return Changed; | ||||
| 7041 | } | ||||
| 7042 | |||||
| 7043 | void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { | ||||
| 7044 | // Initialize the collections. We will make a single pass over the block. | ||||
| 7045 | Stores.clear(); | ||||
| 7046 | GEPs.clear(); | ||||
| 7047 | |||||
| 7048 | // Visit the store and getelementptr instructions in BB and organize them in | ||||
| 7049 | // Stores and GEPs according to the underlying objects of their pointer | ||||
| 7050 | // operands. | ||||
| 7051 | for (Instruction &I : *BB) { | ||||
| 7052 | // Ignore store instructions that are volatile or have a pointer operand | ||||
| 7053 | // that doesn't point to a scalar type. | ||||
| 7054 | if (auto *SI = dyn_cast<StoreInst>(&I)) { | ||||
| 7055 | if (!SI->isSimple()) | ||||
| 7056 | continue; | ||||
| 7057 | if (!isValidElementType(SI->getValueOperand()->getType())) | ||||
| 7058 | continue; | ||||
| 7059 | Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI); | ||||
| 7060 | } | ||||
| 7061 | |||||
| 7062 | // Ignore getelementptr instructions that have more than one index, a | ||||
| 7063 | // constant index, or a pointer operand that doesn't point to a scalar | ||||
| 7064 | // type. | ||||
| 7065 | else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { | ||||
| 7066 | auto Idx = GEP->idx_begin()->get(); | ||||
| 7067 | if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) | ||||
| 7068 | continue; | ||||
| 7069 | if (!isValidElementType(Idx->getType())) | ||||
| 7070 | continue; | ||||
| 7071 | if (GEP->getType()->isVectorTy()) | ||||
| 7072 | continue; | ||||
| 7073 | GEPs[GEP->getPointerOperand()].push_back(GEP); | ||||
| 7074 | } | ||||
| 7075 | } | ||||
| 7076 | } | ||||
| 7077 | |||||
| 7078 | bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { | ||||
| 7079 | if (!A || !B) | ||||
| 7080 | return false; | ||||
| 7081 | Value *VL[] = {A, B}; | ||||
| 7082 | return tryToVectorizeList(VL, R, /*AllowReorder=*/true); | ||||
| 7083 | } | ||||
| 7084 | |||||
| 7085 | bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, | ||||
| 7086 | bool AllowReorder) { | ||||
| 7087 | if (VL.size() < 2) | ||||
| 7088 | return false; | ||||
| 7089 | |||||
| 7090 | LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "do { } while (false) | ||||
| 7091 | << VL.size() << ".\n")do { } while (false); | ||||
| 7092 | |||||
| 7093 | // Check that all of the parts are instructions of the same type, | ||||
| 7094 | // we permit an alternate opcode via InstructionsState. | ||||
| 7095 | InstructionsState S = getSameOpcode(VL); | ||||
| 7096 | if (!S.getOpcode()) | ||||
| 7097 | return false; | ||||
| 7098 | |||||
| 7099 | Instruction *I0 = cast<Instruction>(S.OpValue); | ||||
| 7100 | // Make sure invalid types (including vector type) are rejected before | ||||
| 7101 | // determining vectorization factor for scalar instructions. | ||||
| 7102 | for (Value *V : VL) { | ||||
| 7103 | Type *Ty = V->getType(); | ||||
| 7104 | if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) { | ||||
| 7105 | // NOTE: the following will give user internal llvm type name, which may | ||||
| 7106 | // not be useful. | ||||
| 7107 | R.getORE()->emit([&]() { | ||||
| 7108 | std::string type_str; | ||||
| 7109 | llvm::raw_string_ostream rso(type_str); | ||||
| 7110 | Ty->print(rso); | ||||
| 7111 | return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "UnsupportedType", I0) | ||||
| 7112 | << "Cannot SLP vectorize list: type " | ||||
| 7113 | << rso.str() + " is unsupported by vectorizer"; | ||||
| 7114 | }); | ||||
| 7115 | return false; | ||||
| 7116 | } | ||||
| 7117 | } | ||||
| 7118 | |||||
| 7119 | unsigned Sz = R.getVectorElementSize(I0); | ||||
| 7120 | unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); | ||||
| 7121 | unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF); | ||||
| 7122 | MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); | ||||
| 7123 | if (MaxVF < 2) { | ||||
| 7124 | R.getORE()->emit([&]() { | ||||
| 7125 | return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "SmallVF", I0) | ||||
| 7126 | << "Cannot SLP vectorize list: vectorization factor " | ||||
| 7127 | << "less than 2 is not supported"; | ||||
| 7128 | }); | ||||
| 7129 | return false; | ||||
| 7130 | } | ||||
| 7131 | |||||
| 7132 | bool Changed = false; | ||||
| 7133 | bool CandidateFound = false; | ||||
| 7134 | InstructionCost MinCost = SLPCostThreshold.getValue(); | ||||
| 7135 | Type *ScalarTy = VL[0]->getType(); | ||||
| 7136 | if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) | ||||
| 7137 | ScalarTy = IE->getOperand(1)->getType(); | ||||
| 7138 | |||||
| 7139 | unsigned NextInst = 0, MaxInst = VL.size(); | ||||
| 7140 | for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { | ||||
| 7141 | // No actual vectorization should happen, if number of parts is the same as | ||||
| 7142 | // provided vectorization factor (i.e. the scalar type is used for vector | ||||
| 7143 | // code during codegen). | ||||
| 7144 | auto *VecTy = FixedVectorType::get(ScalarTy, VF); | ||||
| 7145 | if (TTI->getNumberOfParts(VecTy) == VF) | ||||
| 7146 | continue; | ||||
| 7147 | for (unsigned I = NextInst; I < MaxInst; ++I) { | ||||
| 7148 | unsigned OpsWidth = 0; | ||||
| 7149 | |||||
| 7150 | if (I + VF > MaxInst) | ||||
| 7151 | OpsWidth = MaxInst - I; | ||||
| 7152 | else | ||||
| 7153 | OpsWidth = VF; | ||||
| 7154 | |||||
| 7155 | if (!isPowerOf2_32(OpsWidth)) | ||||
| 7156 | continue; | ||||
| 7157 | |||||
| 7158 | if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) | ||||
| 7159 | break; | ||||
| 7160 | |||||
| 7161 | ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); | ||||
| 7162 | // Check that a previous iteration of this loop did not delete the Value. | ||||
| 7163 | if (llvm::any_of(Ops, [&R](Value *V) { | ||||
| 7164 | auto *I = dyn_cast<Instruction>(V); | ||||
| 7165 | return I && R.isDeleted(I); | ||||
| 7166 | })) | ||||
| 7167 | continue; | ||||
| 7168 | |||||
| 7169 | LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "do { } while (false) | ||||
| 7170 | << "\n")do { } while (false); | ||||
| 7171 | |||||
| 7172 | R.buildTree(Ops); | ||||
| 7173 | if (AllowReorder) { | ||||
| 7174 | Optional<ArrayRef<unsigned>> Order = R.bestOrder(); | ||||
| 7175 | if (Order) { | ||||
| 7176 | // TODO: reorder tree nodes without tree rebuilding. | ||||
| 7177 | SmallVector<Value *, 4> ReorderedOps(Ops.size()); | ||||
| 7178 | transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), | ||||
| 7179 | [Ops](const unsigned Idx) { return Ops[Idx]; }); | ||||
| 7180 | R.buildTree(ReorderedOps); | ||||
| 7181 | } | ||||
| 7182 | } | ||||
| 7183 | if (R.isTreeTinyAndNotFullyVectorizable()) | ||||
| 7184 | continue; | ||||
| 7185 | |||||
| 7186 | R.computeMinimumValueSizes(); | ||||
| 7187 | InstructionCost Cost = R.getTreeCost(); | ||||
| 7188 | CandidateFound = true; | ||||
| 7189 | MinCost = std::min(MinCost, Cost); | ||||
| 7190 | |||||
| 7191 | if (Cost < -SLPCostThreshold) { | ||||
| 7192 | LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n")do { } while (false); | ||||
| 7193 | R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "VectorizedList", | ||||
| 7194 | cast<Instruction>(Ops[0])) | ||||
| 7195 | << "SLP vectorized with cost " << ore::NV("Cost", Cost) | ||||
| 7196 | << " and with tree size " | ||||
| 7197 | << ore::NV("TreeSize", R.getTreeSize())); | ||||
| 7198 | |||||
| 7199 | R.vectorizeTree(); | ||||
| 7200 | // Move to the next bundle. | ||||
| 7201 | I += VF - 1; | ||||
| 7202 | NextInst = I + 1; | ||||
| 7203 | Changed = true; | ||||
| 7204 | } | ||||
| 7205 | } | ||||
| 7206 | } | ||||
| 7207 | |||||
| 7208 | if (!Changed && CandidateFound) { | ||||
| 7209 | R.getORE()->emit([&]() { | ||||
| 7210 | return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "NotBeneficial", I0) | ||||
| 7211 | << "List vectorization was possible but not beneficial with cost " | ||||
| 7212 | << ore::NV("Cost", MinCost) << " >= " | ||||
| 7213 | << ore::NV("Treshold", -SLPCostThreshold); | ||||
| 7214 | }); | ||||
| 7215 | } else if (!Changed) { | ||||
| 7216 | R.getORE()->emit([&]() { | ||||
| 7217 | return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "NotPossible", I0) | ||||
| 7218 | << "Cannot SLP vectorize list: vectorization was impossible" | ||||
| 7219 | << " with available vectorization factors"; | ||||
| 7220 | }); | ||||
| 7221 | } | ||||
| 7222 | return Changed; | ||||
| 7223 | } | ||||
| 7224 | |||||
| 7225 | bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { | ||||
| 7226 | if (!I) | ||||
| 7227 | return false; | ||||
| 7228 | |||||
| 7229 | if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I)) | ||||
| 7230 | return false; | ||||
| 7231 | |||||
| 7232 | Value *P = I->getParent(); | ||||
| 7233 | |||||
| 7234 | // Vectorize in current basic block only. | ||||
| 7235 | auto *Op0 = dyn_cast<Instruction>(I->getOperand(0)); | ||||
| 7236 | auto *Op1 = dyn_cast<Instruction>(I->getOperand(1)); | ||||
| 7237 | if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) | ||||
| 7238 | return false; | ||||
| 7239 | |||||
| 7240 | // Try to vectorize V. | ||||
| 7241 | if (tryToVectorizePair(Op0, Op1, R)) | ||||
| 7242 | return true; | ||||
| 7243 | |||||
| 7244 | auto *A = dyn_cast<BinaryOperator>(Op0); | ||||
| 7245 | auto *B = dyn_cast<BinaryOperator>(Op1); | ||||
| 7246 | // Try to skip B. | ||||
| 7247 | if (B && B->hasOneUse()) { | ||||
| 7248 | auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); | ||||
| 7249 | auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); | ||||
| 7250 | if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R)) | ||||
| 7251 | return true; | ||||
| 7252 | if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R)) | ||||
| 7253 | return true; | ||||
| 7254 | } | ||||
| 7255 | |||||
| 7256 | // Try to skip A. | ||||
| 7257 | if (A && A->hasOneUse()) { | ||||
| 7258 | auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); | ||||
| 7259 | auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); | ||||
| 7260 | if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R)) | ||||
| 7261 | return true; | ||||
| 7262 | if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R)) | ||||
| 7263 | return true; | ||||
| 7264 | } | ||||
| 7265 | return false; | ||||
| 7266 | } | ||||
| 7267 | |||||
| 7268 | namespace { | ||||
| 7269 | |||||
| 7270 | /// Model horizontal reductions. | ||||
| 7271 | /// | ||||
| 7272 | /// A horizontal reduction is a tree of reduction instructions that has values | ||||
| 7273 | /// that can be put into a vector as its leaves. For example: | ||||
| 7274 | /// | ||||
| 7275 | /// mul mul mul mul | ||||
| 7276 | /// \ / \ / | ||||
| 7277 | /// + + | ||||
| 7278 | /// \ / | ||||
| 7279 | /// + | ||||
| 7280 | /// This tree has "mul" as its leaf values and "+" as its reduction | ||||
| 7281 | /// instructions. A reduction can feed into a store or a binary operation | ||||
| 7282 | /// feeding a phi. | ||||
| 7283 | /// ... | ||||
| 7284 | /// \ / | ||||
| 7285 | /// + | ||||
| 7286 | /// | | ||||
| 7287 | /// phi += | ||||
| 7288 | /// | ||||
| 7289 | /// Or: | ||||
| 7290 | /// ... | ||||
| 7291 | /// \ / | ||||
| 7292 | /// + | ||||
| 7293 | /// | | ||||
| 7294 | /// *p = | ||||
| 7295 | /// | ||||
| 7296 | class HorizontalReduction { | ||||
| 7297 | using ReductionOpsType = SmallVector<Value *, 16>; | ||||
| 7298 | using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; | ||||
| 7299 | ReductionOpsListType ReductionOps; | ||||
| 7300 | SmallVector<Value *, 32> ReducedVals; | ||||
| 7301 | // Use map vector to make stable output. | ||||
| 7302 | MapVector<Instruction *, Value *> ExtraArgs; | ||||
| 7303 | WeakTrackingVH ReductionRoot; | ||||
| 7304 | /// The type of reduction operation. | ||||
| 7305 | RecurKind RdxKind; | ||||
| 7306 | |||||
| 7307 | const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max(); | ||||
| 7308 | |||||
| 7309 | static bool isCmpSelMinMax(Instruction *I) { | ||||
| 7310 | return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && | ||||
| 7311 | RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); | ||||
| 7312 | } | ||||
| 7313 | |||||
| 7314 | // And/or are potentially poison-safe logical patterns like: | ||||
| 7315 | // select x, y, false | ||||
| 7316 | // select x, true, y | ||||
| 7317 | static bool isBoolLogicOp(Instruction *I) { | ||||
| 7318 | return match(I, m_LogicalAnd(m_Value(), m_Value())) || | ||||
| 7319 | match(I, m_LogicalOr(m_Value(), m_Value())); | ||||
| 7320 | } | ||||
| 7321 | |||||
| 7322 | /// Checks if instruction is associative and can be vectorized. | ||||
| 7323 | static bool isVectorizable(RecurKind Kind, Instruction *I) { | ||||
| 7324 | if (Kind == RecurKind::None) | ||||
| 7325 | return false; | ||||
| 7326 | |||||
| 7327 | // Integer ops that map to select instructions or intrinsics are fine. | ||||
| 7328 | if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) || | ||||
| 7329 | isBoolLogicOp(I)) | ||||
| 7330 | return true; | ||||
| 7331 | |||||
| 7332 | if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { | ||||
| 7333 | // FP min/max are associative except for NaN and -0.0. We do not | ||||
| 7334 | // have to rule out -0.0 here because the intrinsic semantics do not | ||||
| 7335 | // specify a fixed result for it. | ||||
| 7336 | return I->getFastMathFlags().noNaNs(); | ||||
| 7337 | } | ||||
| 7338 | |||||
| 7339 | return I->isAssociative(); | ||||
| 7340 | } | ||||
| 7341 | |||||
| 7342 | static Value *getRdxOperand(Instruction *I, unsigned Index) { | ||||
| 7343 | // Poison-safe 'or' takes the form: select X, true, Y | ||||
| 7344 | // To make that work with the normal operand processing, we skip the | ||||
| 7345 | // true value operand. | ||||
| 7346 | // TODO: Change the code and data structures to handle this without a hack. | ||||
| 7347 | if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1) | ||||
| 7348 | return I->getOperand(2); | ||||
| 7349 | return I->getOperand(Index); | ||||
| 7350 | } | ||||
| 7351 | |||||
| 7352 | /// Checks if the ParentStackElem.first should be marked as a reduction | ||||
| 7353 | /// operation with an extra argument or as extra argument itself. | ||||
| 7354 | void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, | ||||
| 7355 | Value *ExtraArg) { | ||||
| 7356 | if (ExtraArgs.count(ParentStackElem.first)) { | ||||
| 7357 | ExtraArgs[ParentStackElem.first] = nullptr; | ||||
| 7358 | // We ran into something like: | ||||
| 7359 | // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. | ||||
| 7360 | // The whole ParentStackElem.first should be considered as an extra value | ||||
| 7361 | // in this case. | ||||
| 7362 | // Do not perform analysis of remaining operands of ParentStackElem.first | ||||
| 7363 | // instruction, this whole instruction is an extra argument. | ||||
| 7364 | ParentStackElem.second = INVALID_OPERAND_INDEX; | ||||
| 7365 | } else { | ||||
| 7366 | // We ran into something like: | ||||
| 7367 | // ParentStackElem.first += ... + ExtraArg + ... | ||||
| 7368 | ExtraArgs[ParentStackElem.first] = ExtraArg; | ||||
| 7369 | } | ||||
| 7370 | } | ||||
| 7371 | |||||
| 7372 | /// Creates reduction operation with the current opcode. | ||||
| 7373 | static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, | ||||
| 7374 | Value *RHS, const Twine &Name, bool UseSelect) { | ||||
| 7375 | unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); | ||||
| 7376 | switch (Kind) { | ||||
| 7377 | case RecurKind::Add: | ||||
| 7378 | case RecurKind::Mul: | ||||
| 7379 | case RecurKind::Or: | ||||
| 7380 | case RecurKind::And: | ||||
| 7381 | case RecurKind::Xor: | ||||
| 7382 | case RecurKind::FAdd: | ||||
| 7383 | case RecurKind::FMul: | ||||
| 7384 | return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, | ||||
| 7385 | Name); | ||||
| 7386 | case RecurKind::FMax: | ||||
| 7387 | return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); | ||||
| 7388 | case RecurKind::FMin: | ||||
| 7389 | return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); | ||||
| 7390 | case RecurKind::SMax: | ||||
| 7391 | if (UseSelect) { | ||||
| 7392 | Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); | ||||
| 7393 | return Builder.CreateSelect(Cmp, LHS, RHS, Name); | ||||
| 7394 | } | ||||
| 7395 | return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); | ||||
| 7396 | case RecurKind::SMin: | ||||
| 7397 | if (UseSelect) { | ||||
| 7398 | Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); | ||||
| 7399 | return Builder.CreateSelect(Cmp, LHS, RHS, Name); | ||||
| 7400 | } | ||||
| 7401 | return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); | ||||
| 7402 | case RecurKind::UMax: | ||||
| 7403 | if (UseSelect) { | ||||
| 7404 | Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); | ||||
| 7405 | return Builder.CreateSelect(Cmp, LHS, RHS, Name); | ||||
| 7406 | } | ||||
| 7407 | return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); | ||||
| 7408 | case RecurKind::UMin: | ||||
| 7409 | if (UseSelect) { | ||||
| 7410 | Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); | ||||
| 7411 | return Builder.CreateSelect(Cmp, LHS, RHS, Name); | ||||
| 7412 | } | ||||
| 7413 | return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS); | ||||
| 7414 | default: | ||||
| 7415 | llvm_unreachable("Unknown reduction operation.")__builtin_unreachable(); | ||||
| 7416 | } | ||||
| 7417 | } | ||||
| 7418 | |||||
| 7419 | /// Creates reduction operation with the current opcode with the IR flags | ||||
| 7420 | /// from \p ReductionOps. | ||||
| 7421 | static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, | ||||
| 7422 | Value *RHS, const Twine &Name, | ||||
| 7423 | const ReductionOpsListType &ReductionOps) { | ||||
| 7424 | bool UseSelect = ReductionOps.size() == 2; | ||||
| 7425 | assert((!UseSelect || isa<SelectInst>(ReductionOps[1][0])) &&((void)0) | ||||
| 7426 | "Expected cmp + select pairs for reduction")((void)0); | ||||
| 7427 | Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); | ||||
| 7428 | if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { | ||||
| 7429 | if (auto *Sel = dyn_cast<SelectInst>(Op)) { | ||||
| 7430 | propagateIRFlags(Sel->getCondition(), ReductionOps[0]); | ||||
| 7431 | propagateIRFlags(Op, ReductionOps[1]); | ||||
| 7432 | return Op; | ||||
| 7433 | } | ||||
| 7434 | } | ||||
| 7435 | propagateIRFlags(Op, ReductionOps[0]); | ||||
| 7436 | return Op; | ||||
| 7437 | } | ||||
| 7438 | |||||
| 7439 | /// Creates reduction operation with the current opcode with the IR flags | ||||
| 7440 | /// from \p I. | ||||
| 7441 | static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, | ||||
| 7442 | Value *RHS, const Twine &Name, Instruction *I) { | ||||
| 7443 | auto *SelI = dyn_cast<SelectInst>(I); | ||||
| 7444 | Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr); | ||||
| 7445 | if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { | ||||
| 7446 | if (auto *Sel = dyn_cast<SelectInst>(Op)) | ||||
| 7447 | propagateIRFlags(Sel->getCondition(), SelI->getCondition()); | ||||
| 7448 | } | ||||
| 7449 | propagateIRFlags(Op, I); | ||||
| 7450 | return Op; | ||||
| 7451 | } | ||||
| 7452 | |||||
| 7453 | static RecurKind getRdxKind(Instruction *I) { | ||||
| 7454 | assert(I && "Expected instruction for reduction matching")((void)0); | ||||
| 7455 | TargetTransformInfo::ReductionFlags RdxFlags; | ||||
| 7456 | if (match(I, m_Add(m_Value(), m_Value()))) | ||||
| 7457 | return RecurKind::Add; | ||||
| 7458 | if (match(I, m_Mul(m_Value(), m_Value()))) | ||||
| 7459 | return RecurKind::Mul; | ||||
| 7460 | if (match(I, m_And(m_Value(), m_Value())) || | ||||
| 7461 | match(I, m_LogicalAnd(m_Value(), m_Value()))) | ||||
| 7462 | return RecurKind::And; | ||||
| 7463 | if (match(I, m_Or(m_Value(), m_Value())) || | ||||
| 7464 | match(I, m_LogicalOr(m_Value(), m_Value()))) | ||||
| 7465 | return RecurKind::Or; | ||||
| 7466 | if (match(I, m_Xor(m_Value(), m_Value()))) | ||||
| 7467 | return RecurKind::Xor; | ||||
| 7468 | if (match(I, m_FAdd(m_Value(), m_Value()))) | ||||
| 7469 | return RecurKind::FAdd; | ||||
| 7470 | if (match(I, m_FMul(m_Value(), m_Value()))) | ||||
| 7471 | return RecurKind::FMul; | ||||
| 7472 | |||||
| 7473 | if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) | ||||
| 7474 | return RecurKind::FMax; | ||||
| 7475 | if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) | ||||
| 7476 | return RecurKind::FMin; | ||||
| 7477 | |||||
| 7478 | // This matches either cmp+select or intrinsics. SLP is expected to handle | ||||
| 7479 | // either form. | ||||
| 7480 | // TODO: If we are canonicalizing to intrinsics, we can remove several | ||||
| 7481 | // special-case paths that deal with selects. | ||||
| 7482 | if (match(I, m_SMax(m_Value(), m_Value()))) | ||||
| 7483 | return RecurKind::SMax; | ||||
| 7484 | if (match(I, m_SMin(m_Value(), m_Value()))) | ||||
| 7485 | return RecurKind::SMin; | ||||
| 7486 | if (match(I, m_UMax(m_Value(), m_Value()))) | ||||
| 7487 | return RecurKind::UMax; | ||||
| 7488 | if (match(I, m_UMin(m_Value(), m_Value()))) | ||||
| 7489 | return RecurKind::UMin; | ||||
| 7490 | |||||
| 7491 | if (auto *Select = dyn_cast<SelectInst>(I)) { | ||||
| 7492 | // Try harder: look for min/max pattern based on instructions producing | ||||
| 7493 | // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). | ||||
| 7494 | // During the intermediate stages of SLP, it's very common to have | ||||
| 7495 | // pattern like this (since optimizeGatherSequence is run only once | ||||
| 7496 | // at the end): | ||||
| 7497 | // %1 = extractelement <2 x i32> %a, i32 0 | ||||
| 7498 | // %2 = extractelement <2 x i32> %a, i32 1 | ||||
| 7499 | // %cond = icmp sgt i32 %1, %2 | ||||
| 7500 | // %3 = extractelement <2 x i32> %a, i32 0 | ||||
| 7501 | // %4 = extractelement <2 x i32> %a, i32 1 | ||||
| 7502 | // %select = select i1 %cond, i32 %3, i32 %4 | ||||
| 7503 | CmpInst::Predicate Pred; | ||||
| 7504 | Instruction *L1; | ||||
| 7505 | Instruction *L2; | ||||
| 7506 | |||||
| 7507 | Value *LHS = Select->getTrueValue(); | ||||
| 7508 | Value *RHS = Select->getFalseValue(); | ||||
| 7509 | Value *Cond = Select->getCondition(); | ||||
| 7510 | |||||
| 7511 | // TODO: Support inverse predicates. | ||||
| 7512 | if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) { | ||||
| 7513 | if (!isa<ExtractElementInst>(RHS) || | ||||
| 7514 | !L2->isIdenticalTo(cast<Instruction>(RHS))) | ||||
| 7515 | return RecurKind::None; | ||||
| 7516 | } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) { | ||||
| 7517 | if (!isa<ExtractElementInst>(LHS) || | ||||
| 7518 | !L1->isIdenticalTo(cast<Instruction>(LHS))) | ||||
| 7519 | return RecurKind::None; | ||||
| 7520 | } else { | ||||
| 7521 | if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS)) | ||||
| 7522 | return RecurKind::None; | ||||
| 7523 | if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) || | ||||
| 7524 | !L1->isIdenticalTo(cast<Instruction>(LHS)) || | ||||
| 7525 | !L2->isIdenticalTo(cast<Instruction>(RHS))) | ||||
| 7526 | return RecurKind::None; | ||||
| 7527 | } | ||||
| 7528 | |||||
| 7529 | TargetTransformInfo::ReductionFlags RdxFlags; | ||||
| 7530 | switch (Pred) { | ||||
| 7531 | default: | ||||
| 7532 | return RecurKind::None; | ||||
| 7533 | case CmpInst::ICMP_SGT: | ||||
| 7534 | case CmpInst::ICMP_SGE: | ||||
| 7535 | return RecurKind::SMax; | ||||
| 7536 | case CmpInst::ICMP_SLT: | ||||
| 7537 | case CmpInst::ICMP_SLE: | ||||
| 7538 | return RecurKind::SMin; | ||||
| 7539 | case CmpInst::ICMP_UGT: | ||||
| 7540 | case CmpInst::ICMP_UGE: | ||||
| 7541 | return RecurKind::UMax; | ||||
| 7542 | case CmpInst::ICMP_ULT: | ||||
| 7543 | case CmpInst::ICMP_ULE: | ||||
| 7544 | return RecurKind::UMin; | ||||
| 7545 | } | ||||
| 7546 | } | ||||
| 7547 | return RecurKind::None; | ||||
| 7548 | } | ||||
| 7549 | |||||
| 7550 | /// Get the index of the first operand. | ||||
| 7551 | static unsigned getFirstOperandIndex(Instruction *I) { | ||||
| 7552 | return isCmpSelMinMax(I) ? 1 : 0; | ||||
| 7553 | } | ||||
| 7554 | |||||
| 7555 | /// Total number of operands in the reduction operation. | ||||
| 7556 | static unsigned getNumberOfOperands(Instruction *I) { | ||||
| 7557 | return isCmpSelMinMax(I) ? 3 : 2; | ||||
| 7558 | } | ||||
| 7559 | |||||
| 7560 | /// Checks if the instruction is in basic block \p BB. | ||||
| 7561 | /// For a cmp+sel min/max reduction check that both ops are in \p BB. | ||||
| 7562 | static bool hasSameParent(Instruction *I, BasicBlock *BB) { | ||||
| 7563 | if (isCmpSelMinMax(I)) { | ||||
| 7564 | auto *Sel = cast<SelectInst>(I); | ||||
| 7565 | auto *Cmp = cast<Instruction>(Sel->getCondition()); | ||||
| 7566 | return Sel->getParent() == BB && Cmp->getParent() == BB; | ||||
| 7567 | } | ||||
| 7568 | return I->getParent() == BB; | ||||
| 7569 | } | ||||
| 7570 | |||||
| 7571 | /// Expected number of uses for reduction operations/reduced values. | ||||
| 7572 | static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) { | ||||
| 7573 | if (IsCmpSelMinMax) { | ||||
| 7574 | // SelectInst must be used twice while the condition op must have single | ||||
| 7575 | // use only. | ||||
| 7576 | if (auto *Sel = dyn_cast<SelectInst>(I)) | ||||
| 7577 | return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse(); | ||||
| 7578 | return I->hasNUses(2); | ||||
| 7579 | } | ||||
| 7580 | |||||
| 7581 | // Arithmetic reduction operation must be used once only. | ||||
| 7582 | return I->hasOneUse(); | ||||
| 7583 | } | ||||
| 7584 | |||||
| 7585 | /// Initializes the list of reduction operations. | ||||
| 7586 | void initReductionOps(Instruction *I) { | ||||
| 7587 | if (isCmpSelMinMax(I)) | ||||
| 7588 | ReductionOps.assign(2, ReductionOpsType()); | ||||
| 7589 | else | ||||
| 7590 | ReductionOps.assign(1, ReductionOpsType()); | ||||
| 7591 | } | ||||
| 7592 | |||||
| 7593 | /// Add all reduction operations for the reduction instruction \p I. | ||||
| 7594 | void addReductionOps(Instruction *I) { | ||||
| 7595 | if (isCmpSelMinMax(I)) { | ||||
| 7596 | ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); | ||||
| 7597 | ReductionOps[1].emplace_back(I); | ||||
| 7598 | } else { | ||||
| 7599 | ReductionOps[0].emplace_back(I); | ||||
| 7600 | } | ||||
| 7601 | } | ||||
| 7602 | |||||
| 7603 | static Value *getLHS(RecurKind Kind, Instruction *I) { | ||||
| 7604 | if (Kind == RecurKind::None) | ||||
| 7605 | return nullptr; | ||||
| 7606 | return I->getOperand(getFirstOperandIndex(I)); | ||||
| 7607 | } | ||||
| 7608 | static Value *getRHS(RecurKind Kind, Instruction *I) { | ||||
| 7609 | if (Kind == RecurKind::None) | ||||
| 7610 | return nullptr; | ||||
| 7611 | return I->getOperand(getFirstOperandIndex(I) + 1); | ||||
| 7612 | } | ||||
| 7613 | |||||
| 7614 | public: | ||||
| 7615 | HorizontalReduction() = default; | ||||
| 7616 | |||||
| 7617 | /// Try to find a reduction tree. | ||||
| 7618 | bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) { | ||||
| 7619 | assert((!Phi || is_contained(Phi->operands(), Inst)) &&((void)0) | ||||
| 7620 | "Phi needs to use the binary operator")((void)0); | ||||
| 7621 | assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||((void)0) | ||||
| 7622 | isa<IntrinsicInst>(Inst)) &&((void)0) | ||||
| 7623 | "Expected binop, select, or intrinsic for reduction matching")((void)0); | ||||
| 7624 | RdxKind = getRdxKind(Inst); | ||||
| 7625 | |||||
| 7626 | // We could have a initial reductions that is not an add. | ||||
| 7627 | // r *= v1 + v2 + v3 + v4 | ||||
| 7628 | // In such a case start looking for a tree rooted in the first '+'. | ||||
| 7629 | if (Phi) { | ||||
| 7630 | if (getLHS(RdxKind, Inst) == Phi) { | ||||
| 7631 | Phi = nullptr; | ||||
| 7632 | Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst)); | ||||
| 7633 | if (!Inst) | ||||
| 7634 | return false; | ||||
| 7635 | RdxKind = getRdxKind(Inst); | ||||
| 7636 | } else if (getRHS(RdxKind, Inst) == Phi) { | ||||
| 7637 | Phi = nullptr; | ||||
| 7638 | Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst)); | ||||
| 7639 | if (!Inst) | ||||
| 7640 | return false; | ||||
| 7641 | RdxKind = getRdxKind(Inst); | ||||
| 7642 | } | ||||
| 7643 | } | ||||
| 7644 | |||||
| 7645 | if (!isVectorizable(RdxKind, Inst)) | ||||
| 7646 | return false; | ||||
| 7647 | |||||
| 7648 | // Analyze "regular" integer/FP types for reductions - no target-specific | ||||
| 7649 | // types or pointers. | ||||
| 7650 | Type *Ty = Inst->getType(); | ||||
| 7651 | if (!isValidElementType(Ty) || Ty->isPointerTy()) | ||||
| 7652 | return false; | ||||
| 7653 | |||||
| 7654 | // Though the ultimate reduction may have multiple uses, its condition must | ||||
| 7655 | // have only single use. | ||||
| 7656 | if (auto *Sel = dyn_cast<SelectInst>(Inst)) | ||||
| 7657 | if (!Sel->getCondition()->hasOneUse()) | ||||
| 7658 | return false; | ||||
| 7659 | |||||
| 7660 | ReductionRoot = Inst; | ||||
| 7661 | |||||
| 7662 | // The opcode for leaf values that we perform a reduction on. | ||||
| 7663 | // For example: load(x) + load(y) + load(z) + fptoui(w) | ||||
| 7664 | // The leaf opcode for 'w' does not match, so we don't include it as a | ||||
| 7665 | // potential candidate for the reduction. | ||||
| 7666 | unsigned LeafOpcode = 0; | ||||
| 7667 | |||||
| 7668 | // Post-order traverse the reduction tree starting at Inst. We only handle | ||||
| 7669 | // true trees containing binary operators or selects. | ||||
| 7670 | SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; | ||||
| 7671 | Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst))); | ||||
| 7672 | initReductionOps(Inst); | ||||
| 7673 | while (!Stack.empty()) { | ||||
| 7674 | Instruction *TreeN = Stack.back().first; | ||||
| 7675 | unsigned EdgeToVisit = Stack.back().second++; | ||||
| 7676 | const RecurKind TreeRdxKind = getRdxKind(TreeN); | ||||
| 7677 | bool IsReducedValue = TreeRdxKind != RdxKind; | ||||
| 7678 | |||||
| 7679 | // Postorder visit. | ||||
| 7680 | if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) { | ||||
| 7681 | if (IsReducedValue) | ||||
| 7682 | ReducedVals.push_back(TreeN); | ||||
| 7683 | else { | ||||
| 7684 | auto ExtraArgsIter = ExtraArgs.find(TreeN); | ||||
| 7685 | if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) { | ||||
| 7686 | // Check if TreeN is an extra argument of its parent operation. | ||||
| 7687 | if (Stack.size() <= 1) { | ||||
| 7688 | // TreeN can't be an extra argument as it is a root reduction | ||||
| 7689 | // operation. | ||||
| 7690 | return false; | ||||
| 7691 | } | ||||
| 7692 | // Yes, TreeN is an extra argument, do not add it to a list of | ||||
| 7693 | // reduction operations. | ||||
| 7694 | // Stack[Stack.size() - 2] always points to the parent operation. | ||||
| 7695 | markExtraArg(Stack[Stack.size() - 2], TreeN); | ||||
| 7696 | ExtraArgs.erase(TreeN); | ||||
| 7697 | } else | ||||
| 7698 | addReductionOps(TreeN); | ||||
| 7699 | } | ||||
| 7700 | // Retract. | ||||
| 7701 | Stack.pop_back(); | ||||
| 7702 | continue; | ||||
| 7703 | } | ||||
| 7704 | |||||
| 7705 | // Visit operands. | ||||
| 7706 | Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit); | ||||
| 7707 | auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); | ||||
| 7708 | if (!EdgeInst) { | ||||
| 7709 | // Edge value is not a reduction instruction or a leaf instruction. | ||||
| 7710 | // (It may be a constant, function argument, or something else.) | ||||
| 7711 | markExtraArg(Stack.back(), EdgeVal); | ||||
| 7712 | continue; | ||||
| 7713 | } | ||||
| 7714 | RecurKind EdgeRdxKind = getRdxKind(EdgeInst); | ||||
| 7715 | // Continue analysis if the next operand is a reduction operation or | ||||
| 7716 | // (possibly) a leaf value. If the leaf value opcode is not set, | ||||
| 7717 | // the first met operation != reduction operation is considered as the | ||||
| 7718 | // leaf opcode. | ||||
| 7719 | // Only handle trees in the current basic block. | ||||
| 7720 | // Each tree node needs to have minimal number of users except for the | ||||
| 7721 | // ultimate reduction. | ||||
| 7722 | const bool IsRdxInst = EdgeRdxKind == RdxKind; | ||||
| 7723 | if (EdgeInst != Phi && EdgeInst != Inst && | ||||
| 7724 | hasSameParent(EdgeInst, Inst->getParent()) && | ||||
| 7725 | hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) && | ||||
| 7726 | (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) { | ||||
| 7727 | if (IsRdxInst) { | ||||
| 7728 | // We need to be able to reassociate the reduction operations. | ||||
| 7729 | if (!isVectorizable(EdgeRdxKind, EdgeInst)) { | ||||
| 7730 | // I is an extra argument for TreeN (its parent operation). | ||||
| 7731 | markExtraArg(Stack.back(), EdgeInst); | ||||
| 7732 | continue; | ||||
| 7733 | } | ||||
| 7734 | } else if (!LeafOpcode) { | ||||
| 7735 | LeafOpcode = EdgeInst->getOpcode(); | ||||
| 7736 | } | ||||
| 7737 | Stack.push_back( | ||||
| 7738 | std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst))); | ||||
| 7739 | continue; | ||||
| 7740 | } | ||||
| 7741 | // I is an extra argument for TreeN (its parent operation). | ||||
| 7742 | markExtraArg(Stack.back(), EdgeInst); | ||||
| 7743 | } | ||||
| 7744 | return true; | ||||
| 7745 | } | ||||
| 7746 | |||||
| 7747 | /// Attempt to vectorize the tree found by matchAssociativeReduction. | ||||
| 7748 | bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { | ||||
| 7749 | // If there are a sufficient number of reduction values, reduce | ||||
| 7750 | // to a nearby power-of-2. We can safely generate oversized | ||||
| 7751 | // vectors and rely on the backend to split them to legal sizes. | ||||
| 7752 | unsigned NumReducedVals = ReducedVals.size(); | ||||
| 7753 | if (NumReducedVals < 4) | ||||
| 7754 | return false; | ||||
| 7755 | |||||
| 7756 | // Intersect the fast-math-flags from all reduction operations. | ||||
| 7757 | FastMathFlags RdxFMF; | ||||
| 7758 | RdxFMF.set(); | ||||
| 7759 | for (ReductionOpsType &RdxOp : ReductionOps) { | ||||
| 7760 | for (Value *RdxVal : RdxOp) { | ||||
| 7761 | if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal)) | ||||
| 7762 | RdxFMF &= FPMO->getFastMathFlags(); | ||||
| 7763 | } | ||||
| 7764 | } | ||||
| 7765 | |||||
| 7766 | IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); | ||||
| 7767 | Builder.setFastMathFlags(RdxFMF); | ||||
| 7768 | |||||
| 7769 | BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; | ||||
| 7770 | // The same extra argument may be used several times, so log each attempt | ||||
| 7771 | // to use it. | ||||
| 7772 | for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) { | ||||
| 7773 | assert(Pair.first && "DebugLoc must be set.")((void)0); | ||||
| 7774 | ExternallyUsedValues[Pair.second].push_back(Pair.first); | ||||
| 7775 | } | ||||
| 7776 | |||||
| 7777 | // The compare instruction of a min/max is the insertion point for new | ||||
| 7778 | // instructions and may be replaced with a new compare instruction. | ||||
| 7779 | auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { | ||||
| 7780 | assert(isa<SelectInst>(RdxRootInst) &&((void)0) | ||||
| 7781 | "Expected min/max reduction to have select root instruction")((void)0); | ||||
| 7782 | Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); | ||||
| 7783 | assert(isa<Instruction>(ScalarCond) &&((void)0) | ||||
| 7784 | "Expected min/max reduction to have compare condition")((void)0); | ||||
| 7785 | return cast<Instruction>(ScalarCond); | ||||
| 7786 | }; | ||||
| 7787 | |||||
| 7788 | // The reduction root is used as the insertion point for new instructions, | ||||
| 7789 | // so set it as externally used to prevent it from being deleted. | ||||
| 7790 | ExternallyUsedValues[ReductionRoot]; | ||||
| 7791 | SmallVector<Value *, 16> IgnoreList; | ||||
| 7792 | for (ReductionOpsType &RdxOp : ReductionOps) | ||||
| 7793 | IgnoreList.append(RdxOp.begin(), RdxOp.end()); | ||||
| 7794 | |||||
| 7795 | unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); | ||||
| 7796 | if (NumReducedVals > ReduxWidth) { | ||||
| 7797 | // In the loop below, we are building a tree based on a window of | ||||
| 7798 | // 'ReduxWidth' values. | ||||
| 7799 | // If the operands of those values have common traits (compare predicate, | ||||
| 7800 | // constant operand, etc), then we want to group those together to | ||||
| 7801 | // minimize the cost of the reduction. | ||||
| 7802 | |||||
| 7803 | // TODO: This should be extended to count common operands for | ||||
| 7804 | // compares and binops. | ||||
| 7805 | |||||
| 7806 | // Step 1: Count the number of times each compare predicate occurs. | ||||
| 7807 | SmallDenseMap<unsigned, unsigned> PredCountMap; | ||||
| 7808 | for (Value *RdxVal : ReducedVals) { | ||||
| 7809 | CmpInst::Predicate Pred; | ||||
| 7810 | if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) | ||||
| 7811 | ++PredCountMap[Pred]; | ||||
| 7812 | } | ||||
| 7813 | // Step 2: Sort the values so the most common predicates come first. | ||||
| 7814 | stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { | ||||
| 7815 | CmpInst::Predicate PredA, PredB; | ||||
| 7816 | if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && | ||||
| 7817 | match(B, m_Cmp(PredB, m_Value(), m_Value()))) { | ||||
| 7818 | return PredCountMap[PredA] > PredCountMap[PredB]; | ||||
| 7819 | } | ||||
| 7820 | return false; | ||||
| 7821 | }); | ||||
| 7822 | } | ||||
| 7823 | |||||
| 7824 | Value *VectorizedTree = nullptr; | ||||
| 7825 | unsigned i = 0; | ||||
| 7826 | while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { | ||||
| 7827 | ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth); | ||||
| 7828 | V.buildTree(VL, ExternallyUsedValues, IgnoreList); | ||||
| 7829 | Optional<ArrayRef<unsigned>> Order = V.bestOrder(); | ||||
| 7830 | if (Order) { | ||||
| 7831 | assert(Order->size() == VL.size() &&((void)0) | ||||
| 7832 | "Order size must be the same as number of vectorized "((void)0) | ||||
| 7833 | "instructions.")((void)0); | ||||
| 7834 | // TODO: reorder tree nodes without tree rebuilding. | ||||
| 7835 | SmallVector<Value *, 4> ReorderedOps(VL.size()); | ||||
| 7836 | transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), | ||||
| 7837 | [VL](const unsigned Idx) { return VL[Idx]; }); | ||||
| 7838 | V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); | ||||
| 7839 | } | ||||
| 7840 | if (V.isTreeTinyAndNotFullyVectorizable()) | ||||
| 7841 | break; | ||||
| 7842 | if (V.isLoadCombineReductionCandidate(RdxKind)) | ||||
| 7843 | break; | ||||
| 7844 | |||||
| 7845 | // For a poison-safe boolean logic reduction, do not replace select | ||||
| 7846 | // instructions with logic ops. All reduced values will be frozen (see | ||||
| 7847 | // below) to prevent leaking poison. | ||||
| 7848 | if (isa<SelectInst>(ReductionRoot) && | ||||
| 7849 | isBoolLogicOp(cast<Instruction>(ReductionRoot)) && | ||||
| 7850 | NumReducedVals != ReduxWidth) | ||||
| 7851 | break; | ||||
| 7852 | |||||
| 7853 | V.computeMinimumValueSizes(); | ||||
| 7854 | |||||
| 7855 | // Estimate cost. | ||||
| 7856 | InstructionCost TreeCost = | ||||
| 7857 | V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); | ||||
| 7858 | InstructionCost ReductionCost = | ||||
| 7859 | getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); | ||||
| 7860 | InstructionCost Cost = TreeCost + ReductionCost; | ||||
| 7861 | if (!Cost.isValid()) { | ||||
| 7862 | LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n")do { } while (false); | ||||
| 7863 | return false; | ||||
| 7864 | } | ||||
| 7865 | if (Cost >= -SLPCostThreshold) { | ||||
| 7866 | V.getORE()->emit([&]() { | ||||
| 7867 | return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "HorSLPNotBeneficial", | ||||
| 7868 | cast<Instruction>(VL[0])) | ||||
| 7869 | << "Vectorizing horizontal reduction is possible" | ||||
| 7870 | << "but not beneficial with cost " << ore::NV("Cost", Cost) | ||||
| 7871 | << " and threshold " | ||||
| 7872 | << ore::NV("Threshold", -SLPCostThreshold); | ||||
| 7873 | }); | ||||
| 7874 | break; | ||||
| 7875 | } | ||||
| 7876 | |||||
| 7877 | LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"do { } while (false) | ||||
| 7878 | << Cost << ". (HorRdx)\n")do { } while (false); | ||||
| 7879 | V.getORE()->emit([&]() { | ||||
| 7880 | return OptimizationRemark(SV_NAME"slp-vectorizer", "VectorizedHorizontalReduction", | ||||
| 7881 | cast<Instruction>(VL[0])) | ||||
| 7882 | << "Vectorized horizontal reduction with cost " | ||||
| 7883 | << ore::NV("Cost", Cost) << " and with tree size " | ||||
| 7884 | << ore::NV("TreeSize", V.getTreeSize()); | ||||
| 7885 | }); | ||||
| 7886 | |||||
| 7887 | // Vectorize a tree. | ||||
| 7888 | DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); | ||||
| 7889 | Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); | ||||
| 7890 | |||||
| 7891 | // Emit a reduction. If the root is a select (min/max idiom), the insert | ||||
| 7892 | // point is the compare condition of that select. | ||||
| 7893 | Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); | ||||
| 7894 | if (isCmpSelMinMax(RdxRootInst)) | ||||
| 7895 | Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); | ||||
| 7896 | else | ||||
| 7897 | Builder.SetInsertPoint(RdxRootInst); | ||||
| 7898 | |||||
| 7899 | // To prevent poison from leaking across what used to be sequential, safe, | ||||
| 7900 | // scalar boolean logic operations, the reduction operand must be frozen. | ||||
| 7901 | if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst)) | ||||
| 7902 | VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); | ||||
| 7903 | |||||
| 7904 | Value *ReducedSubTree = | ||||
| 7905 | emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); | ||||
| 7906 | |||||
| 7907 | if (!VectorizedTree) { | ||||
| 7908 | // Initialize the final value in the reduction. | ||||
| 7909 | VectorizedTree = ReducedSubTree; | ||||
| 7910 | } else { | ||||
| 7911 | // Update the final value in the reduction. | ||||
| 7912 | Builder.SetCurrentDebugLocation(Loc); | ||||
| 7913 | VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, | ||||
| 7914 | ReducedSubTree, "op.rdx", ReductionOps); | ||||
| 7915 | } | ||||
| 7916 | i += ReduxWidth; | ||||
| 7917 | ReduxWidth = PowerOf2Floor(NumReducedVals - i); | ||||
| 7918 | } | ||||
| 7919 | |||||
| 7920 | if (VectorizedTree) { | ||||
| 7921 | // Finish the reduction. | ||||
| 7922 | for (; i < NumReducedVals; ++i) { | ||||
| 7923 | auto *I = cast<Instruction>(ReducedVals[i]); | ||||
| 7924 | Builder.SetCurrentDebugLocation(I->getDebugLoc()); | ||||
| 7925 | VectorizedTree = | ||||
| 7926 | createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps); | ||||
| 7927 | } | ||||
| 7928 | for (auto &Pair : ExternallyUsedValues) { | ||||
| 7929 | // Add each externally used value to the final reduction. | ||||
| 7930 | for (auto *I : Pair.second) { | ||||
| 7931 | Builder.SetCurrentDebugLocation(I->getDebugLoc()); | ||||
| 7932 | VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, | ||||
| 7933 | Pair.first, "op.extra", I); | ||||
| 7934 | } | ||||
| 7935 | } | ||||
| 7936 | |||||
| 7937 | ReductionRoot->replaceAllUsesWith(VectorizedTree); | ||||
| 7938 | |||||
| 7939 | // Mark all scalar reduction ops for deletion, they are replaced by the | ||||
| 7940 | // vector reductions. | ||||
| 7941 | V.eraseInstructions(IgnoreList); | ||||
| 7942 | } | ||||
| 7943 | return VectorizedTree != nullptr; | ||||
| 7944 | } | ||||
| 7945 | |||||
| 7946 | unsigned numReductionValues() const { return ReducedVals.size(); } | ||||
| 7947 | |||||
| 7948 | private: | ||||
| 7949 | /// Calculate the cost of a reduction. | ||||
| 7950 | InstructionCost getReductionCost(TargetTransformInfo *TTI, | ||||
| 7951 | Value *FirstReducedVal, unsigned ReduxWidth, | ||||
| 7952 | FastMathFlags FMF) { | ||||
| 7953 | Type *ScalarTy = FirstReducedVal->getType(); | ||||
| 7954 | FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); | ||||
| 7955 | InstructionCost VectorCost, ScalarCost; | ||||
| 7956 | switch (RdxKind) { | ||||
| 7957 | case RecurKind::Add: | ||||
| 7958 | case RecurKind::Mul: | ||||
| 7959 | case RecurKind::Or: | ||||
| 7960 | case RecurKind::And: | ||||
| 7961 | case RecurKind::Xor: | ||||
| 7962 | case RecurKind::FAdd: | ||||
| 7963 | case RecurKind::FMul: { | ||||
| 7964 | unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); | ||||
| 7965 | VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF); | ||||
| 7966 | ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); | ||||
| 7967 | break; | ||||
| 7968 | } | ||||
| 7969 | case RecurKind::FMax: | ||||
| 7970 | case RecurKind::FMin: { | ||||
| 7971 | auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); | ||||
| 7972 | VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, | ||||
| 7973 | /*unsigned=*/false); | ||||
| 7974 | ScalarCost = | ||||
| 7975 | TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + | ||||
| 7976 | TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, | ||||
| 7977 | CmpInst::makeCmpResultType(ScalarTy)); | ||||
| 7978 | break; | ||||
| 7979 | } | ||||
| 7980 | case RecurKind::SMax: | ||||
| 7981 | case RecurKind::SMin: | ||||
| 7982 | case RecurKind::UMax: | ||||
| 7983 | case RecurKind::UMin: { | ||||
| 7984 | auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); | ||||
| 7985 | bool IsUnsigned = | ||||
| 7986 | RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; | ||||
| 7987 | VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned); | ||||
| 7988 | ScalarCost = | ||||
| 7989 | TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + | ||||
| 7990 | TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, | ||||
| 7991 | CmpInst::makeCmpResultType(ScalarTy)); | ||||
| 7992 | break; | ||||
| 7993 | } | ||||
| 7994 | default: | ||||
| 7995 | llvm_unreachable("Expected arithmetic or min/max reduction operation")__builtin_unreachable(); | ||||
| 7996 | } | ||||
| 7997 | |||||
| 7998 | // Scalar cost is repeated for N-1 elements. | ||||
| 7999 | ScalarCost *= (ReduxWidth - 1); | ||||
| 8000 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCostdo { } while (false) | ||||
| 8001 | << " for reduction that starts with " << *FirstReducedValdo { } while (false) | ||||
| 8002 | << " (It is a splitting reduction)\n")do { } while (false); | ||||
| 8003 | return VectorCost - ScalarCost; | ||||
| 8004 | } | ||||
| 8005 | |||||
| 8006 | /// Emit a horizontal reduction of the vectorized value. | ||||
| 8007 | Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, | ||||
| 8008 | unsigned ReduxWidth, const TargetTransformInfo *TTI) { | ||||
| 8009 | assert(VectorizedValue && "Need to have a vectorized tree node")((void)0); | ||||
| 8010 | assert(isPowerOf2_32(ReduxWidth) &&((void)0) | ||||
| 8011 | "We only handle power-of-two reductions for now")((void)0); | ||||
| 8012 | |||||
| 8013 | return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, | ||||
| 8014 | ReductionOps.back()); | ||||
| 8015 | } | ||||
| 8016 | }; | ||||
| 8017 | |||||
| 8018 | } // end anonymous namespace | ||||
| 8019 | |||||
| 8020 | static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { | ||||
| 8021 | if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) | ||||
| 8022 | return cast<FixedVectorType>(IE->getType())->getNumElements(); | ||||
| 8023 | |||||
| 8024 | unsigned AggregateSize = 1; | ||||
| 8025 | auto *IV = cast<InsertValueInst>(InsertInst); | ||||
| 8026 | Type *CurrentType = IV->getType(); | ||||
| 8027 | do { | ||||
| 8028 | if (auto *ST = dyn_cast<StructType>(CurrentType)) { | ||||
| 8029 | for (auto *Elt : ST->elements()) | ||||
| 8030 | if (Elt != ST->getElementType(0)) // check homogeneity | ||||
| 8031 | return None; | ||||
| 8032 | AggregateSize *= ST->getNumElements(); | ||||
| 8033 | CurrentType = ST->getElementType(0); | ||||
| 8034 | } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { | ||||
| 8035 | AggregateSize *= AT->getNumElements(); | ||||
| 8036 | CurrentType = AT->getElementType(); | ||||
| 8037 | } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { | ||||
| 8038 | AggregateSize *= VT->getNumElements(); | ||||
| 8039 | return AggregateSize; | ||||
| 8040 | } else if (CurrentType->isSingleValueType()) { | ||||
| 8041 | return AggregateSize; | ||||
| 8042 | } else { | ||||
| 8043 | return None; | ||||
| 8044 | } | ||||
| 8045 | } while (true); | ||||
| 8046 | } | ||||
| 8047 | |||||
| 8048 | static bool findBuildAggregate_rec(Instruction *LastInsertInst, | ||||
| 8049 | TargetTransformInfo *TTI, | ||||
| 8050 | SmallVectorImpl<Value *> &BuildVectorOpds, | ||||
| 8051 | SmallVectorImpl<Value *> &InsertElts, | ||||
| 8052 | unsigned OperandOffset) { | ||||
| 8053 | do { | ||||
| 8054 | Value *InsertedOperand = LastInsertInst->getOperand(1); | ||||
| 8055 | Optional<int> OperandIndex = getInsertIndex(LastInsertInst, OperandOffset); | ||||
| 8056 | if (!OperandIndex) | ||||
| 8057 | return false; | ||||
| 8058 | if (isa<InsertElementInst>(InsertedOperand) || | ||||
| 8059 | isa<InsertValueInst>(InsertedOperand)) { | ||||
| 8060 | if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI, | ||||
| 8061 | BuildVectorOpds, InsertElts, *OperandIndex)) | ||||
| 8062 | return false; | ||||
| 8063 | } else { | ||||
| 8064 | BuildVectorOpds[*OperandIndex] = InsertedOperand; | ||||
| 8065 | InsertElts[*OperandIndex] = LastInsertInst; | ||||
| 8066 | } | ||||
| 8067 | LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); | ||||
| 8068 | } while (LastInsertInst != nullptr && | ||||
| 8069 | (isa<InsertValueInst>(LastInsertInst) || | ||||
| 8070 | isa<InsertElementInst>(LastInsertInst)) && | ||||
| 8071 | LastInsertInst->hasOneUse()); | ||||
| 8072 | return true; | ||||
| 8073 | } | ||||
| 8074 | |||||
| 8075 | /// Recognize construction of vectors like | ||||
| 8076 | /// %ra = insertelement <4 x float> poison, float %s0, i32 0 | ||||
| 8077 | /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 | ||||
| 8078 | /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 | ||||
| 8079 | /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 | ||||
| 8080 | /// starting from the last insertelement or insertvalue instruction. | ||||
| 8081 | /// | ||||
| 8082 | /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, | ||||
| 8083 | /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. | ||||
| 8084 | /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. | ||||
| 8085 | /// | ||||
| 8086 | /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. | ||||
| 8087 | /// | ||||
| 8088 | /// \return true if it matches. | ||||
| 8089 | static bool findBuildAggregate(Instruction *LastInsertInst, | ||||
| 8090 | TargetTransformInfo *TTI, | ||||
| 8091 | SmallVectorImpl<Value *> &BuildVectorOpds, | ||||
| 8092 | SmallVectorImpl<Value *> &InsertElts) { | ||||
| 8093 | |||||
| 8094 | assert((isa<InsertElementInst>(LastInsertInst) ||((void)0) | ||||
| 8095 | isa<InsertValueInst>(LastInsertInst)) &&((void)0) | ||||
| 8096 | "Expected insertelement or insertvalue instruction!")((void)0); | ||||
| 8097 | |||||
| 8098 | assert((BuildVectorOpds.empty() && InsertElts.empty()) &&((void)0) | ||||
| 8099 | "Expected empty result vectors!")((void)0); | ||||
| 8100 | |||||
| 8101 | Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst); | ||||
| 8102 | if (!AggregateSize) | ||||
| 8103 | return false; | ||||
| 8104 | BuildVectorOpds.resize(*AggregateSize); | ||||
| 8105 | InsertElts.resize(*AggregateSize); | ||||
| 8106 | |||||
| 8107 | if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, | ||||
| 8108 | 0)) { | ||||
| 8109 | llvm::erase_value(BuildVectorOpds, nullptr); | ||||
| 8110 | llvm::erase_value(InsertElts, nullptr); | ||||
| 8111 | if (BuildVectorOpds.size() >= 2) | ||||
| 8112 | return true; | ||||
| 8113 | } | ||||
| 8114 | |||||
| 8115 | return false; | ||||
| 8116 | } | ||||
| 8117 | |||||
| 8118 | /// Try and get a reduction value from a phi node. | ||||
| 8119 | /// | ||||
| 8120 | /// Given a phi node \p P in a block \p ParentBB, consider possible reductions | ||||
| 8121 | /// if they come from either \p ParentBB or a containing loop latch. | ||||
| 8122 | /// | ||||
| 8123 | /// \returns A candidate reduction value if possible, or \code nullptr \endcode | ||||
| 8124 | /// if not possible. | ||||
| 8125 | static Value *getReductionValue(const DominatorTree *DT, PHINode *P, | ||||
| 8126 | BasicBlock *ParentBB, LoopInfo *LI) { | ||||
| 8127 | // There are situations where the reduction value is not dominated by the | ||||
| 8128 | // reduction phi. Vectorizing such cases has been reported to cause | ||||
| 8129 | // miscompiles. See PR25787. | ||||
| 8130 | auto DominatedReduxValue = [&](Value *R) { | ||||
| 8131 | return isa<Instruction>(R) && | ||||
| 8132 | DT->dominates(P->getParent(), cast<Instruction>(R)->getParent()); | ||||
| 8133 | }; | ||||
| 8134 | |||||
| 8135 | Value *Rdx = nullptr; | ||||
| 8136 | |||||
| 8137 | // Return the incoming value if it comes from the same BB as the phi node. | ||||
| 8138 | if (P->getIncomingBlock(0) == ParentBB) { | ||||
| 8139 | Rdx = P->getIncomingValue(0); | ||||
| 8140 | } else if (P->getIncomingBlock(1) == ParentBB) { | ||||
| 8141 | Rdx = P->getIncomingValue(1); | ||||
| 8142 | } | ||||
| 8143 | |||||
| 8144 | if (Rdx && DominatedReduxValue(Rdx)) | ||||
| 8145 | return Rdx; | ||||
| 8146 | |||||
| 8147 | // Otherwise, check whether we have a loop latch to look at. | ||||
| 8148 | Loop *BBL = LI->getLoopFor(ParentBB); | ||||
| 8149 | if (!BBL) | ||||
| 8150 | return nullptr; | ||||
| 8151 | BasicBlock *BBLatch = BBL->getLoopLatch(); | ||||
| 8152 | if (!BBLatch) | ||||
| 8153 | return nullptr; | ||||
| 8154 | |||||
| 8155 | // There is a loop latch, return the incoming value if it comes from | ||||
| 8156 | // that. This reduction pattern occasionally turns up. | ||||
| 8157 | if (P->getIncomingBlock(0) == BBLatch) { | ||||
| 8158 | Rdx = P->getIncomingValue(0); | ||||
| 8159 | } else if (P->getIncomingBlock(1) == BBLatch) { | ||||
| 8160 | Rdx = P->getIncomingValue(1); | ||||
| 8161 | } | ||||
| 8162 | |||||
| 8163 | if (Rdx && DominatedReduxValue(Rdx)) | ||||
| 8164 | return Rdx; | ||||
| 8165 | |||||
| 8166 | return nullptr; | ||||
| 8167 | } | ||||
| 8168 | |||||
| 8169 | static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { | ||||
| 8170 | if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) | ||||
| 8171 | return true; | ||||
| 8172 | if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1)))) | ||||
| 8173 | return true; | ||||
| 8174 | if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) | ||||
| 8175 | return true; | ||||
| 8176 | if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1)))) | ||||
| 8177 | return true; | ||||
| 8178 | if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1)))) | ||||
| 8179 | return true; | ||||
| 8180 | if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1)))) | ||||
| 8181 | return true; | ||||
| 8182 | if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1)))) | ||||
| 8183 | return true; | ||||
| 8184 | return false; | ||||
| 8185 | } | ||||
| 8186 | |||||
| 8187 | /// Attempt to reduce a horizontal reduction. | ||||
| 8188 | /// If it is legal to match a horizontal reduction feeding the phi node \a P | ||||
| 8189 | /// with reduction operators \a Root (or one of its operands) in a basic block | ||||
| 8190 | /// \a BB, then check if it can be done. If horizontal reduction is not found | ||||
| 8191 | /// and root instruction is a binary operation, vectorization of the operands is | ||||
| 8192 | /// attempted. | ||||
| 8193 | /// \returns true if a horizontal reduction was matched and reduced or operands | ||||
| 8194 | /// of one of the binary instruction were vectorized. | ||||
| 8195 | /// \returns false if a horizontal reduction was not matched (or not possible) | ||||
| 8196 | /// or no vectorization of any binary operation feeding \a Root instruction was | ||||
| 8197 | /// performed. | ||||
| 8198 | static bool tryToVectorizeHorReductionOrInstOperands( | ||||
| 8199 | PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, | ||||
| 8200 | TargetTransformInfo *TTI, | ||||
| 8201 | const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) { | ||||
| 8202 | if (!ShouldVectorizeHor) | ||||
| 8203 | return false; | ||||
| 8204 | |||||
| 8205 | if (!Root) | ||||
| 8206 | return false; | ||||
| 8207 | |||||
| 8208 | if (Root->getParent() != BB || isa<PHINode>(Root)) | ||||
| 8209 | return false; | ||||
| 8210 | // Start analysis starting from Root instruction. If horizontal reduction is | ||||
| 8211 | // found, try to vectorize it. If it is not a horizontal reduction or | ||||
| 8212 | // vectorization is not possible or not effective, and currently analyzed | ||||
| 8213 | // instruction is a binary operation, try to vectorize the operands, using | ||||
| 8214 | // pre-order DFS traversal order. If the operands were not vectorized, repeat | ||||
| 8215 | // the same procedure considering each operand as a possible root of the | ||||
| 8216 | // horizontal reduction. | ||||
| 8217 | // Interrupt the process if the Root instruction itself was vectorized or all | ||||
| 8218 | // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. | ||||
| 8219 | // Skip the analysis of CmpInsts.Compiler implements postanalysis of the | ||||
| 8220 | // CmpInsts so we can skip extra attempts in | ||||
| 8221 | // tryToVectorizeHorReductionOrInstOperands and save compile time. | ||||
| 8222 | SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0}); | ||||
| 8223 | SmallPtrSet<Value *, 8> VisitedInstrs; | ||||
| 8224 | bool Res = false; | ||||
| 8225 | while (!Stack.empty()) { | ||||
| 8226 | Instruction *Inst; | ||||
| 8227 | unsigned Level; | ||||
| 8228 | std::tie(Inst, Level) = Stack.pop_back_val(); | ||||
| 8229 | // Do not try to analyze instruction that has already been vectorized. | ||||
| 8230 | // This may happen when we vectorize instruction operands on a previous | ||||
| 8231 | // iteration while stack was populated before that happened. | ||||
| 8232 | if (R.isDeleted(Inst)) | ||||
| 8233 | continue; | ||||
| 8234 | Value *B0, *B1; | ||||
| 8235 | bool IsBinop = matchRdxBop(Inst, B0, B1); | ||||
| 8236 | bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); | ||||
| 8237 | if (IsBinop || IsSelect) { | ||||
| 8238 | HorizontalReduction HorRdx; | ||||
| 8239 | if (HorRdx.matchAssociativeReduction(P, Inst)) { | ||||
| 8240 | if (HorRdx.tryToReduce(R, TTI)) { | ||||
| 8241 | Res = true; | ||||
| 8242 | // Set P to nullptr to avoid re-analysis of phi node in | ||||
| 8243 | // matchAssociativeReduction function unless this is the root node. | ||||
| 8244 | P = nullptr; | ||||
| 8245 | continue; | ||||
| 8246 | } | ||||
| 8247 | } | ||||
| 8248 | if (P && IsBinop) { | ||||
| 8249 | Inst = dyn_cast<Instruction>(B0); | ||||
| 8250 | if (Inst == P) | ||||
| 8251 | Inst = dyn_cast<Instruction>(B1); | ||||
| 8252 | if (!Inst) { | ||||
| 8253 | // Set P to nullptr to avoid re-analysis of phi node in | ||||
| 8254 | // matchAssociativeReduction function unless this is the root node. | ||||
| 8255 | P = nullptr; | ||||
| 8256 | continue; | ||||
| 8257 | } | ||||
| 8258 | } | ||||
| 8259 | } | ||||
| 8260 | // Set P to nullptr to avoid re-analysis of phi node in | ||||
| 8261 | // matchAssociativeReduction function unless this is the root node. | ||||
| 8262 | P = nullptr; | ||||
| 8263 | // Do not try to vectorize CmpInst operands, this is done separately. | ||||
| 8264 | if (!isa<CmpInst>(Inst) && Vectorize(Inst, R)) { | ||||
| 8265 | Res = true; | ||||
| 8266 | continue; | ||||
| 8267 | } | ||||
| 8268 | |||||
| 8269 | // Try to vectorize operands. | ||||
| 8270 | // Continue analysis for the instruction from the same basic block only to | ||||
| 8271 | // save compile time. | ||||
| 8272 | if (++Level < RecursionMaxDepth) | ||||
| 8273 | for (auto *Op : Inst->operand_values()) | ||||
| 8274 | if (VisitedInstrs.insert(Op).second) | ||||
| 8275 | if (auto *I = dyn_cast<Instruction>(Op)) | ||||
| 8276 | // Do not try to vectorize CmpInst operands, this is done | ||||
| 8277 | // separately. | ||||
| 8278 | if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) && | ||||
| 8279 | I->getParent() == BB) | ||||
| 8280 | Stack.emplace_back(I, Level); | ||||
| 8281 | } | ||||
| 8282 | return Res; | ||||
| 8283 | } | ||||
| 8284 | |||||
| 8285 | bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, | ||||
| 8286 | BasicBlock *BB, BoUpSLP &R, | ||||
| 8287 | TargetTransformInfo *TTI) { | ||||
| 8288 | auto *I = dyn_cast_or_null<Instruction>(V); | ||||
| 8289 | if (!I) | ||||
| 8290 | return false; | ||||
| 8291 | |||||
| 8292 | if (!isa<BinaryOperator>(I)) | ||||
| 8293 | P = nullptr; | ||||
| 8294 | // Try to match and vectorize a horizontal reduction. | ||||
| 8295 | auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { | ||||
| 8296 | return tryToVectorize(I, R); | ||||
| 8297 | }; | ||||
| 8298 | return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, | ||||
| 8299 | ExtraVectorization); | ||||
| 8300 | } | ||||
| 8301 | |||||
| 8302 | bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, | ||||
| 8303 | BasicBlock *BB, BoUpSLP &R) { | ||||
| 8304 | const DataLayout &DL = BB->getModule()->getDataLayout(); | ||||
| 8305 | if (!R.canMapToVector(IVI->getType(), DL)) | ||||
| 8306 | return false; | ||||
| 8307 | |||||
| 8308 | SmallVector<Value *, 16> BuildVectorOpds; | ||||
| 8309 | SmallVector<Value *, 16> BuildVectorInsts; | ||||
| 8310 | if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) | ||||
| 8311 | return false; | ||||
| 8312 | |||||
| 8313 | LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n")do { } while (false); | ||||
| 8314 | // Aggregate value is unlikely to be processed in vector register, we need to | ||||
| 8315 | // extract scalars into scalar registers, so NeedExtraction is set true. | ||||
| 8316 | return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false); | ||||
| 8317 | } | ||||
| 8318 | |||||
| 8319 | bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, | ||||
| 8320 | BasicBlock *BB, BoUpSLP &R) { | ||||
| 8321 | SmallVector<Value *, 16> BuildVectorInsts; | ||||
| 8322 | SmallVector<Value *, 16> BuildVectorOpds; | ||||
| 8323 | SmallVector<int> Mask; | ||||
| 8324 | if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || | ||||
| 8325 | (llvm::all_of(BuildVectorOpds, | ||||
| 8326 | [](Value *V) { return isa<ExtractElementInst>(V); }) && | ||||
| 8327 | isShuffle(BuildVectorOpds, Mask))) | ||||
| 8328 | return false; | ||||
| 8329 | |||||
| 8330 | LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n")do { } while (false); | ||||
| 8331 | return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true); | ||||
| 8332 | } | ||||
| 8333 | |||||
| 8334 | bool SLPVectorizerPass::vectorizeSimpleInstructions( | ||||
| 8335 | SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, | ||||
| 8336 | bool AtTerminator) { | ||||
| 8337 | bool OpsChanged = false; | ||||
| 8338 | SmallVector<Instruction *, 4> PostponedCmps; | ||||
| 8339 | for (auto *I : reverse(Instructions)) { | ||||
| 8340 | if (R.isDeleted(I)) | ||||
| 8341 | continue; | ||||
| 8342 | if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) | ||||
| 8343 | OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); | ||||
| 8344 | else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) | ||||
| 8345 | OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); | ||||
| 8346 | else if (isa<CmpInst>(I)) | ||||
| 8347 | PostponedCmps.push_back(I); | ||||
| 8348 | } | ||||
| 8349 | if (AtTerminator) { | ||||
| 8350 | // Try to find reductions first. | ||||
| 8351 | for (Instruction *I : PostponedCmps) { | ||||
| 8352 | if (R.isDeleted(I)) | ||||
| 8353 | continue; | ||||
| 8354 | for (Value *Op : I->operands()) | ||||
| 8355 | OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); | ||||
| 8356 | } | ||||
| 8357 | // Try to vectorize operands as vector bundles. | ||||
| 8358 | for (Instruction *I : PostponedCmps) { | ||||
| 8359 | if (R.isDeleted(I)) | ||||
| 8360 | continue; | ||||
| 8361 | OpsChanged |= tryToVectorize(I, R); | ||||
| 8362 | } | ||||
| 8363 | Instructions.clear(); | ||||
| 8364 | } else { | ||||
| 8365 | // Insert in reverse order since the PostponedCmps vector was filled in | ||||
| 8366 | // reverse order. | ||||
| 8367 | Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); | ||||
| 8368 | } | ||||
| 8369 | return OpsChanged; | ||||
| 8370 | } | ||||
| 8371 | |||||
| 8372 | bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { | ||||
| 8373 | bool Changed = false; | ||||
| 8374 | SmallVector<Value *, 4> Incoming; | ||||
| 8375 | SmallPtrSet<Value *, 16> VisitedInstrs; | ||||
| 8376 | // Maps phi nodes to the non-phi nodes found in the use tree for each phi | ||||
| 8377 | // node. Allows better to identify the chains that can be vectorized in the | ||||
| 8378 | // better way. | ||||
| 8379 | DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes; | ||||
| 8380 | |||||
| 8381 | bool HaveVectorizedPhiNodes = true; | ||||
| 8382 | while (HaveVectorizedPhiNodes) { | ||||
| 8383 | HaveVectorizedPhiNodes = false; | ||||
| 8384 | |||||
| 8385 | // Collect the incoming values from the PHIs. | ||||
| 8386 | Incoming.clear(); | ||||
| 8387 | for (Instruction &I : *BB) { | ||||
| 8388 | PHINode *P = dyn_cast<PHINode>(&I); | ||||
| 8389 | if (!P) | ||||
| 8390 | break; | ||||
| 8391 | |||||
| 8392 | // No need to analyze deleted, vectorized and non-vectorizable | ||||
| 8393 | // instructions. | ||||
| 8394 | if (!VisitedInstrs.count(P) && !R.isDeleted(P) && | ||||
| 8395 | isValidElementType(P->getType())) | ||||
| 8396 | Incoming.push_back(P); | ||||
| 8397 | } | ||||
| 8398 | |||||
| 8399 | // Find the corresponding non-phi nodes for better matching when trying to | ||||
| 8400 | // build the tree. | ||||
| 8401 | for (Value *V : Incoming) { | ||||
| 8402 | SmallVectorImpl<Value *> &Opcodes = | ||||
| 8403 | PHIToOpcodes.try_emplace(V).first->getSecond(); | ||||
| 8404 | if (!Opcodes.empty()) | ||||
| 8405 | continue; | ||||
| 8406 | SmallVector<Value *, 4> Nodes(1, V); | ||||
| 8407 | SmallPtrSet<Value *, 4> Visited; | ||||
| 8408 | while (!Nodes.empty()) { | ||||
| 8409 | auto *PHI = cast<PHINode>(Nodes.pop_back_val()); | ||||
| 8410 | if (!Visited.insert(PHI).second) | ||||
| 8411 | continue; | ||||
| 8412 | for (Value *V : PHI->incoming_values()) { | ||||
| 8413 | if (auto *PHI1 = dyn_cast<PHINode>((V))) { | ||||
| 8414 | Nodes.push_back(PHI1); | ||||
| 8415 | continue; | ||||
| 8416 | } | ||||
| 8417 | Opcodes.emplace_back(V); | ||||
| 8418 | } | ||||
| 8419 | } | ||||
| 8420 | } | ||||
| 8421 | |||||
| 8422 | // Sort by type, parent, operands. | ||||
| 8423 | stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) { | ||||
| 8424 | assert(isValidElementType(V1->getType()) &&((void)0) | ||||
| 8425 | isValidElementType(V2->getType()) &&((void)0) | ||||
| 8426 | "Expected vectorizable types only.")((void)0); | ||||
| 8427 | // It is fine to compare type IDs here, since we expect only vectorizable | ||||
| 8428 | // types, like ints, floats and pointers, we don't care about other type. | ||||
| 8429 | if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) | ||||
| 8430 | return true; | ||||
| 8431 | if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) | ||||
| 8432 | return false; | ||||
| 8433 | ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; | ||||
| 8434 | ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; | ||||
| 8435 | if (Opcodes1.size() < Opcodes2.size()) | ||||
| 8436 | return true; | ||||
| 8437 | if (Opcodes1.size() > Opcodes2.size()) | ||||
| 8438 | return false; | ||||
| 8439 | for (int I = 0, E = Opcodes1.size(); I < E; ++I) { | ||||
| 8440 | // Undefs are compatible with any other value. | ||||
| 8441 | if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) | ||||
| 8442 | continue; | ||||
| 8443 | if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) | ||||
| 8444 | if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { | ||||
| 8445 | DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); | ||||
| 8446 | DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); | ||||
| 8447 | if (!NodeI1) | ||||
| 8448 | return NodeI2 != nullptr; | ||||
| 8449 | if (!NodeI2) | ||||
| 8450 | return false; | ||||
| 8451 | assert((NodeI1 == NodeI2) ==((void)0) | ||||
| 8452 | (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&((void)0) | ||||
| 8453 | "Different nodes should have different DFS numbers")((void)0); | ||||
| 8454 | if (NodeI1 != NodeI2) | ||||
| 8455 | return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); | ||||
| 8456 | InstructionsState S = getSameOpcode({I1, I2}); | ||||
| 8457 | if (S.getOpcode()) | ||||
| 8458 | continue; | ||||
| 8459 | return I1->getOpcode() < I2->getOpcode(); | ||||
| 8460 | } | ||||
| 8461 | if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) | ||||
| 8462 | continue; | ||||
| 8463 | if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) | ||||
| 8464 | return true; | ||||
| 8465 | if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) | ||||
| 8466 | return false; | ||||
| 8467 | } | ||||
| 8468 | return false; | ||||
| 8469 | }); | ||||
| 8470 | |||||
| 8471 | auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { | ||||
| 8472 | if (V1 == V2) | ||||
| 8473 | return true; | ||||
| 8474 | if (V1->getType() != V2->getType()) | ||||
| 8475 | return false; | ||||
| 8476 | ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; | ||||
| 8477 | ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; | ||||
| 8478 | if (Opcodes1.size() != Opcodes2.size()) | ||||
| 8479 | return false; | ||||
| 8480 | for (int I = 0, E = Opcodes1.size(); I < E; ++I) { | ||||
| 8481 | // Undefs are compatible with any other value. | ||||
| 8482 | if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) | ||||
| 8483 | continue; | ||||
| 8484 | if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) | ||||
| 8485 | if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { | ||||
| 8486 | if (I1->getParent() != I2->getParent()) | ||||
| 8487 | return false; | ||||
| 8488 | InstructionsState S = getSameOpcode({I1, I2}); | ||||
| 8489 | if (S.getOpcode()) | ||||
| 8490 | continue; | ||||
| 8491 | return false; | ||||
| 8492 | } | ||||
| 8493 | if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) | ||||
| 8494 | continue; | ||||
| 8495 | if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) | ||||
| 8496 | return false; | ||||
| 8497 | } | ||||
| 8498 | return true; | ||||
| 8499 | }; | ||||
| 8500 | |||||
| 8501 | // Try to vectorize elements base on their type. | ||||
| 8502 | SmallVector<Value *, 4> Candidates; | ||||
| 8503 | for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), | ||||
| 8504 | E = Incoming.end(); | ||||
| 8505 | IncIt != E;) { | ||||
| 8506 | |||||
| 8507 | // Look for the next elements with the same type, parent and operand | ||||
| 8508 | // kinds. | ||||
| 8509 | SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; | ||||
| 8510 | while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) { | ||||
| 8511 | VisitedInstrs.insert(*SameTypeIt); | ||||
| 8512 | ++SameTypeIt; | ||||
| 8513 | } | ||||
| 8514 | |||||
| 8515 | // Try to vectorize them. | ||||
| 8516 | unsigned NumElts = (SameTypeIt - IncIt); | ||||
| 8517 | LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("do { } while (false) | ||||
| 8518 | << NumElts << ")\n")do { } while (false); | ||||
| 8519 | // The order in which the phi nodes appear in the program does not matter. | ||||
| 8520 | // So allow tryToVectorizeList to reorder them if it is beneficial. This | ||||
| 8521 | // is done when there are exactly two elements since tryToVectorizeList | ||||
| 8522 | // asserts that there are only two values when AllowReorder is true. | ||||
| 8523 | if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, | ||||
| 8524 | /*AllowReorder=*/true)) { | ||||
| 8525 | // Success start over because instructions might have been changed. | ||||
| 8526 | HaveVectorizedPhiNodes = true; | ||||
| 8527 | Changed = true; | ||||
| 8528 | } else if (NumElts < 4 && | ||||
| 8529 | (Candidates.empty() || | ||||
| 8530 | Candidates.front()->getType() == (*IncIt)->getType())) { | ||||
| 8531 | Candidates.append(IncIt, std::next(IncIt, NumElts)); | ||||
| 8532 | } | ||||
| 8533 | // Final attempt to vectorize phis with the same types. | ||||
| 8534 | if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) { | ||||
| 8535 | if (Candidates.size() > 1 && | ||||
| 8536 | tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) { | ||||
| 8537 | // Success start over because instructions might have been changed. | ||||
| 8538 | HaveVectorizedPhiNodes = true; | ||||
| 8539 | Changed = true; | ||||
| 8540 | } | ||||
| 8541 | Candidates.clear(); | ||||
| 8542 | } | ||||
| 8543 | |||||
| 8544 | // Start over at the next instruction of a different type (or the end). | ||||
| 8545 | IncIt = SameTypeIt; | ||||
| 8546 | } | ||||
| 8547 | } | ||||
| 8548 | |||||
| 8549 | VisitedInstrs.clear(); | ||||
| 8550 | |||||
| 8551 | SmallVector<Instruction *, 8> PostProcessInstructions; | ||||
| 8552 | SmallDenseSet<Instruction *, 4> KeyNodes; | ||||
| 8553 | for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { | ||||
| 8554 | // Skip instructions with scalable type. The num of elements is unknown at | ||||
| 8555 | // compile-time for scalable type. | ||||
| 8556 | if (isa<ScalableVectorType>(it->getType())) | ||||
| 8557 | continue; | ||||
| 8558 | |||||
| 8559 | // Skip instructions marked for the deletion. | ||||
| 8560 | if (R.isDeleted(&*it)) | ||||
| 8561 | continue; | ||||
| 8562 | // We may go through BB multiple times so skip the one we have checked. | ||||
| 8563 | if (!VisitedInstrs.insert(&*it).second) { | ||||
| 8564 | if (it->use_empty() && KeyNodes.contains(&*it) && | ||||
| 8565 | vectorizeSimpleInstructions(PostProcessInstructions, BB, R, | ||||
| 8566 | it->isTerminator())) { | ||||
| 8567 | // We would like to start over since some instructions are deleted | ||||
| 8568 | // and the iterator may become invalid value. | ||||
| 8569 | Changed = true; | ||||
| 8570 | it = BB->begin(); | ||||
| 8571 | e = BB->end(); | ||||
| 8572 | } | ||||
| 8573 | continue; | ||||
| 8574 | } | ||||
| 8575 | |||||
| 8576 | if (isa<DbgInfoIntrinsic>(it)) | ||||
| 8577 | continue; | ||||
| 8578 | |||||
| 8579 | // Try to vectorize reductions that use PHINodes. | ||||
| 8580 | if (PHINode *P = dyn_cast<PHINode>(it)) { | ||||
| 8581 | // Check that the PHI is a reduction PHI. | ||||
| 8582 | if (P->getNumIncomingValues() == 2) { | ||||
| 8583 | // Try to match and vectorize a horizontal reduction. | ||||
| 8584 | if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, | ||||
| 8585 | TTI)) { | ||||
| 8586 | Changed = true; | ||||
| 8587 | it = BB->begin(); | ||||
| 8588 | e = BB->end(); | ||||
| 8589 | continue; | ||||
| 8590 | } | ||||
| 8591 | } | ||||
| 8592 | // Try to vectorize the incoming values of the PHI, to catch reductions | ||||
| 8593 | // that feed into PHIs. | ||||
| 8594 | for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { | ||||
| 8595 | // Skip if the incoming block is the current BB for now. Also, bypass | ||||
| 8596 | // unreachable IR for efficiency and to avoid crashing. | ||||
| 8597 | // TODO: Collect the skipped incoming values and try to vectorize them | ||||
| 8598 | // after processing BB. | ||||
| 8599 | if (BB == P->getIncomingBlock(I) || | ||||
| 8600 | !DT->isReachableFromEntry(P->getIncomingBlock(I))) | ||||
| 8601 | continue; | ||||
| 8602 | |||||
| 8603 | Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), | ||||
| 8604 | P->getIncomingBlock(I), R, TTI); | ||||
| 8605 | } | ||||
| 8606 | continue; | ||||
| 8607 | } | ||||
| 8608 | |||||
| 8609 | // Ran into an instruction without users, like terminator, or function call | ||||
| 8610 | // with ignored return value, store. Ignore unused instructions (basing on | ||||
| 8611 | // instruction type, except for CallInst and InvokeInst). | ||||
| 8612 | if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) || | ||||
| 8613 | isa<InvokeInst>(it))) { | ||||
| 8614 | KeyNodes.insert(&*it); | ||||
| 8615 | bool OpsChanged = false; | ||||
| 8616 | if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) { | ||||
| 8617 | for (auto *V : it->operand_values()) { | ||||
| 8618 | // Try to match and vectorize a horizontal reduction. | ||||
| 8619 | OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); | ||||
| 8620 | } | ||||
| 8621 | } | ||||
| 8622 | // Start vectorization of post-process list of instructions from the | ||||
| 8623 | // top-tree instructions to try to vectorize as many instructions as | ||||
| 8624 | // possible. | ||||
| 8625 | OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R, | ||||
| 8626 | it->isTerminator()); | ||||
| 8627 | if (OpsChanged) { | ||||
| 8628 | // We would like to start over since some instructions are deleted | ||||
| 8629 | // and the iterator may become invalid value. | ||||
| 8630 | Changed = true; | ||||
| 8631 | it = BB->begin(); | ||||
| 8632 | e = BB->end(); | ||||
| 8633 | continue; | ||||
| 8634 | } | ||||
| 8635 | } | ||||
| 8636 | |||||
| 8637 | if (isa<InsertElementInst>(it) || isa<CmpInst>(it) || | ||||
| 8638 | isa<InsertValueInst>(it)) | ||||
| 8639 | PostProcessInstructions.push_back(&*it); | ||||
| 8640 | } | ||||
| 8641 | |||||
| 8642 | return Changed; | ||||
| 8643 | } | ||||
| 8644 | |||||
| 8645 | bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { | ||||
| 8646 | auto Changed = false; | ||||
| 8647 | for (auto &Entry : GEPs) { | ||||
| 8648 | // If the getelementptr list has fewer than two elements, there's nothing | ||||
| 8649 | // to do. | ||||
| 8650 | if (Entry.second.size() < 2) | ||||
| 8651 | continue; | ||||
| 8652 | |||||
| 8653 | LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "do { } while (false) | ||||
| 8654 | << Entry.second.size() << ".\n")do { } while (false); | ||||
| 8655 | |||||
| 8656 | // Process the GEP list in chunks suitable for the target's supported | ||||
| 8657 | // vector size. If a vector register can't hold 1 element, we are done. We | ||||
| 8658 | // are trying to vectorize the index computations, so the maximum number of | ||||
| 8659 | // elements is based on the size of the index expression, rather than the | ||||
| 8660 | // size of the GEP itself (the target's pointer size). | ||||
| 8661 | unsigned MaxVecRegSize = R.getMaxVecRegSize(); | ||||
| 8662 | unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin()); | ||||
| 8663 | if (MaxVecRegSize < EltSize) | ||||
| 8664 | continue; | ||||
| 8665 | |||||
| 8666 | unsigned MaxElts = MaxVecRegSize / EltSize; | ||||
| 8667 | for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { | ||||
| 8668 | auto Len = std::min<unsigned>(BE - BI, MaxElts); | ||||
| 8669 | ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); | ||||
| 8670 | |||||
| 8671 | // Initialize a set a candidate getelementptrs. Note that we use a | ||||
| 8672 | // SetVector here to preserve program order. If the index computations | ||||
| 8673 | // are vectorizable and begin with loads, we want to minimize the chance | ||||
| 8674 | // of having to reorder them later. | ||||
| 8675 | SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); | ||||
| 8676 | |||||
| 8677 | // Some of the candidates may have already been vectorized after we | ||||
| 8678 | // initially collected them. If so, they are marked as deleted, so remove | ||||
| 8679 | // them from the set of candidates. | ||||
| 8680 | Candidates.remove_if( | ||||
| 8681 | [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); }); | ||||
| 8682 | |||||
| 8683 | // Remove from the set of candidates all pairs of getelementptrs with | ||||
| 8684 | // constant differences. Such getelementptrs are likely not good | ||||
| 8685 | // candidates for vectorization in a bottom-up phase since one can be | ||||
| 8686 | // computed from the other. We also ensure all candidate getelementptr | ||||
| 8687 | // indices are unique. | ||||
| 8688 | for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { | ||||
| 8689 | auto *GEPI = GEPList[I]; | ||||
| 8690 | if (!Candidates.count(GEPI)) | ||||
| 8691 | continue; | ||||
| 8692 | auto *SCEVI = SE->getSCEV(GEPList[I]); | ||||
| 8693 | for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { | ||||
| 8694 | auto *GEPJ = GEPList[J]; | ||||
| 8695 | auto *SCEVJ = SE->getSCEV(GEPList[J]); | ||||
| 8696 | if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { | ||||
| 8697 | Candidates.remove(GEPI); | ||||
| 8698 | Candidates.remove(GEPJ); | ||||
| 8699 | } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { | ||||
| 8700 | Candidates.remove(GEPJ); | ||||
| 8701 | } | ||||
| 8702 | } | ||||
| 8703 | } | ||||
| 8704 | |||||
| 8705 | // We break out of the above computation as soon as we know there are | ||||
| 8706 | // fewer than two candidates remaining. | ||||
| 8707 | if (Candidates.size() < 2) | ||||
| 8708 | continue; | ||||
| 8709 | |||||
| 8710 | // Add the single, non-constant index of each candidate to the bundle. We | ||||
| 8711 | // ensured the indices met these constraints when we originally collected | ||||
| 8712 | // the getelementptrs. | ||||
| 8713 | SmallVector<Value *, 16> Bundle(Candidates.size()); | ||||
| 8714 | auto BundleIndex = 0u; | ||||
| 8715 | for (auto *V : Candidates) { | ||||
| 8716 | auto *GEP = cast<GetElementPtrInst>(V); | ||||
| 8717 | auto *GEPIdx = GEP->idx_begin()->get(); | ||||
| 8718 | assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx))((void)0); | ||||
| 8719 | Bundle[BundleIndex++] = GEPIdx; | ||||
| 8720 | } | ||||
| 8721 | |||||
| 8722 | // Try and vectorize the indices. We are currently only interested in | ||||
| 8723 | // gather-like cases of the form: | ||||
| 8724 | // | ||||
| 8725 | // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... | ||||
| 8726 | // | ||||
| 8727 | // where the loads of "a", the loads of "b", and the subtractions can be | ||||
| 8728 | // performed in parallel. It's likely that detecting this pattern in a | ||||
| 8729 | // bottom-up phase will be simpler and less costly than building a | ||||
| 8730 | // full-blown top-down phase beginning at the consecutive loads. | ||||
| 8731 | Changed |= tryToVectorizeList(Bundle, R); | ||||
| 8732 | } | ||||
| 8733 | } | ||||
| 8734 | return Changed; | ||||
| 8735 | } | ||||
| 8736 | |||||
| 8737 | bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { | ||||
| 8738 | bool Changed = false; | ||||
| 8739 | // Sort by type, base pointers and values operand. Value operands must be | ||||
| 8740 | // compatible (have the same opcode, same parent), otherwise it is | ||||
| 8741 | // definitely not profitable to try to vectorize them. | ||||
| 8742 | auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { | ||||
| 8743 | if (V->getPointerOperandType()->getTypeID() < | ||||
| 8744 | V2->getPointerOperandType()->getTypeID()) | ||||
| 8745 | return true; | ||||
| 8746 | if (V->getPointerOperandType()->getTypeID() > | ||||
| 8747 | V2->getPointerOperandType()->getTypeID()) | ||||
| 8748 | return false; | ||||
| 8749 | // UndefValues are compatible with all other values. | ||||
| 8750 | if (isa<UndefValue>(V->getValueOperand()) || | ||||
| 8751 | isa<UndefValue>(V2->getValueOperand())) | ||||
| 8752 | return false; | ||||
| 8753 | if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand())) | ||||
| 8754 | if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { | ||||
| 8755 | DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = | ||||
| 8756 | DT->getNode(I1->getParent()); | ||||
| 8757 | DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = | ||||
| 8758 | DT->getNode(I2->getParent()); | ||||
| 8759 | assert(NodeI1 && "Should only process reachable instructions")((void)0); | ||||
| 8760 | assert(NodeI1 && "Should only process reachable instructions")((void)0); | ||||
| 8761 | assert((NodeI1 == NodeI2) ==((void)0) | ||||
| 8762 | (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&((void)0) | ||||
| 8763 | "Different nodes should have different DFS numbers")((void)0); | ||||
| 8764 | if (NodeI1 != NodeI2) | ||||
| 8765 | return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); | ||||
| 8766 | InstructionsState S = getSameOpcode({I1, I2}); | ||||
| 8767 | if (S.getOpcode()) | ||||
| 8768 | return false; | ||||
| 8769 | return I1->getOpcode() < I2->getOpcode(); | ||||
| 8770 | } | ||||
| 8771 | if (isa<Constant>(V->getValueOperand()) && | ||||
| 8772 | isa<Constant>(V2->getValueOperand())) | ||||
| 8773 | return false; | ||||
| 8774 | return V->getValueOperand()->getValueID() < | ||||
| 8775 | V2->getValueOperand()->getValueID(); | ||||
| 8776 | }; | ||||
| 8777 | |||||
| 8778 | auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) { | ||||
| 8779 | if (V1 == V2) | ||||
| 8780 | return true; | ||||
| 8781 | if (V1->getPointerOperandType() != V2->getPointerOperandType()) | ||||
| 8782 | return false; | ||||
| 8783 | // Undefs are compatible with any other value. | ||||
| 8784 | if (isa<UndefValue>(V1->getValueOperand()) || | ||||
| 8785 | isa<UndefValue>(V2->getValueOperand())) | ||||
| 8786 | return true; | ||||
| 8787 | if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) | ||||
| 8788 | if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { | ||||
| 8789 | if (I1->getParent() != I2->getParent()) | ||||
| 8790 | return false; | ||||
| 8791 | InstructionsState S = getSameOpcode({I1, I2}); | ||||
| 8792 | return S.getOpcode() > 0; | ||||
| 8793 | } | ||||
| 8794 | if (isa<Constant>(V1->getValueOperand()) && | ||||
| 8795 | isa<Constant>(V2->getValueOperand())) | ||||
| 8796 | return true; | ||||
| 8797 | return V1->getValueOperand()->getValueID() == | ||||
| 8798 | V2->getValueOperand()->getValueID(); | ||||
| 8799 | }; | ||||
| 8800 | |||||
| 8801 | // Attempt to sort and vectorize each of the store-groups. | ||||
| 8802 | for (auto &Pair : Stores) { | ||||
| 8803 | if (Pair.second.size() < 2) | ||||
| 8804 | continue; | ||||
| 8805 | |||||
| 8806 | LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "do { } while (false) | ||||
| 8807 | << Pair.second.size() << ".\n")do { } while (false); | ||||
| 8808 | |||||
| 8809 | stable_sort(Pair.second, StoreSorter); | ||||
| 8810 | |||||
| 8811 | // Try to vectorize elements based on their compatibility. | ||||
| 8812 | for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(), | ||||
| 8813 | E = Pair.second.end(); | ||||
| 8814 | IncIt != E;) { | ||||
| 8815 | |||||
| 8816 | // Look for the next elements with the same type. | ||||
| 8817 | ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt; | ||||
| 8818 | Type *EltTy = (*IncIt)->getPointerOperand()->getType(); | ||||
| 8819 | |||||
| 8820 | while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt)) | ||||
| 8821 | ++SameTypeIt; | ||||
| 8822 | |||||
| 8823 | // Try to vectorize them. | ||||
| 8824 | unsigned NumElts = (SameTypeIt - IncIt); | ||||
| 8825 | LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores ("do { } while (false) | ||||
| 8826 | << NumElts << ")\n")do { } while (false); | ||||
| 8827 | if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() && | ||||
| 8828 | vectorizeStores(makeArrayRef(IncIt, NumElts), R)) { | ||||
| 8829 | // Success start over because instructions might have been changed. | ||||
| 8830 | Changed = true; | ||||
| 8831 | } | ||||
| 8832 | |||||
| 8833 | // Start over at the next instruction of a different type (or the end). | ||||
| 8834 | IncIt = SameTypeIt; | ||||
| 8835 | } | ||||
| 8836 | } | ||||
| 8837 | return Changed; | ||||
| 8838 | } | ||||
| 8839 | |||||
| 8840 | char SLPVectorizer::ID = 0; | ||||
| 8841 | |||||
| 8842 | static const char lv_name[] = "SLP Vectorizer"; | ||||
| 8843 | |||||
| 8844 | INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)static void *initializeSLPVectorizerPassOnce(PassRegistry & Registry) { | ||||
| 8845 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry); | ||||
| 8846 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry); | ||||
| 8847 | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry); | ||||
| 8848 | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry); | ||||
| 8849 | INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry); | ||||
| 8850 | INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry); | ||||
| 8851 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry); | ||||
| 8852 | INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry); | ||||
| 8853 | INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "slp-vectorizer", & SLPVectorizer::ID, PassInfo::NormalCtor_t(callDefaultCtor< SLPVectorizer>), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeSLPVectorizerPassFlag ; void llvm::initializeSLPVectorizerPass(PassRegistry &Registry ) { llvm::call_once(InitializeSLPVectorizerPassFlag, initializeSLPVectorizerPassOnce , std::ref(Registry)); } | ||||
| 8854 | |||||
| 8855 | Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); } |
| 1 | //===- llvm/ADT/ilist_iterator.h - Intrusive List Iterator ------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_ADT_ILIST_ITERATOR_H |
| 10 | #define LLVM_ADT_ILIST_ITERATOR_H |
| 11 | |
| 12 | #include "llvm/ADT/ilist_node.h" |
| 13 | #include <cassert> |
| 14 | #include <cstddef> |
| 15 | #include <iterator> |
| 16 | #include <type_traits> |
| 17 | |
| 18 | namespace llvm { |
| 19 | |
| 20 | namespace ilist_detail { |
| 21 | |
| 22 | /// Find const-correct node types. |
| 23 | template <class OptionsT, bool IsConst> struct IteratorTraits; |
| 24 | template <class OptionsT> struct IteratorTraits<OptionsT, false> { |
| 25 | using value_type = typename OptionsT::value_type; |
| 26 | using pointer = typename OptionsT::pointer; |
| 27 | using reference = typename OptionsT::reference; |
| 28 | using node_pointer = ilist_node_impl<OptionsT> *; |
| 29 | using node_reference = ilist_node_impl<OptionsT> &; |
| 30 | }; |
| 31 | template <class OptionsT> struct IteratorTraits<OptionsT, true> { |
| 32 | using value_type = const typename OptionsT::value_type; |
| 33 | using pointer = typename OptionsT::const_pointer; |
| 34 | using reference = typename OptionsT::const_reference; |
| 35 | using node_pointer = const ilist_node_impl<OptionsT> *; |
| 36 | using node_reference = const ilist_node_impl<OptionsT> &; |
| 37 | }; |
| 38 | |
| 39 | template <bool IsReverse> struct IteratorHelper; |
| 40 | template <> struct IteratorHelper<false> : ilist_detail::NodeAccess { |
| 41 | using Access = ilist_detail::NodeAccess; |
| 42 | |
| 43 | template <class T> static void increment(T *&I) { I = Access::getNext(*I); } |
| 44 | template <class T> static void decrement(T *&I) { I = Access::getPrev(*I); } |
| 45 | }; |
| 46 | template <> struct IteratorHelper<true> : ilist_detail::NodeAccess { |
| 47 | using Access = ilist_detail::NodeAccess; |
| 48 | |
| 49 | template <class T> static void increment(T *&I) { I = Access::getPrev(*I); } |
| 50 | template <class T> static void decrement(T *&I) { I = Access::getNext(*I); } |
| 51 | }; |
| 52 | |
| 53 | } // end namespace ilist_detail |
| 54 | |
| 55 | /// Iterator for intrusive lists based on ilist_node. |
| 56 | template <class OptionsT, bool IsReverse, bool IsConst> |
| 57 | class ilist_iterator : ilist_detail::SpecificNodeAccess<OptionsT> { |
| 58 | friend ilist_iterator<OptionsT, IsReverse, !IsConst>; |
| 59 | friend ilist_iterator<OptionsT, !IsReverse, IsConst>; |
| 60 | friend ilist_iterator<OptionsT, !IsReverse, !IsConst>; |
| 61 | |
| 62 | using Traits = ilist_detail::IteratorTraits<OptionsT, IsConst>; |
| 63 | using Access = ilist_detail::SpecificNodeAccess<OptionsT>; |
| 64 | |
| 65 | public: |
| 66 | using value_type = typename Traits::value_type; |
| 67 | using pointer = typename Traits::pointer; |
| 68 | using reference = typename Traits::reference; |
| 69 | using difference_type = ptrdiff_t; |
| 70 | using iterator_category = std::bidirectional_iterator_tag; |
| 71 | using const_pointer = typename OptionsT::const_pointer; |
| 72 | using const_reference = typename OptionsT::const_reference; |
| 73 | |
| 74 | private: |
| 75 | using node_pointer = typename Traits::node_pointer; |
| 76 | using node_reference = typename Traits::node_reference; |
| 77 | |
| 78 | node_pointer NodePtr = nullptr; |
| 79 | |
| 80 | public: |
| 81 | /// Create from an ilist_node. |
| 82 | explicit ilist_iterator(node_reference N) : NodePtr(&N) {} |
| 83 | |
| 84 | explicit ilist_iterator(pointer NP) : NodePtr(Access::getNodePtr(NP)) {} |
| 85 | explicit ilist_iterator(reference NR) : NodePtr(Access::getNodePtr(&NR)) {} |
| 86 | ilist_iterator() = default; |
| 87 | |
| 88 | // This is templated so that we can allow constructing a const iterator from |
| 89 | // a nonconst iterator... |
| 90 | template <bool RHSIsConst> |
| 91 | ilist_iterator(const ilist_iterator<OptionsT, IsReverse, RHSIsConst> &RHS, |
| 92 | std::enable_if_t<IsConst || !RHSIsConst, void *> = nullptr) |
| 93 | : NodePtr(RHS.NodePtr) {} |
| 94 | |
| 95 | // This is templated so that we can allow assigning to a const iterator from |
| 96 | // a nonconst iterator... |
| 97 | template <bool RHSIsConst> |
| 98 | std::enable_if_t<IsConst || !RHSIsConst, ilist_iterator &> |
| 99 | operator=(const ilist_iterator<OptionsT, IsReverse, RHSIsConst> &RHS) { |
| 100 | NodePtr = RHS.NodePtr; |
| 101 | return *this; |
| 102 | } |
| 103 | |
| 104 | /// Explicit conversion between forward/reverse iterators. |
| 105 | /// |
| 106 | /// Translate between forward and reverse iterators without changing range |
| 107 | /// boundaries. The resulting iterator will dereference (and have a handle) |
| 108 | /// to the previous node, which is somewhat unexpected; but converting the |
| 109 | /// two endpoints in a range will give the same range in reverse. |
| 110 | /// |
| 111 | /// This matches std::reverse_iterator conversions. |
| 112 | explicit ilist_iterator( |
| 113 | const ilist_iterator<OptionsT, !IsReverse, IsConst> &RHS) |
| 114 | : ilist_iterator(++RHS.getReverse()) {} |
| 115 | |
| 116 | /// Get a reverse iterator to the same node. |
| 117 | /// |
| 118 | /// Gives a reverse iterator that will dereference (and have a handle) to the |
| 119 | /// same node. Converting the endpoint iterators in a range will give a |
| 120 | /// different range; for range operations, use the explicit conversions. |
| 121 | ilist_iterator<OptionsT, !IsReverse, IsConst> getReverse() const { |
| 122 | if (NodePtr) |
| 123 | return ilist_iterator<OptionsT, !IsReverse, IsConst>(*NodePtr); |
| 124 | return ilist_iterator<OptionsT, !IsReverse, IsConst>(); |
| 125 | } |
| 126 | |
| 127 | /// Const-cast. |
| 128 | ilist_iterator<OptionsT, IsReverse, false> getNonConst() const { |
| 129 | if (NodePtr) |
| 130 | return ilist_iterator<OptionsT, IsReverse, false>( |
| 131 | const_cast<typename ilist_iterator<OptionsT, IsReverse, |
| 132 | false>::node_reference>(*NodePtr)); |
| 133 | return ilist_iterator<OptionsT, IsReverse, false>(); |
| 134 | } |
| 135 | |
| 136 | // Accessors... |
| 137 | reference operator*() const { |
| 138 | assert(!NodePtr->isKnownSentinel())((void)0); |
| 139 | return *Access::getValuePtr(NodePtr); |
| 140 | } |
| 141 | pointer operator->() const { return &operator*(); } |
| 142 | |
| 143 | // Comparison operators |
| 144 | friend bool operator==(const ilist_iterator &LHS, const ilist_iterator &RHS) { |
| 145 | return LHS.NodePtr == RHS.NodePtr; |
| 146 | } |
| 147 | friend bool operator!=(const ilist_iterator &LHS, const ilist_iterator &RHS) { |
| 148 | return LHS.NodePtr != RHS.NodePtr; |
| 149 | } |
| 150 | |
| 151 | // Increment and decrement operators... |
| 152 | ilist_iterator &operator--() { |
| 153 | NodePtr = IsReverse ? NodePtr->getNext() : NodePtr->getPrev(); |
| 154 | return *this; |
| 155 | } |
| 156 | ilist_iterator &operator++() { |
| 157 | NodePtr = IsReverse ? NodePtr->getPrev() : NodePtr->getNext(); |
| 158 | return *this; |
| 159 | } |
| 160 | ilist_iterator operator--(int) { |
| 161 | ilist_iterator tmp = *this; |
| 162 | --*this; |
| 163 | return tmp; |
| 164 | } |
| 165 | ilist_iterator operator++(int) { |
| 166 | ilist_iterator tmp = *this; |
| 167 | ++*this; |
| 168 | return tmp; |
| 169 | } |
| 170 | |
| 171 | /// Get the underlying ilist_node. |
| 172 | node_pointer getNodePtr() const { return static_cast<node_pointer>(NodePtr); } |
| 173 | |
| 174 | /// Check for end. Only valid if ilist_sentinel_tracking<true>. |
| 175 | bool isEnd() const { return NodePtr ? NodePtr->isSentinel() : false; } |
| 176 | }; |
| 177 | |
| 178 | template <typename From> struct simplify_type; |
| 179 | |
| 180 | /// Allow ilist_iterators to convert into pointers to a node automatically when |
| 181 | /// used by the dyn_cast, cast, isa mechanisms... |
| 182 | /// |
| 183 | /// FIXME: remove this, since there is no implicit conversion to NodeTy. |
| 184 | template <class OptionsT, bool IsConst> |
| 185 | struct simplify_type<ilist_iterator<OptionsT, false, IsConst>> { |
| 186 | using iterator = ilist_iterator<OptionsT, false, IsConst>; |
| 187 | using SimpleType = typename iterator::pointer; |
| 188 | |
| 189 | static SimpleType getSimplifiedValue(const iterator &Node) { return &*Node; } |
| 190 | }; |
| 191 | template <class OptionsT, bool IsConst> |
| 192 | struct simplify_type<const ilist_iterator<OptionsT, false, IsConst>> |
| 193 | : simplify_type<ilist_iterator<OptionsT, false, IsConst>> {}; |
| 194 | |
| 195 | } // end namespace llvm |
| 196 | |
| 197 | #endif // LLVM_ADT_ILIST_ITERATOR_H |