| File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp |
| Warning: | line 1229, column 34 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// | ||||
| 2 | // | ||||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
| 6 | // | ||||
| 7 | //===----------------------------------------------------------------------===// | ||||
| 8 | // | ||||
| 9 | // This pass merges loads/stores to/from sequential memory addresses into vector | ||||
| 10 | // loads/stores. Although there's nothing GPU-specific in here, this pass is | ||||
| 11 | // motivated by the microarchitectural quirks of nVidia and AMD GPUs. | ||||
| 12 | // | ||||
| 13 | // (For simplicity below we talk about loads only, but everything also applies | ||||
| 14 | // to stores.) | ||||
| 15 | // | ||||
| 16 | // This pass is intended to be run late in the pipeline, after other | ||||
| 17 | // vectorization opportunities have been exploited. So the assumption here is | ||||
| 18 | // that immediately following our new vector load we'll need to extract out the | ||||
| 19 | // individual elements of the load, so we can operate on them individually. | ||||
| 20 | // | ||||
| 21 | // On CPUs this transformation is usually not beneficial, because extracting the | ||||
| 22 | // elements of a vector register is expensive on most architectures. It's | ||||
| 23 | // usually better just to load each element individually into its own scalar | ||||
| 24 | // register. | ||||
| 25 | // | ||||
| 26 | // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a | ||||
| 27 | // "vector load" loads directly into a series of scalar registers. In effect, | ||||
| 28 | // extracting the elements of the vector is free. It's therefore always | ||||
| 29 | // beneficial to vectorize a sequence of loads on these architectures. | ||||
| 30 | // | ||||
| 31 | // Vectorizing (perhaps a better name might be "coalescing") loads can have | ||||
| 32 | // large performance impacts on GPU kernels, and opportunities for vectorizing | ||||
| 33 | // are common in GPU code. This pass tries very hard to find such | ||||
| 34 | // opportunities; its runtime is quadratic in the number of loads in a BB. | ||||
| 35 | // | ||||
| 36 | // Some CPU architectures, such as ARM, have instructions that load into | ||||
| 37 | // multiple scalar registers, similar to a GPU vectorized load. In theory ARM | ||||
| 38 | // could use this pass (with some modifications), but currently it implements | ||||
| 39 | // its own pass to do something similar to what we do here. | ||||
| 40 | |||||
| 41 | #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" | ||||
| 42 | #include "llvm/ADT/APInt.h" | ||||
| 43 | #include "llvm/ADT/ArrayRef.h" | ||||
| 44 | #include "llvm/ADT/MapVector.h" | ||||
| 45 | #include "llvm/ADT/PostOrderIterator.h" | ||||
| 46 | #include "llvm/ADT/STLExtras.h" | ||||
| 47 | #include "llvm/ADT/SmallPtrSet.h" | ||||
| 48 | #include "llvm/ADT/SmallVector.h" | ||||
| 49 | #include "llvm/ADT/Statistic.h" | ||||
| 50 | #include "llvm/ADT/iterator_range.h" | ||||
| 51 | #include "llvm/Analysis/AliasAnalysis.h" | ||||
| 52 | #include "llvm/Analysis/AssumptionCache.h" | ||||
| 53 | #include "llvm/Analysis/MemoryLocation.h" | ||||
| 54 | #include "llvm/Analysis/ScalarEvolution.h" | ||||
| 55 | #include "llvm/Analysis/TargetTransformInfo.h" | ||||
| 56 | #include "llvm/Analysis/ValueTracking.h" | ||||
| 57 | #include "llvm/Analysis/VectorUtils.h" | ||||
| 58 | #include "llvm/IR/Attributes.h" | ||||
| 59 | #include "llvm/IR/BasicBlock.h" | ||||
| 60 | #include "llvm/IR/Constants.h" | ||||
| 61 | #include "llvm/IR/DataLayout.h" | ||||
| 62 | #include "llvm/IR/DerivedTypes.h" | ||||
| 63 | #include "llvm/IR/Dominators.h" | ||||
| 64 | #include "llvm/IR/Function.h" | ||||
| 65 | #include "llvm/IR/IRBuilder.h" | ||||
| 66 | #include "llvm/IR/InstrTypes.h" | ||||
| 67 | #include "llvm/IR/Instruction.h" | ||||
| 68 | #include "llvm/IR/Instructions.h" | ||||
| 69 | #include "llvm/IR/IntrinsicInst.h" | ||||
| 70 | #include "llvm/IR/Module.h" | ||||
| 71 | #include "llvm/IR/Type.h" | ||||
| 72 | #include "llvm/IR/User.h" | ||||
| 73 | #include "llvm/IR/Value.h" | ||||
| 74 | #include "llvm/InitializePasses.h" | ||||
| 75 | #include "llvm/Pass.h" | ||||
| 76 | #include "llvm/Support/Casting.h" | ||||
| 77 | #include "llvm/Support/Debug.h" | ||||
| 78 | #include "llvm/Support/KnownBits.h" | ||||
| 79 | #include "llvm/Support/MathExtras.h" | ||||
| 80 | #include "llvm/Support/raw_ostream.h" | ||||
| 81 | #include "llvm/Transforms/Utils/Local.h" | ||||
| 82 | #include "llvm/Transforms/Vectorize.h" | ||||
| 83 | #include <algorithm> | ||||
| 84 | #include <cassert> | ||||
| 85 | #include <cstdlib> | ||||
| 86 | #include <tuple> | ||||
| 87 | #include <utility> | ||||
| 88 | |||||
| 89 | using namespace llvm; | ||||
| 90 | |||||
| 91 | #define DEBUG_TYPE"load-store-vectorizer" "load-store-vectorizer" | ||||
| 92 | |||||
| 93 | STATISTIC(NumVectorInstructions, "Number of vector accesses generated")static llvm::Statistic NumVectorInstructions = {"load-store-vectorizer" , "NumVectorInstructions", "Number of vector accesses generated" }; | ||||
| 94 | STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized")static llvm::Statistic NumScalarsVectorized = {"load-store-vectorizer" , "NumScalarsVectorized", "Number of scalar accesses vectorized" }; | ||||
| 95 | |||||
| 96 | // FIXME: Assuming stack alignment of 4 is always good enough | ||||
| 97 | static const unsigned StackAdjustedAlignment = 4; | ||||
| 98 | |||||
| 99 | namespace { | ||||
| 100 | |||||
| 101 | /// ChainID is an arbitrary token that is allowed to be different only for the | ||||
| 102 | /// accesses that are guaranteed to be considered non-consecutive by | ||||
| 103 | /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions | ||||
| 104 | /// together and reducing the number of instructions the main search operates on | ||||
| 105 | /// at a time, i.e. this is to reduce compile time and nothing else as the main | ||||
| 106 | /// search has O(n^2) time complexity. The underlying type of ChainID should not | ||||
| 107 | /// be relied upon. | ||||
| 108 | using ChainID = const Value *; | ||||
| 109 | using InstrList = SmallVector<Instruction *, 8>; | ||||
| 110 | using InstrListMap = MapVector<ChainID, InstrList>; | ||||
| 111 | |||||
| 112 | class Vectorizer { | ||||
| 113 | Function &F; | ||||
| 114 | AliasAnalysis &AA; | ||||
| 115 | AssumptionCache &AC; | ||||
| 116 | DominatorTree &DT; | ||||
| 117 | ScalarEvolution &SE; | ||||
| 118 | TargetTransformInfo &TTI; | ||||
| 119 | const DataLayout &DL; | ||||
| 120 | IRBuilder<> Builder; | ||||
| 121 | |||||
| 122 | public: | ||||
| 123 | Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, | ||||
| 124 | DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) | ||||
| 125 | : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI), | ||||
| 126 | DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} | ||||
| 127 | |||||
| 128 | bool run(); | ||||
| 129 | |||||
| 130 | private: | ||||
| 131 | unsigned getPointerAddressSpace(Value *I); | ||||
| 132 | |||||
| 133 | static const unsigned MaxDepth = 3; | ||||
| 134 | |||||
| 135 | bool isConsecutiveAccess(Value *A, Value *B); | ||||
| 136 | bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, | ||||
| 137 | unsigned Depth = 0) const; | ||||
| 138 | bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, | ||||
| 139 | unsigned Depth) const; | ||||
| 140 | bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, | ||||
| 141 | unsigned Depth) const; | ||||
| 142 | |||||
| 143 | /// After vectorization, reorder the instructions that I depends on | ||||
| 144 | /// (the instructions defining its operands), to ensure they dominate I. | ||||
| 145 | void reorder(Instruction *I); | ||||
| 146 | |||||
| 147 | /// Returns the first and the last instructions in Chain. | ||||
| 148 | std::pair<BasicBlock::iterator, BasicBlock::iterator> | ||||
| 149 | getBoundaryInstrs(ArrayRef<Instruction *> Chain); | ||||
| 150 | |||||
| 151 | /// Erases the original instructions after vectorizing. | ||||
| 152 | void eraseInstructions(ArrayRef<Instruction *> Chain); | ||||
| 153 | |||||
| 154 | /// "Legalize" the vector type that would be produced by combining \p | ||||
| 155 | /// ElementSizeBits elements in \p Chain. Break into two pieces such that the | ||||
| 156 | /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is | ||||
| 157 | /// expected to have more than 4 elements. | ||||
| 158 | std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> | ||||
| 159 | splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); | ||||
| 160 | |||||
| 161 | /// Finds the largest prefix of Chain that's vectorizable, checking for | ||||
| 162 | /// intervening instructions which may affect the memory accessed by the | ||||
| 163 | /// instructions within Chain. | ||||
| 164 | /// | ||||
| 165 | /// The elements of \p Chain must be all loads or all stores and must be in | ||||
| 166 | /// address order. | ||||
| 167 | ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); | ||||
| 168 | |||||
| 169 | /// Collects load and store instructions to vectorize. | ||||
| 170 | std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); | ||||
| 171 | |||||
| 172 | /// Processes the collected instructions, the \p Map. The values of \p Map | ||||
| 173 | /// should be all loads or all stores. | ||||
| 174 | bool vectorizeChains(InstrListMap &Map); | ||||
| 175 | |||||
| 176 | /// Finds the load/stores to consecutive memory addresses and vectorizes them. | ||||
| 177 | bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); | ||||
| 178 | |||||
| 179 | /// Vectorizes the load instructions in Chain. | ||||
| 180 | bool | ||||
| 181 | vectorizeLoadChain(ArrayRef<Instruction *> Chain, | ||||
| 182 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed); | ||||
| 183 | |||||
| 184 | /// Vectorizes the store instructions in Chain. | ||||
| 185 | bool | ||||
| 186 | vectorizeStoreChain(ArrayRef<Instruction *> Chain, | ||||
| 187 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed); | ||||
| 188 | |||||
| 189 | /// Check if this load/store access is misaligned accesses. | ||||
| 190 | bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, | ||||
| 191 | Align Alignment); | ||||
| 192 | }; | ||||
| 193 | |||||
| 194 | class LoadStoreVectorizerLegacyPass : public FunctionPass { | ||||
| 195 | public: | ||||
| 196 | static char ID; | ||||
| 197 | |||||
| 198 | LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { | ||||
| 199 | initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); | ||||
| 200 | } | ||||
| 201 | |||||
| 202 | bool runOnFunction(Function &F) override; | ||||
| 203 | |||||
| 204 | StringRef getPassName() const override { | ||||
| 205 | return "GPU Load and Store Vectorizer"; | ||||
| 206 | } | ||||
| 207 | |||||
| 208 | void getAnalysisUsage(AnalysisUsage &AU) const override { | ||||
| 209 | AU.addRequired<AAResultsWrapperPass>(); | ||||
| 210 | AU.addRequired<AssumptionCacheTracker>(); | ||||
| 211 | AU.addRequired<ScalarEvolutionWrapperPass>(); | ||||
| 212 | AU.addRequired<DominatorTreeWrapperPass>(); | ||||
| 213 | AU.addRequired<TargetTransformInfoWrapperPass>(); | ||||
| 214 | AU.setPreservesCFG(); | ||||
| 215 | } | ||||
| 216 | }; | ||||
| 217 | |||||
| 218 | } // end anonymous namespace | ||||
| 219 | |||||
| 220 | char LoadStoreVectorizerLegacyPass::ID = 0; | ||||
| 221 | |||||
| 222 | INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,static void *initializeLoadStoreVectorizerLegacyPassPassOnce( PassRegistry &Registry) { | ||||
| 223 | "Vectorize load and Store instructions", false, false)static void *initializeLoadStoreVectorizerLegacyPassPassOnce( PassRegistry &Registry) { | ||||
| 224 | INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)initializeSCEVAAWrapperPassPass(Registry); | ||||
| 225 | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);; | ||||
| 226 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry); | ||||
| 227 | INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry); | ||||
| 228 | INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry); | ||||
| 229 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry); | ||||
| 230 | INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,PassInfo *PI = new PassInfo( "Vectorize load and store instructions" , "load-store-vectorizer", &LoadStoreVectorizerLegacyPass ::ID, PassInfo::NormalCtor_t(callDefaultCtor<LoadStoreVectorizerLegacyPass >), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeLoadStoreVectorizerLegacyPassPassFlag ; void llvm::initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoadStoreVectorizerLegacyPassPassFlag , initializeLoadStoreVectorizerLegacyPassPassOnce, std::ref(Registry )); } | ||||
| 231 | "Vectorize load and store instructions", false, false)PassInfo *PI = new PassInfo( "Vectorize load and store instructions" , "load-store-vectorizer", &LoadStoreVectorizerLegacyPass ::ID, PassInfo::NormalCtor_t(callDefaultCtor<LoadStoreVectorizerLegacyPass >), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeLoadStoreVectorizerLegacyPassPassFlag ; void llvm::initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoadStoreVectorizerLegacyPassPassFlag , initializeLoadStoreVectorizerLegacyPassPassOnce, std::ref(Registry )); } | ||||
| 232 | |||||
| 233 | Pass *llvm::createLoadStoreVectorizerPass() { | ||||
| 234 | return new LoadStoreVectorizerLegacyPass(); | ||||
| 235 | } | ||||
| 236 | |||||
| 237 | bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { | ||||
| 238 | // Don't vectorize when the attribute NoImplicitFloat is used. | ||||
| 239 | if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) | ||||
| 240 | return false; | ||||
| 241 | |||||
| 242 | AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); | ||||
| 243 | DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); | ||||
| 244 | ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); | ||||
| 245 | TargetTransformInfo &TTI = | ||||
| 246 | getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); | ||||
| 247 | |||||
| 248 | AssumptionCache &AC = | ||||
| 249 | getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); | ||||
| 250 | |||||
| 251 | Vectorizer V(F, AA, AC, DT, SE, TTI); | ||||
| 252 | return V.run(); | ||||
| 253 | } | ||||
| 254 | |||||
| 255 | PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { | ||||
| 256 | // Don't vectorize when the attribute NoImplicitFloat is used. | ||||
| 257 | if (F.hasFnAttribute(Attribute::NoImplicitFloat)) | ||||
| 258 | return PreservedAnalyses::all(); | ||||
| 259 | |||||
| 260 | AliasAnalysis &AA = AM.getResult<AAManager>(F); | ||||
| 261 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); | ||||
| 262 | ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); | ||||
| 263 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); | ||||
| 264 | AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); | ||||
| 265 | |||||
| 266 | Vectorizer V(F, AA, AC, DT, SE, TTI); | ||||
| 267 | bool Changed = V.run(); | ||||
| 268 | PreservedAnalyses PA; | ||||
| 269 | PA.preserveSet<CFGAnalyses>(); | ||||
| 270 | return Changed ? PA : PreservedAnalyses::all(); | ||||
| 271 | } | ||||
| 272 | |||||
| 273 | // The real propagateMetadata expects a SmallVector<Value*>, but we deal in | ||||
| 274 | // vectors of Instructions. | ||||
| 275 | static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { | ||||
| 276 | SmallVector<Value *, 8> VL(IL.begin(), IL.end()); | ||||
| 277 | propagateMetadata(I, VL); | ||||
| 278 | } | ||||
| 279 | |||||
| 280 | // Vectorizer Implementation | ||||
| 281 | bool Vectorizer::run() { | ||||
| 282 | bool Changed = false; | ||||
| 283 | |||||
| 284 | // Scan the blocks in the function in post order. | ||||
| 285 | for (BasicBlock *BB : post_order(&F)) { | ||||
| 286 | InstrListMap LoadRefs, StoreRefs; | ||||
| 287 | std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); | ||||
| 288 | Changed |= vectorizeChains(LoadRefs); | ||||
| 289 | Changed |= vectorizeChains(StoreRefs); | ||||
| 290 | } | ||||
| 291 | |||||
| 292 | return Changed; | ||||
| 293 | } | ||||
| 294 | |||||
| 295 | unsigned Vectorizer::getPointerAddressSpace(Value *I) { | ||||
| 296 | if (LoadInst *L = dyn_cast<LoadInst>(I)) | ||||
| 297 | return L->getPointerAddressSpace(); | ||||
| 298 | if (StoreInst *S = dyn_cast<StoreInst>(I)) | ||||
| 299 | return S->getPointerAddressSpace(); | ||||
| 300 | return -1; | ||||
| 301 | } | ||||
| 302 | |||||
| 303 | // FIXME: Merge with llvm::isConsecutiveAccess | ||||
| 304 | bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { | ||||
| 305 | Value *PtrA = getLoadStorePointerOperand(A); | ||||
| 306 | Value *PtrB = getLoadStorePointerOperand(B); | ||||
| 307 | unsigned ASA = getPointerAddressSpace(A); | ||||
| 308 | unsigned ASB = getPointerAddressSpace(B); | ||||
| 309 | |||||
| 310 | // Check that the address spaces match and that the pointers are valid. | ||||
| 311 | if (!PtrA || !PtrB || (ASA != ASB)) | ||||
| 312 | return false; | ||||
| 313 | |||||
| 314 | // Make sure that A and B are different pointers of the same size type. | ||||
| 315 | Type *PtrATy = getLoadStoreType(A); | ||||
| 316 | Type *PtrBTy = getLoadStoreType(B); | ||||
| 317 | if (PtrA == PtrB || | ||||
| 318 | PtrATy->isVectorTy() != PtrBTy->isVectorTy() || | ||||
| 319 | DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || | ||||
| 320 | DL.getTypeStoreSize(PtrATy->getScalarType()) != | ||||
| 321 | DL.getTypeStoreSize(PtrBTy->getScalarType())) | ||||
| 322 | return false; | ||||
| 323 | |||||
| 324 | unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); | ||||
| 325 | APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); | ||||
| 326 | |||||
| 327 | return areConsecutivePointers(PtrA, PtrB, Size); | ||||
| 328 | } | ||||
| 329 | |||||
| 330 | bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, | ||||
| 331 | APInt PtrDelta, unsigned Depth) const { | ||||
| 332 | unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); | ||||
| 333 | APInt OffsetA(PtrBitWidth, 0); | ||||
| 334 | APInt OffsetB(PtrBitWidth, 0); | ||||
| 335 | PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); | ||||
| 336 | PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); | ||||
| 337 | |||||
| 338 | unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); | ||||
| 339 | |||||
| 340 | if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) | ||||
| 341 | return false; | ||||
| 342 | |||||
| 343 | // In case if we have to shrink the pointer | ||||
| 344 | // stripAndAccumulateInBoundsConstantOffsets should properly handle a | ||||
| 345 | // possible overflow and the value should fit into a smallest data type | ||||
| 346 | // used in the cast/gep chain. | ||||
| 347 | assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&((void)0) | ||||
| 348 | OffsetB.getMinSignedBits() <= NewPtrBitWidth)((void)0); | ||||
| 349 | |||||
| 350 | OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); | ||||
| 351 | OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); | ||||
| 352 | PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); | ||||
| 353 | |||||
| 354 | APInt OffsetDelta = OffsetB - OffsetA; | ||||
| 355 | |||||
| 356 | // Check if they are based on the same pointer. That makes the offsets | ||||
| 357 | // sufficient. | ||||
| 358 | if (PtrA == PtrB) | ||||
| 359 | return OffsetDelta == PtrDelta; | ||||
| 360 | |||||
| 361 | // Compute the necessary base pointer delta to have the necessary final delta | ||||
| 362 | // equal to the pointer delta requested. | ||||
| 363 | APInt BaseDelta = PtrDelta - OffsetDelta; | ||||
| 364 | |||||
| 365 | // Compute the distance with SCEV between the base pointers. | ||||
| 366 | const SCEV *PtrSCEVA = SE.getSCEV(PtrA); | ||||
| 367 | const SCEV *PtrSCEVB = SE.getSCEV(PtrB); | ||||
| 368 | const SCEV *C = SE.getConstant(BaseDelta); | ||||
| 369 | const SCEV *X = SE.getAddExpr(PtrSCEVA, C); | ||||
| 370 | if (X == PtrSCEVB) | ||||
| 371 | return true; | ||||
| 372 | |||||
| 373 | // The above check will not catch the cases where one of the pointers is | ||||
| 374 | // factorized but the other one is not, such as (C + (S * (A + B))) vs | ||||
| 375 | // (AS + BS). Get the minus scev. That will allow re-combining the expresions | ||||
| 376 | // and getting the simplified difference. | ||||
| 377 | const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); | ||||
| 378 | if (C == Dist) | ||||
| 379 | return true; | ||||
| 380 | |||||
| 381 | // Sometimes even this doesn't work, because SCEV can't always see through | ||||
| 382 | // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking | ||||
| 383 | // things the hard way. | ||||
| 384 | return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); | ||||
| 385 | } | ||||
| 386 | |||||
| 387 | static bool checkNoWrapFlags(Instruction *I, bool Signed) { | ||||
| 388 | BinaryOperator *BinOpI = cast<BinaryOperator>(I); | ||||
| 389 | return (Signed && BinOpI->hasNoSignedWrap()) || | ||||
| 390 | (!Signed && BinOpI->hasNoUnsignedWrap()); | ||||
| 391 | } | ||||
| 392 | |||||
| 393 | static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, | ||||
| 394 | unsigned MatchingOpIdxA, Instruction *AddOpB, | ||||
| 395 | unsigned MatchingOpIdxB, bool Signed) { | ||||
| 396 | // If both OpA and OpB is an add with NSW/NUW and with | ||||
| 397 | // one of the operands being the same, we can guarantee that the | ||||
| 398 | // transformation is safe if we can prove that OpA won't overflow when | ||||
| 399 | // IdxDiff added to the other operand of OpA. | ||||
| 400 | // For example: | ||||
| 401 | // %tmp7 = add nsw i32 %tmp2, %v0 | ||||
| 402 | // %tmp8 = sext i32 %tmp7 to i64 | ||||
| 403 | // ... | ||||
| 404 | // %tmp11 = add nsw i32 %v0, 1 | ||||
| 405 | // %tmp12 = add nsw i32 %tmp2, %tmp11 | ||||
| 406 | // %tmp13 = sext i32 %tmp12 to i64 | ||||
| 407 | // | ||||
| 408 | // Both %tmp7 and %tmp2 has the nsw flag and the first operand | ||||
| 409 | // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow | ||||
| 410 | // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the | ||||
| 411 | // nsw flag. | ||||
| 412 | assert(AddOpA->getOpcode() == Instruction::Add &&((void)0) | ||||
| 413 | AddOpB->getOpcode() == Instruction::Add &&((void)0) | ||||
| 414 | checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed))((void)0); | ||||
| 415 | if (AddOpA->getOperand(MatchingOpIdxA) == | ||||
| 416 | AddOpB->getOperand(MatchingOpIdxB)) { | ||||
| 417 | Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1); | ||||
| 418 | Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1); | ||||
| 419 | Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA); | ||||
| 420 | Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB); | ||||
| 421 | // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`. | ||||
| 422 | if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add && | ||||
| 423 | checkNoWrapFlags(OtherInstrB, Signed) && | ||||
| 424 | isa<ConstantInt>(OtherInstrB->getOperand(1))) { | ||||
| 425 | int64_t CstVal = | ||||
| 426 | cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue(); | ||||
| 427 | if (OtherInstrB->getOperand(0) == OtherOperandA && | ||||
| 428 | IdxDiff.getSExtValue() == CstVal) | ||||
| 429 | return true; | ||||
| 430 | } | ||||
| 431 | // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`. | ||||
| 432 | if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add && | ||||
| 433 | checkNoWrapFlags(OtherInstrA, Signed) && | ||||
| 434 | isa<ConstantInt>(OtherInstrA->getOperand(1))) { | ||||
| 435 | int64_t CstVal = | ||||
| 436 | cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue(); | ||||
| 437 | if (OtherInstrA->getOperand(0) == OtherOperandB && | ||||
| 438 | IdxDiff.getSExtValue() == -CstVal) | ||||
| 439 | return true; | ||||
| 440 | } | ||||
| 441 | // Match `x +nsw/nuw (y +nsw/nuw c)` and | ||||
| 442 | // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`. | ||||
| 443 | if (OtherInstrA && OtherInstrB && | ||||
| 444 | OtherInstrA->getOpcode() == Instruction::Add && | ||||
| 445 | OtherInstrB->getOpcode() == Instruction::Add && | ||||
| 446 | checkNoWrapFlags(OtherInstrA, Signed) && | ||||
| 447 | checkNoWrapFlags(OtherInstrB, Signed) && | ||||
| 448 | isa<ConstantInt>(OtherInstrA->getOperand(1)) && | ||||
| 449 | isa<ConstantInt>(OtherInstrB->getOperand(1))) { | ||||
| 450 | int64_t CstValA = | ||||
| 451 | cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue(); | ||||
| 452 | int64_t CstValB = | ||||
| 453 | cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue(); | ||||
| 454 | if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) && | ||||
| 455 | IdxDiff.getSExtValue() == (CstValB - CstValA)) | ||||
| 456 | return true; | ||||
| 457 | } | ||||
| 458 | } | ||||
| 459 | return false; | ||||
| 460 | } | ||||
| 461 | |||||
| 462 | bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, | ||||
| 463 | APInt PtrDelta, | ||||
| 464 | unsigned Depth) const { | ||||
| 465 | auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); | ||||
| 466 | auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); | ||||
| 467 | if (!GEPA || !GEPB) | ||||
| 468 | return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); | ||||
| 469 | |||||
| 470 | // Look through GEPs after checking they're the same except for the last | ||||
| 471 | // index. | ||||
| 472 | if (GEPA->getNumOperands() != GEPB->getNumOperands() || | ||||
| 473 | GEPA->getPointerOperand() != GEPB->getPointerOperand()) | ||||
| 474 | return false; | ||||
| 475 | gep_type_iterator GTIA = gep_type_begin(GEPA); | ||||
| 476 | gep_type_iterator GTIB = gep_type_begin(GEPB); | ||||
| 477 | for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { | ||||
| 478 | if (GTIA.getOperand() != GTIB.getOperand()) | ||||
| 479 | return false; | ||||
| 480 | ++GTIA; | ||||
| 481 | ++GTIB; | ||||
| 482 | } | ||||
| 483 | |||||
| 484 | Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); | ||||
| 485 | Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); | ||||
| 486 | if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || | ||||
| 487 | OpA->getType() != OpB->getType()) | ||||
| 488 | return false; | ||||
| 489 | |||||
| 490 | if (PtrDelta.isNegative()) { | ||||
| 491 | if (PtrDelta.isMinSignedValue()) | ||||
| 492 | return false; | ||||
| 493 | PtrDelta.negate(); | ||||
| 494 | std::swap(OpA, OpB); | ||||
| 495 | } | ||||
| 496 | uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); | ||||
| 497 | if (PtrDelta.urem(Stride) != 0) | ||||
| 498 | return false; | ||||
| 499 | unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); | ||||
| 500 | APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); | ||||
| 501 | |||||
| 502 | // Only look through a ZExt/SExt. | ||||
| 503 | if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) | ||||
| 504 | return false; | ||||
| 505 | |||||
| 506 | bool Signed = isa<SExtInst>(OpA); | ||||
| 507 | |||||
| 508 | // At this point A could be a function parameter, i.e. not an instruction | ||||
| 509 | Value *ValA = OpA->getOperand(0); | ||||
| 510 | OpB = dyn_cast<Instruction>(OpB->getOperand(0)); | ||||
| 511 | if (!OpB || ValA->getType() != OpB->getType()) | ||||
| 512 | return false; | ||||
| 513 | |||||
| 514 | // Now we need to prove that adding IdxDiff to ValA won't overflow. | ||||
| 515 | bool Safe = false; | ||||
| 516 | |||||
| 517 | // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to | ||||
| 518 | // ValA, we're okay. | ||||
| 519 | if (OpB->getOpcode() == Instruction::Add && | ||||
| 520 | isa<ConstantInt>(OpB->getOperand(1)) && | ||||
| 521 | IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) && | ||||
| 522 | checkNoWrapFlags(OpB, Signed)) | ||||
| 523 | Safe = true; | ||||
| 524 | |||||
| 525 | // Second attempt: check if we have eligible add NSW/NUW instruction | ||||
| 526 | // sequences. | ||||
| 527 | OpA = dyn_cast<Instruction>(ValA); | ||||
| 528 | if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && | ||||
| 529 | OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) && | ||||
| 530 | checkNoWrapFlags(OpB, Signed)) { | ||||
| 531 | // In the checks below a matching operand in OpA and OpB is | ||||
| 532 | // an operand which is the same in those two instructions. | ||||
| 533 | // Below we account for possible orders of the operands of | ||||
| 534 | // these add instructions. | ||||
| 535 | for (unsigned MatchingOpIdxA : {0, 1}) | ||||
| 536 | for (unsigned MatchingOpIdxB : {0, 1}) | ||||
| 537 | if (!Safe) | ||||
| 538 | Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB, | ||||
| 539 | MatchingOpIdxB, Signed); | ||||
| 540 | } | ||||
| 541 | |||||
| 542 | unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); | ||||
| 543 | |||||
| 544 | // Third attempt: | ||||
| 545 | // If all set bits of IdxDiff or any higher order bit other than the sign bit | ||||
| 546 | // are known to be zero in ValA, we can add Diff to it while guaranteeing no | ||||
| 547 | // overflow of any sort. | ||||
| 548 | if (!Safe) { | ||||
| 549 | KnownBits Known(BitWidth); | ||||
| 550 | computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT); | ||||
| 551 | APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); | ||||
| 552 | if (Signed) | ||||
| 553 | BitsAllowedToBeSet.clearBit(BitWidth - 1); | ||||
| 554 | if (BitsAllowedToBeSet.ult(IdxDiff)) | ||||
| 555 | return false; | ||||
| 556 | } | ||||
| 557 | |||||
| 558 | const SCEV *OffsetSCEVA = SE.getSCEV(ValA); | ||||
| 559 | const SCEV *OffsetSCEVB = SE.getSCEV(OpB); | ||||
| 560 | const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); | ||||
| 561 | const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); | ||||
| 562 | return X == OffsetSCEVB; | ||||
| 563 | } | ||||
| 564 | |||||
| 565 | bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, | ||||
| 566 | const APInt &PtrDelta, | ||||
| 567 | unsigned Depth) const { | ||||
| 568 | if (Depth++ == MaxDepth) | ||||
| 569 | return false; | ||||
| 570 | |||||
| 571 | if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { | ||||
| 572 | if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { | ||||
| 573 | return SelectA->getCondition() == SelectB->getCondition() && | ||||
| 574 | areConsecutivePointers(SelectA->getTrueValue(), | ||||
| 575 | SelectB->getTrueValue(), PtrDelta, Depth) && | ||||
| 576 | areConsecutivePointers(SelectA->getFalseValue(), | ||||
| 577 | SelectB->getFalseValue(), PtrDelta, Depth); | ||||
| 578 | } | ||||
| 579 | } | ||||
| 580 | return false; | ||||
| 581 | } | ||||
| 582 | |||||
| 583 | void Vectorizer::reorder(Instruction *I) { | ||||
| 584 | SmallPtrSet<Instruction *, 16> InstructionsToMove; | ||||
| 585 | SmallVector<Instruction *, 16> Worklist; | ||||
| 586 | |||||
| 587 | Worklist.push_back(I); | ||||
| 588 | while (!Worklist.empty()) { | ||||
| 589 | Instruction *IW = Worklist.pop_back_val(); | ||||
| 590 | int NumOperands = IW->getNumOperands(); | ||||
| 591 | for (int i = 0; i < NumOperands; i++) { | ||||
| 592 | Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); | ||||
| 593 | if (!IM || IM->getOpcode() == Instruction::PHI) | ||||
| 594 | continue; | ||||
| 595 | |||||
| 596 | // If IM is in another BB, no need to move it, because this pass only | ||||
| 597 | // vectorizes instructions within one BB. | ||||
| 598 | if (IM->getParent() != I->getParent()) | ||||
| 599 | continue; | ||||
| 600 | |||||
| 601 | if (!IM->comesBefore(I)) { | ||||
| 602 | InstructionsToMove.insert(IM); | ||||
| 603 | Worklist.push_back(IM); | ||||
| 604 | } | ||||
| 605 | } | ||||
| 606 | } | ||||
| 607 | |||||
| 608 | // All instructions to move should follow I. Start from I, not from begin(). | ||||
| 609 | for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; | ||||
| 610 | ++BBI) { | ||||
| 611 | if (!InstructionsToMove.count(&*BBI)) | ||||
| 612 | continue; | ||||
| 613 | Instruction *IM = &*BBI; | ||||
| 614 | --BBI; | ||||
| 615 | IM->removeFromParent(); | ||||
| 616 | IM->insertBefore(I); | ||||
| 617 | } | ||||
| 618 | } | ||||
| 619 | |||||
| 620 | std::pair<BasicBlock::iterator, BasicBlock::iterator> | ||||
| 621 | Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { | ||||
| 622 | Instruction *C0 = Chain[0]; | ||||
| 623 | BasicBlock::iterator FirstInstr = C0->getIterator(); | ||||
| 624 | BasicBlock::iterator LastInstr = C0->getIterator(); | ||||
| 625 | |||||
| 626 | BasicBlock *BB = C0->getParent(); | ||||
| 627 | unsigned NumFound = 0; | ||||
| 628 | for (Instruction &I : *BB) { | ||||
| 629 | if (!is_contained(Chain, &I)) | ||||
| 630 | continue; | ||||
| 631 | |||||
| 632 | ++NumFound; | ||||
| 633 | if (NumFound == 1) { | ||||
| 634 | FirstInstr = I.getIterator(); | ||||
| 635 | } | ||||
| 636 | if (NumFound == Chain.size()) { | ||||
| 637 | LastInstr = I.getIterator(); | ||||
| 638 | break; | ||||
| 639 | } | ||||
| 640 | } | ||||
| 641 | |||||
| 642 | // Range is [first, last). | ||||
| 643 | return std::make_pair(FirstInstr, ++LastInstr); | ||||
| 644 | } | ||||
| 645 | |||||
| 646 | void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { | ||||
| 647 | SmallVector<Instruction *, 16> Instrs; | ||||
| 648 | for (Instruction *I : Chain) { | ||||
| 649 | Value *PtrOperand = getLoadStorePointerOperand(I); | ||||
| 650 | assert(PtrOperand && "Instruction must have a pointer operand.")((void)0); | ||||
| 651 | Instrs.push_back(I); | ||||
| 652 | if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) | ||||
| 653 | Instrs.push_back(GEP); | ||||
| 654 | } | ||||
| 655 | |||||
| 656 | // Erase instructions. | ||||
| 657 | for (Instruction *I : Instrs) | ||||
| 658 | if (I->use_empty()) | ||||
| 659 | I->eraseFromParent(); | ||||
| 660 | } | ||||
| 661 | |||||
| 662 | std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> | ||||
| 663 | Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, | ||||
| 664 | unsigned ElementSizeBits) { | ||||
| 665 | unsigned ElementSizeBytes = ElementSizeBits / 8; | ||||
| 666 | unsigned SizeBytes = ElementSizeBytes * Chain.size(); | ||||
| 667 | unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; | ||||
| 668 | if (NumLeft == Chain.size()) { | ||||
| 669 | if ((NumLeft & 1) == 0) | ||||
| 670 | NumLeft /= 2; // Split even in half | ||||
| 671 | else | ||||
| 672 | --NumLeft; // Split off last element | ||||
| 673 | } else if (NumLeft == 0) | ||||
| 674 | NumLeft = 1; | ||||
| 675 | return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); | ||||
| 676 | } | ||||
| 677 | |||||
| 678 | ArrayRef<Instruction *> | ||||
| 679 | Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { | ||||
| 680 | // These are in BB order, unlike Chain, which is in address order. | ||||
| 681 | SmallVector<Instruction *, 16> MemoryInstrs; | ||||
| 682 | SmallVector<Instruction *, 16> ChainInstrs; | ||||
| 683 | |||||
| 684 | bool IsLoadChain = isa<LoadInst>(Chain[0]); | ||||
| 685 | LLVM_DEBUG({do { } while (false) | ||||
| 686 | for (Instruction *I : Chain) {do { } while (false) | ||||
| 687 | if (IsLoadChain)do { } while (false) | ||||
| 688 | assert(isa<LoadInst>(I) &&do { } while (false) | ||||
| 689 | "All elements of Chain must be loads, or all must be stores.");do { } while (false) | ||||
| 690 | elsedo { } while (false) | ||||
| 691 | assert(isa<StoreInst>(I) &&do { } while (false) | ||||
| 692 | "All elements of Chain must be loads, or all must be stores.");do { } while (false) | ||||
| 693 | }do { } while (false) | ||||
| 694 | })do { } while (false); | ||||
| 695 | |||||
| 696 | for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { | ||||
| 697 | if (isa<LoadInst>(I) || isa<StoreInst>(I)) { | ||||
| 698 | if (!is_contained(Chain, &I)) | ||||
| 699 | MemoryInstrs.push_back(&I); | ||||
| 700 | else | ||||
| 701 | ChainInstrs.push_back(&I); | ||||
| 702 | } else if (isa<IntrinsicInst>(&I) && | ||||
| 703 | cast<IntrinsicInst>(&I)->getIntrinsicID() == | ||||
| 704 | Intrinsic::sideeffect) { | ||||
| 705 | // Ignore llvm.sideeffect calls. | ||||
| 706 | } else if (isa<IntrinsicInst>(&I) && | ||||
| 707 | cast<IntrinsicInst>(&I)->getIntrinsicID() == | ||||
| 708 | Intrinsic::pseudoprobe) { | ||||
| 709 | // Ignore llvm.pseudoprobe calls. | ||||
| 710 | } else if (isa<IntrinsicInst>(&I) && | ||||
| 711 | cast<IntrinsicInst>(&I)->getIntrinsicID() == Intrinsic::assume) { | ||||
| 712 | // Ignore llvm.assume calls. | ||||
| 713 | } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { | ||||
| 714 | LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << Ido { } while (false) | ||||
| 715 | << '\n')do { } while (false); | ||||
| 716 | break; | ||||
| 717 | } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { | ||||
| 718 | LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << Ido { } while (false) | ||||
| 719 | << '\n')do { } while (false); | ||||
| 720 | break; | ||||
| 721 | } | ||||
| 722 | } | ||||
| 723 | |||||
| 724 | // Loop until we find an instruction in ChainInstrs that we can't vectorize. | ||||
| 725 | unsigned ChainInstrIdx = 0; | ||||
| 726 | Instruction *BarrierMemoryInstr = nullptr; | ||||
| 727 | |||||
| 728 | for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { | ||||
| 729 | Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; | ||||
| 730 | |||||
| 731 | // If a barrier memory instruction was found, chain instructions that follow | ||||
| 732 | // will not be added to the valid prefix. | ||||
| 733 | if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr)) | ||||
| 734 | break; | ||||
| 735 | |||||
| 736 | // Check (in BB order) if any instruction prevents ChainInstr from being | ||||
| 737 | // vectorized. Find and store the first such "conflicting" instruction. | ||||
| 738 | for (Instruction *MemInstr : MemoryInstrs) { | ||||
| 739 | // If a barrier memory instruction was found, do not check past it. | ||||
| 740 | if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr)) | ||||
| 741 | break; | ||||
| 742 | |||||
| 743 | auto *MemLoad = dyn_cast<LoadInst>(MemInstr); | ||||
| 744 | auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); | ||||
| 745 | if (MemLoad && ChainLoad) | ||||
| 746 | continue; | ||||
| 747 | |||||
| 748 | // We can ignore the alias if the we have a load store pair and the load | ||||
| 749 | // is known to be invariant. The load cannot be clobbered by the store. | ||||
| 750 | auto IsInvariantLoad = [](const LoadInst *LI) -> bool { | ||||
| 751 | return LI->hasMetadata(LLVMContext::MD_invariant_load); | ||||
| 752 | }; | ||||
| 753 | |||||
| 754 | // We can ignore the alias as long as the load comes before the store, | ||||
| 755 | // because that means we won't be moving the load past the store to | ||||
| 756 | // vectorize it (the vectorized load is inserted at the location of the | ||||
| 757 | // first load in the chain). | ||||
| 758 | if (isa<StoreInst>(MemInstr) && ChainLoad && | ||||
| 759 | (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr))) | ||||
| 760 | continue; | ||||
| 761 | |||||
| 762 | // Same case, but in reverse. | ||||
| 763 | if (MemLoad && isa<StoreInst>(ChainInstr) && | ||||
| 764 | (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr))) | ||||
| 765 | continue; | ||||
| 766 | |||||
| 767 | if (!AA.isNoAlias(MemoryLocation::get(MemInstr), | ||||
| 768 | MemoryLocation::get(ChainInstr))) { | ||||
| 769 | LLVM_DEBUG({do { } while (false) | ||||
| 770 | dbgs() << "LSV: Found alias:\n"do { } while (false) | ||||
| 771 | " Aliasing instruction and pointer:\n"do { } while (false) | ||||
| 772 | << " " << *MemInstr << '\n'do { } while (false) | ||||
| 773 | << " " << *getLoadStorePointerOperand(MemInstr) << '\n'do { } while (false) | ||||
| 774 | << " Aliased instruction and pointer:\n"do { } while (false) | ||||
| 775 | << " " << *ChainInstr << '\n'do { } while (false) | ||||
| 776 | << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';do { } while (false) | ||||
| 777 | })do { } while (false); | ||||
| 778 | // Save this aliasing memory instruction as a barrier, but allow other | ||||
| 779 | // instructions that precede the barrier to be vectorized with this one. | ||||
| 780 | BarrierMemoryInstr = MemInstr; | ||||
| 781 | break; | ||||
| 782 | } | ||||
| 783 | } | ||||
| 784 | // Continue the search only for store chains, since vectorizing stores that | ||||
| 785 | // precede an aliasing load is valid. Conversely, vectorizing loads is valid | ||||
| 786 | // up to an aliasing store, but should not pull loads from further down in | ||||
| 787 | // the basic block. | ||||
| 788 | if (IsLoadChain && BarrierMemoryInstr) { | ||||
| 789 | // The BarrierMemoryInstr is a store that precedes ChainInstr. | ||||
| 790 | assert(BarrierMemoryInstr->comesBefore(ChainInstr))((void)0); | ||||
| 791 | break; | ||||
| 792 | } | ||||
| 793 | } | ||||
| 794 | |||||
| 795 | // Find the largest prefix of Chain whose elements are all in | ||||
| 796 | // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of | ||||
| 797 | // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB | ||||
| 798 | // order.) | ||||
| 799 | SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( | ||||
| 800 | ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); | ||||
| 801 | unsigned ChainIdx = 0; | ||||
| 802 | for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { | ||||
| 803 | if (!VectorizableChainInstrs.count(Chain[ChainIdx])) | ||||
| 804 | break; | ||||
| 805 | } | ||||
| 806 | return Chain.slice(0, ChainIdx); | ||||
| 807 | } | ||||
| 808 | |||||
| 809 | static ChainID getChainID(const Value *Ptr) { | ||||
| 810 | const Value *ObjPtr = getUnderlyingObject(Ptr); | ||||
| 811 | if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { | ||||
| 812 | // The select's themselves are distinct instructions even if they share the | ||||
| 813 | // same condition and evaluate to consecutive pointers for true and false | ||||
| 814 | // values of the condition. Therefore using the select's themselves for | ||||
| 815 | // grouping instructions would put consecutive accesses into different lists | ||||
| 816 | // and they won't be even checked for being consecutive, and won't be | ||||
| 817 | // vectorized. | ||||
| 818 | return Sel->getCondition(); | ||||
| 819 | } | ||||
| 820 | return ObjPtr; | ||||
| 821 | } | ||||
| 822 | |||||
| 823 | std::pair<InstrListMap, InstrListMap> | ||||
| 824 | Vectorizer::collectInstructions(BasicBlock *BB) { | ||||
| 825 | InstrListMap LoadRefs; | ||||
| 826 | InstrListMap StoreRefs; | ||||
| 827 | |||||
| 828 | for (Instruction &I : *BB) { | ||||
| 829 | if (!I.mayReadOrWriteMemory()) | ||||
| 830 | continue; | ||||
| 831 | |||||
| 832 | if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { | ||||
| 833 | if (!LI->isSimple()) | ||||
| 834 | continue; | ||||
| 835 | |||||
| 836 | // Skip if it's not legal. | ||||
| 837 | if (!TTI.isLegalToVectorizeLoad(LI)) | ||||
| 838 | continue; | ||||
| 839 | |||||
| 840 | Type *Ty = LI->getType(); | ||||
| 841 | if (!VectorType::isValidElementType(Ty->getScalarType())) | ||||
| 842 | continue; | ||||
| 843 | |||||
| 844 | // Skip weird non-byte sizes. They probably aren't worth the effort of | ||||
| 845 | // handling correctly. | ||||
| 846 | unsigned TySize = DL.getTypeSizeInBits(Ty); | ||||
| 847 | if ((TySize % 8) != 0) | ||||
| 848 | continue; | ||||
| 849 | |||||
| 850 | // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain | ||||
| 851 | // functions are currently using an integer type for the vectorized | ||||
| 852 | // load/store, and does not support casting between the integer type and a | ||||
| 853 | // vector of pointers (e.g. i64 to <2 x i16*>) | ||||
| 854 | if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) | ||||
| 855 | continue; | ||||
| 856 | |||||
| 857 | Value *Ptr = LI->getPointerOperand(); | ||||
| 858 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); | ||||
| 859 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); | ||||
| 860 | |||||
| 861 | unsigned VF = VecRegSize / TySize; | ||||
| 862 | VectorType *VecTy = dyn_cast<VectorType>(Ty); | ||||
| 863 | |||||
| 864 | // No point in looking at these if they're too big to vectorize. | ||||
| 865 | if (TySize > VecRegSize / 2 || | ||||
| 866 | (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) | ||||
| 867 | continue; | ||||
| 868 | |||||
| 869 | // Make sure all the users of a vector are constant-index extracts. | ||||
| 870 | if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { | ||||
| 871 | const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); | ||||
| 872 | return EEI && isa<ConstantInt>(EEI->getOperand(1)); | ||||
| 873 | })) | ||||
| 874 | continue; | ||||
| 875 | |||||
| 876 | // Save the load locations. | ||||
| 877 | const ChainID ID = getChainID(Ptr); | ||||
| 878 | LoadRefs[ID].push_back(LI); | ||||
| 879 | } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { | ||||
| 880 | if (!SI->isSimple()) | ||||
| 881 | continue; | ||||
| 882 | |||||
| 883 | // Skip if it's not legal. | ||||
| 884 | if (!TTI.isLegalToVectorizeStore(SI)) | ||||
| 885 | continue; | ||||
| 886 | |||||
| 887 | Type *Ty = SI->getValueOperand()->getType(); | ||||
| 888 | if (!VectorType::isValidElementType(Ty->getScalarType())) | ||||
| 889 | continue; | ||||
| 890 | |||||
| 891 | // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain | ||||
| 892 | // functions are currently using an integer type for the vectorized | ||||
| 893 | // load/store, and does not support casting between the integer type and a | ||||
| 894 | // vector of pointers (e.g. i64 to <2 x i16*>) | ||||
| 895 | if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) | ||||
| 896 | continue; | ||||
| 897 | |||||
| 898 | // Skip weird non-byte sizes. They probably aren't worth the effort of | ||||
| 899 | // handling correctly. | ||||
| 900 | unsigned TySize = DL.getTypeSizeInBits(Ty); | ||||
| 901 | if ((TySize % 8) != 0) | ||||
| 902 | continue; | ||||
| 903 | |||||
| 904 | Value *Ptr = SI->getPointerOperand(); | ||||
| 905 | unsigned AS = Ptr->getType()->getPointerAddressSpace(); | ||||
| 906 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); | ||||
| 907 | |||||
| 908 | unsigned VF = VecRegSize / TySize; | ||||
| 909 | VectorType *VecTy = dyn_cast<VectorType>(Ty); | ||||
| 910 | |||||
| 911 | // No point in looking at these if they're too big to vectorize. | ||||
| 912 | if (TySize > VecRegSize / 2 || | ||||
| 913 | (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) | ||||
| 914 | continue; | ||||
| 915 | |||||
| 916 | if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { | ||||
| 917 | const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); | ||||
| 918 | return EEI && isa<ConstantInt>(EEI->getOperand(1)); | ||||
| 919 | })) | ||||
| 920 | continue; | ||||
| 921 | |||||
| 922 | // Save store location. | ||||
| 923 | const ChainID ID = getChainID(Ptr); | ||||
| 924 | StoreRefs[ID].push_back(SI); | ||||
| 925 | } | ||||
| 926 | } | ||||
| 927 | |||||
| 928 | return {LoadRefs, StoreRefs}; | ||||
| 929 | } | ||||
| 930 | |||||
| 931 | bool Vectorizer::vectorizeChains(InstrListMap &Map) { | ||||
| 932 | bool Changed = false; | ||||
| 933 | |||||
| 934 | for (const std::pair<ChainID, InstrList> &Chain : Map) { | ||||
| 935 | unsigned Size = Chain.second.size(); | ||||
| 936 | if (Size < 2) | ||||
| |||||
| 937 | continue; | ||||
| 938 | |||||
| 939 | LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n")do { } while (false); | ||||
| 940 | |||||
| 941 | // Process the stores in chunks of 64. | ||||
| 942 | for (unsigned CI = 0, CE = Size; CI
| ||||
| 943 | unsigned Len = std::min<unsigned>(CE - CI, 64); | ||||
| 944 | ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); | ||||
| 945 | Changed |= vectorizeInstructions(Chunk); | ||||
| 946 | } | ||||
| 947 | } | ||||
| 948 | |||||
| 949 | return Changed; | ||||
| 950 | } | ||||
| 951 | |||||
| 952 | bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { | ||||
| 953 | LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()do { } while (false) | ||||
| 954 | << " instructions.\n")do { } while (false); | ||||
| 955 | SmallVector<int, 16> Heads, Tails; | ||||
| 956 | int ConsecutiveChain[64]; | ||||
| 957 | |||||
| 958 | // Do a quadratic search on all of the given loads/stores and find all of the | ||||
| 959 | // pairs of loads/stores that follow each other. | ||||
| 960 | for (int i = 0, e = Instrs.size(); i < e; ++i) { | ||||
| 961 | ConsecutiveChain[i] = -1; | ||||
| 962 | for (int j = e - 1; j >= 0; --j) { | ||||
| 963 | if (i == j) | ||||
| 964 | continue; | ||||
| 965 | |||||
| 966 | if (isConsecutiveAccess(Instrs[i], Instrs[j])) { | ||||
| 967 | if (ConsecutiveChain[i] != -1) { | ||||
| 968 | int CurDistance = std::abs(ConsecutiveChain[i] - i); | ||||
| 969 | int NewDistance = std::abs(ConsecutiveChain[i] - j); | ||||
| 970 | if (j < i || NewDistance > CurDistance) | ||||
| 971 | continue; // Should not insert. | ||||
| 972 | } | ||||
| 973 | |||||
| 974 | Tails.push_back(j); | ||||
| 975 | Heads.push_back(i); | ||||
| 976 | ConsecutiveChain[i] = j; | ||||
| 977 | } | ||||
| 978 | } | ||||
| 979 | } | ||||
| 980 | |||||
| 981 | bool Changed = false; | ||||
| 982 | SmallPtrSet<Instruction *, 16> InstructionsProcessed; | ||||
| 983 | |||||
| 984 | for (int Head : Heads) { | ||||
| 985 | if (InstructionsProcessed.count(Instrs[Head])) | ||||
| 986 | continue; | ||||
| 987 | bool LongerChainExists = false; | ||||
| 988 | for (unsigned TIt = 0; TIt < Tails.size(); TIt++) | ||||
| 989 | if (Head == Tails[TIt] && | ||||
| 990 | !InstructionsProcessed.count(Instrs[Heads[TIt]])) { | ||||
| 991 | LongerChainExists = true; | ||||
| 992 | break; | ||||
| 993 | } | ||||
| 994 | if (LongerChainExists
| ||||
| 995 | continue; | ||||
| 996 | |||||
| 997 | // We found an instr that starts a chain. Now follow the chain and try to | ||||
| 998 | // vectorize it. | ||||
| 999 | SmallVector<Instruction *, 16> Operands; | ||||
| 1000 | int I = Head; | ||||
| 1001 | while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { | ||||
| 1002 | if (InstructionsProcessed.count(Instrs[I])) | ||||
| 1003 | break; | ||||
| 1004 | |||||
| 1005 | Operands.push_back(Instrs[I]); | ||||
| 1006 | I = ConsecutiveChain[I]; | ||||
| 1007 | } | ||||
| 1008 | |||||
| 1009 | bool Vectorized = false; | ||||
| 1010 | if (isa<LoadInst>(*Operands.begin())) | ||||
| 1011 | Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); | ||||
| 1012 | else | ||||
| 1013 | Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); | ||||
| 1014 | |||||
| 1015 | Changed |= Vectorized; | ||||
| 1016 | } | ||||
| 1017 | |||||
| 1018 | return Changed; | ||||
| 1019 | } | ||||
| 1020 | |||||
| 1021 | bool Vectorizer::vectorizeStoreChain( | ||||
| 1022 | ArrayRef<Instruction *> Chain, | ||||
| 1023 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { | ||||
| 1024 | StoreInst *S0 = cast<StoreInst>(Chain[0]); | ||||
| 1025 | |||||
| 1026 | // If the vector has an int element, default to int for the whole store. | ||||
| 1027 | Type *StoreTy = nullptr; | ||||
| 1028 | for (Instruction *I : Chain) { | ||||
| 1029 | StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); | ||||
| 1030 | if (StoreTy->isIntOrIntVectorTy()) | ||||
| 1031 | break; | ||||
| 1032 | |||||
| 1033 | if (StoreTy->isPtrOrPtrVectorTy()) { | ||||
| 1034 | StoreTy = Type::getIntNTy(F.getParent()->getContext(), | ||||
| 1035 | DL.getTypeSizeInBits(StoreTy)); | ||||
| 1036 | break; | ||||
| 1037 | } | ||||
| 1038 | } | ||||
| 1039 | assert(StoreTy && "Failed to find store type")((void)0); | ||||
| 1040 | |||||
| 1041 | unsigned Sz = DL.getTypeSizeInBits(StoreTy); | ||||
| 1042 | unsigned AS = S0->getPointerAddressSpace(); | ||||
| 1043 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); | ||||
| 1044 | unsigned VF = VecRegSize / Sz; | ||||
| 1045 | unsigned ChainSize = Chain.size(); | ||||
| 1046 | Align Alignment = S0->getAlign(); | ||||
| 1047 | |||||
| 1048 | if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { | ||||
| 1049 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | ||||
| 1050 | return false; | ||||
| 1051 | } | ||||
| 1052 | |||||
| 1053 | ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); | ||||
| 1054 | if (NewChain.empty()) { | ||||
| 1055 | // No vectorization possible. | ||||
| 1056 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | ||||
| 1057 | return false; | ||||
| 1058 | } | ||||
| 1059 | if (NewChain.size() == 1) { | ||||
| 1060 | // Failed after the first instruction. Discard it and try the smaller chain. | ||||
| 1061 | InstructionsProcessed->insert(NewChain.front()); | ||||
| 1062 | return false; | ||||
| 1063 | } | ||||
| 1064 | |||||
| 1065 | // Update Chain to the valid vectorizable subchain. | ||||
| 1066 | Chain = NewChain; | ||||
| 1067 | ChainSize = Chain.size(); | ||||
| 1068 | |||||
| 1069 | // Check if it's legal to vectorize this chain. If not, split the chain and | ||||
| 1070 | // try again. | ||||
| 1071 | unsigned EltSzInBytes = Sz / 8; | ||||
| 1072 | unsigned SzInBytes = EltSzInBytes * ChainSize; | ||||
| 1073 | |||||
| 1074 | FixedVectorType *VecTy; | ||||
| 1075 | auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy); | ||||
| 1076 | if (VecStoreTy) | ||||
| 1077 | VecTy = FixedVectorType::get(StoreTy->getScalarType(), | ||||
| 1078 | Chain.size() * VecStoreTy->getNumElements()); | ||||
| 1079 | else | ||||
| 1080 | VecTy = FixedVectorType::get(StoreTy, Chain.size()); | ||||
| 1081 | |||||
| 1082 | // If it's more than the max vector size or the target has a better | ||||
| 1083 | // vector factor, break it into two pieces. | ||||
| 1084 | unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); | ||||
| 1085 | if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { | ||||
| 1086 | LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."do { } while (false) | ||||
| 1087 | " Creating two separate arrays.\n")do { } while (false); | ||||
| 1088 | return vectorizeStoreChain(Chain.slice(0, TargetVF), | ||||
| 1089 | InstructionsProcessed) | | ||||
| 1090 | vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); | ||||
| 1091 | } | ||||
| 1092 | |||||
| 1093 | LLVM_DEBUG({do { } while (false) | ||||
| 1094 | dbgs() << "LSV: Stores to vectorize:\n";do { } while (false) | ||||
| 1095 | for (Instruction *I : Chain)do { } while (false) | ||||
| 1096 | dbgs() << " " << *I << "\n";do { } while (false) | ||||
| 1097 | })do { } while (false); | ||||
| 1098 | |||||
| 1099 | // We won't try again to vectorize the elements of the chain, regardless of | ||||
| 1100 | // whether we succeed below. | ||||
| 1101 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | ||||
| 1102 | |||||
| 1103 | // If the store is going to be misaligned, don't vectorize it. | ||||
| 1104 | if (accessIsMisaligned(SzInBytes, AS, Alignment)) { | ||||
| 1105 | if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { | ||||
| 1106 | auto Chains = splitOddVectorElts(Chain, Sz); | ||||
| 1107 | return vectorizeStoreChain(Chains.first, InstructionsProcessed) | | ||||
| 1108 | vectorizeStoreChain(Chains.second, InstructionsProcessed); | ||||
| 1109 | } | ||||
| 1110 | |||||
| 1111 | Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), | ||||
| 1112 | Align(StackAdjustedAlignment), | ||||
| 1113 | DL, S0, nullptr, &DT); | ||||
| 1114 | if (NewAlign >= Alignment) | ||||
| 1115 | Alignment = NewAlign; | ||||
| 1116 | else | ||||
| 1117 | return false; | ||||
| 1118 | } | ||||
| 1119 | |||||
| 1120 | if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { | ||||
| 1121 | auto Chains = splitOddVectorElts(Chain, Sz); | ||||
| 1122 | return vectorizeStoreChain(Chains.first, InstructionsProcessed) | | ||||
| 1123 | vectorizeStoreChain(Chains.second, InstructionsProcessed); | ||||
| 1124 | } | ||||
| 1125 | |||||
| 1126 | BasicBlock::iterator First, Last; | ||||
| 1127 | std::tie(First, Last) = getBoundaryInstrs(Chain); | ||||
| 1128 | Builder.SetInsertPoint(&*Last); | ||||
| 1129 | |||||
| 1130 | Value *Vec = UndefValue::get(VecTy); | ||||
| 1131 | |||||
| 1132 | if (VecStoreTy) { | ||||
| 1133 | unsigned VecWidth = VecStoreTy->getNumElements(); | ||||
| 1134 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { | ||||
| 1135 | StoreInst *Store = cast<StoreInst>(Chain[I]); | ||||
| 1136 | for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { | ||||
| 1137 | unsigned NewIdx = J + I * VecWidth; | ||||
| 1138 | Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), | ||||
| 1139 | Builder.getInt32(J)); | ||||
| 1140 | if (Extract->getType() != StoreTy->getScalarType()) | ||||
| 1141 | Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); | ||||
| 1142 | |||||
| 1143 | Value *Insert = | ||||
| 1144 | Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); | ||||
| 1145 | Vec = Insert; | ||||
| 1146 | } | ||||
| 1147 | } | ||||
| 1148 | } else { | ||||
| 1149 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { | ||||
| 1150 | StoreInst *Store = cast<StoreInst>(Chain[I]); | ||||
| 1151 | Value *Extract = Store->getValueOperand(); | ||||
| 1152 | if (Extract->getType() != StoreTy->getScalarType()) | ||||
| 1153 | Extract = | ||||
| 1154 | Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); | ||||
| 1155 | |||||
| 1156 | Value *Insert = | ||||
| 1157 | Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); | ||||
| 1158 | Vec = Insert; | ||||
| 1159 | } | ||||
| 1160 | } | ||||
| 1161 | |||||
| 1162 | StoreInst *SI = Builder.CreateAlignedStore( | ||||
| 1163 | Vec, | ||||
| 1164 | Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), | ||||
| 1165 | Alignment); | ||||
| 1166 | propagateMetadata(SI, Chain); | ||||
| 1167 | |||||
| 1168 | eraseInstructions(Chain); | ||||
| 1169 | ++NumVectorInstructions; | ||||
| 1170 | NumScalarsVectorized += Chain.size(); | ||||
| 1171 | return true; | ||||
| 1172 | } | ||||
| 1173 | |||||
| 1174 | bool Vectorizer::vectorizeLoadChain( | ||||
| 1175 | ArrayRef<Instruction *> Chain, | ||||
| 1176 | SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { | ||||
| 1177 | LoadInst *L0 = cast<LoadInst>(Chain[0]); | ||||
| 1178 | |||||
| 1179 | // If the vector has an int element, default to int for the whole load. | ||||
| 1180 | Type *LoadTy = nullptr; | ||||
| 1181 | for (const auto &V : Chain) { | ||||
| 1182 | LoadTy = cast<LoadInst>(V)->getType(); | ||||
| 1183 | if (LoadTy->isIntOrIntVectorTy()) | ||||
| 1184 | break; | ||||
| 1185 | |||||
| 1186 | if (LoadTy->isPtrOrPtrVectorTy()) { | ||||
| 1187 | LoadTy = Type::getIntNTy(F.getParent()->getContext(), | ||||
| 1188 | DL.getTypeSizeInBits(LoadTy)); | ||||
| 1189 | break; | ||||
| 1190 | } | ||||
| 1191 | } | ||||
| 1192 | assert(LoadTy && "Can't determine LoadInst type from chain")((void)0); | ||||
| 1193 | |||||
| 1194 | unsigned Sz = DL.getTypeSizeInBits(LoadTy); | ||||
| 1195 | unsigned AS = L0->getPointerAddressSpace(); | ||||
| 1196 | unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); | ||||
| 1197 | unsigned VF = VecRegSize / Sz; | ||||
| 1198 | unsigned ChainSize = Chain.size(); | ||||
| 1199 | Align Alignment = L0->getAlign(); | ||||
| 1200 | |||||
| 1201 | if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { | ||||
| 1202 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | ||||
| 1203 | return false; | ||||
| 1204 | } | ||||
| 1205 | |||||
| 1206 | ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); | ||||
| 1207 | if (NewChain.empty()) { | ||||
| 1208 | // No vectorization possible. | ||||
| 1209 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | ||||
| 1210 | return false; | ||||
| 1211 | } | ||||
| 1212 | if (NewChain.size() == 1) { | ||||
| 1213 | // Failed after the first instruction. Discard it and try the smaller chain. | ||||
| 1214 | InstructionsProcessed->insert(NewChain.front()); | ||||
| 1215 | return false; | ||||
| 1216 | } | ||||
| 1217 | |||||
| 1218 | // Update Chain to the valid vectorizable subchain. | ||||
| 1219 | Chain = NewChain; | ||||
| 1220 | ChainSize = Chain.size(); | ||||
| 1221 | |||||
| 1222 | // Check if it's legal to vectorize this chain. If not, split the chain and | ||||
| 1223 | // try again. | ||||
| 1224 | unsigned EltSzInBytes = Sz / 8; | ||||
| 1225 | unsigned SzInBytes = EltSzInBytes * ChainSize; | ||||
| 1226 | VectorType *VecTy; | ||||
| 1227 | auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy); | ||||
| 1228 | if (VecLoadTy) | ||||
| 1229 | VecTy = FixedVectorType::get(LoadTy->getScalarType(), | ||||
| |||||
| 1230 | Chain.size() * VecLoadTy->getNumElements()); | ||||
| 1231 | else | ||||
| 1232 | VecTy = FixedVectorType::get(LoadTy, Chain.size()); | ||||
| 1233 | |||||
| 1234 | // If it's more than the max vector size or the target has a better | ||||
| 1235 | // vector factor, break it into two pieces. | ||||
| 1236 | unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); | ||||
| 1237 | if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { | ||||
| 1238 | LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."do { } while (false) | ||||
| 1239 | " Creating two separate arrays.\n")do { } while (false); | ||||
| 1240 | return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | | ||||
| 1241 | vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); | ||||
| 1242 | } | ||||
| 1243 | |||||
| 1244 | // We won't try again to vectorize the elements of the chain, regardless of | ||||
| 1245 | // whether we succeed below. | ||||
| 1246 | InstructionsProcessed->insert(Chain.begin(), Chain.end()); | ||||
| 1247 | |||||
| 1248 | // If the load is going to be misaligned, don't vectorize it. | ||||
| 1249 | if (accessIsMisaligned(SzInBytes, AS, Alignment)) { | ||||
| 1250 | if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { | ||||
| 1251 | auto Chains = splitOddVectorElts(Chain, Sz); | ||||
| 1252 | return vectorizeLoadChain(Chains.first, InstructionsProcessed) | | ||||
| 1253 | vectorizeLoadChain(Chains.second, InstructionsProcessed); | ||||
| 1254 | } | ||||
| 1255 | |||||
| 1256 | Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), | ||||
| 1257 | Align(StackAdjustedAlignment), | ||||
| 1258 | DL, L0, nullptr, &DT); | ||||
| 1259 | if (NewAlign >= Alignment) | ||||
| 1260 | Alignment = NewAlign; | ||||
| 1261 | else | ||||
| 1262 | return false; | ||||
| 1263 | } | ||||
| 1264 | |||||
| 1265 | if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { | ||||
| 1266 | auto Chains = splitOddVectorElts(Chain, Sz); | ||||
| 1267 | return vectorizeLoadChain(Chains.first, InstructionsProcessed) | | ||||
| 1268 | vectorizeLoadChain(Chains.second, InstructionsProcessed); | ||||
| 1269 | } | ||||
| 1270 | |||||
| 1271 | LLVM_DEBUG({do { } while (false) | ||||
| 1272 | dbgs() << "LSV: Loads to vectorize:\n";do { } while (false) | ||||
| 1273 | for (Instruction *I : Chain)do { } while (false) | ||||
| 1274 | I->dump();do { } while (false) | ||||
| 1275 | })do { } while (false); | ||||
| 1276 | |||||
| 1277 | // getVectorizablePrefix already computed getBoundaryInstrs. The value of | ||||
| 1278 | // Last may have changed since then, but the value of First won't have. If it | ||||
| 1279 | // matters, we could compute getBoundaryInstrs only once and reuse it here. | ||||
| 1280 | BasicBlock::iterator First, Last; | ||||
| 1281 | std::tie(First, Last) = getBoundaryInstrs(Chain); | ||||
| 1282 | Builder.SetInsertPoint(&*First); | ||||
| 1283 | |||||
| 1284 | Value *Bitcast = | ||||
| 1285 | Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); | ||||
| 1286 | LoadInst *LI = | ||||
| 1287 | Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); | ||||
| 1288 | propagateMetadata(LI, Chain); | ||||
| 1289 | |||||
| 1290 | if (VecLoadTy) { | ||||
| 1291 | SmallVector<Instruction *, 16> InstrsToErase; | ||||
| 1292 | |||||
| 1293 | unsigned VecWidth = VecLoadTy->getNumElements(); | ||||
| 1294 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { | ||||
| 1295 | for (auto Use : Chain[I]->users()) { | ||||
| 1296 | // All users of vector loads are ExtractElement instructions with | ||||
| 1297 | // constant indices, otherwise we would have bailed before now. | ||||
| 1298 | Instruction *UI = cast<Instruction>(Use); | ||||
| 1299 | unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); | ||||
| 1300 | unsigned NewIdx = Idx + I * VecWidth; | ||||
| 1301 | Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), | ||||
| 1302 | UI->getName()); | ||||
| 1303 | if (V->getType() != UI->getType()) | ||||
| 1304 | V = Builder.CreateBitCast(V, UI->getType()); | ||||
| 1305 | |||||
| 1306 | // Replace the old instruction. | ||||
| 1307 | UI->replaceAllUsesWith(V); | ||||
| 1308 | InstrsToErase.push_back(UI); | ||||
| 1309 | } | ||||
| 1310 | } | ||||
| 1311 | |||||
| 1312 | // Bitcast might not be an Instruction, if the value being loaded is a | ||||
| 1313 | // constant. In that case, no need to reorder anything. | ||||
| 1314 | if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) | ||||
| 1315 | reorder(BitcastInst); | ||||
| 1316 | |||||
| 1317 | for (auto I : InstrsToErase) | ||||
| 1318 | I->eraseFromParent(); | ||||
| 1319 | } else { | ||||
| 1320 | for (unsigned I = 0, E = Chain.size(); I != E; ++I) { | ||||
| 1321 | Value *CV = Chain[I]; | ||||
| 1322 | Value *V = | ||||
| 1323 | Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); | ||||
| 1324 | if (V->getType() != CV->getType()) { | ||||
| 1325 | V = Builder.CreateBitOrPointerCast(V, CV->getType()); | ||||
| 1326 | } | ||||
| 1327 | |||||
| 1328 | // Replace the old instruction. | ||||
| 1329 | CV->replaceAllUsesWith(V); | ||||
| 1330 | } | ||||
| 1331 | |||||
| 1332 | if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) | ||||
| 1333 | reorder(BitcastInst); | ||||
| 1334 | } | ||||
| 1335 | |||||
| 1336 | eraseInstructions(Chain); | ||||
| 1337 | |||||
| 1338 | ++NumVectorInstructions; | ||||
| 1339 | NumScalarsVectorized += Chain.size(); | ||||
| 1340 | return true; | ||||
| 1341 | } | ||||
| 1342 | |||||
| 1343 | bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, | ||||
| 1344 | Align Alignment) { | ||||
| 1345 | if (Alignment.value() % SzInBytes == 0) | ||||
| 1346 | return false; | ||||
| 1347 | |||||
| 1348 | bool Fast = false; | ||||
| 1349 | bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), | ||||
| 1350 | SzInBytes * 8, AddressSpace, | ||||
| 1351 | Alignment, &Fast); | ||||
| 1352 | LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allowsdo { } while (false) | ||||
| 1353 | << " and fast? " << Fast << "\n";)do { } while (false); | ||||
| 1354 | return !Allows || !Fast; | ||||
| 1355 | } |
| 1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// | ||||
| 2 | // | ||||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
| 6 | // | ||||
| 7 | //===----------------------------------------------------------------------===// | ||||
| 8 | // | ||||
| 9 | // This file contains some functions that are useful for math stuff. | ||||
| 10 | // | ||||
| 11 | //===----------------------------------------------------------------------===// | ||||
| 12 | |||||
| 13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H | ||||
| 14 | #define LLVM_SUPPORT_MATHEXTRAS_H | ||||
| 15 | |||||
| 16 | #include "llvm/Support/Compiler.h" | ||||
| 17 | #include <cassert> | ||||
| 18 | #include <climits> | ||||
| 19 | #include <cmath> | ||||
| 20 | #include <cstdint> | ||||
| 21 | #include <cstring> | ||||
| 22 | #include <limits> | ||||
| 23 | #include <type_traits> | ||||
| 24 | |||||
| 25 | #ifdef __ANDROID_NDK__ | ||||
| 26 | #include <android/api-level.h> | ||||
| 27 | #endif | ||||
| 28 | |||||
| 29 | #ifdef _MSC_VER | ||||
| 30 | // Declare these intrinsics manually rather including intrin.h. It's very | ||||
| 31 | // expensive, and MathExtras.h is popular. | ||||
| 32 | // #include <intrin.h> | ||||
| 33 | extern "C" { | ||||
| 34 | unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); | ||||
| 35 | unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); | ||||
| 36 | unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); | ||||
| 37 | unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); | ||||
| 38 | } | ||||
| 39 | #endif | ||||
| 40 | |||||
| 41 | namespace llvm { | ||||
| 42 | |||||
| 43 | /// The behavior an operation has on an input of 0. | ||||
| 44 | enum ZeroBehavior { | ||||
| 45 | /// The returned value is undefined. | ||||
| 46 | ZB_Undefined, | ||||
| 47 | /// The returned value is numeric_limits<T>::max() | ||||
| 48 | ZB_Max, | ||||
| 49 | /// The returned value is numeric_limits<T>::digits | ||||
| 50 | ZB_Width | ||||
| 51 | }; | ||||
| 52 | |||||
| 53 | /// Mathematical constants. | ||||
| 54 | namespace numbers { | ||||
| 55 | // TODO: Track C++20 std::numbers. | ||||
| 56 | // TODO: Favor using the hexadecimal FP constants (requires C++17). | ||||
| 57 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 | ||||
| 58 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 | ||||
| 59 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 | ||||
| 60 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 | ||||
| 61 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) | ||||
| 62 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) | ||||
| 63 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 | ||||
| 64 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 | ||||
| 65 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 | ||||
| 66 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 | ||||
| 67 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 | ||||
| 68 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) | ||||
| 69 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 | ||||
| 70 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) | ||||
| 71 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 | ||||
| 72 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 | ||||
| 73 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 | ||||
| 74 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 | ||||
| 75 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 | ||||
| 76 | log2ef = 1.44269504F, // (0x1.715476P+0) | ||||
| 77 | log10ef = .434294482F, // (0x1.bcb7b2P-2) | ||||
| 78 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 | ||||
| 79 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 | ||||
| 80 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 | ||||
| 81 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 | ||||
| 82 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 | ||||
| 83 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) | ||||
| 84 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 | ||||
| 85 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) | ||||
| 86 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 | ||||
| 87 | } // namespace numbers | ||||
| 88 | |||||
| 89 | namespace detail { | ||||
| 90 | template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter { | ||||
| 91 | static unsigned count(T Val, ZeroBehavior) { | ||||
| 92 | if (!Val) | ||||
| 93 | return std::numeric_limits<T>::digits; | ||||
| 94 | if (Val & 0x1) | ||||
| 95 | return 0; | ||||
| 96 | |||||
| 97 | // Bisection method. | ||||
| 98 | unsigned ZeroBits = 0; | ||||
| 99 | T Shift = std::numeric_limits<T>::digits >> 1; | ||||
| 100 | T Mask = std::numeric_limits<T>::max() >> Shift; | ||||
| 101 | while (Shift) { | ||||
| 102 | if ((Val & Mask) == 0) { | ||||
| 103 | Val >>= Shift; | ||||
| 104 | ZeroBits |= Shift; | ||||
| 105 | } | ||||
| 106 | Shift >>= 1; | ||||
| 107 | Mask >>= Shift; | ||||
| 108 | } | ||||
| 109 | return ZeroBits; | ||||
| 110 | } | ||||
| 111 | }; | ||||
| 112 | |||||
| 113 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||
| 114 | template <typename T> struct TrailingZerosCounter<T, 4> { | ||||
| 115 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||
| 116 | if (ZB != ZB_Undefined && Val == 0) | ||||
| 117 | return 32; | ||||
| 118 | |||||
| 119 | #if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4) | ||||
| 120 | return __builtin_ctz(Val); | ||||
| 121 | #elif defined(_MSC_VER) | ||||
| 122 | unsigned long Index; | ||||
| 123 | _BitScanForward(&Index, Val); | ||||
| 124 | return Index; | ||||
| 125 | #endif | ||||
| 126 | } | ||||
| 127 | }; | ||||
| 128 | |||||
| 129 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||
| 130 | template <typename T> struct TrailingZerosCounter<T, 8> { | ||||
| 131 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||
| 132 | if (ZB != ZB_Undefined && Val == 0) | ||||
| 133 | return 64; | ||||
| 134 | |||||
| 135 | #if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4) | ||||
| 136 | return __builtin_ctzll(Val); | ||||
| 137 | #elif defined(_MSC_VER) | ||||
| 138 | unsigned long Index; | ||||
| 139 | _BitScanForward64(&Index, Val); | ||||
| 140 | return Index; | ||||
| 141 | #endif | ||||
| 142 | } | ||||
| 143 | }; | ||||
| 144 | #endif | ||||
| 145 | #endif | ||||
| 146 | } // namespace detail | ||||
| 147 | |||||
| 148 | /// Count number of 0's from the least significant bit to the most | ||||
| 149 | /// stopping at the first 1. | ||||
| 150 | /// | ||||
| 151 | /// Only unsigned integral types are allowed. | ||||
| 152 | /// | ||||
| 153 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||
| 154 | /// valid arguments. | ||||
| 155 | template <typename T> | ||||
| 156 | unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||
| 157 | static_assert(std::numeric_limits<T>::is_integer && | ||||
| 158 | !std::numeric_limits<T>::is_signed, | ||||
| 159 | "Only unsigned integral types are allowed."); | ||||
| 160 | return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||
| 161 | } | ||||
| 162 | |||||
| 163 | namespace detail { | ||||
| 164 | template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter { | ||||
| 165 | static unsigned count(T Val, ZeroBehavior) { | ||||
| 166 | if (!Val) | ||||
| 167 | return std::numeric_limits<T>::digits; | ||||
| 168 | |||||
| 169 | // Bisection method. | ||||
| 170 | unsigned ZeroBits = 0; | ||||
| 171 | for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) { | ||||
| 172 | T Tmp = Val >> Shift; | ||||
| 173 | if (Tmp) | ||||
| 174 | Val = Tmp; | ||||
| 175 | else | ||||
| 176 | ZeroBits |= Shift; | ||||
| 177 | } | ||||
| 178 | return ZeroBits; | ||||
| 179 | } | ||||
| 180 | }; | ||||
| 181 | |||||
| 182 | #if defined(__GNUC__4) || defined(_MSC_VER) | ||||
| 183 | template <typename T> struct LeadingZerosCounter<T, 4> { | ||||
| 184 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||
| 185 | if (ZB != ZB_Undefined && Val == 0) | ||||
| 186 | return 32; | ||||
| 187 | |||||
| 188 | #if __has_builtin(__builtin_clz)1 || defined(__GNUC__4) | ||||
| 189 | return __builtin_clz(Val); | ||||
| 190 | #elif defined(_MSC_VER) | ||||
| 191 | unsigned long Index; | ||||
| 192 | _BitScanReverse(&Index, Val); | ||||
| 193 | return Index ^ 31; | ||||
| 194 | #endif | ||||
| 195 | } | ||||
| 196 | }; | ||||
| 197 | |||||
| 198 | #if !defined(_MSC_VER) || defined(_M_X64) | ||||
| 199 | template <typename T> struct LeadingZerosCounter<T, 8> { | ||||
| 200 | static unsigned count(T Val, ZeroBehavior ZB) { | ||||
| 201 | if (ZB != ZB_Undefined && Val == 0) | ||||
| 202 | return 64; | ||||
| 203 | |||||
| 204 | #if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4) | ||||
| 205 | return __builtin_clzll(Val); | ||||
| 206 | #elif defined(_MSC_VER) | ||||
| 207 | unsigned long Index; | ||||
| 208 | _BitScanReverse64(&Index, Val); | ||||
| 209 | return Index ^ 63; | ||||
| 210 | #endif | ||||
| 211 | } | ||||
| 212 | }; | ||||
| 213 | #endif | ||||
| 214 | #endif | ||||
| 215 | } // namespace detail | ||||
| 216 | |||||
| 217 | /// Count number of 0's from the most significant bit to the least | ||||
| 218 | /// stopping at the first 1. | ||||
| 219 | /// | ||||
| 220 | /// Only unsigned integral types are allowed. | ||||
| 221 | /// | ||||
| 222 | /// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are | ||||
| 223 | /// valid arguments. | ||||
| 224 | template <typename T> | ||||
| 225 | unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { | ||||
| 226 | static_assert(std::numeric_limits<T>::is_integer && | ||||
| 227 | !std::numeric_limits<T>::is_signed, | ||||
| 228 | "Only unsigned integral types are allowed."); | ||||
| 229 | return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB); | ||||
| 230 | } | ||||
| 231 | |||||
| 232 | /// Get the index of the first set bit starting from the least | ||||
| 233 | /// significant bit. | ||||
| 234 | /// | ||||
| 235 | /// Only unsigned integral types are allowed. | ||||
| 236 | /// | ||||
| 237 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||
| 238 | /// valid arguments. | ||||
| 239 | template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||
| 240 | if (ZB == ZB_Max && Val == 0) | ||||
| 241 | return std::numeric_limits<T>::max(); | ||||
| 242 | |||||
| 243 | return countTrailingZeros(Val, ZB_Undefined); | ||||
| 244 | } | ||||
| 245 | |||||
| 246 | /// Create a bitmask with the N right-most bits set to 1, and all other | ||||
| 247 | /// bits set to 0. Only unsigned types are allowed. | ||||
| 248 | template <typename T> T maskTrailingOnes(unsigned N) { | ||||
| 249 | static_assert(std::is_unsigned<T>::value, "Invalid type!"); | ||||
| 250 | const unsigned Bits = CHAR_BIT8 * sizeof(T); | ||||
| 251 | assert(N <= Bits && "Invalid bit index")((void)0); | ||||
| 252 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); | ||||
| 253 | } | ||||
| 254 | |||||
| 255 | /// Create a bitmask with the N left-most bits set to 1, and all other | ||||
| 256 | /// bits set to 0. Only unsigned types are allowed. | ||||
| 257 | template <typename T> T maskLeadingOnes(unsigned N) { | ||||
| 258 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||
| 259 | } | ||||
| 260 | |||||
| 261 | /// Create a bitmask with the N right-most bits set to 0, and all other | ||||
| 262 | /// bits set to 1. Only unsigned types are allowed. | ||||
| 263 | template <typename T> T maskTrailingZeros(unsigned N) { | ||||
| 264 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||
| 265 | } | ||||
| 266 | |||||
| 267 | /// Create a bitmask with the N left-most bits set to 0, and all other | ||||
| 268 | /// bits set to 1. Only unsigned types are allowed. | ||||
| 269 | template <typename T> T maskLeadingZeros(unsigned N) { | ||||
| 270 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); | ||||
| 271 | } | ||||
| 272 | |||||
| 273 | /// Get the index of the last set bit starting from the least | ||||
| 274 | /// significant bit. | ||||
| 275 | /// | ||||
| 276 | /// Only unsigned integral types are allowed. | ||||
| 277 | /// | ||||
| 278 | /// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are | ||||
| 279 | /// valid arguments. | ||||
| 280 | template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { | ||||
| 281 | if (ZB == ZB_Max && Val == 0) | ||||
| 282 | return std::numeric_limits<T>::max(); | ||||
| 283 | |||||
| 284 | // Use ^ instead of - because both gcc and llvm can remove the associated ^ | ||||
| 285 | // in the __builtin_clz intrinsic on x86. | ||||
| 286 | return countLeadingZeros(Val, ZB_Undefined) ^ | ||||
| 287 | (std::numeric_limits<T>::digits - 1); | ||||
| 288 | } | ||||
| 289 | |||||
| 290 | /// Macro compressed bit reversal table for 256 bits. | ||||
| 291 | /// | ||||
| 292 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable | ||||
| 293 | static const unsigned char BitReverseTable256[256] = { | ||||
| 294 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 | ||||
| 295 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) | ||||
| 296 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) | ||||
| 297 | R6(0), R6(2), R6(1), R6(3) | ||||
| 298 | #undef R2 | ||||
| 299 | #undef R4 | ||||
| 300 | #undef R6 | ||||
| 301 | }; | ||||
| 302 | |||||
| 303 | /// Reverse the bits in \p Val. | ||||
| 304 | template <typename T> | ||||
| 305 | T reverseBits(T Val) { | ||||
| 306 | unsigned char in[sizeof(Val)]; | ||||
| 307 | unsigned char out[sizeof(Val)]; | ||||
| 308 | std::memcpy(in, &Val, sizeof(Val)); | ||||
| 309 | for (unsigned i = 0; i < sizeof(Val); ++i) | ||||
| 310 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; | ||||
| 311 | std::memcpy(&Val, out, sizeof(Val)); | ||||
| 312 | return Val; | ||||
| 313 | } | ||||
| 314 | |||||
| 315 | #if __has_builtin(__builtin_bitreverse8)1 | ||||
| 316 | template<> | ||||
| 317 | inline uint8_t reverseBits<uint8_t>(uint8_t Val) { | ||||
| 318 | return __builtin_bitreverse8(Val); | ||||
| 319 | } | ||||
| 320 | #endif | ||||
| 321 | |||||
| 322 | #if __has_builtin(__builtin_bitreverse16)1 | ||||
| 323 | template<> | ||||
| 324 | inline uint16_t reverseBits<uint16_t>(uint16_t Val) { | ||||
| 325 | return __builtin_bitreverse16(Val); | ||||
| 326 | } | ||||
| 327 | #endif | ||||
| 328 | |||||
| 329 | #if __has_builtin(__builtin_bitreverse32)1 | ||||
| 330 | template<> | ||||
| 331 | inline uint32_t reverseBits<uint32_t>(uint32_t Val) { | ||||
| 332 | return __builtin_bitreverse32(Val); | ||||
| 333 | } | ||||
| 334 | #endif | ||||
| 335 | |||||
| 336 | #if __has_builtin(__builtin_bitreverse64)1 | ||||
| 337 | template<> | ||||
| 338 | inline uint64_t reverseBits<uint64_t>(uint64_t Val) { | ||||
| 339 | return __builtin_bitreverse64(Val); | ||||
| 340 | } | ||||
| 341 | #endif | ||||
| 342 | |||||
| 343 | // NOTE: The following support functions use the _32/_64 extensions instead of | ||||
| 344 | // type overloading so that signed and unsigned integers can be used without | ||||
| 345 | // ambiguity. | ||||
| 346 | |||||
| 347 | /// Return the high 32 bits of a 64 bit value. | ||||
| 348 | constexpr inline uint32_t Hi_32(uint64_t Value) { | ||||
| 349 | return static_cast<uint32_t>(Value >> 32); | ||||
| 350 | } | ||||
| 351 | |||||
| 352 | /// Return the low 32 bits of a 64 bit value. | ||||
| 353 | constexpr inline uint32_t Lo_32(uint64_t Value) { | ||||
| 354 | return static_cast<uint32_t>(Value); | ||||
| 355 | } | ||||
| 356 | |||||
| 357 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. | ||||
| 358 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { | ||||
| 359 | return ((uint64_t)High << 32) | (uint64_t)Low; | ||||
| 360 | } | ||||
| 361 | |||||
| 362 | /// Checks if an integer fits into the given bit width. | ||||
| 363 | template <unsigned N> constexpr inline bool isInt(int64_t x) { | ||||
| 364 | return N >= 64 || (-(INT64_C(1)1LL<<(N-1)) <= x && x < (INT64_C(1)1LL<<(N-1))); | ||||
| 365 | } | ||||
| 366 | // Template specializations to get better code for common cases. | ||||
| 367 | template <> constexpr inline bool isInt<8>(int64_t x) { | ||||
| 368 | return static_cast<int8_t>(x) == x; | ||||
| 369 | } | ||||
| 370 | template <> constexpr inline bool isInt<16>(int64_t x) { | ||||
| 371 | return static_cast<int16_t>(x) == x; | ||||
| 372 | } | ||||
| 373 | template <> constexpr inline bool isInt<32>(int64_t x) { | ||||
| 374 | return static_cast<int32_t>(x) == x; | ||||
| 375 | } | ||||
| 376 | |||||
| 377 | /// Checks if a signed integer is an N bit number shifted left by S. | ||||
| 378 | template <unsigned N, unsigned S> | ||||
| 379 | constexpr inline bool isShiftedInt(int64_t x) { | ||||
| 380 | static_assert( | ||||
| 381 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); | ||||
| 382 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); | ||||
| 383 | return isInt<N + S>(x) && (x % (UINT64_C(1)1ULL << S) == 0); | ||||
| 384 | } | ||||
| 385 | |||||
| 386 | /// Checks if an unsigned integer fits into the given bit width. | ||||
| 387 | /// | ||||
| 388 | /// This is written as two functions rather than as simply | ||||
| 389 | /// | ||||
| 390 | /// return N >= 64 || X < (UINT64_C(1) << N); | ||||
| 391 | /// | ||||
| 392 | /// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting | ||||
| 393 | /// left too many places. | ||||
| 394 | template <unsigned N> | ||||
| 395 | constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) { | ||||
| 396 | static_assert(N > 0, "isUInt<0> doesn't make sense"); | ||||
| 397 | return X < (UINT64_C(1)1ULL << (N)); | ||||
| 398 | } | ||||
| 399 | template <unsigned N> | ||||
| 400 | constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) { | ||||
| 401 | return true; | ||||
| 402 | } | ||||
| 403 | |||||
| 404 | // Template specializations to get better code for common cases. | ||||
| 405 | template <> constexpr inline bool isUInt<8>(uint64_t x) { | ||||
| 406 | return static_cast<uint8_t>(x) == x; | ||||
| 407 | } | ||||
| 408 | template <> constexpr inline bool isUInt<16>(uint64_t x) { | ||||
| 409 | return static_cast<uint16_t>(x) == x; | ||||
| 410 | } | ||||
| 411 | template <> constexpr inline bool isUInt<32>(uint64_t x) { | ||||
| 412 | return static_cast<uint32_t>(x) == x; | ||||
| 413 | } | ||||
| 414 | |||||
| 415 | /// Checks if a unsigned integer is an N bit number shifted left by S. | ||||
| 416 | template <unsigned N, unsigned S> | ||||
| 417 | constexpr inline bool isShiftedUInt(uint64_t x) { | ||||
| 418 | static_assert( | ||||
| 419 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); | ||||
| 420 | static_assert(N + S <= 64, | ||||
| 421 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); | ||||
| 422 | // Per the two static_asserts above, S must be strictly less than 64. So | ||||
| 423 | // 1 << S is not undefined behavior. | ||||
| 424 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1ULL << S) == 0); | ||||
| 425 | } | ||||
| 426 | |||||
| 427 | /// Gets the maximum value for a N-bit unsigned integer. | ||||
| 428 | inline uint64_t maxUIntN(uint64_t N) { | ||||
| 429 | assert(N > 0 && N <= 64 && "integer width out of range")((void)0); | ||||
| 430 | |||||
| 431 | // uint64_t(1) << 64 is undefined behavior, so we can't do | ||||
| 432 | // (uint64_t(1) << N) - 1 | ||||
| 433 | // without checking first that N != 64. But this works and doesn't have a | ||||
| 434 | // branch. | ||||
| 435 | return UINT64_MAX0xffffffffffffffffULL >> (64 - N); | ||||
| 436 | } | ||||
| 437 | |||||
| 438 | /// Gets the minimum value for a N-bit signed integer. | ||||
| 439 | inline int64_t minIntN(int64_t N) { | ||||
| 440 | assert(N > 0 && N <= 64 && "integer width out of range")((void)0); | ||||
| 441 | |||||
| 442 | return UINT64_C(1)1ULL + ~(UINT64_C(1)1ULL << (N - 1)); | ||||
| 443 | } | ||||
| 444 | |||||
| 445 | /// Gets the maximum value for a N-bit signed integer. | ||||
| 446 | inline int64_t maxIntN(int64_t N) { | ||||
| 447 | assert(N > 0 && N <= 64 && "integer width out of range")((void)0); | ||||
| 448 | |||||
| 449 | // This relies on two's complement wraparound when N == 64, so we convert to | ||||
| 450 | // int64_t only at the very end to avoid UB. | ||||
| 451 | return (UINT64_C(1)1ULL << (N - 1)) - 1; | ||||
| 452 | } | ||||
| 453 | |||||
| 454 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. | ||||
| 455 | inline bool isUIntN(unsigned N, uint64_t x) { | ||||
| 456 | return N >= 64 || x <= maxUIntN(N); | ||||
| 457 | } | ||||
| 458 | |||||
| 459 | /// Checks if an signed integer fits into the given (dynamic) bit width. | ||||
| 460 | inline bool isIntN(unsigned N, int64_t x) { | ||||
| 461 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); | ||||
| 462 | } | ||||
| 463 | |||||
| 464 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||
| 465 | /// least significant bit with the remainder zero (32 bit version). | ||||
| 466 | /// Ex. isMask_32(0x0000FFFFU) == true. | ||||
| 467 | constexpr inline bool isMask_32(uint32_t Value) { | ||||
| 468 | return Value && ((Value + 1) & Value) == 0; | ||||
| 469 | } | ||||
| 470 | |||||
| 471 | /// Return true if the argument is a non-empty sequence of ones starting at the | ||||
| 472 | /// least significant bit with the remainder zero (64 bit version). | ||||
| 473 | constexpr inline bool isMask_64(uint64_t Value) { | ||||
| 474 | return Value && ((Value + 1) & Value) == 0; | ||||
| 475 | } | ||||
| 476 | |||||
| 477 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||
| 478 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. | ||||
| 479 | constexpr inline bool isShiftedMask_32(uint32_t Value) { | ||||
| 480 | return Value && isMask_32((Value - 1) | Value); | ||||
| 481 | } | ||||
| 482 | |||||
| 483 | /// Return true if the argument contains a non-empty sequence of ones with the | ||||
| 484 | /// remainder zero (64 bit version.) | ||||
| 485 | constexpr inline bool isShiftedMask_64(uint64_t Value) { | ||||
| 486 | return Value && isMask_64((Value - 1) | Value); | ||||
| 487 | } | ||||
| 488 | |||||
| 489 | /// Return true if the argument is a power of two > 0. | ||||
| 490 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) | ||||
| 491 | constexpr inline bool isPowerOf2_32(uint32_t Value) { | ||||
| 492 | return Value
| ||||
| 493 | } | ||||
| 494 | |||||
| 495 | /// Return true if the argument is a power of two > 0 (64 bit edition.) | ||||
| 496 | constexpr inline bool isPowerOf2_64(uint64_t Value) { | ||||
| 497 | return Value && !(Value & (Value - 1)); | ||||
| 498 | } | ||||
| 499 | |||||
| 500 | /// Count the number of ones from the most significant bit to the first | ||||
| 501 | /// zero bit. | ||||
| 502 | /// | ||||
| 503 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. | ||||
| 504 | /// Only unsigned integral types are allowed. | ||||
| 505 | /// | ||||
| 506 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||
| 507 | /// ZB_Undefined are valid arguments. | ||||
| 508 | template <typename T> | ||||
| 509 | unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||
| 510 | static_assert(std::numeric_limits<T>::is_integer && | ||||
| 511 | !std::numeric_limits<T>::is_signed, | ||||
| 512 | "Only unsigned integral types are allowed."); | ||||
| 513 | return countLeadingZeros<T>(~Value, ZB); | ||||
| 514 | } | ||||
| 515 | |||||
| 516 | /// Count the number of ones from the least significant bit to the first | ||||
| 517 | /// zero bit. | ||||
| 518 | /// | ||||
| 519 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. | ||||
| 520 | /// Only unsigned integral types are allowed. | ||||
| 521 | /// | ||||
| 522 | /// \param ZB the behavior on an input of all ones. Only ZB_Width and | ||||
| 523 | /// ZB_Undefined are valid arguments. | ||||
| 524 | template <typename T> | ||||
| 525 | unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) { | ||||
| 526 | static_assert(std::numeric_limits<T>::is_integer && | ||||
| 527 | !std::numeric_limits<T>::is_signed, | ||||
| 528 | "Only unsigned integral types are allowed."); | ||||
| 529 | return countTrailingZeros<T>(~Value, ZB); | ||||
| 530 | } | ||||
| 531 | |||||
| 532 | namespace detail { | ||||
| 533 | template <typename T, std::size_t SizeOfT> struct PopulationCounter { | ||||
| 534 | static unsigned count(T Value) { | ||||
| 535 | // Generic version, forward to 32 bits. | ||||
| 536 | static_assert(SizeOfT <= 4, "Not implemented!"); | ||||
| 537 | #if defined(__GNUC__4) | ||||
| 538 | return __builtin_popcount(Value); | ||||
| 539 | #else | ||||
| 540 | uint32_t v = Value; | ||||
| 541 | v = v - ((v >> 1) & 0x55555555); | ||||
| 542 | v = (v & 0x33333333) + ((v >> 2) & 0x33333333); | ||||
| 543 | return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; | ||||
| 544 | #endif | ||||
| 545 | } | ||||
| 546 | }; | ||||
| 547 | |||||
| 548 | template <typename T> struct PopulationCounter<T, 8> { | ||||
| 549 | static unsigned count(T Value) { | ||||
| 550 | #if defined(__GNUC__4) | ||||
| 551 | return __builtin_popcountll(Value); | ||||
| 552 | #else | ||||
| 553 | uint64_t v = Value; | ||||
| 554 | v = v - ((v >> 1) & 0x5555555555555555ULL); | ||||
| 555 | v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); | ||||
| 556 | v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; | ||||
| 557 | return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); | ||||
| 558 | #endif | ||||
| 559 | } | ||||
| 560 | }; | ||||
| 561 | } // namespace detail | ||||
| 562 | |||||
| 563 | /// Count the number of set bits in a value. | ||||
| 564 | /// Ex. countPopulation(0xF000F000) = 8 | ||||
| 565 | /// Returns 0 if the word is zero. | ||||
| 566 | template <typename T> | ||||
| 567 | inline unsigned countPopulation(T Value) { | ||||
| 568 | static_assert(std::numeric_limits<T>::is_integer && | ||||
| 569 | !std::numeric_limits<T>::is_signed, | ||||
| 570 | "Only unsigned integral types are allowed."); | ||||
| 571 | return detail::PopulationCounter<T, sizeof(T)>::count(Value); | ||||
| 572 | } | ||||
| 573 | |||||
| 574 | /// Compile time Log2. | ||||
| 575 | /// Valid only for positive powers of two. | ||||
| 576 | template <size_t kValue> constexpr inline size_t CTLog2() { | ||||
| 577 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), | ||||
| 578 | "Value is not a valid power of 2"); | ||||
| 579 | return 1 + CTLog2<kValue / 2>(); | ||||
| 580 | } | ||||
| 581 | |||||
| 582 | template <> constexpr inline size_t CTLog2<1>() { return 0; } | ||||
| 583 | |||||
| 584 | /// Return the log base 2 of the specified value. | ||||
| 585 | inline double Log2(double Value) { | ||||
| 586 | #if defined(__ANDROID_API__) && __ANDROID_API__ < 18 | ||||
| 587 | return __builtin_log(Value) / __builtin_log(2.0); | ||||
| 588 | #else | ||||
| 589 | return log2(Value); | ||||
| 590 | #endif | ||||
| 591 | } | ||||
| 592 | |||||
| 593 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||
| 594 | /// (32 bit edition.) | ||||
| 595 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 | ||||
| 596 | inline unsigned Log2_32(uint32_t Value) { | ||||
| 597 | return 31 - countLeadingZeros(Value); | ||||
| 598 | } | ||||
| 599 | |||||
| 600 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. | ||||
| 601 | /// (64 bit edition.) | ||||
| 602 | inline unsigned Log2_64(uint64_t Value) { | ||||
| 603 | return 63 - countLeadingZeros(Value); | ||||
| 604 | } | ||||
| 605 | |||||
| 606 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. | ||||
| 607 | /// (32 bit edition). | ||||
| 608 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 | ||||
| 609 | inline unsigned Log2_32_Ceil(uint32_t Value) { | ||||
| 610 | return 32 - countLeadingZeros(Value - 1); | ||||
| 611 | } | ||||
| 612 | |||||
| 613 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. | ||||
| 614 | /// (64 bit edition.) | ||||
| 615 | inline unsigned Log2_64_Ceil(uint64_t Value) { | ||||
| 616 | return 64 - countLeadingZeros(Value - 1); | ||||
| 617 | } | ||||
| 618 | |||||
| 619 | /// Return the greatest common divisor of the values using Euclid's algorithm. | ||||
| 620 | template <typename T> | ||||
| 621 | inline T greatestCommonDivisor(T A, T B) { | ||||
| 622 | while (B) { | ||||
| 623 | T Tmp = B; | ||||
| 624 | B = A % B; | ||||
| 625 | A = Tmp; | ||||
| 626 | } | ||||
| 627 | return A; | ||||
| 628 | } | ||||
| 629 | |||||
| 630 | inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { | ||||
| 631 | return greatestCommonDivisor<uint64_t>(A, B); | ||||
| 632 | } | ||||
| 633 | |||||
| 634 | /// This function takes a 64-bit integer and returns the bit equivalent double. | ||||
| 635 | inline double BitsToDouble(uint64_t Bits) { | ||||
| 636 | double D; | ||||
| 637 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||
| 638 | memcpy(&D, &Bits, sizeof(Bits)); | ||||
| 639 | return D; | ||||
| 640 | } | ||||
| 641 | |||||
| 642 | /// This function takes a 32-bit integer and returns the bit equivalent float. | ||||
| 643 | inline float BitsToFloat(uint32_t Bits) { | ||||
| 644 | float F; | ||||
| 645 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||
| 646 | memcpy(&F, &Bits, sizeof(Bits)); | ||||
| 647 | return F; | ||||
| 648 | } | ||||
| 649 | |||||
| 650 | /// This function takes a double and returns the bit equivalent 64-bit integer. | ||||
| 651 | /// Note that copying doubles around changes the bits of NaNs on some hosts, | ||||
| 652 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||
| 653 | inline uint64_t DoubleToBits(double Double) { | ||||
| 654 | uint64_t Bits; | ||||
| 655 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); | ||||
| 656 | memcpy(&Bits, &Double, sizeof(Double)); | ||||
| 657 | return Bits; | ||||
| 658 | } | ||||
| 659 | |||||
| 660 | /// This function takes a float and returns the bit equivalent 32-bit integer. | ||||
| 661 | /// Note that copying floats around changes the bits of NaNs on some hosts, | ||||
| 662 | /// notably x86, so this routine cannot be used if these bits are needed. | ||||
| 663 | inline uint32_t FloatToBits(float Float) { | ||||
| 664 | uint32_t Bits; | ||||
| 665 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); | ||||
| 666 | memcpy(&Bits, &Float, sizeof(Float)); | ||||
| 667 | return Bits; | ||||
| 668 | } | ||||
| 669 | |||||
| 670 | /// A and B are either alignments or offsets. Return the minimum alignment that | ||||
| 671 | /// may be assumed after adding the two together. | ||||
| 672 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { | ||||
| 673 | // The largest power of 2 that divides both A and B. | ||||
| 674 | // | ||||
| 675 | // Replace "-Value" by "1+~Value" in the following commented code to avoid | ||||
| 676 | // MSVC warning C4146 | ||||
| 677 | // return (A | B) & -(A | B); | ||||
| 678 | return (A | B) & (1 + ~(A | B)); | ||||
| 679 | } | ||||
| 680 | |||||
| 681 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. | ||||
| 682 | /// Returns zero on overflow. | ||||
| 683 | inline uint64_t NextPowerOf2(uint64_t A) { | ||||
| 684 | A |= (A >> 1); | ||||
| 685 | A |= (A >> 2); | ||||
| 686 | A |= (A >> 4); | ||||
| 687 | A |= (A >> 8); | ||||
| 688 | A |= (A >> 16); | ||||
| 689 | A |= (A >> 32); | ||||
| 690 | return A + 1; | ||||
| 691 | } | ||||
| 692 | |||||
| 693 | /// Returns the power of two which is less than or equal to the given value. | ||||
| 694 | /// Essentially, it is a floor operation across the domain of powers of two. | ||||
| 695 | inline uint64_t PowerOf2Floor(uint64_t A) { | ||||
| 696 | if (!A) return 0; | ||||
| 697 | return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); | ||||
| 698 | } | ||||
| 699 | |||||
| 700 | /// Returns the power of two which is greater than or equal to the given value. | ||||
| 701 | /// Essentially, it is a ceil operation across the domain of powers of two. | ||||
| 702 | inline uint64_t PowerOf2Ceil(uint64_t A) { | ||||
| 703 | if (!A) | ||||
| 704 | return 0; | ||||
| 705 | return NextPowerOf2(A - 1); | ||||
| 706 | } | ||||
| 707 | |||||
| 708 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||
| 709 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. | ||||
| 710 | /// | ||||
| 711 | /// If non-zero \p Skew is specified, the return value will be a minimal | ||||
| 712 | /// integer that is greater than or equal to \p Value and equal to | ||||
| 713 | /// \p Align * N + \p Skew for some integer N. If \p Skew is larger than | ||||
| 714 | /// \p Align, its value is adjusted to '\p Skew mod \p Align'. | ||||
| 715 | /// | ||||
| 716 | /// Examples: | ||||
| 717 | /// \code | ||||
| 718 | /// alignTo(5, 8) = 8 | ||||
| 719 | /// alignTo(17, 8) = 24 | ||||
| 720 | /// alignTo(~0LL, 8) = 0 | ||||
| 721 | /// alignTo(321, 255) = 510 | ||||
| 722 | /// | ||||
| 723 | /// alignTo(5, 8, 7) = 7 | ||||
| 724 | /// alignTo(17, 8, 1) = 17 | ||||
| 725 | /// alignTo(~0LL, 8, 3) = 3 | ||||
| 726 | /// alignTo(321, 255, 42) = 552 | ||||
| 727 | /// \endcode | ||||
| 728 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||
| 729 | assert(Align != 0u && "Align can't be 0.")((void)0); | ||||
| 730 | Skew %= Align; | ||||
| 731 | return (Value + Align - 1 - Skew) / Align * Align + Skew; | ||||
| 732 | } | ||||
| 733 | |||||
| 734 | /// Returns the next integer (mod 2**64) that is greater than or equal to | ||||
| 735 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. | ||||
| 736 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { | ||||
| 737 | static_assert(Align != 0u, "Align must be non-zero"); | ||||
| 738 | return (Value + Align - 1) / Align * Align; | ||||
| 739 | } | ||||
| 740 | |||||
| 741 | /// Returns the integer ceil(Numerator / Denominator). | ||||
| 742 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { | ||||
| 743 | return alignTo(Numerator, Denominator) / Denominator; | ||||
| 744 | } | ||||
| 745 | |||||
| 746 | /// Returns the integer nearest(Numerator / Denominator). | ||||
| 747 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { | ||||
| 748 | return (Numerator + (Denominator / 2)) / Denominator; | ||||
| 749 | } | ||||
| 750 | |||||
| 751 | /// Returns the largest uint64_t less than or equal to \p Value and is | ||||
| 752 | /// \p Skew mod \p Align. \p Align must be non-zero | ||||
| 753 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { | ||||
| 754 | assert(Align != 0u && "Align can't be 0.")((void)0); | ||||
| 755 | Skew %= Align; | ||||
| 756 | return (Value - Skew) / Align * Align + Skew; | ||||
| 757 | } | ||||
| 758 | |||||
| 759 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||
| 760 | /// Requires 0 < B <= 32. | ||||
| 761 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { | ||||
| 762 | static_assert(B > 0, "Bit width can't be 0."); | ||||
| 763 | static_assert(B <= 32, "Bit width out of range."); | ||||
| 764 | return int32_t(X << (32 - B)) >> (32 - B); | ||||
| 765 | } | ||||
| 766 | |||||
| 767 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. | ||||
| 768 | /// Requires 0 < B <= 32. | ||||
| 769 | inline int32_t SignExtend32(uint32_t X, unsigned B) { | ||||
| 770 | assert(B > 0 && "Bit width can't be 0.")((void)0); | ||||
| 771 | assert(B <= 32 && "Bit width out of range.")((void)0); | ||||
| 772 | return int32_t(X << (32 - B)) >> (32 - B); | ||||
| 773 | } | ||||
| 774 | |||||
| 775 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||
| 776 | /// Requires 0 < B <= 64. | ||||
| 777 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { | ||||
| 778 | static_assert(B > 0, "Bit width can't be 0."); | ||||
| 779 | static_assert(B <= 64, "Bit width out of range."); | ||||
| 780 | return int64_t(x << (64 - B)) >> (64 - B); | ||||
| 781 | } | ||||
| 782 | |||||
| 783 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. | ||||
| 784 | /// Requires 0 < B <= 64. | ||||
| 785 | inline int64_t SignExtend64(uint64_t X, unsigned B) { | ||||
| 786 | assert(B > 0 && "Bit width can't be 0.")((void)0); | ||||
| 787 | assert(B <= 64 && "Bit width out of range.")((void)0); | ||||
| 788 | return int64_t(X << (64 - B)) >> (64 - B); | ||||
| 789 | } | ||||
| 790 | |||||
| 791 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute | ||||
| 792 | /// value of the result. | ||||
| 793 | template <typename T> | ||||
| 794 | std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) { | ||||
| 795 | return X > Y ? (X - Y) : (Y - X); | ||||
| 796 | } | ||||
| 797 | |||||
| 798 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the | ||||
| 799 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||
| 800 | /// the result is larger than the maximum representable value of type T. | ||||
| 801 | template <typename T> | ||||
| 802 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||
| 803 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||
| 804 | bool Dummy; | ||||
| 805 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||
| 806 | // Hacker's Delight, p. 29 | ||||
| 807 | T Z = X + Y; | ||||
| 808 | Overflowed = (Z < X || Z < Y); | ||||
| 809 | if (Overflowed) | ||||
| 810 | return std::numeric_limits<T>::max(); | ||||
| 811 | else | ||||
| 812 | return Z; | ||||
| 813 | } | ||||
| 814 | |||||
| 815 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the | ||||
| 816 | /// maximum representable value of T on overflow. ResultOverflowed indicates if | ||||
| 817 | /// the result is larger than the maximum representable value of type T. | ||||
| 818 | template <typename T> | ||||
| 819 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||
| 820 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { | ||||
| 821 | bool Dummy; | ||||
| 822 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||
| 823 | |||||
| 824 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that | ||||
| 825 | // because it fails for uint16_t (where multiplication can have undefined | ||||
| 826 | // behavior due to promotion to int), and requires a division in addition | ||||
| 827 | // to the multiplication. | ||||
| 828 | |||||
| 829 | Overflowed = false; | ||||
| 830 | |||||
| 831 | // Log2(Z) would be either Log2Z or Log2Z + 1. | ||||
| 832 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z | ||||
| 833 | // will necessarily be less than Log2Max as desired. | ||||
| 834 | int Log2Z = Log2_64(X) + Log2_64(Y); | ||||
| 835 | const T Max = std::numeric_limits<T>::max(); | ||||
| 836 | int Log2Max = Log2_64(Max); | ||||
| 837 | if (Log2Z < Log2Max) { | ||||
| 838 | return X * Y; | ||||
| 839 | } | ||||
| 840 | if (Log2Z > Log2Max) { | ||||
| 841 | Overflowed = true; | ||||
| 842 | return Max; | ||||
| 843 | } | ||||
| 844 | |||||
| 845 | // We're going to use the top bit, and maybe overflow one | ||||
| 846 | // bit past it. Multiply all but the bottom bit then add | ||||
| 847 | // that on at the end. | ||||
| 848 | T Z = (X >> 1) * Y; | ||||
| 849 | if (Z & ~(Max >> 1)) { | ||||
| 850 | Overflowed = true; | ||||
| 851 | return Max; | ||||
| 852 | } | ||||
| 853 | Z <<= 1; | ||||
| 854 | if (X & 1) | ||||
| 855 | return SaturatingAdd(Z, Y, ResultOverflowed); | ||||
| 856 | |||||
| 857 | return Z; | ||||
| 858 | } | ||||
| 859 | |||||
| 860 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to | ||||
| 861 | /// the product. Clamp the result to the maximum representable value of T on | ||||
| 862 | /// overflow. ResultOverflowed indicates if the result is larger than the | ||||
| 863 | /// maximum representable value of type T. | ||||
| 864 | template <typename T> | ||||
| 865 | std::enable_if_t<std::is_unsigned<T>::value, T> | ||||
| 866 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { | ||||
| 867 | bool Dummy; | ||||
| 868 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; | ||||
| 869 | |||||
| 870 | T Product = SaturatingMultiply(X, Y, &Overflowed); | ||||
| 871 | if (Overflowed) | ||||
| 872 | return Product; | ||||
| 873 | |||||
| 874 | return SaturatingAdd(A, Product, &Overflowed); | ||||
| 875 | } | ||||
| 876 | |||||
| 877 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. | ||||
| 878 | extern const float huge_valf; | ||||
| 879 | |||||
| 880 | |||||
| 881 | /// Add two signed integers, computing the two's complement truncated result, | ||||
| 882 | /// returning true if overflow occured. | ||||
| 883 | template <typename T> | ||||
| 884 | std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) { | ||||
| 885 | #if __has_builtin(__builtin_add_overflow)1 | ||||
| 886 | return __builtin_add_overflow(X, Y, &Result); | ||||
| 887 | #else | ||||
| 888 | // Perform the unsigned addition. | ||||
| 889 | using U = std::make_unsigned_t<T>; | ||||
| 890 | const U UX = static_cast<U>(X); | ||||
| 891 | const U UY = static_cast<U>(Y); | ||||
| 892 | const U UResult = UX + UY; | ||||
| 893 | |||||
| 894 | // Convert to signed. | ||||
| 895 | Result = static_cast<T>(UResult); | ||||
| 896 | |||||
| 897 | // Adding two positive numbers should result in a positive number. | ||||
| 898 | if (X > 0 && Y > 0) | ||||
| 899 | return Result <= 0; | ||||
| 900 | // Adding two negatives should result in a negative number. | ||||
| 901 | if (X < 0 && Y < 0) | ||||
| 902 | return Result >= 0; | ||||
| 903 | return false; | ||||
| 904 | #endif | ||||
| 905 | } | ||||
| 906 | |||||
| 907 | /// Subtract two signed integers, computing the two's complement truncated | ||||
| 908 | /// result, returning true if an overflow ocurred. | ||||
| 909 | template <typename T> | ||||
| 910 | std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) { | ||||
| 911 | #if __has_builtin(__builtin_sub_overflow)1 | ||||
| 912 | return __builtin_sub_overflow(X, Y, &Result); | ||||
| 913 | #else | ||||
| 914 | // Perform the unsigned addition. | ||||
| 915 | using U = std::make_unsigned_t<T>; | ||||
| 916 | const U UX = static_cast<U>(X); | ||||
| 917 | const U UY = static_cast<U>(Y); | ||||
| 918 | const U UResult = UX - UY; | ||||
| 919 | |||||
| 920 | // Convert to signed. | ||||
| 921 | Result = static_cast<T>(UResult); | ||||
| 922 | |||||
| 923 | // Subtracting a positive number from a negative results in a negative number. | ||||
| 924 | if (X <= 0 && Y > 0) | ||||
| 925 | return Result >= 0; | ||||
| 926 | // Subtracting a negative number from a positive results in a positive number. | ||||
| 927 | if (X >= 0 && Y < 0) | ||||
| 928 | return Result <= 0; | ||||
| 929 | return false; | ||||
| 930 | #endif | ||||
| 931 | } | ||||
| 932 | |||||
| 933 | /// Multiply two signed integers, computing the two's complement truncated | ||||
| 934 | /// result, returning true if an overflow ocurred. | ||||
| 935 | template <typename T> | ||||
| 936 | std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) { | ||||
| 937 | // Perform the unsigned multiplication on absolute values. | ||||
| 938 | using U = std::make_unsigned_t<T>; | ||||
| 939 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); | ||||
| 940 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); | ||||
| 941 | const U UResult = UX * UY; | ||||
| 942 | |||||
| 943 | // Convert to signed. | ||||
| 944 | const bool IsNegative = (X < 0) ^ (Y < 0); | ||||
| 945 | Result = IsNegative ? (0 - UResult) : UResult; | ||||
| 946 | |||||
| 947 | // If any of the args was 0, result is 0 and no overflow occurs. | ||||
| 948 | if (UX == 0 || UY == 0) | ||||
| 949 | return false; | ||||
| 950 | |||||
| 951 | // UX and UY are in [1, 2^n], where n is the number of digits. | ||||
| 952 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for | ||||
| 953 | // positive) divided by an argument compares to the other. | ||||
| 954 | if (IsNegative) | ||||
| 955 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; | ||||
| 956 | else | ||||
| 957 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; | ||||
| 958 | } | ||||
| 959 | |||||
| 960 | } // End llvm namespace | ||||
| 961 | |||||
| 962 | #endif |