| File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/CodeGen/ExpandMemCmp.cpp |
| Warning: | line 430, column 60 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===// | |||
| 2 | // | |||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | |||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
| 6 | // | |||
| 7 | //===----------------------------------------------------------------------===// | |||
| 8 | // | |||
| 9 | // This pass tries to expand memcmp() calls into optimally-sized loads and | |||
| 10 | // compares for the target. | |||
| 11 | // | |||
| 12 | //===----------------------------------------------------------------------===// | |||
| 13 | ||||
| 14 | #include "llvm/ADT/Statistic.h" | |||
| 15 | #include "llvm/Analysis/ConstantFolding.h" | |||
| 16 | #include "llvm/Analysis/DomTreeUpdater.h" | |||
| 17 | #include "llvm/Analysis/LazyBlockFrequencyInfo.h" | |||
| 18 | #include "llvm/Analysis/ProfileSummaryInfo.h" | |||
| 19 | #include "llvm/Analysis/TargetLibraryInfo.h" | |||
| 20 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
| 21 | #include "llvm/Analysis/ValueTracking.h" | |||
| 22 | #include "llvm/CodeGen/TargetLowering.h" | |||
| 23 | #include "llvm/CodeGen/TargetPassConfig.h" | |||
| 24 | #include "llvm/CodeGen/TargetSubtargetInfo.h" | |||
| 25 | #include "llvm/IR/Dominators.h" | |||
| 26 | #include "llvm/IR/IRBuilder.h" | |||
| 27 | #include "llvm/InitializePasses.h" | |||
| 28 | #include "llvm/Target/TargetMachine.h" | |||
| 29 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" | |||
| 30 | #include "llvm/Transforms/Utils/Local.h" | |||
| 31 | #include "llvm/Transforms/Utils/SizeOpts.h" | |||
| 32 | ||||
| 33 | using namespace llvm; | |||
| 34 | ||||
| 35 | #define DEBUG_TYPE"expandmemcmp" "expandmemcmp" | |||
| 36 | ||||
| 37 | STATISTIC(NumMemCmpCalls, "Number of memcmp calls")static llvm::Statistic NumMemCmpCalls = {"expandmemcmp", "NumMemCmpCalls" , "Number of memcmp calls"}; | |||
| 38 | STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size")static llvm::Statistic NumMemCmpNotConstant = {"expandmemcmp" , "NumMemCmpNotConstant", "Number of memcmp calls without constant size" }; | |||
| 39 | STATISTIC(NumMemCmpGreaterThanMax,static llvm::Statistic NumMemCmpGreaterThanMax = {"expandmemcmp" , "NumMemCmpGreaterThanMax", "Number of memcmp calls with size greater than max size" } | |||
| 40 | "Number of memcmp calls with size greater than max size")static llvm::Statistic NumMemCmpGreaterThanMax = {"expandmemcmp" , "NumMemCmpGreaterThanMax", "Number of memcmp calls with size greater than max size" }; | |||
| 41 | STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls")static llvm::Statistic NumMemCmpInlined = {"expandmemcmp", "NumMemCmpInlined" , "Number of inlined memcmp calls"}; | |||
| 42 | ||||
| 43 | static cl::opt<unsigned> MemCmpEqZeroNumLoadsPerBlock( | |||
| 44 | "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), | |||
| 45 | cl::desc("The number of loads per basic block for inline expansion of " | |||
| 46 | "memcmp that is only being compared against zero.")); | |||
| 47 | ||||
| 48 | static cl::opt<unsigned> MaxLoadsPerMemcmp( | |||
| 49 | "max-loads-per-memcmp", cl::Hidden, | |||
| 50 | cl::desc("Set maximum number of loads used in expanded memcmp")); | |||
| 51 | ||||
| 52 | static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize( | |||
| 53 | "max-loads-per-memcmp-opt-size", cl::Hidden, | |||
| 54 | cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz")); | |||
| 55 | ||||
| 56 | namespace { | |||
| 57 | ||||
| 58 | ||||
| 59 | // This class provides helper functions to expand a memcmp library call into an | |||
| 60 | // inline expansion. | |||
| 61 | class MemCmpExpansion { | |||
| 62 | struct ResultBlock { | |||
| 63 | BasicBlock *BB = nullptr; | |||
| 64 | PHINode *PhiSrc1 = nullptr; | |||
| 65 | PHINode *PhiSrc2 = nullptr; | |||
| 66 | ||||
| 67 | ResultBlock() = default; | |||
| 68 | }; | |||
| 69 | ||||
| 70 | CallInst *const CI; | |||
| 71 | ResultBlock ResBlock; | |||
| 72 | const uint64_t Size; | |||
| 73 | unsigned MaxLoadSize; | |||
| 74 | uint64_t NumLoadsNonOneByte; | |||
| 75 | const uint64_t NumLoadsPerBlockForZeroCmp; | |||
| 76 | std::vector<BasicBlock *> LoadCmpBlocks; | |||
| 77 | BasicBlock *EndBlock; | |||
| 78 | PHINode *PhiRes; | |||
| 79 | const bool IsUsedForZeroCmp; | |||
| 80 | const DataLayout &DL; | |||
| 81 | DomTreeUpdater *DTU; | |||
| 82 | IRBuilder<> Builder; | |||
| 83 | // Represents the decomposition in blocks of the expansion. For example, | |||
| 84 | // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and | |||
| 85 | // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}. | |||
| 86 | struct LoadEntry { | |||
| 87 | LoadEntry(unsigned LoadSize, uint64_t Offset) | |||
| 88 | : LoadSize(LoadSize), Offset(Offset) { | |||
| 89 | } | |||
| 90 | ||||
| 91 | // The size of the load for this block, in bytes. | |||
| 92 | unsigned LoadSize; | |||
| 93 | // The offset of this load from the base pointer, in bytes. | |||
| 94 | uint64_t Offset; | |||
| 95 | }; | |||
| 96 | using LoadEntryVector = SmallVector<LoadEntry, 8>; | |||
| 97 | LoadEntryVector LoadSequence; | |||
| 98 | ||||
| 99 | void createLoadCmpBlocks(); | |||
| 100 | void createResultBlock(); | |||
| 101 | void setupResultBlockPHINodes(); | |||
| 102 | void setupEndBlockPHINodes(); | |||
| 103 | Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex); | |||
| 104 | void emitLoadCompareBlock(unsigned BlockIndex); | |||
| 105 | void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, | |||
| 106 | unsigned &LoadIndex); | |||
| 107 | void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes); | |||
| 108 | void emitMemCmpResultBlock(); | |||
| 109 | Value *getMemCmpExpansionZeroCase(); | |||
| 110 | Value *getMemCmpEqZeroOneBlock(); | |||
| 111 | Value *getMemCmpOneBlock(); | |||
| 112 | struct LoadPair { | |||
| 113 | Value *Lhs = nullptr; | |||
| 114 | Value *Rhs = nullptr; | |||
| 115 | }; | |||
| 116 | LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType, | |||
| 117 | unsigned OffsetBytes); | |||
| 118 | ||||
| 119 | static LoadEntryVector | |||
| 120 | computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes, | |||
| 121 | unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte); | |||
| 122 | static LoadEntryVector | |||
| 123 | computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize, | |||
| 124 | unsigned MaxNumLoads, | |||
| 125 | unsigned &NumLoadsNonOneByte); | |||
| 126 | ||||
| 127 | public: | |||
| 128 | MemCmpExpansion(CallInst *CI, uint64_t Size, | |||
| 129 | const TargetTransformInfo::MemCmpExpansionOptions &Options, | |||
| 130 | const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout, | |||
| 131 | DomTreeUpdater *DTU); | |||
| 132 | ||||
| 133 | unsigned getNumBlocks(); | |||
| 134 | uint64_t getNumLoads() const { return LoadSequence.size(); } | |||
| 135 | ||||
| 136 | Value *getMemCmpExpansion(); | |||
| 137 | }; | |||
| 138 | ||||
| 139 | MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence( | |||
| 140 | uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes, | |||
| 141 | const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) { | |||
| 142 | NumLoadsNonOneByte = 0; | |||
| 143 | LoadEntryVector LoadSequence; | |||
| 144 | uint64_t Offset = 0; | |||
| 145 | while (Size && !LoadSizes.empty()) { | |||
| 146 | const unsigned LoadSize = LoadSizes.front(); | |||
| 147 | const uint64_t NumLoadsForThisSize = Size / LoadSize; | |||
| 148 | if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) { | |||
| 149 | // Do not expand if the total number of loads is larger than what the | |||
| 150 | // target allows. Note that it's important that we exit before completing | |||
| 151 | // the expansion to avoid using a ton of memory to store the expansion for | |||
| 152 | // large sizes. | |||
| 153 | return {}; | |||
| 154 | } | |||
| 155 | if (NumLoadsForThisSize > 0) { | |||
| 156 | for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) { | |||
| 157 | LoadSequence.push_back({LoadSize, Offset}); | |||
| 158 | Offset += LoadSize; | |||
| 159 | } | |||
| 160 | if (LoadSize > 1) | |||
| 161 | ++NumLoadsNonOneByte; | |||
| 162 | Size = Size % LoadSize; | |||
| 163 | } | |||
| 164 | LoadSizes = LoadSizes.drop_front(); | |||
| 165 | } | |||
| 166 | return LoadSequence; | |||
| 167 | } | |||
| 168 | ||||
| 169 | MemCmpExpansion::LoadEntryVector | |||
| 170 | MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size, | |||
| 171 | const unsigned MaxLoadSize, | |||
| 172 | const unsigned MaxNumLoads, | |||
| 173 | unsigned &NumLoadsNonOneByte) { | |||
| 174 | // These are already handled by the greedy approach. | |||
| 175 | if (Size < 2 || MaxLoadSize < 2) | |||
| 176 | return {}; | |||
| 177 | ||||
| 178 | // We try to do as many non-overlapping loads as possible starting from the | |||
| 179 | // beginning. | |||
| 180 | const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize; | |||
| 181 | assert(NumNonOverlappingLoads && "there must be at least one load")((void)0); | |||
| 182 | // There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with | |||
| 183 | // an overlapping load. | |||
| 184 | Size = Size - NumNonOverlappingLoads * MaxLoadSize; | |||
| 185 | // Bail if we do not need an overloapping store, this is already handled by | |||
| 186 | // the greedy approach. | |||
| 187 | if (Size == 0) | |||
| 188 | return {}; | |||
| 189 | // Bail if the number of loads (non-overlapping + potential overlapping one) | |||
| 190 | // is larger than the max allowed. | |||
| 191 | if ((NumNonOverlappingLoads + 1) > MaxNumLoads) | |||
| 192 | return {}; | |||
| 193 | ||||
| 194 | // Add non-overlapping loads. | |||
| 195 | LoadEntryVector LoadSequence; | |||
| 196 | uint64_t Offset = 0; | |||
| 197 | for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) { | |||
| 198 | LoadSequence.push_back({MaxLoadSize, Offset}); | |||
| 199 | Offset += MaxLoadSize; | |||
| 200 | } | |||
| 201 | ||||
| 202 | // Add the last overlapping load. | |||
| 203 | assert(Size > 0 && Size < MaxLoadSize && "broken invariant")((void)0); | |||
| 204 | LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)}); | |||
| 205 | NumLoadsNonOneByte = 1; | |||
| 206 | return LoadSequence; | |||
| 207 | } | |||
| 208 | ||||
| 209 | // Initialize the basic block structure required for expansion of memcmp call | |||
| 210 | // with given maximum load size and memcmp size parameter. | |||
| 211 | // This structure includes: | |||
| 212 | // 1. A list of load compare blocks - LoadCmpBlocks. | |||
| 213 | // 2. An EndBlock, split from original instruction point, which is the block to | |||
| 214 | // return from. | |||
| 215 | // 3. ResultBlock, block to branch to for early exit when a | |||
| 216 | // LoadCmpBlock finds a difference. | |||
| 217 | MemCmpExpansion::MemCmpExpansion( | |||
| 218 | CallInst *const CI, uint64_t Size, | |||
| 219 | const TargetTransformInfo::MemCmpExpansionOptions &Options, | |||
| 220 | const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout, | |||
| 221 | DomTreeUpdater *DTU) | |||
| 222 | : CI(CI), Size(Size), MaxLoadSize(0), NumLoadsNonOneByte(0), | |||
| 223 | NumLoadsPerBlockForZeroCmp(Options.NumLoadsPerBlock), | |||
| 224 | IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), DTU(DTU), | |||
| 225 | Builder(CI) { | |||
| 226 | assert(Size > 0 && "zero blocks")((void)0); | |||
| 227 | // Scale the max size down if the target can load more bytes than we need. | |||
| 228 | llvm::ArrayRef<unsigned> LoadSizes(Options.LoadSizes); | |||
| 229 | while (!LoadSizes.empty() && LoadSizes.front() > Size) { | |||
| 230 | LoadSizes = LoadSizes.drop_front(); | |||
| 231 | } | |||
| 232 | assert(!LoadSizes.empty() && "cannot load Size bytes")((void)0); | |||
| 233 | MaxLoadSize = LoadSizes.front(); | |||
| 234 | // Compute the decomposition. | |||
| 235 | unsigned GreedyNumLoadsNonOneByte = 0; | |||
| 236 | LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, Options.MaxNumLoads, | |||
| 237 | GreedyNumLoadsNonOneByte); | |||
| 238 | NumLoadsNonOneByte = GreedyNumLoadsNonOneByte; | |||
| 239 | assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant")((void)0); | |||
| 240 | // If we allow overlapping loads and the load sequence is not already optimal, | |||
| 241 | // use overlapping loads. | |||
| 242 | if (Options.AllowOverlappingLoads && | |||
| 243 | (LoadSequence.empty() || LoadSequence.size() > 2)) { | |||
| 244 | unsigned OverlappingNumLoadsNonOneByte = 0; | |||
| 245 | auto OverlappingLoads = computeOverlappingLoadSequence( | |||
| 246 | Size, MaxLoadSize, Options.MaxNumLoads, OverlappingNumLoadsNonOneByte); | |||
| 247 | if (!OverlappingLoads.empty() && | |||
| 248 | (LoadSequence.empty() || | |||
| 249 | OverlappingLoads.size() < LoadSequence.size())) { | |||
| 250 | LoadSequence = OverlappingLoads; | |||
| 251 | NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte; | |||
| 252 | } | |||
| 253 | } | |||
| 254 | assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant")((void)0); | |||
| 255 | } | |||
| 256 | ||||
| 257 | unsigned MemCmpExpansion::getNumBlocks() { | |||
| 258 | if (IsUsedForZeroCmp) | |||
| 259 | return getNumLoads() / NumLoadsPerBlockForZeroCmp + | |||
| 260 | (getNumLoads() % NumLoadsPerBlockForZeroCmp != 0 ? 1 : 0); | |||
| 261 | return getNumLoads(); | |||
| 262 | } | |||
| 263 | ||||
| 264 | void MemCmpExpansion::createLoadCmpBlocks() { | |||
| 265 | for (unsigned i = 0; i < getNumBlocks(); i++) { | |||
| 266 | BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", | |||
| 267 | EndBlock->getParent(), EndBlock); | |||
| 268 | LoadCmpBlocks.push_back(BB); | |||
| 269 | } | |||
| 270 | } | |||
| 271 | ||||
| 272 | void MemCmpExpansion::createResultBlock() { | |||
| 273 | ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", | |||
| 274 | EndBlock->getParent(), EndBlock); | |||
| 275 | } | |||
| 276 | ||||
| 277 | MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType, | |||
| 278 | bool NeedsBSwap, | |||
| 279 | Type *CmpSizeType, | |||
| 280 | unsigned OffsetBytes) { | |||
| 281 | // Get the memory source at offset `OffsetBytes`. | |||
| 282 | Value *LhsSource = CI->getArgOperand(0); | |||
| 283 | Value *RhsSource = CI->getArgOperand(1); | |||
| 284 | Align LhsAlign = LhsSource->getPointerAlignment(DL); | |||
| 285 | Align RhsAlign = RhsSource->getPointerAlignment(DL); | |||
| 286 | if (OffsetBytes > 0) { | |||
| 287 | auto *ByteType = Type::getInt8Ty(CI->getContext()); | |||
| 288 | LhsSource = Builder.CreateConstGEP1_64( | |||
| 289 | ByteType, Builder.CreateBitCast(LhsSource, ByteType->getPointerTo()), | |||
| 290 | OffsetBytes); | |||
| 291 | RhsSource = Builder.CreateConstGEP1_64( | |||
| 292 | ByteType, Builder.CreateBitCast(RhsSource, ByteType->getPointerTo()), | |||
| 293 | OffsetBytes); | |||
| 294 | LhsAlign = commonAlignment(LhsAlign, OffsetBytes); | |||
| 295 | RhsAlign = commonAlignment(RhsAlign, OffsetBytes); | |||
| 296 | } | |||
| 297 | LhsSource = Builder.CreateBitCast(LhsSource, LoadSizeType->getPointerTo()); | |||
| 298 | RhsSource = Builder.CreateBitCast(RhsSource, LoadSizeType->getPointerTo()); | |||
| 299 | ||||
| 300 | // Create a constant or a load from the source. | |||
| 301 | Value *Lhs = nullptr; | |||
| 302 | if (auto *C = dyn_cast<Constant>(LhsSource)) | |||
| 303 | Lhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL); | |||
| 304 | if (!Lhs) | |||
| 305 | Lhs = Builder.CreateAlignedLoad(LoadSizeType, LhsSource, LhsAlign); | |||
| 306 | ||||
| 307 | Value *Rhs = nullptr; | |||
| 308 | if (auto *C = dyn_cast<Constant>(RhsSource)) | |||
| 309 | Rhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL); | |||
| 310 | if (!Rhs) | |||
| 311 | Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign); | |||
| 312 | ||||
| 313 | // Swap bytes if required. | |||
| 314 | if (NeedsBSwap) { | |||
| 315 | Function *Bswap = Intrinsic::getDeclaration(CI->getModule(), | |||
| 316 | Intrinsic::bswap, LoadSizeType); | |||
| 317 | Lhs = Builder.CreateCall(Bswap, Lhs); | |||
| 318 | Rhs = Builder.CreateCall(Bswap, Rhs); | |||
| 319 | } | |||
| 320 | ||||
| 321 | // Zero extend if required. | |||
| 322 | if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) { | |||
| 323 | Lhs = Builder.CreateZExt(Lhs, CmpSizeType); | |||
| 324 | Rhs = Builder.CreateZExt(Rhs, CmpSizeType); | |||
| 325 | } | |||
| 326 | return {Lhs, Rhs}; | |||
| 327 | } | |||
| 328 | ||||
| 329 | // This function creates the IR instructions for loading and comparing 1 byte. | |||
| 330 | // It loads 1 byte from each source of the memcmp parameters with the given | |||
| 331 | // GEPIndex. It then subtracts the two loaded values and adds this result to the | |||
| 332 | // final phi node for selecting the memcmp result. | |||
| 333 | void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, | |||
| 334 | unsigned OffsetBytes) { | |||
| 335 | BasicBlock *BB = LoadCmpBlocks[BlockIndex]; | |||
| 336 | Builder.SetInsertPoint(BB); | |||
| 337 | const LoadPair Loads = | |||
| 338 | getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false, | |||
| 339 | Type::getInt32Ty(CI->getContext()), OffsetBytes); | |||
| 340 | Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs); | |||
| 341 | ||||
| 342 | PhiRes->addIncoming(Diff, BB); | |||
| 343 | ||||
| 344 | if (BlockIndex < (LoadCmpBlocks.size() - 1)) { | |||
| 345 | // Early exit branch if difference found to EndBlock. Otherwise, continue to | |||
| 346 | // next LoadCmpBlock, | |||
| 347 | Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, | |||
| 348 | ConstantInt::get(Diff->getType(), 0)); | |||
| 349 | BranchInst *CmpBr = | |||
| 350 | BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp); | |||
| 351 | if (DTU) | |||
| 352 | DTU->applyUpdates( | |||
| 353 | {{DominatorTree::Insert, BB, EndBlock}, | |||
| 354 | {DominatorTree::Insert, BB, LoadCmpBlocks[BlockIndex + 1]}}); | |||
| 355 | Builder.Insert(CmpBr); | |||
| 356 | } else { | |||
| 357 | // The last block has an unconditional branch to EndBlock. | |||
| 358 | BranchInst *CmpBr = BranchInst::Create(EndBlock); | |||
| 359 | if (DTU) | |||
| 360 | DTU->applyUpdates({{DominatorTree::Insert, BB, EndBlock}}); | |||
| 361 | Builder.Insert(CmpBr); | |||
| 362 | } | |||
| 363 | } | |||
| 364 | ||||
| 365 | /// Generate an equality comparison for one or more pairs of loaded values. | |||
| 366 | /// This is used in the case where the memcmp() call is compared equal or not | |||
| 367 | /// equal to zero. | |||
| 368 | Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, | |||
| 369 | unsigned &LoadIndex) { | |||
| 370 | assert(LoadIndex < getNumLoads() &&((void)0) | |||
| 371 | "getCompareLoadPairs() called with no remaining loads")((void)0); | |||
| 372 | std::vector<Value *> XorList, OrList; | |||
| 373 | Value *Diff = nullptr; | |||
| 374 | ||||
| 375 | const unsigned NumLoads = | |||
| 376 | std::min(getNumLoads() - LoadIndex, NumLoadsPerBlockForZeroCmp); | |||
| 377 | ||||
| 378 | // For a single-block expansion, start inserting before the memcmp call. | |||
| 379 | if (LoadCmpBlocks.empty()) | |||
| 380 | Builder.SetInsertPoint(CI); | |||
| 381 | else | |||
| 382 | Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); | |||
| 383 | ||||
| 384 | Value *Cmp = nullptr; | |||
| 385 | // If we have multiple loads per block, we need to generate a composite | |||
| 386 | // comparison using xor+or. The type for the combinations is the largest load | |||
| 387 | // type. | |||
| 388 | IntegerType *const MaxLoadType = | |||
| 389 | NumLoads == 1 ? nullptr | |||
| 390 | : IntegerType::get(CI->getContext(), MaxLoadSize * 8); | |||
| 391 | for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) { | |||
| 392 | const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex]; | |||
| 393 | const LoadPair Loads = getLoadPair( | |||
| 394 | IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), | |||
| 395 | /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset); | |||
| 396 | ||||
| 397 | if (NumLoads != 1) { | |||
| 398 | // If we have multiple loads per block, we need to generate a composite | |||
| 399 | // comparison using xor+or. | |||
| 400 | Diff = Builder.CreateXor(Loads.Lhs, Loads.Rhs); | |||
| 401 | Diff = Builder.CreateZExt(Diff, MaxLoadType); | |||
| 402 | XorList.push_back(Diff); | |||
| 403 | } else { | |||
| 404 | // If there's only one load per block, we just compare the loaded values. | |||
| 405 | Cmp = Builder.CreateICmpNE(Loads.Lhs, Loads.Rhs); | |||
| 406 | } | |||
| 407 | } | |||
| 408 | ||||
| 409 | auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> { | |||
| 410 | std::vector<Value *> OutList; | |||
| 411 | for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { | |||
| 412 | Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); | |||
| 413 | OutList.push_back(Or); | |||
| 414 | } | |||
| 415 | if (InList.size() % 2 != 0) | |||
| 416 | OutList.push_back(InList.back()); | |||
| 417 | return OutList; | |||
| 418 | }; | |||
| 419 | ||||
| 420 | if (!Cmp
| |||
| 421 | // Pairwise OR the XOR results. | |||
| 422 | OrList = pairWiseOr(XorList); | |||
| 423 | ||||
| 424 | // Pairwise OR the OR results until one result left. | |||
| 425 | while (OrList.size() != 1) { | |||
| 426 | OrList = pairWiseOr(OrList); | |||
| 427 | } | |||
| 428 | ||||
| 429 | assert(Diff && "Failed to find comparison diff")((void)0); | |||
| 430 | Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); | |||
| ||||
| 431 | } | |||
| 432 | ||||
| 433 | return Cmp; | |||
| 434 | } | |||
| 435 | ||||
| 436 | void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, | |||
| 437 | unsigned &LoadIndex) { | |||
| 438 | Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex); | |||
| 439 | ||||
| 440 | BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) | |||
| 441 | ? EndBlock | |||
| 442 | : LoadCmpBlocks[BlockIndex + 1]; | |||
| 443 | // Early exit branch if difference found to ResultBlock. Otherwise, | |||
| 444 | // continue to next LoadCmpBlock or EndBlock. | |||
| 445 | BasicBlock *BB = Builder.GetInsertBlock(); | |||
| 446 | BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); | |||
| 447 | Builder.Insert(CmpBr); | |||
| 448 | if (DTU) | |||
| 449 | DTU->applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB}, | |||
| 450 | {DominatorTree::Insert, BB, NextBB}}); | |||
| 451 | ||||
| 452 | // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 | |||
| 453 | // since early exit to ResultBlock was not taken (no difference was found in | |||
| 454 | // any of the bytes). | |||
| 455 | if (BlockIndex == LoadCmpBlocks.size() - 1) { | |||
| 456 | Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); | |||
| 457 | PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); | |||
| 458 | } | |||
| 459 | } | |||
| 460 | ||||
| 461 | // This function creates the IR intructions for loading and comparing using the | |||
| 462 | // given LoadSize. It loads the number of bytes specified by LoadSize from each | |||
| 463 | // source of the memcmp parameters. It then does a subtract to see if there was | |||
| 464 | // a difference in the loaded values. If a difference is found, it branches | |||
| 465 | // with an early exit to the ResultBlock for calculating which source was | |||
| 466 | // larger. Otherwise, it falls through to the either the next LoadCmpBlock or | |||
| 467 | // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with | |||
| 468 | // a special case through emitLoadCompareByteBlock. The special handling can | |||
| 469 | // simply subtract the loaded values and add it to the result phi node. | |||
| 470 | void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { | |||
| 471 | // There is one load per block in this case, BlockIndex == LoadIndex. | |||
| 472 | const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex]; | |||
| 473 | ||||
| 474 | if (CurLoadEntry.LoadSize == 1) { | |||
| 475 | MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset); | |||
| 476 | return; | |||
| 477 | } | |||
| 478 | ||||
| 479 | Type *LoadSizeType = | |||
| 480 | IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); | |||
| 481 | Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); | |||
| 482 | assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type")((void)0); | |||
| 483 | ||||
| 484 | Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]); | |||
| 485 | ||||
| 486 | const LoadPair Loads = | |||
| 487 | getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType, | |||
| 488 | CurLoadEntry.Offset); | |||
| 489 | ||||
| 490 | // Add the loaded values to the phi nodes for calculating memcmp result only | |||
| 491 | // if result is not used in a zero equality. | |||
| 492 | if (!IsUsedForZeroCmp) { | |||
| 493 | ResBlock.PhiSrc1->addIncoming(Loads.Lhs, LoadCmpBlocks[BlockIndex]); | |||
| 494 | ResBlock.PhiSrc2->addIncoming(Loads.Rhs, LoadCmpBlocks[BlockIndex]); | |||
| 495 | } | |||
| 496 | ||||
| 497 | Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Loads.Lhs, Loads.Rhs); | |||
| 498 | BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) | |||
| 499 | ? EndBlock | |||
| 500 | : LoadCmpBlocks[BlockIndex + 1]; | |||
| 501 | // Early exit branch if difference found to ResultBlock. Otherwise, continue | |||
| 502 | // to next LoadCmpBlock or EndBlock. | |||
| 503 | BasicBlock *BB = Builder.GetInsertBlock(); | |||
| 504 | BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp); | |||
| 505 | Builder.Insert(CmpBr); | |||
| 506 | if (DTU) | |||
| 507 | DTU->applyUpdates({{DominatorTree::Insert, BB, NextBB}, | |||
| 508 | {DominatorTree::Insert, BB, ResBlock.BB}}); | |||
| 509 | ||||
| 510 | // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 | |||
| 511 | // since early exit to ResultBlock was not taken (no difference was found in | |||
| 512 | // any of the bytes). | |||
| 513 | if (BlockIndex == LoadCmpBlocks.size() - 1) { | |||
| 514 | Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); | |||
| 515 | PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); | |||
| 516 | } | |||
| 517 | } | |||
| 518 | ||||
| 519 | // This function populates the ResultBlock with a sequence to calculate the | |||
| 520 | // memcmp result. It compares the two loaded source values and returns -1 if | |||
| 521 | // src1 < src2 and 1 if src1 > src2. | |||
| 522 | void MemCmpExpansion::emitMemCmpResultBlock() { | |||
| 523 | // Special case: if memcmp result is used in a zero equality, result does not | |||
| 524 | // need to be calculated and can simply return 1. | |||
| 525 | if (IsUsedForZeroCmp) { | |||
| 526 | BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); | |||
| 527 | Builder.SetInsertPoint(ResBlock.BB, InsertPt); | |||
| 528 | Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); | |||
| 529 | PhiRes->addIncoming(Res, ResBlock.BB); | |||
| 530 | BranchInst *NewBr = BranchInst::Create(EndBlock); | |||
| 531 | Builder.Insert(NewBr); | |||
| 532 | if (DTU) | |||
| 533 | DTU->applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}}); | |||
| 534 | return; | |||
| 535 | } | |||
| 536 | BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); | |||
| 537 | Builder.SetInsertPoint(ResBlock.BB, InsertPt); | |||
| 538 | ||||
| 539 | Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, | |||
| 540 | ResBlock.PhiSrc2); | |||
| 541 | ||||
| 542 | Value *Res = | |||
| 543 | Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), | |||
| 544 | ConstantInt::get(Builder.getInt32Ty(), 1)); | |||
| 545 | ||||
| 546 | PhiRes->addIncoming(Res, ResBlock.BB); | |||
| 547 | BranchInst *NewBr = BranchInst::Create(EndBlock); | |||
| 548 | Builder.Insert(NewBr); | |||
| 549 | if (DTU) | |||
| 550 | DTU->applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}}); | |||
| 551 | } | |||
| 552 | ||||
| 553 | void MemCmpExpansion::setupResultBlockPHINodes() { | |||
| 554 | Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); | |||
| 555 | Builder.SetInsertPoint(ResBlock.BB); | |||
| 556 | // Note: this assumes one load per block. | |||
| 557 | ResBlock.PhiSrc1 = | |||
| 558 | Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1"); | |||
| 559 | ResBlock.PhiSrc2 = | |||
| 560 | Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2"); | |||
| 561 | } | |||
| 562 | ||||
| 563 | void MemCmpExpansion::setupEndBlockPHINodes() { | |||
| 564 | Builder.SetInsertPoint(&EndBlock->front()); | |||
| 565 | PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); | |||
| 566 | } | |||
| 567 | ||||
| 568 | Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { | |||
| 569 | unsigned LoadIndex = 0; | |||
| 570 | // This loop populates each of the LoadCmpBlocks with the IR sequence to | |||
| 571 | // handle multiple loads per block. | |||
| 572 | for (unsigned I = 0; I < getNumBlocks(); ++I) { | |||
| ||||
| 573 | emitLoadCompareBlockMultipleLoads(I, LoadIndex); | |||
| 574 | } | |||
| 575 | ||||
| 576 | emitMemCmpResultBlock(); | |||
| 577 | return PhiRes; | |||
| 578 | } | |||
| 579 | ||||
| 580 | /// A memcmp expansion that compares equality with 0 and only has one block of | |||
| 581 | /// load and compare can bypass the compare, branch, and phi IR that is required | |||
| 582 | /// in the general case. | |||
| 583 | Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { | |||
| 584 | unsigned LoadIndex = 0; | |||
| 585 | Value *Cmp = getCompareLoadPairs(0, LoadIndex); | |||
| 586 | assert(LoadIndex == getNumLoads() && "some entries were not consumed")((void)0); | |||
| 587 | return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); | |||
| 588 | } | |||
| 589 | ||||
| 590 | /// A memcmp expansion that only has one block of load and compare can bypass | |||
| 591 | /// the compare, branch, and phi IR that is required in the general case. | |||
| 592 | Value *MemCmpExpansion::getMemCmpOneBlock() { | |||
| 593 | Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); | |||
| 594 | bool NeedsBSwap = DL.isLittleEndian() && Size != 1; | |||
| 595 | ||||
| 596 | // The i8 and i16 cases don't need compares. We zext the loaded values and | |||
| 597 | // subtract them to get the suitable negative, zero, or positive i32 result. | |||
| 598 | if (Size < 4) { | |||
| 599 | const LoadPair Loads = | |||
| 600 | getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(), | |||
| 601 | /*Offset*/ 0); | |||
| 602 | return Builder.CreateSub(Loads.Lhs, Loads.Rhs); | |||
| 603 | } | |||
| 604 | ||||
| 605 | const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType, | |||
| 606 | /*Offset*/ 0); | |||
| 607 | // The result of memcmp is negative, zero, or positive, so produce that by | |||
| 608 | // subtracting 2 extended compare bits: sub (ugt, ult). | |||
| 609 | // If a target prefers to use selects to get -1/0/1, they should be able | |||
| 610 | // to transform this later. The inverse transform (going from selects to math) | |||
| 611 | // may not be possible in the DAG because the selects got converted into | |||
| 612 | // branches before we got there. | |||
| 613 | Value *CmpUGT = Builder.CreateICmpUGT(Loads.Lhs, Loads.Rhs); | |||
| 614 | Value *CmpULT = Builder.CreateICmpULT(Loads.Lhs, Loads.Rhs); | |||
| 615 | Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty()); | |||
| 616 | Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty()); | |||
| 617 | return Builder.CreateSub(ZextUGT, ZextULT); | |||
| 618 | } | |||
| 619 | ||||
| 620 | // This function expands the memcmp call into an inline expansion and returns | |||
| 621 | // the memcmp result. | |||
| 622 | Value *MemCmpExpansion::getMemCmpExpansion() { | |||
| 623 | // Create the basic block framework for a multi-block expansion. | |||
| 624 | if (getNumBlocks() != 1) { | |||
| 625 | BasicBlock *StartBlock = CI->getParent(); | |||
| 626 | EndBlock = SplitBlock(StartBlock, CI, DTU, /*LI=*/nullptr, | |||
| 627 | /*MSSAU=*/nullptr, "endblock"); | |||
| 628 | setupEndBlockPHINodes(); | |||
| 629 | createResultBlock(); | |||
| 630 | ||||
| 631 | // If return value of memcmp is not used in a zero equality, we need to | |||
| 632 | // calculate which source was larger. The calculation requires the | |||
| 633 | // two loaded source values of each load compare block. | |||
| 634 | // These will be saved in the phi nodes created by setupResultBlockPHINodes. | |||
| 635 | if (!IsUsedForZeroCmp) setupResultBlockPHINodes(); | |||
| 636 | ||||
| 637 | // Create the number of required load compare basic blocks. | |||
| 638 | createLoadCmpBlocks(); | |||
| 639 | ||||
| 640 | // Update the terminator added by SplitBlock to branch to the first | |||
| 641 | // LoadCmpBlock. | |||
| 642 | StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); | |||
| 643 | if (DTU) | |||
| 644 | DTU->applyUpdates({{DominatorTree::Insert, StartBlock, LoadCmpBlocks[0]}, | |||
| 645 | {DominatorTree::Delete, StartBlock, EndBlock}}); | |||
| 646 | } | |||
| 647 | ||||
| 648 | Builder.SetCurrentDebugLocation(CI->getDebugLoc()); | |||
| 649 | ||||
| 650 | if (IsUsedForZeroCmp) | |||
| 651 | return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock() | |||
| 652 | : getMemCmpExpansionZeroCase(); | |||
| 653 | ||||
| 654 | if (getNumBlocks() == 1) | |||
| 655 | return getMemCmpOneBlock(); | |||
| 656 | ||||
| 657 | for (unsigned I = 0; I < getNumBlocks(); ++I) { | |||
| 658 | emitLoadCompareBlock(I); | |||
| 659 | } | |||
| 660 | ||||
| 661 | emitMemCmpResultBlock(); | |||
| 662 | return PhiRes; | |||
| 663 | } | |||
| 664 | ||||
| 665 | // This function checks to see if an expansion of memcmp can be generated. | |||
| 666 | // It checks for constant compare size that is less than the max inline size. | |||
| 667 | // If an expansion cannot occur, returns false to leave as a library call. | |||
| 668 | // Otherwise, the library call is replaced with a new IR instruction sequence. | |||
| 669 | /// We want to transform: | |||
| 670 | /// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) | |||
| 671 | /// To: | |||
| 672 | /// loadbb: | |||
| 673 | /// %0 = bitcast i32* %buffer2 to i8* | |||
| 674 | /// %1 = bitcast i32* %buffer1 to i8* | |||
| 675 | /// %2 = bitcast i8* %1 to i64* | |||
| 676 | /// %3 = bitcast i8* %0 to i64* | |||
| 677 | /// %4 = load i64, i64* %2 | |||
| 678 | /// %5 = load i64, i64* %3 | |||
| 679 | /// %6 = call i64 @llvm.bswap.i64(i64 %4) | |||
| 680 | /// %7 = call i64 @llvm.bswap.i64(i64 %5) | |||
| 681 | /// %8 = sub i64 %6, %7 | |||
| 682 | /// %9 = icmp ne i64 %8, 0 | |||
| 683 | /// br i1 %9, label %res_block, label %loadbb1 | |||
| 684 | /// res_block: ; preds = %loadbb2, | |||
| 685 | /// %loadbb1, %loadbb | |||
| 686 | /// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] | |||
| 687 | /// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] | |||
| 688 | /// %10 = icmp ult i64 %phi.src1, %phi.src2 | |||
| 689 | /// %11 = select i1 %10, i32 -1, i32 1 | |||
| 690 | /// br label %endblock | |||
| 691 | /// loadbb1: ; preds = %loadbb | |||
| 692 | /// %12 = bitcast i32* %buffer2 to i8* | |||
| 693 | /// %13 = bitcast i32* %buffer1 to i8* | |||
| 694 | /// %14 = bitcast i8* %13 to i32* | |||
| 695 | /// %15 = bitcast i8* %12 to i32* | |||
| 696 | /// %16 = getelementptr i32, i32* %14, i32 2 | |||
| 697 | /// %17 = getelementptr i32, i32* %15, i32 2 | |||
| 698 | /// %18 = load i32, i32* %16 | |||
| 699 | /// %19 = load i32, i32* %17 | |||
| 700 | /// %20 = call i32 @llvm.bswap.i32(i32 %18) | |||
| 701 | /// %21 = call i32 @llvm.bswap.i32(i32 %19) | |||
| 702 | /// %22 = zext i32 %20 to i64 | |||
| 703 | /// %23 = zext i32 %21 to i64 | |||
| 704 | /// %24 = sub i64 %22, %23 | |||
| 705 | /// %25 = icmp ne i64 %24, 0 | |||
| 706 | /// br i1 %25, label %res_block, label %loadbb2 | |||
| 707 | /// loadbb2: ; preds = %loadbb1 | |||
| 708 | /// %26 = bitcast i32* %buffer2 to i8* | |||
| 709 | /// %27 = bitcast i32* %buffer1 to i8* | |||
| 710 | /// %28 = bitcast i8* %27 to i16* | |||
| 711 | /// %29 = bitcast i8* %26 to i16* | |||
| 712 | /// %30 = getelementptr i16, i16* %28, i16 6 | |||
| 713 | /// %31 = getelementptr i16, i16* %29, i16 6 | |||
| 714 | /// %32 = load i16, i16* %30 | |||
| 715 | /// %33 = load i16, i16* %31 | |||
| 716 | /// %34 = call i16 @llvm.bswap.i16(i16 %32) | |||
| 717 | /// %35 = call i16 @llvm.bswap.i16(i16 %33) | |||
| 718 | /// %36 = zext i16 %34 to i64 | |||
| 719 | /// %37 = zext i16 %35 to i64 | |||
| 720 | /// %38 = sub i64 %36, %37 | |||
| 721 | /// %39 = icmp ne i64 %38, 0 | |||
| 722 | /// br i1 %39, label %res_block, label %loadbb3 | |||
| 723 | /// loadbb3: ; preds = %loadbb2 | |||
| 724 | /// %40 = bitcast i32* %buffer2 to i8* | |||
| 725 | /// %41 = bitcast i32* %buffer1 to i8* | |||
| 726 | /// %42 = getelementptr i8, i8* %41, i8 14 | |||
| 727 | /// %43 = getelementptr i8, i8* %40, i8 14 | |||
| 728 | /// %44 = load i8, i8* %42 | |||
| 729 | /// %45 = load i8, i8* %43 | |||
| 730 | /// %46 = zext i8 %44 to i32 | |||
| 731 | /// %47 = zext i8 %45 to i32 | |||
| 732 | /// %48 = sub i32 %46, %47 | |||
| 733 | /// br label %endblock | |||
| 734 | /// endblock: ; preds = %res_block, | |||
| 735 | /// %loadbb3 | |||
| 736 | /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] | |||
| 737 | /// ret i32 %phi.res | |||
| 738 | static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, | |||
| 739 | const TargetLowering *TLI, const DataLayout *DL, | |||
| 740 | ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, | |||
| 741 | DomTreeUpdater *DTU) { | |||
| 742 | NumMemCmpCalls++; | |||
| 743 | ||||
| 744 | // Early exit from expansion if -Oz. | |||
| 745 | if (CI->getFunction()->hasMinSize()) | |||
| 746 | return false; | |||
| 747 | ||||
| 748 | // Early exit from expansion if size is not a constant. | |||
| 749 | ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2)); | |||
| 750 | if (!SizeCast) { | |||
| 751 | NumMemCmpNotConstant++; | |||
| 752 | return false; | |||
| 753 | } | |||
| 754 | const uint64_t SizeVal = SizeCast->getZExtValue(); | |||
| 755 | ||||
| 756 | if (SizeVal == 0) { | |||
| 757 | return false; | |||
| 758 | } | |||
| 759 | // TTI call to check if target would like to expand memcmp. Also, get the | |||
| 760 | // available load sizes. | |||
| 761 | const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); | |||
| 762 | bool OptForSize = CI->getFunction()->hasOptSize() || | |||
| 763 | llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); | |||
| 764 | auto Options = TTI->enableMemCmpExpansion(OptForSize, | |||
| 765 | IsUsedForZeroCmp); | |||
| 766 | if (!Options) return false; | |||
| 767 | ||||
| 768 | if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()) | |||
| 769 | Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock; | |||
| 770 | ||||
| 771 | if (OptForSize && | |||
| 772 | MaxLoadsPerMemcmpOptSize.getNumOccurrences()) | |||
| 773 | Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize; | |||
| 774 | ||||
| 775 | if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences()) | |||
| 776 | Options.MaxNumLoads = MaxLoadsPerMemcmp; | |||
| 777 | ||||
| 778 | MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL, DTU); | |||
| 779 | ||||
| 780 | // Don't expand if this will require more loads than desired by the target. | |||
| 781 | if (Expansion.getNumLoads() == 0) { | |||
| 782 | NumMemCmpGreaterThanMax++; | |||
| 783 | return false; | |||
| 784 | } | |||
| 785 | ||||
| 786 | NumMemCmpInlined++; | |||
| 787 | ||||
| 788 | Value *Res = Expansion.getMemCmpExpansion(); | |||
| 789 | ||||
| 790 | // Replace call with result of expansion and erase call. | |||
| 791 | CI->replaceAllUsesWith(Res); | |||
| 792 | CI->eraseFromParent(); | |||
| 793 | ||||
| 794 | return true; | |||
| 795 | } | |||
| 796 | ||||
| 797 | class ExpandMemCmpPass : public FunctionPass { | |||
| 798 | public: | |||
| 799 | static char ID; | |||
| 800 | ||||
| 801 | ExpandMemCmpPass() : FunctionPass(ID) { | |||
| 802 | initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry()); | |||
| 803 | } | |||
| 804 | ||||
| 805 | bool runOnFunction(Function &F) override { | |||
| 806 | if (skipFunction(F)) return false; | |||
| 807 | ||||
| 808 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); | |||
| 809 | if (!TPC) { | |||
| 810 | return false; | |||
| 811 | } | |||
| 812 | const TargetLowering* TL = | |||
| 813 | TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering(); | |||
| 814 | ||||
| 815 | const TargetLibraryInfo *TLI = | |||
| 816 | &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); | |||
| 817 | const TargetTransformInfo *TTI = | |||
| 818 | &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); | |||
| 819 | auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); | |||
| 820 | auto *BFI = (PSI && PSI->hasProfileSummary()) ? | |||
| 821 | &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() : | |||
| 822 | nullptr; | |||
| 823 | DominatorTree *DT = nullptr; | |||
| 824 | if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>()) | |||
| 825 | DT = &DTWP->getDomTree(); | |||
| 826 | auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI, DT); | |||
| 827 | return !PA.areAllPreserved(); | |||
| 828 | } | |||
| 829 | ||||
| 830 | private: | |||
| 831 | void getAnalysisUsage(AnalysisUsage &AU) const override { | |||
| 832 | AU.addRequired<TargetLibraryInfoWrapperPass>(); | |||
| 833 | AU.addRequired<TargetTransformInfoWrapperPass>(); | |||
| 834 | AU.addRequired<ProfileSummaryInfoWrapperPass>(); | |||
| 835 | AU.addPreserved<DominatorTreeWrapperPass>(); | |||
| 836 | LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); | |||
| 837 | FunctionPass::getAnalysisUsage(AU); | |||
| 838 | } | |||
| 839 | ||||
| 840 | PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, | |||
| 841 | const TargetTransformInfo *TTI, | |||
| 842 | const TargetLowering *TL, ProfileSummaryInfo *PSI, | |||
| 843 | BlockFrequencyInfo *BFI, DominatorTree *DT); | |||
| 844 | // Returns true if a change was made. | |||
| 845 | bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, | |||
| 846 | const TargetTransformInfo *TTI, const TargetLowering *TL, | |||
| 847 | const DataLayout &DL, ProfileSummaryInfo *PSI, | |||
| 848 | BlockFrequencyInfo *BFI, DomTreeUpdater *DTU); | |||
| 849 | }; | |||
| 850 | ||||
| 851 | bool ExpandMemCmpPass::runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, | |||
| 852 | const TargetTransformInfo *TTI, | |||
| 853 | const TargetLowering *TL, | |||
| 854 | const DataLayout &DL, ProfileSummaryInfo *PSI, | |||
| 855 | BlockFrequencyInfo *BFI, | |||
| 856 | DomTreeUpdater *DTU) { | |||
| 857 | for (Instruction& I : BB) { | |||
| 858 | CallInst *CI = dyn_cast<CallInst>(&I); | |||
| 859 | if (!CI) { | |||
| 860 | continue; | |||
| 861 | } | |||
| 862 | LibFunc Func; | |||
| 863 | if (TLI->getLibFunc(*CI, Func) && | |||
| 864 | (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && | |||
| 865 | expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU)) { | |||
| 866 | return true; | |||
| 867 | } | |||
| 868 | } | |||
| 869 | return false; | |||
| 870 | } | |||
| 871 | ||||
| 872 | PreservedAnalyses | |||
| 873 | ExpandMemCmpPass::runImpl(Function &F, const TargetLibraryInfo *TLI, | |||
| 874 | const TargetTransformInfo *TTI, | |||
| 875 | const TargetLowering *TL, ProfileSummaryInfo *PSI, | |||
| 876 | BlockFrequencyInfo *BFI, DominatorTree *DT) { | |||
| 877 | Optional<DomTreeUpdater> DTU; | |||
| 878 | if (DT) | |||
| 879 | DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy); | |||
| 880 | ||||
| 881 | const DataLayout& DL = F.getParent()->getDataLayout(); | |||
| 882 | bool MadeChanges = false; | |||
| 883 | for (auto BBIt = F.begin(); BBIt != F.end();) { | |||
| 884 | if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, | |||
| 885 | DTU.hasValue() ? DTU.getPointer() : nullptr)) { | |||
| 886 | MadeChanges = true; | |||
| 887 | // If changes were made, restart the function from the beginning, since | |||
| 888 | // the structure of the function was changed. | |||
| 889 | BBIt = F.begin(); | |||
| 890 | } else { | |||
| 891 | ++BBIt; | |||
| 892 | } | |||
| 893 | } | |||
| 894 | if (MadeChanges) | |||
| 895 | for (BasicBlock &BB : F) | |||
| 896 | SimplifyInstructionsInBlock(&BB); | |||
| 897 | if (!MadeChanges) | |||
| 898 | return PreservedAnalyses::all(); | |||
| 899 | PreservedAnalyses PA; | |||
| 900 | PA.preserve<DominatorTreeAnalysis>(); | |||
| 901 | return PA; | |||
| 902 | } | |||
| 903 | ||||
| 904 | } // namespace | |||
| 905 | ||||
| 906 | char ExpandMemCmpPass::ID = 0; | |||
| 907 | INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp",static void *initializeExpandMemCmpPassPassOnce(PassRegistry & Registry) { | |||
| 908 | "Expand memcmp() to load/stores", false, false)static void *initializeExpandMemCmpPassPassOnce(PassRegistry & Registry) { | |||
| 909 | INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry); | |||
| 910 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry); | |||
| 911 | INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)initializeLazyBlockFrequencyInfoPassPass(Registry); | |||
| 912 | INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry); | |||
| 913 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry); | |||
| 914 | INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",PassInfo *PI = new PassInfo( "Expand memcmp() to load/stores" , "expandmemcmp", &ExpandMemCmpPass::ID, PassInfo::NormalCtor_t (callDefaultCtor<ExpandMemCmpPass>), false, false); Registry .registerPass(*PI, true); return PI; } static llvm::once_flag InitializeExpandMemCmpPassPassFlag; void llvm::initializeExpandMemCmpPassPass (PassRegistry &Registry) { llvm::call_once(InitializeExpandMemCmpPassPassFlag , initializeExpandMemCmpPassPassOnce, std::ref(Registry)); } | |||
| 915 | "Expand memcmp() to load/stores", false, false)PassInfo *PI = new PassInfo( "Expand memcmp() to load/stores" , "expandmemcmp", &ExpandMemCmpPass::ID, PassInfo::NormalCtor_t (callDefaultCtor<ExpandMemCmpPass>), false, false); Registry .registerPass(*PI, true); return PI; } static llvm::once_flag InitializeExpandMemCmpPassPassFlag; void llvm::initializeExpandMemCmpPassPass (PassRegistry &Registry) { llvm::call_once(InitializeExpandMemCmpPassPassFlag , initializeExpandMemCmpPassPassOnce, std::ref(Registry)); } | |||
| 916 | ||||
| 917 | FunctionPass *llvm::createExpandMemCmpPass() { | |||
| 918 | return new ExpandMemCmpPass(); | |||
| 919 | } |