| File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Transforms/Scalar/LoopInterchange.cpp |
| Warning: | line 669, column 8 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===- LoopInterchange.cpp - Loop interchange pass-------------------------===// | |||
| 2 | // | |||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | |||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
| 6 | // | |||
| 7 | //===----------------------------------------------------------------------===// | |||
| 8 | // | |||
| 9 | // This Pass handles loop interchange transform. | |||
| 10 | // This pass interchanges loops to provide a more cache-friendly memory access | |||
| 11 | // patterns. | |||
| 12 | // | |||
| 13 | //===----------------------------------------------------------------------===// | |||
| 14 | ||||
| 15 | #include "llvm/Transforms/Scalar/LoopInterchange.h" | |||
| 16 | #include "llvm/ADT/STLExtras.h" | |||
| 17 | #include "llvm/ADT/SmallVector.h" | |||
| 18 | #include "llvm/ADT/Statistic.h" | |||
| 19 | #include "llvm/ADT/StringRef.h" | |||
| 20 | #include "llvm/Analysis/DependenceAnalysis.h" | |||
| 21 | #include "llvm/Analysis/LoopInfo.h" | |||
| 22 | #include "llvm/Analysis/LoopNestAnalysis.h" | |||
| 23 | #include "llvm/Analysis/LoopPass.h" | |||
| 24 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" | |||
| 25 | #include "llvm/Analysis/ScalarEvolution.h" | |||
| 26 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" | |||
| 27 | #include "llvm/IR/BasicBlock.h" | |||
| 28 | #include "llvm/IR/Constants.h" | |||
| 29 | #include "llvm/IR/DiagnosticInfo.h" | |||
| 30 | #include "llvm/IR/Dominators.h" | |||
| 31 | #include "llvm/IR/Function.h" | |||
| 32 | #include "llvm/IR/IRBuilder.h" | |||
| 33 | #include "llvm/IR/InstrTypes.h" | |||
| 34 | #include "llvm/IR/Instruction.h" | |||
| 35 | #include "llvm/IR/Instructions.h" | |||
| 36 | #include "llvm/IR/Type.h" | |||
| 37 | #include "llvm/IR/User.h" | |||
| 38 | #include "llvm/IR/Value.h" | |||
| 39 | #include "llvm/InitializePasses.h" | |||
| 40 | #include "llvm/Pass.h" | |||
| 41 | #include "llvm/Support/Casting.h" | |||
| 42 | #include "llvm/Support/CommandLine.h" | |||
| 43 | #include "llvm/Support/Debug.h" | |||
| 44 | #include "llvm/Support/ErrorHandling.h" | |||
| 45 | #include "llvm/Support/raw_ostream.h" | |||
| 46 | #include "llvm/Transforms/Scalar.h" | |||
| 47 | #include "llvm/Transforms/Utils.h" | |||
| 48 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" | |||
| 49 | #include "llvm/Transforms/Utils/LoopUtils.h" | |||
| 50 | #include <cassert> | |||
| 51 | #include <utility> | |||
| 52 | #include <vector> | |||
| 53 | ||||
| 54 | using namespace llvm; | |||
| 55 | ||||
| 56 | #define DEBUG_TYPE"loop-interchange" "loop-interchange" | |||
| 57 | ||||
| 58 | STATISTIC(LoopsInterchanged, "Number of loops interchanged")static llvm::Statistic LoopsInterchanged = {"loop-interchange" , "LoopsInterchanged", "Number of loops interchanged"}; | |||
| 59 | ||||
| 60 | static cl::opt<int> LoopInterchangeCostThreshold( | |||
| 61 | "loop-interchange-threshold", cl::init(0), cl::Hidden, | |||
| 62 | cl::desc("Interchange if you gain more than this number")); | |||
| 63 | ||||
| 64 | namespace { | |||
| 65 | ||||
| 66 | using LoopVector = SmallVector<Loop *, 8>; | |||
| 67 | ||||
| 68 | // TODO: Check if we can use a sparse matrix here. | |||
| 69 | using CharMatrix = std::vector<std::vector<char>>; | |||
| 70 | ||||
| 71 | } // end anonymous namespace | |||
| 72 | ||||
| 73 | // Maximum number of dependencies that can be handled in the dependency matrix. | |||
| 74 | static const unsigned MaxMemInstrCount = 100; | |||
| 75 | ||||
| 76 | // Maximum loop depth supported. | |||
| 77 | static const unsigned MaxLoopNestDepth = 10; | |||
| 78 | ||||
| 79 | #ifdef DUMP_DEP_MATRICIES | |||
| 80 | static void printDepMatrix(CharMatrix &DepMatrix) { | |||
| 81 | for (auto &Row : DepMatrix) { | |||
| 82 | for (auto D : Row) | |||
| 83 | LLVM_DEBUG(dbgs() << D << " ")do { } while (false); | |||
| 84 | LLVM_DEBUG(dbgs() << "\n")do { } while (false); | |||
| 85 | } | |||
| 86 | } | |||
| 87 | #endif | |||
| 88 | ||||
| 89 | static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, | |||
| 90 | Loop *L, DependenceInfo *DI) { | |||
| 91 | using ValueVector = SmallVector<Value *, 16>; | |||
| 92 | ||||
| 93 | ValueVector MemInstr; | |||
| 94 | ||||
| 95 | // For each block. | |||
| 96 | for (BasicBlock *BB : L->blocks()) { | |||
| 97 | // Scan the BB and collect legal loads and stores. | |||
| 98 | for (Instruction &I : *BB) { | |||
| 99 | if (!isa<Instruction>(I)) | |||
| 100 | return false; | |||
| 101 | if (auto *Ld = dyn_cast<LoadInst>(&I)) { | |||
| 102 | if (!Ld->isSimple()) | |||
| 103 | return false; | |||
| 104 | MemInstr.push_back(&I); | |||
| 105 | } else if (auto *St = dyn_cast<StoreInst>(&I)) { | |||
| 106 | if (!St->isSimple()) | |||
| 107 | return false; | |||
| 108 | MemInstr.push_back(&I); | |||
| 109 | } | |||
| 110 | } | |||
| 111 | } | |||
| 112 | ||||
| 113 | LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()do { } while (false) | |||
| 114 | << " Loads and Stores to analyze\n")do { } while (false); | |||
| 115 | ||||
| 116 | ValueVector::iterator I, IE, J, JE; | |||
| 117 | ||||
| 118 | for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { | |||
| 119 | for (J = I, JE = MemInstr.end(); J != JE; ++J) { | |||
| 120 | std::vector<char> Dep; | |||
| 121 | Instruction *Src = cast<Instruction>(*I); | |||
| 122 | Instruction *Dst = cast<Instruction>(*J); | |||
| 123 | if (Src == Dst) | |||
| 124 | continue; | |||
| 125 | // Ignore Input dependencies. | |||
| 126 | if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) | |||
| 127 | continue; | |||
| 128 | // Track Output, Flow, and Anti dependencies. | |||
| 129 | if (auto D = DI->depends(Src, Dst, true)) { | |||
| 130 | assert(D->isOrdered() && "Expected an output, flow or anti dep.")((void)0); | |||
| 131 | LLVM_DEBUG(StringRef DepType =do { } while (false) | |||
| 132 | D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";do { } while (false) | |||
| 133 | dbgs() << "Found " << DepTypedo { } while (false) | |||
| 134 | << " dependency between Src and Dst\n"do { } while (false) | |||
| 135 | << " Src:" << *Src << "\n Dst:" << *Dst << '\n')do { } while (false); | |||
| 136 | unsigned Levels = D->getLevels(); | |||
| 137 | char Direction; | |||
| 138 | for (unsigned II = 1; II <= Levels; ++II) { | |||
| 139 | const SCEV *Distance = D->getDistance(II); | |||
| 140 | const SCEVConstant *SCEVConst = | |||
| 141 | dyn_cast_or_null<SCEVConstant>(Distance); | |||
| 142 | if (SCEVConst) { | |||
| 143 | const ConstantInt *CI = SCEVConst->getValue(); | |||
| 144 | if (CI->isNegative()) | |||
| 145 | Direction = '<'; | |||
| 146 | else if (CI->isZero()) | |||
| 147 | Direction = '='; | |||
| 148 | else | |||
| 149 | Direction = '>'; | |||
| 150 | Dep.push_back(Direction); | |||
| 151 | } else if (D->isScalar(II)) { | |||
| 152 | Direction = 'S'; | |||
| 153 | Dep.push_back(Direction); | |||
| 154 | } else { | |||
| 155 | unsigned Dir = D->getDirection(II); | |||
| 156 | if (Dir == Dependence::DVEntry::LT || | |||
| 157 | Dir == Dependence::DVEntry::LE) | |||
| 158 | Direction = '<'; | |||
| 159 | else if (Dir == Dependence::DVEntry::GT || | |||
| 160 | Dir == Dependence::DVEntry::GE) | |||
| 161 | Direction = '>'; | |||
| 162 | else if (Dir == Dependence::DVEntry::EQ) | |||
| 163 | Direction = '='; | |||
| 164 | else | |||
| 165 | Direction = '*'; | |||
| 166 | Dep.push_back(Direction); | |||
| 167 | } | |||
| 168 | } | |||
| 169 | while (Dep.size() != Level) { | |||
| 170 | Dep.push_back('I'); | |||
| 171 | } | |||
| 172 | ||||
| 173 | DepMatrix.push_back(Dep); | |||
| 174 | if (DepMatrix.size() > MaxMemInstrCount) { | |||
| 175 | LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCountdo { } while (false) | |||
| 176 | << " dependencies inside loop\n")do { } while (false); | |||
| 177 | return false; | |||
| 178 | } | |||
| 179 | } | |||
| 180 | } | |||
| 181 | } | |||
| 182 | ||||
| 183 | return true; | |||
| 184 | } | |||
| 185 | ||||
| 186 | // A loop is moved from index 'from' to an index 'to'. Update the Dependence | |||
| 187 | // matrix by exchanging the two columns. | |||
| 188 | static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, | |||
| 189 | unsigned ToIndx) { | |||
| 190 | for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I) | |||
| 191 | std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]); | |||
| 192 | } | |||
| 193 | ||||
| 194 | // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is | |||
| 195 | // '>' | |||
| 196 | static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, | |||
| 197 | unsigned Column) { | |||
| 198 | for (unsigned i = 0; i <= Column; ++i) { | |||
| 199 | if (DepMatrix[Row][i] == '<') | |||
| 200 | return false; | |||
| 201 | if (DepMatrix[Row][i] == '>') | |||
| 202 | return true; | |||
| 203 | } | |||
| 204 | // All dependencies were '=','S' or 'I' | |||
| 205 | return false; | |||
| 206 | } | |||
| 207 | ||||
| 208 | // Checks if no dependence exist in the dependency matrix in Row before Column. | |||
| 209 | static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, | |||
| 210 | unsigned Column) { | |||
| 211 | for (unsigned i = 0; i < Column; ++i) { | |||
| 212 | if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' && | |||
| 213 | DepMatrix[Row][i] != 'I') | |||
| 214 | return false; | |||
| 215 | } | |||
| 216 | return true; | |||
| 217 | } | |||
| 218 | ||||
| 219 | static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, | |||
| 220 | unsigned OuterLoopId, char InnerDep, | |||
| 221 | char OuterDep) { | |||
| 222 | if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) | |||
| 223 | return false; | |||
| 224 | ||||
| 225 | if (InnerDep == OuterDep) | |||
| 226 | return true; | |||
| 227 | ||||
| 228 | // It is legal to interchange if and only if after interchange no row has a | |||
| 229 | // '>' direction as the leftmost non-'='. | |||
| 230 | ||||
| 231 | if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') | |||
| 232 | return true; | |||
| 233 | ||||
| 234 | if (InnerDep == '<') | |||
| 235 | return true; | |||
| 236 | ||||
| 237 | if (InnerDep == '>') { | |||
| 238 | // If OuterLoopId represents outermost loop then interchanging will make the | |||
| 239 | // 1st dependency as '>' | |||
| 240 | if (OuterLoopId == 0) | |||
| 241 | return false; | |||
| 242 | ||||
| 243 | // If all dependencies before OuterloopId are '=','S'or 'I'. Then | |||
| 244 | // interchanging will result in this row having an outermost non '=' | |||
| 245 | // dependency of '>' | |||
| 246 | if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) | |||
| 247 | return true; | |||
| 248 | } | |||
| 249 | ||||
| 250 | return false; | |||
| 251 | } | |||
| 252 | ||||
| 253 | // Checks if it is legal to interchange 2 loops. | |||
| 254 | // [Theorem] A permutation of the loops in a perfect nest is legal if and only | |||
| 255 | // if the direction matrix, after the same permutation is applied to its | |||
| 256 | // columns, has no ">" direction as the leftmost non-"=" direction in any row. | |||
| 257 | static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, | |||
| 258 | unsigned InnerLoopId, | |||
| 259 | unsigned OuterLoopId) { | |||
| 260 | unsigned NumRows = DepMatrix.size(); | |||
| 261 | // For each row check if it is valid to interchange. | |||
| 262 | for (unsigned Row = 0; Row < NumRows; ++Row) { | |||
| 263 | char InnerDep = DepMatrix[Row][InnerLoopId]; | |||
| 264 | char OuterDep = DepMatrix[Row][OuterLoopId]; | |||
| 265 | if (InnerDep == '*' || OuterDep == '*') | |||
| 266 | return false; | |||
| 267 | if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)) | |||
| 268 | return false; | |||
| 269 | } | |||
| 270 | return true; | |||
| 271 | } | |||
| 272 | ||||
| 273 | static LoopVector populateWorklist(Loop &L) { | |||
| 274 | LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "do { } while (false) | |||
| 275 | << L.getHeader()->getParent()->getName() << " Loop: %"do { } while (false) | |||
| 276 | << L.getHeader()->getName() << '\n')do { } while (false); | |||
| 277 | LoopVector LoopList; | |||
| 278 | Loop *CurrentLoop = &L; | |||
| 279 | const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); | |||
| 280 | while (!Vec->empty()) { | |||
| 281 | // The current loop has multiple subloops in it hence it is not tightly | |||
| 282 | // nested. | |||
| 283 | // Discard all loops above it added into Worklist. | |||
| 284 | if (Vec->size() != 1) | |||
| 285 | return {}; | |||
| 286 | ||||
| 287 | LoopList.push_back(CurrentLoop); | |||
| 288 | CurrentLoop = Vec->front(); | |||
| 289 | Vec = &CurrentLoop->getSubLoops(); | |||
| 290 | } | |||
| 291 | LoopList.push_back(CurrentLoop); | |||
| 292 | return LoopList; | |||
| 293 | } | |||
| 294 | ||||
| 295 | static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { | |||
| 296 | PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); | |||
| 297 | if (InnerIndexVar) | |||
| 298 | return InnerIndexVar; | |||
| 299 | if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) | |||
| 300 | return nullptr; | |||
| 301 | for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { | |||
| 302 | PHINode *PhiVar = cast<PHINode>(I); | |||
| 303 | Type *PhiTy = PhiVar->getType(); | |||
| 304 | if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && | |||
| 305 | !PhiTy->isPointerTy()) | |||
| 306 | return nullptr; | |||
| 307 | const SCEVAddRecExpr *AddRec = | |||
| 308 | dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); | |||
| 309 | if (!AddRec || !AddRec->isAffine()) | |||
| 310 | continue; | |||
| 311 | const SCEV *Step = AddRec->getStepRecurrence(*SE); | |||
| 312 | if (!isa<SCEVConstant>(Step)) | |||
| 313 | continue; | |||
| 314 | // Found the induction variable. | |||
| 315 | // FIXME: Handle loops with more than one induction variable. Note that, | |||
| 316 | // currently, legality makes sure we have only one induction variable. | |||
| 317 | return PhiVar; | |||
| 318 | } | |||
| 319 | return nullptr; | |||
| 320 | } | |||
| 321 | ||||
| 322 | namespace { | |||
| 323 | ||||
| 324 | /// LoopInterchangeLegality checks if it is legal to interchange the loop. | |||
| 325 | class LoopInterchangeLegality { | |||
| 326 | public: | |||
| 327 | LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, | |||
| 328 | OptimizationRemarkEmitter *ORE) | |||
| 329 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} | |||
| 330 | ||||
| 331 | /// Check if the loops can be interchanged. | |||
| 332 | bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, | |||
| 333 | CharMatrix &DepMatrix); | |||
| 334 | ||||
| 335 | /// Check if the loop structure is understood. We do not handle triangular | |||
| 336 | /// loops for now. | |||
| 337 | bool isLoopStructureUnderstood(PHINode *InnerInductionVar); | |||
| 338 | ||||
| 339 | bool currentLimitations(); | |||
| 340 | ||||
| 341 | const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const { | |||
| 342 | return OuterInnerReductions; | |||
| 343 | } | |||
| 344 | ||||
| 345 | private: | |||
| 346 | bool tightlyNested(Loop *Outer, Loop *Inner); | |||
| 347 | bool containsUnsafeInstructions(BasicBlock *BB); | |||
| 348 | ||||
| 349 | /// Discover induction and reduction PHIs in the header of \p L. Induction | |||
| 350 | /// PHIs are added to \p Inductions, reductions are added to | |||
| 351 | /// OuterInnerReductions. When the outer loop is passed, the inner loop needs | |||
| 352 | /// to be passed as \p InnerLoop. | |||
| 353 | bool findInductionAndReductions(Loop *L, | |||
| 354 | SmallVector<PHINode *, 8> &Inductions, | |||
| 355 | Loop *InnerLoop); | |||
| 356 | ||||
| 357 | Loop *OuterLoop; | |||
| 358 | Loop *InnerLoop; | |||
| 359 | ||||
| 360 | ScalarEvolution *SE; | |||
| 361 | ||||
| 362 | /// Interface to emit optimization remarks. | |||
| 363 | OptimizationRemarkEmitter *ORE; | |||
| 364 | ||||
| 365 | /// Set of reduction PHIs taking part of a reduction across the inner and | |||
| 366 | /// outer loop. | |||
| 367 | SmallPtrSet<PHINode *, 4> OuterInnerReductions; | |||
| 368 | }; | |||
| 369 | ||||
| 370 | /// LoopInterchangeProfitability checks if it is profitable to interchange the | |||
| 371 | /// loop. | |||
| 372 | class LoopInterchangeProfitability { | |||
| 373 | public: | |||
| 374 | LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, | |||
| 375 | OptimizationRemarkEmitter *ORE) | |||
| 376 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} | |||
| 377 | ||||
| 378 | /// Check if the loop interchange is profitable. | |||
| 379 | bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, | |||
| 380 | CharMatrix &DepMatrix); | |||
| 381 | ||||
| 382 | private: | |||
| 383 | int getInstrOrderCost(); | |||
| 384 | ||||
| 385 | Loop *OuterLoop; | |||
| 386 | Loop *InnerLoop; | |||
| 387 | ||||
| 388 | /// Scev analysis. | |||
| 389 | ScalarEvolution *SE; | |||
| 390 | ||||
| 391 | /// Interface to emit optimization remarks. | |||
| 392 | OptimizationRemarkEmitter *ORE; | |||
| 393 | }; | |||
| 394 | ||||
| 395 | /// LoopInterchangeTransform interchanges the loop. | |||
| 396 | class LoopInterchangeTransform { | |||
| 397 | public: | |||
| 398 | LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, | |||
| 399 | LoopInfo *LI, DominatorTree *DT, | |||
| 400 | const LoopInterchangeLegality &LIL) | |||
| 401 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {} | |||
| 402 | ||||
| 403 | /// Interchange OuterLoop and InnerLoop. | |||
| 404 | bool transform(); | |||
| 405 | void restructureLoops(Loop *NewInner, Loop *NewOuter, | |||
| 406 | BasicBlock *OrigInnerPreHeader, | |||
| 407 | BasicBlock *OrigOuterPreHeader); | |||
| 408 | void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); | |||
| 409 | ||||
| 410 | private: | |||
| 411 | bool adjustLoopLinks(); | |||
| 412 | bool adjustLoopBranches(); | |||
| 413 | ||||
| 414 | Loop *OuterLoop; | |||
| 415 | Loop *InnerLoop; | |||
| 416 | ||||
| 417 | /// Scev analysis. | |||
| 418 | ScalarEvolution *SE; | |||
| 419 | ||||
| 420 | LoopInfo *LI; | |||
| 421 | DominatorTree *DT; | |||
| 422 | ||||
| 423 | const LoopInterchangeLegality &LIL; | |||
| 424 | }; | |||
| 425 | ||||
| 426 | struct LoopInterchange { | |||
| 427 | ScalarEvolution *SE = nullptr; | |||
| 428 | LoopInfo *LI = nullptr; | |||
| 429 | DependenceInfo *DI = nullptr; | |||
| 430 | DominatorTree *DT = nullptr; | |||
| 431 | ||||
| 432 | /// Interface to emit optimization remarks. | |||
| 433 | OptimizationRemarkEmitter *ORE; | |||
| 434 | ||||
| 435 | LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, | |||
| 436 | DominatorTree *DT, OptimizationRemarkEmitter *ORE) | |||
| 437 | : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {} | |||
| 438 | ||||
| 439 | bool run(Loop *L) { | |||
| 440 | if (L->getParentLoop()) | |||
| 441 | return false; | |||
| 442 | ||||
| 443 | return processLoopList(populateWorklist(*L)); | |||
| 444 | } | |||
| 445 | ||||
| 446 | bool run(LoopNest &LN) { | |||
| 447 | const auto &LoopList = LN.getLoops(); | |||
| 448 | for (unsigned I = 1; I < LoopList.size(); ++I) | |||
| 449 | if (LoopList[I]->getParentLoop() != LoopList[I - 1]) | |||
| 450 | return false; | |||
| 451 | return processLoopList(LoopList); | |||
| 452 | } | |||
| 453 | ||||
| 454 | bool isComputableLoopNest(ArrayRef<Loop *> LoopList) { | |||
| 455 | for (Loop *L : LoopList) { | |||
| 456 | const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); | |||
| 457 | if (isa<SCEVCouldNotCompute>(ExitCountOuter)) { | |||
| 458 | LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n")do { } while (false); | |||
| 459 | return false; | |||
| 460 | } | |||
| 461 | if (L->getNumBackEdges() != 1) { | |||
| 462 | LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n")do { } while (false); | |||
| 463 | return false; | |||
| 464 | } | |||
| 465 | if (!L->getExitingBlock()) { | |||
| 466 | LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n")do { } while (false); | |||
| 467 | return false; | |||
| 468 | } | |||
| 469 | } | |||
| 470 | return true; | |||
| 471 | } | |||
| 472 | ||||
| 473 | unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) { | |||
| 474 | // TODO: Add a better heuristic to select the loop to be interchanged based | |||
| 475 | // on the dependence matrix. Currently we select the innermost loop. | |||
| 476 | return LoopList.size() - 1; | |||
| 477 | } | |||
| 478 | ||||
| 479 | bool processLoopList(ArrayRef<Loop *> LoopList) { | |||
| 480 | bool Changed = false; | |||
| 481 | unsigned LoopNestDepth = LoopList.size(); | |||
| 482 | if (LoopNestDepth < 2) { | |||
| 483 | LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n")do { } while (false); | |||
| 484 | return false; | |||
| 485 | } | |||
| 486 | if (LoopNestDepth > MaxLoopNestDepth) { | |||
| 487 | LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "do { } while (false) | |||
| 488 | << MaxLoopNestDepth << "\n")do { } while (false); | |||
| 489 | return false; | |||
| 490 | } | |||
| 491 | if (!isComputableLoopNest(LoopList)) { | |||
| 492 | LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n")do { } while (false); | |||
| 493 | return false; | |||
| 494 | } | |||
| 495 | ||||
| 496 | LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepthdo { } while (false) | |||
| 497 | << "\n")do { } while (false); | |||
| 498 | ||||
| 499 | CharMatrix DependencyMatrix; | |||
| 500 | Loop *OuterMostLoop = *(LoopList.begin()); | |||
| 501 | if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, | |||
| 502 | OuterMostLoop, DI)) { | |||
| 503 | LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n")do { } while (false); | |||
| 504 | return false; | |||
| 505 | } | |||
| 506 | #ifdef DUMP_DEP_MATRICIES | |||
| 507 | LLVM_DEBUG(dbgs() << "Dependence before interchange\n")do { } while (false); | |||
| 508 | printDepMatrix(DependencyMatrix); | |||
| 509 | #endif | |||
| 510 | ||||
| 511 | // Get the Outermost loop exit. | |||
| 512 | BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); | |||
| 513 | if (!LoopNestExit) { | |||
| 514 | LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block")do { } while (false); | |||
| 515 | return false; | |||
| 516 | } | |||
| 517 | ||||
| 518 | unsigned SelecLoopId = selectLoopForInterchange(LoopList); | |||
| 519 | // Move the selected loop outwards to the best possible position. | |||
| 520 | Loop *LoopToBeInterchanged = LoopList[SelecLoopId]; | |||
| 521 | for (unsigned i = SelecLoopId; i > 0; i--) { | |||
| 522 | bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i, | |||
| 523 | i - 1, DependencyMatrix); | |||
| 524 | if (!Interchanged) | |||
| 525 | return Changed; | |||
| 526 | // Update the DependencyMatrix | |||
| 527 | interChangeDependencies(DependencyMatrix, i, i - 1); | |||
| 528 | #ifdef DUMP_DEP_MATRICIES | |||
| 529 | LLVM_DEBUG(dbgs() << "Dependence after interchange\n")do { } while (false); | |||
| 530 | printDepMatrix(DependencyMatrix); | |||
| 531 | #endif | |||
| 532 | Changed |= Interchanged; | |||
| 533 | } | |||
| 534 | return Changed; | |||
| 535 | } | |||
| 536 | ||||
| 537 | bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId, | |||
| 538 | unsigned OuterLoopId, | |||
| 539 | std::vector<std::vector<char>> &DependencyMatrix) { | |||
| 540 | LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopIddo { } while (false) | |||
| 541 | << " and OuterLoopId = " << OuterLoopId << "\n")do { } while (false); | |||
| 542 | LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); | |||
| 543 | if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { | |||
| 544 | LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n")do { } while (false); | |||
| 545 | return false; | |||
| 546 | } | |||
| 547 | LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n")do { } while (false); | |||
| 548 | LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); | |||
| 549 | if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { | |||
| 550 | LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n")do { } while (false); | |||
| 551 | return false; | |||
| 552 | } | |||
| 553 | ||||
| 554 | ORE->emit([&]() { | |||
| 555 | return OptimizationRemark(DEBUG_TYPE"loop-interchange", "Interchanged", | |||
| 556 | InnerLoop->getStartLoc(), | |||
| 557 | InnerLoop->getHeader()) | |||
| 558 | << "Loop interchanged with enclosing loop."; | |||
| 559 | }); | |||
| 560 | ||||
| 561 | LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL); | |||
| 562 | LIT.transform(); | |||
| 563 | LLVM_DEBUG(dbgs() << "Loops interchanged.\n")do { } while (false); | |||
| 564 | LoopsInterchanged++; | |||
| 565 | ||||
| 566 | assert(InnerLoop->isLCSSAForm(*DT) &&((void)0) | |||
| 567 | "Inner loop not left in LCSSA form after loop interchange!")((void)0); | |||
| 568 | assert(OuterLoop->isLCSSAForm(*DT) &&((void)0) | |||
| 569 | "Outer loop not left in LCSSA form after loop interchange!")((void)0); | |||
| 570 | ||||
| 571 | return true; | |||
| 572 | } | |||
| 573 | }; | |||
| 574 | ||||
| 575 | } // end anonymous namespace | |||
| 576 | ||||
| 577 | bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) { | |||
| 578 | return any_of(*BB, [](const Instruction &I) { | |||
| 579 | return I.mayHaveSideEffects() || I.mayReadFromMemory(); | |||
| 580 | }); | |||
| 581 | } | |||
| 582 | ||||
| 583 | bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { | |||
| 584 | BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); | |||
| 585 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); | |||
| 586 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); | |||
| 587 | ||||
| 588 | LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n")do { } while (false); | |||
| 589 | ||||
| 590 | // A perfectly nested loop will not have any branch in between the outer and | |||
| 591 | // inner block i.e. outer header will branch to either inner preheader and | |||
| 592 | // outerloop latch. | |||
| 593 | BranchInst *OuterLoopHeaderBI = | |||
| 594 | dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); | |||
| 595 | if (!OuterLoopHeaderBI) | |||
| 596 | return false; | |||
| 597 | ||||
| 598 | for (BasicBlock *Succ : successors(OuterLoopHeaderBI)) | |||
| 599 | if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() && | |||
| 600 | Succ != OuterLoopLatch) | |||
| 601 | return false; | |||
| 602 | ||||
| 603 | LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n")do { } while (false); | |||
| 604 | // We do not have any basic block in between now make sure the outer header | |||
| 605 | // and outer loop latch doesn't contain any unsafe instructions. | |||
| 606 | if (containsUnsafeInstructions(OuterLoopHeader) || | |||
| 607 | containsUnsafeInstructions(OuterLoopLatch)) | |||
| 608 | return false; | |||
| 609 | ||||
| 610 | // Also make sure the inner loop preheader does not contain any unsafe | |||
| 611 | // instructions. Note that all instructions in the preheader will be moved to | |||
| 612 | // the outer loop header when interchanging. | |||
| 613 | if (InnerLoopPreHeader != OuterLoopHeader && | |||
| 614 | containsUnsafeInstructions(InnerLoopPreHeader)) | |||
| 615 | return false; | |||
| 616 | ||||
| 617 | BasicBlock *InnerLoopExit = InnerLoop->getExitBlock(); | |||
| 618 | // Ensure the inner loop exit block flows to the outer loop latch possibly | |||
| 619 | // through empty blocks. | |||
| 620 | const BasicBlock &SuccInner = | |||
| 621 | LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch); | |||
| 622 | if (&SuccInner != OuterLoopLatch) { | |||
| 623 | LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExitdo { } while (false) | |||
| 624 | << " does not lead to the outer loop latch.\n";)do { } while (false); | |||
| 625 | return false; | |||
| 626 | } | |||
| 627 | // The inner loop exit block does flow to the outer loop latch and not some | |||
| 628 | // other BBs, now make sure it contains safe instructions, since it will be | |||
| 629 | // moved into the (new) inner loop after interchange. | |||
| 630 | if (containsUnsafeInstructions(InnerLoopExit)) | |||
| 631 | return false; | |||
| 632 | ||||
| 633 | LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n")do { } while (false); | |||
| 634 | // We have a perfect loop nest. | |||
| 635 | return true; | |||
| 636 | } | |||
| 637 | ||||
| 638 | bool LoopInterchangeLegality::isLoopStructureUnderstood( | |||
| 639 | PHINode *InnerInduction) { | |||
| 640 | unsigned Num = InnerInduction->getNumOperands(); | |||
| 641 | BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); | |||
| 642 | for (unsigned i = 0; i < Num; ++i) { | |||
| 643 | Value *Val = InnerInduction->getOperand(i); | |||
| 644 | if (isa<Constant>(Val)) | |||
| 645 | continue; | |||
| 646 | Instruction *I = dyn_cast<Instruction>(Val); | |||
| 647 | if (!I) | |||
| 648 | return false; | |||
| 649 | // TODO: Handle triangular loops. | |||
| 650 | // e.g. for(int i=0;i<N;i++) | |||
| 651 | // for(int j=i;j<N;j++) | |||
| 652 | unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); | |||
| 653 | if (InnerInduction->getIncomingBlock(IncomBlockIndx) == | |||
| 654 | InnerLoopPreheader && | |||
| 655 | !OuterLoop->isLoopInvariant(I)) { | |||
| 656 | return false; | |||
| 657 | } | |||
| 658 | } | |||
| 659 | ||||
| 660 | // TODO: Handle triangular loops of another form. | |||
| 661 | // e.g. for(int i=0;i<N;i++) | |||
| 662 | // for(int j=0;j<i;j++) | |||
| 663 | // or, | |||
| 664 | // for(int i=0;i<N;i++) | |||
| 665 | // for(int j=0;j*i<N;j++) | |||
| 666 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); | |||
| 667 | BranchInst *InnerLoopLatchBI = | |||
| 668 | dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); | |||
| 669 | if (!InnerLoopLatchBI->isConditional()) | |||
| ||||
| 670 | return false; | |||
| 671 | if (CmpInst *InnerLoopCmp = | |||
| 672 | dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) { | |||
| 673 | Value *Op0 = InnerLoopCmp->getOperand(0); | |||
| 674 | Value *Op1 = InnerLoopCmp->getOperand(1); | |||
| 675 | ||||
| 676 | // LHS and RHS of the inner loop exit condition, e.g., | |||
| 677 | // in "for(int j=0;j<i;j++)", LHS is j and RHS is i. | |||
| 678 | Value *Left = nullptr; | |||
| 679 | Value *Right = nullptr; | |||
| 680 | ||||
| 681 | // Check if V only involves inner loop induction variable. | |||
| 682 | // Return true if V is InnerInduction, or a cast from | |||
| 683 | // InnerInduction, or a binary operator that involves | |||
| 684 | // InnerInduction and a constant. | |||
| 685 | std::function<bool(Value *)> IsPathToIndVar; | |||
| 686 | IsPathToIndVar = [&InnerInduction, &IsPathToIndVar](Value *V) -> bool { | |||
| 687 | if (V == InnerInduction) | |||
| 688 | return true; | |||
| 689 | if (isa<Constant>(V)) | |||
| 690 | return true; | |||
| 691 | Instruction *I = dyn_cast<Instruction>(V); | |||
| 692 | if (!I) | |||
| 693 | return false; | |||
| 694 | if (isa<CastInst>(I)) | |||
| 695 | return IsPathToIndVar(I->getOperand(0)); | |||
| 696 | if (isa<BinaryOperator>(I)) | |||
| 697 | return IsPathToIndVar(I->getOperand(0)) && | |||
| 698 | IsPathToIndVar(I->getOperand(1)); | |||
| 699 | return false; | |||
| 700 | }; | |||
| 701 | ||||
| 702 | if (IsPathToIndVar(Op0) && !isa<Constant>(Op0)) { | |||
| 703 | Left = Op0; | |||
| 704 | Right = Op1; | |||
| 705 | } else if (IsPathToIndVar(Op1) && !isa<Constant>(Op1)) { | |||
| 706 | Left = Op1; | |||
| 707 | Right = Op0; | |||
| 708 | } | |||
| 709 | ||||
| 710 | if (Left == nullptr) | |||
| 711 | return false; | |||
| 712 | ||||
| 713 | const SCEV *S = SE->getSCEV(Right); | |||
| 714 | if (!SE->isLoopInvariant(S, OuterLoop)) | |||
| 715 | return false; | |||
| 716 | } | |||
| 717 | ||||
| 718 | return true; | |||
| 719 | } | |||
| 720 | ||||
| 721 | // If SV is a LCSSA PHI node with a single incoming value, return the incoming | |||
| 722 | // value. | |||
| 723 | static Value *followLCSSA(Value *SV) { | |||
| 724 | PHINode *PHI = dyn_cast<PHINode>(SV); | |||
| 725 | if (!PHI) | |||
| 726 | return SV; | |||
| 727 | ||||
| 728 | if (PHI->getNumIncomingValues() != 1) | |||
| 729 | return SV; | |||
| 730 | return followLCSSA(PHI->getIncomingValue(0)); | |||
| 731 | } | |||
| 732 | ||||
| 733 | // Check V's users to see if it is involved in a reduction in L. | |||
| 734 | static PHINode *findInnerReductionPhi(Loop *L, Value *V) { | |||
| 735 | // Reduction variables cannot be constants. | |||
| 736 | if (isa<Constant>(V)) | |||
| 737 | return nullptr; | |||
| 738 | ||||
| 739 | for (Value *User : V->users()) { | |||
| 740 | if (PHINode *PHI = dyn_cast<PHINode>(User)) { | |||
| 741 | if (PHI->getNumIncomingValues() == 1) | |||
| 742 | continue; | |||
| 743 | RecurrenceDescriptor RD; | |||
| 744 | if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) | |||
| 745 | return PHI; | |||
| 746 | return nullptr; | |||
| 747 | } | |||
| 748 | } | |||
| 749 | ||||
| 750 | return nullptr; | |||
| 751 | } | |||
| 752 | ||||
| 753 | bool LoopInterchangeLegality::findInductionAndReductions( | |||
| 754 | Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) { | |||
| 755 | if (!L->getLoopLatch() || !L->getLoopPredecessor()) | |||
| 756 | return false; | |||
| 757 | for (PHINode &PHI : L->getHeader()->phis()) { | |||
| 758 | RecurrenceDescriptor RD; | |||
| 759 | InductionDescriptor ID; | |||
| 760 | if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) | |||
| 761 | Inductions.push_back(&PHI); | |||
| 762 | else { | |||
| 763 | // PHIs in inner loops need to be part of a reduction in the outer loop, | |||
| 764 | // discovered when checking the PHIs of the outer loop earlier. | |||
| 765 | if (!InnerLoop) { | |||
| 766 | if (!OuterInnerReductions.count(&PHI)) { | |||
| 767 | LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "do { } while (false) | |||
| 768 | "across the outer loop.\n")do { } while (false); | |||
| 769 | return false; | |||
| 770 | } | |||
| 771 | } else { | |||
| 772 | assert(PHI.getNumIncomingValues() == 2 &&((void)0) | |||
| 773 | "Phis in loop header should have exactly 2 incoming values")((void)0); | |||
| 774 | // Check if we have a PHI node in the outer loop that has a reduction | |||
| 775 | // result from the inner loop as an incoming value. | |||
| 776 | Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch())); | |||
| 777 | PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V); | |||
| 778 | if (!InnerRedPhi || | |||
| 779 | !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) { | |||
| 780 | LLVM_DEBUG(do { } while (false) | |||
| 781 | dbgs()do { } while (false) | |||
| 782 | << "Failed to recognize PHI as an induction or reduction.\n")do { } while (false); | |||
| 783 | return false; | |||
| 784 | } | |||
| 785 | OuterInnerReductions.insert(&PHI); | |||
| 786 | OuterInnerReductions.insert(InnerRedPhi); | |||
| 787 | } | |||
| 788 | } | |||
| 789 | } | |||
| 790 | return true; | |||
| 791 | } | |||
| 792 | ||||
| 793 | // This function indicates the current limitations in the transform as a result | |||
| 794 | // of which we do not proceed. | |||
| 795 | bool LoopInterchangeLegality::currentLimitations() { | |||
| 796 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); | |||
| 797 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); | |||
| 798 | ||||
| 799 | // transform currently expects the loop latches to also be the exiting | |||
| 800 | // blocks. | |||
| 801 | if (InnerLoop->getExitingBlock() != InnerLoopLatch || | |||
| ||||
| 802 | OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() || | |||
| 803 | !isa<BranchInst>(InnerLoopLatch->getTerminator()) || | |||
| 804 | !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) { | |||
| 805 | LLVM_DEBUG(do { } while (false) | |||
| 806 | dbgs() << "Loops where the latch is not the exiting block are not"do { } while (false) | |||
| 807 | << " supported currently.\n")do { } while (false); | |||
| 808 | ORE->emit([&]() { | |||
| 809 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "ExitingNotLatch", | |||
| 810 | OuterLoop->getStartLoc(), | |||
| 811 | OuterLoop->getHeader()) | |||
| 812 | << "Loops where the latch is not the exiting block cannot be" | |||
| 813 | " interchange currently."; | |||
| 814 | }); | |||
| 815 | return true; | |||
| 816 | } | |||
| 817 | ||||
| 818 | PHINode *InnerInductionVar; | |||
| 819 | SmallVector<PHINode *, 8> Inductions; | |||
| 820 | if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) { | |||
| 821 | LLVM_DEBUG(do { } while (false) | |||
| 822 | dbgs() << "Only outer loops with induction or reduction PHI nodes "do { } while (false) | |||
| 823 | << "are supported currently.\n")do { } while (false); | |||
| 824 | ORE->emit([&]() { | |||
| 825 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedPHIOuter", | |||
| 826 | OuterLoop->getStartLoc(), | |||
| 827 | OuterLoop->getHeader()) | |||
| 828 | << "Only outer loops with induction or reduction PHI nodes can be" | |||
| 829 | " interchanged currently."; | |||
| 830 | }); | |||
| 831 | return true; | |||
| 832 | } | |||
| 833 | ||||
| 834 | // TODO: Currently we handle only loops with 1 induction variable. | |||
| 835 | if (Inductions.size() != 1) { | |||
| 836 | LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "do { } while (false) | |||
| 837 | << "supported currently.\n")do { } while (false); | |||
| 838 | ORE->emit([&]() { | |||
| 839 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "MultiIndutionOuter", | |||
| 840 | OuterLoop->getStartLoc(), | |||
| 841 | OuterLoop->getHeader()) | |||
| 842 | << "Only outer loops with 1 induction variable can be " | |||
| 843 | "interchanged currently."; | |||
| 844 | }); | |||
| 845 | return true; | |||
| 846 | } | |||
| 847 | ||||
| 848 | Inductions.clear(); | |||
| 849 | if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) { | |||
| 850 | LLVM_DEBUG(do { } while (false) | |||
| 851 | dbgs() << "Only inner loops with induction or reduction PHI nodes "do { } while (false) | |||
| 852 | << "are supported currently.\n")do { } while (false); | |||
| 853 | ORE->emit([&]() { | |||
| 854 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedPHIInner", | |||
| 855 | InnerLoop->getStartLoc(), | |||
| 856 | InnerLoop->getHeader()) | |||
| 857 | << "Only inner loops with induction or reduction PHI nodes can be" | |||
| 858 | " interchange currently."; | |||
| 859 | }); | |||
| 860 | return true; | |||
| 861 | } | |||
| 862 | ||||
| 863 | // TODO: Currently we handle only loops with 1 induction variable. | |||
| 864 | if (Inductions.size() != 1) { | |||
| 865 | LLVM_DEBUG(do { } while (false) | |||
| 866 | dbgs() << "We currently only support loops with 1 induction variable."do { } while (false) | |||
| 867 | << "Failed to interchange due to current limitation\n")do { } while (false); | |||
| 868 | ORE->emit([&]() { | |||
| 869 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "MultiInductionInner", | |||
| 870 | InnerLoop->getStartLoc(), | |||
| 871 | InnerLoop->getHeader()) | |||
| 872 | << "Only inner loops with 1 induction variable can be " | |||
| 873 | "interchanged currently."; | |||
| 874 | }); | |||
| 875 | return true; | |||
| 876 | } | |||
| 877 | InnerInductionVar = Inductions.pop_back_val(); | |||
| 878 | ||||
| 879 | // TODO: Triangular loops are not handled for now. | |||
| 880 | if (!isLoopStructureUnderstood(InnerInductionVar)) { | |||
| 881 | LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n")do { } while (false); | |||
| 882 | ORE->emit([&]() { | |||
| 883 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedStructureInner", | |||
| 884 | InnerLoop->getStartLoc(), | |||
| 885 | InnerLoop->getHeader()) | |||
| 886 | << "Inner loop structure not understood currently."; | |||
| 887 | }); | |||
| 888 | return true; | |||
| 889 | } | |||
| 890 | ||||
| 891 | // TODO: Current limitation: Since we split the inner loop latch at the point | |||
| 892 | // were induction variable is incremented (induction.next); We cannot have | |||
| 893 | // more than 1 user of induction.next since it would result in broken code | |||
| 894 | // after split. | |||
| 895 | // e.g. | |||
| 896 | // for(i=0;i<N;i++) { | |||
| 897 | // for(j = 0;j<M;j++) { | |||
| 898 | // A[j+1][i+2] = A[j][i]+k; | |||
| 899 | // } | |||
| 900 | // } | |||
| 901 | Instruction *InnerIndexVarInc = nullptr; | |||
| 902 | if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) | |||
| 903 | InnerIndexVarInc = | |||
| 904 | dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); | |||
| 905 | else | |||
| 906 | InnerIndexVarInc = | |||
| 907 | dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); | |||
| 908 | ||||
| 909 | if (!InnerIndexVarInc) { | |||
| 910 | LLVM_DEBUG(do { } while (false) | |||
| 911 | dbgs() << "Did not find an instruction to increment the induction "do { } while (false) | |||
| 912 | << "variable.\n")do { } while (false); | |||
| 913 | ORE->emit([&]() { | |||
| 914 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "NoIncrementInInner", | |||
| 915 | InnerLoop->getStartLoc(), | |||
| 916 | InnerLoop->getHeader()) | |||
| 917 | << "The inner loop does not increment the induction variable."; | |||
| 918 | }); | |||
| 919 | return true; | |||
| 920 | } | |||
| 921 | ||||
| 922 | // Since we split the inner loop latch on this induction variable. Make sure | |||
| 923 | // we do not have any instruction between the induction variable and branch | |||
| 924 | // instruction. | |||
| 925 | ||||
| 926 | bool FoundInduction = false; | |||
| 927 | for (const Instruction &I : | |||
| 928 | llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) { | |||
| 929 | if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || | |||
| 930 | isa<ZExtInst>(I)) | |||
| 931 | continue; | |||
| 932 | ||||
| 933 | // We found an instruction. If this is not induction variable then it is not | |||
| 934 | // safe to split this loop latch. | |||
| 935 | if (!I.isIdenticalTo(InnerIndexVarInc)) { | |||
| 936 | LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "do { } while (false) | |||
| 937 | << "variable increment and branch.\n")do { } while (false); | |||
| 938 | ORE->emit([&]() { | |||
| 939 | return OptimizationRemarkMissed( | |||
| 940 | DEBUG_TYPE"loop-interchange", "UnsupportedInsBetweenInduction", | |||
| 941 | InnerLoop->getStartLoc(), InnerLoop->getHeader()) | |||
| 942 | << "Found unsupported instruction between induction variable " | |||
| 943 | "increment and branch."; | |||
| 944 | }); | |||
| 945 | return true; | |||
| 946 | } | |||
| 947 | ||||
| 948 | FoundInduction = true; | |||
| 949 | break; | |||
| 950 | } | |||
| 951 | // The loop latch ended and we didn't find the induction variable return as | |||
| 952 | // current limitation. | |||
| 953 | if (!FoundInduction) { | |||
| 954 | LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n")do { } while (false); | |||
| 955 | ORE->emit([&]() { | |||
| 956 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "NoIndutionVariable", | |||
| 957 | InnerLoop->getStartLoc(), | |||
| 958 | InnerLoop->getHeader()) | |||
| 959 | << "Did not find the induction variable."; | |||
| 960 | }); | |||
| 961 | return true; | |||
| 962 | } | |||
| 963 | return false; | |||
| 964 | } | |||
| 965 | ||||
| 966 | // We currently only support LCSSA PHI nodes in the inner loop exit, if their | |||
| 967 | // users are either reduction PHIs or PHIs outside the outer loop (which means | |||
| 968 | // the we are only interested in the final value after the loop). | |||
| 969 | static bool | |||
| 970 | areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, | |||
| 971 | SmallPtrSetImpl<PHINode *> &Reductions) { | |||
| 972 | BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); | |||
| 973 | for (PHINode &PHI : InnerExit->phis()) { | |||
| 974 | // Reduction lcssa phi will have only 1 incoming block that from loop latch. | |||
| 975 | if (PHI.getNumIncomingValues() > 1) | |||
| 976 | return false; | |||
| 977 | if (any_of(PHI.users(), [&Reductions, OuterL](User *U) { | |||
| 978 | PHINode *PN = dyn_cast<PHINode>(U); | |||
| 979 | return !PN || | |||
| 980 | (!Reductions.count(PN) && OuterL->contains(PN->getParent())); | |||
| 981 | })) { | |||
| 982 | return false; | |||
| 983 | } | |||
| 984 | } | |||
| 985 | return true; | |||
| 986 | } | |||
| 987 | ||||
| 988 | // We currently support LCSSA PHI nodes in the outer loop exit, if their | |||
| 989 | // incoming values do not come from the outer loop latch or if the | |||
| 990 | // outer loop latch has a single predecessor. In that case, the value will | |||
| 991 | // be available if both the inner and outer loop conditions are true, which | |||
| 992 | // will still be true after interchanging. If we have multiple predecessor, | |||
| 993 | // that may not be the case, e.g. because the outer loop latch may be executed | |||
| 994 | // if the inner loop is not executed. | |||
| 995 | static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { | |||
| 996 | BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); | |||
| 997 | for (PHINode &PHI : LoopNestExit->phis()) { | |||
| 998 | // FIXME: We currently are not able to detect floating point reductions | |||
| 999 | // and have to use floating point PHIs as a proxy to prevent | |||
| 1000 | // interchanging in the presence of floating point reductions. | |||
| 1001 | if (PHI.getType()->isFloatingPointTy()) | |||
| 1002 | return false; | |||
| 1003 | for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { | |||
| 1004 | Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); | |||
| 1005 | if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) | |||
| 1006 | continue; | |||
| 1007 | ||||
| 1008 | // The incoming value is defined in the outer loop latch. Currently we | |||
| 1009 | // only support that in case the outer loop latch has a single predecessor. | |||
| 1010 | // This guarantees that the outer loop latch is executed if and only if | |||
| 1011 | // the inner loop is executed (because tightlyNested() guarantees that the | |||
| 1012 | // outer loop header only branches to the inner loop or the outer loop | |||
| 1013 | // latch). | |||
| 1014 | // FIXME: We could weaken this logic and allow multiple predecessors, | |||
| 1015 | // if the values are produced outside the loop latch. We would need | |||
| 1016 | // additional logic to update the PHI nodes in the exit block as | |||
| 1017 | // well. | |||
| 1018 | if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) | |||
| 1019 | return false; | |||
| 1020 | } | |||
| 1021 | } | |||
| 1022 | return true; | |||
| 1023 | } | |||
| 1024 | ||||
| 1025 | // In case of multi-level nested loops, it may occur that lcssa phis exist in | |||
| 1026 | // the latch of InnerLoop, i.e., when defs of the incoming values are further | |||
| 1027 | // inside the loopnest. Sometimes those incoming values are not available | |||
| 1028 | // after interchange, since the original inner latch will become the new outer | |||
| 1029 | // latch which may have predecessor paths that do not include those incoming | |||
| 1030 | // values. | |||
| 1031 | // TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of | |||
| 1032 | // multi-level loop nests. | |||
| 1033 | static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { | |||
| 1034 | if (InnerLoop->getSubLoops().empty()) | |||
| 1035 | return true; | |||
| 1036 | // If the original outer latch has only one predecessor, then values defined | |||
| 1037 | // further inside the looploop, e.g., in the innermost loop, will be available | |||
| 1038 | // at the new outer latch after interchange. | |||
| 1039 | if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr) | |||
| 1040 | return true; | |||
| 1041 | ||||
| 1042 | // The outer latch has more than one predecessors, i.e., the inner | |||
| 1043 | // exit and the inner header. | |||
| 1044 | // PHI nodes in the inner latch are lcssa phis where the incoming values | |||
| 1045 | // are defined further inside the loopnest. Check if those phis are used | |||
| 1046 | // in the original inner latch. If that is the case then bail out since | |||
| 1047 | // those incoming values may not be available at the new outer latch. | |||
| 1048 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); | |||
| 1049 | for (PHINode &PHI : InnerLoopLatch->phis()) { | |||
| 1050 | for (auto *U : PHI.users()) { | |||
| 1051 | Instruction *UI = cast<Instruction>(U); | |||
| 1052 | if (InnerLoopLatch == UI->getParent()) | |||
| 1053 | return false; | |||
| 1054 | } | |||
| 1055 | } | |||
| 1056 | return true; | |||
| 1057 | } | |||
| 1058 | ||||
| 1059 | bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, | |||
| 1060 | unsigned OuterLoopId, | |||
| 1061 | CharMatrix &DepMatrix) { | |||
| 1062 | if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { | |||
| 1063 | LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopIddo { } while (false) | |||
| 1064 | << " and OuterLoopId = " << OuterLoopIddo { } while (false) | |||
| 1065 | << " due to dependence\n")do { } while (false); | |||
| 1066 | ORE->emit([&]() { | |||
| 1067 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "Dependence", | |||
| 1068 | InnerLoop->getStartLoc(), | |||
| 1069 | InnerLoop->getHeader()) | |||
| 1070 | << "Cannot interchange loops due to dependences."; | |||
| 1071 | }); | |||
| 1072 | return false; | |||
| 1073 | } | |||
| 1074 | // Check if outer and inner loop contain legal instructions only. | |||
| 1075 | for (auto *BB : OuterLoop->blocks()) | |||
| 1076 | for (Instruction &I : BB->instructionsWithoutDebug()) | |||
| 1077 | if (CallInst *CI = dyn_cast<CallInst>(&I)) { | |||
| 1078 | // readnone functions do not prevent interchanging. | |||
| 1079 | if (CI->doesNotReadMemory()) | |||
| 1080 | continue; | |||
| 1081 | LLVM_DEBUG(do { } while (false) | |||
| 1082 | dbgs() << "Loops with call instructions cannot be interchanged "do { } while (false) | |||
| 1083 | << "safely.")do { } while (false); | |||
| 1084 | ORE->emit([&]() { | |||
| 1085 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "CallInst", | |||
| 1086 | CI->getDebugLoc(), | |||
| 1087 | CI->getParent()) | |||
| 1088 | << "Cannot interchange loops due to call instruction."; | |||
| 1089 | }); | |||
| 1090 | ||||
| 1091 | return false; | |||
| 1092 | } | |||
| 1093 | ||||
| 1094 | if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) { | |||
| 1095 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n")do { } while (false); | |||
| 1096 | ORE->emit([&]() { | |||
| 1097 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedInnerLatchPHI", | |||
| 1098 | InnerLoop->getStartLoc(), | |||
| 1099 | InnerLoop->getHeader()) | |||
| 1100 | << "Cannot interchange loops because unsupported PHI nodes found " | |||
| 1101 | "in inner loop latch."; | |||
| 1102 | }); | |||
| 1103 | return false; | |||
| 1104 | } | |||
| 1105 | ||||
| 1106 | // TODO: The loops could not be interchanged due to current limitations in the | |||
| 1107 | // transform module. | |||
| 1108 | if (currentLimitations()) { | |||
| 1109 | LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n")do { } while (false); | |||
| 1110 | return false; | |||
| 1111 | } | |||
| 1112 | ||||
| 1113 | // Check if the loops are tightly nested. | |||
| 1114 | if (!tightlyNested(OuterLoop, InnerLoop)) { | |||
| 1115 | LLVM_DEBUG(dbgs() << "Loops not tightly nested\n")do { } while (false); | |||
| 1116 | ORE->emit([&]() { | |||
| 1117 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "NotTightlyNested", | |||
| 1118 | InnerLoop->getStartLoc(), | |||
| 1119 | InnerLoop->getHeader()) | |||
| 1120 | << "Cannot interchange loops because they are not tightly " | |||
| 1121 | "nested."; | |||
| 1122 | }); | |||
| 1123 | return false; | |||
| 1124 | } | |||
| 1125 | ||||
| 1126 | if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, | |||
| 1127 | OuterInnerReductions)) { | |||
| 1128 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n")do { } while (false); | |||
| 1129 | ORE->emit([&]() { | |||
| 1130 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedExitPHI", | |||
| 1131 | InnerLoop->getStartLoc(), | |||
| 1132 | InnerLoop->getHeader()) | |||
| 1133 | << "Found unsupported PHI node in loop exit."; | |||
| 1134 | }); | |||
| 1135 | return false; | |||
| 1136 | } | |||
| 1137 | ||||
| 1138 | if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { | |||
| 1139 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n")do { } while (false); | |||
| 1140 | ORE->emit([&]() { | |||
| 1141 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedExitPHI", | |||
| 1142 | OuterLoop->getStartLoc(), | |||
| 1143 | OuterLoop->getHeader()) | |||
| 1144 | << "Found unsupported PHI node in loop exit."; | |||
| 1145 | }); | |||
| 1146 | return false; | |||
| 1147 | } | |||
| 1148 | ||||
| 1149 | return true; | |||
| 1150 | } | |||
| 1151 | ||||
| 1152 | int LoopInterchangeProfitability::getInstrOrderCost() { | |||
| 1153 | unsigned GoodOrder, BadOrder; | |||
| 1154 | BadOrder = GoodOrder = 0; | |||
| 1155 | for (BasicBlock *BB : InnerLoop->blocks()) { | |||
| 1156 | for (Instruction &Ins : *BB) { | |||
| 1157 | if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { | |||
| 1158 | unsigned NumOp = GEP->getNumOperands(); | |||
| 1159 | bool FoundInnerInduction = false; | |||
| 1160 | bool FoundOuterInduction = false; | |||
| 1161 | for (unsigned i = 0; i < NumOp; ++i) { | |||
| 1162 | // Skip operands that are not SCEV-able. | |||
| 1163 | if (!SE->isSCEVable(GEP->getOperand(i)->getType())) | |||
| 1164 | continue; | |||
| 1165 | ||||
| 1166 | const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); | |||
| 1167 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); | |||
| 1168 | if (!AR) | |||
| 1169 | continue; | |||
| 1170 | ||||
| 1171 | // If we find the inner induction after an outer induction e.g. | |||
| 1172 | // for(int i=0;i<N;i++) | |||
| 1173 | // for(int j=0;j<N;j++) | |||
| 1174 | // A[i][j] = A[i-1][j-1]+k; | |||
| 1175 | // then it is a good order. | |||
| 1176 | if (AR->getLoop() == InnerLoop) { | |||
| 1177 | // We found an InnerLoop induction after OuterLoop induction. It is | |||
| 1178 | // a good order. | |||
| 1179 | FoundInnerInduction = true; | |||
| 1180 | if (FoundOuterInduction) { | |||
| 1181 | GoodOrder++; | |||
| 1182 | break; | |||
| 1183 | } | |||
| 1184 | } | |||
| 1185 | // If we find the outer induction after an inner induction e.g. | |||
| 1186 | // for(int i=0;i<N;i++) | |||
| 1187 | // for(int j=0;j<N;j++) | |||
| 1188 | // A[j][i] = A[j-1][i-1]+k; | |||
| 1189 | // then it is a bad order. | |||
| 1190 | if (AR->getLoop() == OuterLoop) { | |||
| 1191 | // We found an OuterLoop induction after InnerLoop induction. It is | |||
| 1192 | // a bad order. | |||
| 1193 | FoundOuterInduction = true; | |||
| 1194 | if (FoundInnerInduction) { | |||
| 1195 | BadOrder++; | |||
| 1196 | break; | |||
| 1197 | } | |||
| 1198 | } | |||
| 1199 | } | |||
| 1200 | } | |||
| 1201 | } | |||
| 1202 | } | |||
| 1203 | return GoodOrder - BadOrder; | |||
| 1204 | } | |||
| 1205 | ||||
| 1206 | static bool isProfitableForVectorization(unsigned InnerLoopId, | |||
| 1207 | unsigned OuterLoopId, | |||
| 1208 | CharMatrix &DepMatrix) { | |||
| 1209 | // TODO: Improve this heuristic to catch more cases. | |||
| 1210 | // If the inner loop is loop independent or doesn't carry any dependency it is | |||
| 1211 | // profitable to move this to outer position. | |||
| 1212 | for (auto &Row : DepMatrix) { | |||
| 1213 | if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') | |||
| 1214 | return false; | |||
| 1215 | // TODO: We need to improve this heuristic. | |||
| 1216 | if (Row[OuterLoopId] != '=') | |||
| 1217 | return false; | |||
| 1218 | } | |||
| 1219 | // If outer loop has dependence and inner loop is loop independent then it is | |||
| 1220 | // profitable to interchange to enable parallelism. | |||
| 1221 | // If there are no dependences, interchanging will not improve anything. | |||
| 1222 | return !DepMatrix.empty(); | |||
| 1223 | } | |||
| 1224 | ||||
| 1225 | bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, | |||
| 1226 | unsigned OuterLoopId, | |||
| 1227 | CharMatrix &DepMatrix) { | |||
| 1228 | // TODO: Add better profitability checks. | |||
| 1229 | // e.g | |||
| 1230 | // 1) Construct dependency matrix and move the one with no loop carried dep | |||
| 1231 | // inside to enable vectorization. | |||
| 1232 | ||||
| 1233 | // This is rough cost estimation algorithm. It counts the good and bad order | |||
| 1234 | // of induction variables in the instruction and allows reordering if number | |||
| 1235 | // of bad orders is more than good. | |||
| 1236 | int Cost = getInstrOrderCost(); | |||
| 1237 | LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n")do { } while (false); | |||
| 1238 | if (Cost < -LoopInterchangeCostThreshold) | |||
| 1239 | return true; | |||
| 1240 | ||||
| 1241 | // It is not profitable as per current cache profitability model. But check if | |||
| 1242 | // we can move this loop outside to improve parallelism. | |||
| 1243 | if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) | |||
| 1244 | return true; | |||
| 1245 | ||||
| 1246 | ORE->emit([&]() { | |||
| 1247 | return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "InterchangeNotProfitable", | |||
| 1248 | InnerLoop->getStartLoc(), | |||
| 1249 | InnerLoop->getHeader()) | |||
| 1250 | << "Interchanging loops is too costly (cost=" | |||
| 1251 | << ore::NV("Cost", Cost) << ", threshold=" | |||
| 1252 | << ore::NV("Threshold", LoopInterchangeCostThreshold) | |||
| 1253 | << ") and it does not improve parallelism."; | |||
| 1254 | }); | |||
| 1255 | return false; | |||
| 1256 | } | |||
| 1257 | ||||
| 1258 | void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, | |||
| 1259 | Loop *InnerLoop) { | |||
| 1260 | for (Loop *L : *OuterLoop) | |||
| 1261 | if (L == InnerLoop) { | |||
| 1262 | OuterLoop->removeChildLoop(L); | |||
| 1263 | return; | |||
| 1264 | } | |||
| 1265 | llvm_unreachable("Couldn't find loop")__builtin_unreachable(); | |||
| 1266 | } | |||
| 1267 | ||||
| 1268 | /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the | |||
| 1269 | /// new inner and outer loop after interchanging: NewInner is the original | |||
| 1270 | /// outer loop and NewOuter is the original inner loop. | |||
| 1271 | /// | |||
| 1272 | /// Before interchanging, we have the following structure | |||
| 1273 | /// Outer preheader | |||
| 1274 | // Outer header | |||
| 1275 | // Inner preheader | |||
| 1276 | // Inner header | |||
| 1277 | // Inner body | |||
| 1278 | // Inner latch | |||
| 1279 | // outer bbs | |||
| 1280 | // Outer latch | |||
| 1281 | // | |||
| 1282 | // After interchanging: | |||
| 1283 | // Inner preheader | |||
| 1284 | // Inner header | |||
| 1285 | // Outer preheader | |||
| 1286 | // Outer header | |||
| 1287 | // Inner body | |||
| 1288 | // outer bbs | |||
| 1289 | // Outer latch | |||
| 1290 | // Inner latch | |||
| 1291 | void LoopInterchangeTransform::restructureLoops( | |||
| 1292 | Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader, | |||
| 1293 | BasicBlock *OrigOuterPreHeader) { | |||
| 1294 | Loop *OuterLoopParent = OuterLoop->getParentLoop(); | |||
| 1295 | // The original inner loop preheader moves from the new inner loop to | |||
| 1296 | // the parent loop, if there is one. | |||
| 1297 | NewInner->removeBlockFromLoop(OrigInnerPreHeader); | |||
| 1298 | LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent); | |||
| 1299 | ||||
| 1300 | // Switch the loop levels. | |||
| 1301 | if (OuterLoopParent) { | |||
| 1302 | // Remove the loop from its parent loop. | |||
| 1303 | removeChildLoop(OuterLoopParent, NewInner); | |||
| 1304 | removeChildLoop(NewInner, NewOuter); | |||
| 1305 | OuterLoopParent->addChildLoop(NewOuter); | |||
| 1306 | } else { | |||
| 1307 | removeChildLoop(NewInner, NewOuter); | |||
| 1308 | LI->changeTopLevelLoop(NewInner, NewOuter); | |||
| 1309 | } | |||
| 1310 | while (!NewOuter->isInnermost()) | |||
| 1311 | NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin())); | |||
| 1312 | NewOuter->addChildLoop(NewInner); | |||
| 1313 | ||||
| 1314 | // BBs from the original inner loop. | |||
| 1315 | SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks()); | |||
| 1316 | ||||
| 1317 | // Add BBs from the original outer loop to the original inner loop (excluding | |||
| 1318 | // BBs already in inner loop) | |||
| 1319 | for (BasicBlock *BB : NewInner->blocks()) | |||
| 1320 | if (LI->getLoopFor(BB) == NewInner) | |||
| 1321 | NewOuter->addBlockEntry(BB); | |||
| 1322 | ||||
| 1323 | // Now remove inner loop header and latch from the new inner loop and move | |||
| 1324 | // other BBs (the loop body) to the new inner loop. | |||
| 1325 | BasicBlock *OuterHeader = NewOuter->getHeader(); | |||
| 1326 | BasicBlock *OuterLatch = NewOuter->getLoopLatch(); | |||
| 1327 | for (BasicBlock *BB : OrigInnerBBs) { | |||
| 1328 | // Nothing will change for BBs in child loops. | |||
| 1329 | if (LI->getLoopFor(BB) != NewOuter) | |||
| 1330 | continue; | |||
| 1331 | // Remove the new outer loop header and latch from the new inner loop. | |||
| 1332 | if (BB == OuterHeader || BB == OuterLatch) | |||
| 1333 | NewInner->removeBlockFromLoop(BB); | |||
| 1334 | else | |||
| 1335 | LI->changeLoopFor(BB, NewInner); | |||
| 1336 | } | |||
| 1337 | ||||
| 1338 | // The preheader of the original outer loop becomes part of the new | |||
| 1339 | // outer loop. | |||
| 1340 | NewOuter->addBlockEntry(OrigOuterPreHeader); | |||
| 1341 | LI->changeLoopFor(OrigOuterPreHeader, NewOuter); | |||
| 1342 | ||||
| 1343 | // Tell SE that we move the loops around. | |||
| 1344 | SE->forgetLoop(NewOuter); | |||
| 1345 | SE->forgetLoop(NewInner); | |||
| 1346 | } | |||
| 1347 | ||||
| 1348 | bool LoopInterchangeTransform::transform() { | |||
| 1349 | bool Transformed = false; | |||
| 1350 | Instruction *InnerIndexVar; | |||
| 1351 | ||||
| 1352 | if (InnerLoop->getSubLoops().empty()) { | |||
| 1353 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); | |||
| 1354 | LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n")do { } while (false); | |||
| 1355 | PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); | |||
| 1356 | if (!InductionPHI) { | |||
| 1357 | LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n")do { } while (false); | |||
| 1358 | return false; | |||
| 1359 | } | |||
| 1360 | ||||
| 1361 | if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) | |||
| 1362 | InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); | |||
| 1363 | else | |||
| 1364 | InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); | |||
| 1365 | ||||
| 1366 | // Ensure that InductionPHI is the first Phi node. | |||
| 1367 | if (&InductionPHI->getParent()->front() != InductionPHI) | |||
| 1368 | InductionPHI->moveBefore(&InductionPHI->getParent()->front()); | |||
| 1369 | ||||
| 1370 | // Create a new latch block for the inner loop. We split at the | |||
| 1371 | // current latch's terminator and then move the condition and all | |||
| 1372 | // operands that are not either loop-invariant or the induction PHI into the | |||
| 1373 | // new latch block. | |||
| 1374 | BasicBlock *NewLatch = | |||
| 1375 | SplitBlock(InnerLoop->getLoopLatch(), | |||
| 1376 | InnerLoop->getLoopLatch()->getTerminator(), DT, LI); | |||
| 1377 | ||||
| 1378 | SmallSetVector<Instruction *, 4> WorkList; | |||
| 1379 | unsigned i = 0; | |||
| 1380 | auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() { | |||
| 1381 | for (; i < WorkList.size(); i++) { | |||
| 1382 | // Duplicate instruction and move it the new latch. Update uses that | |||
| 1383 | // have been moved. | |||
| 1384 | Instruction *NewI = WorkList[i]->clone(); | |||
| 1385 | NewI->insertBefore(NewLatch->getFirstNonPHI()); | |||
| 1386 | assert(!NewI->mayHaveSideEffects() &&((void)0) | |||
| 1387 | "Moving instructions with side-effects may change behavior of "((void)0) | |||
| 1388 | "the loop nest!")((void)0); | |||
| 1389 | for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) { | |||
| 1390 | Instruction *UserI = cast<Instruction>(U.getUser()); | |||
| 1391 | if (!InnerLoop->contains(UserI->getParent()) || | |||
| 1392 | UserI->getParent() == NewLatch || UserI == InductionPHI) | |||
| 1393 | U.set(NewI); | |||
| 1394 | } | |||
| 1395 | // Add operands of moved instruction to the worklist, except if they are | |||
| 1396 | // outside the inner loop or are the induction PHI. | |||
| 1397 | for (Value *Op : WorkList[i]->operands()) { | |||
| 1398 | Instruction *OpI = dyn_cast<Instruction>(Op); | |||
| 1399 | if (!OpI || | |||
| 1400 | this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop || | |||
| 1401 | OpI == InductionPHI) | |||
| 1402 | continue; | |||
| 1403 | WorkList.insert(OpI); | |||
| 1404 | } | |||
| 1405 | } | |||
| 1406 | }; | |||
| 1407 | ||||
| 1408 | // FIXME: Should we interchange when we have a constant condition? | |||
| 1409 | Instruction *CondI = dyn_cast<Instruction>( | |||
| 1410 | cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator()) | |||
| 1411 | ->getCondition()); | |||
| 1412 | if (CondI) | |||
| 1413 | WorkList.insert(CondI); | |||
| 1414 | MoveInstructions(); | |||
| 1415 | WorkList.insert(cast<Instruction>(InnerIndexVar)); | |||
| 1416 | MoveInstructions(); | |||
| 1417 | ||||
| 1418 | // Splits the inner loops phi nodes out into a separate basic block. | |||
| 1419 | BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); | |||
| 1420 | SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); | |||
| 1421 | LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n")do { } while (false); | |||
| 1422 | } | |||
| 1423 | ||||
| 1424 | // Instructions in the original inner loop preheader may depend on values | |||
| 1425 | // defined in the outer loop header. Move them there, because the original | |||
| 1426 | // inner loop preheader will become the entry into the interchanged loop nest. | |||
| 1427 | // Currently we move all instructions and rely on LICM to move invariant | |||
| 1428 | // instructions outside the loop nest. | |||
| 1429 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); | |||
| 1430 | BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); | |||
| 1431 | if (InnerLoopPreHeader != OuterLoopHeader) { | |||
| 1432 | SmallPtrSet<Instruction *, 4> NeedsMoving; | |||
| 1433 | for (Instruction &I : | |||
| 1434 | make_early_inc_range(make_range(InnerLoopPreHeader->begin(), | |||
| 1435 | std::prev(InnerLoopPreHeader->end())))) | |||
| 1436 | I.moveBefore(OuterLoopHeader->getTerminator()); | |||
| 1437 | } | |||
| 1438 | ||||
| 1439 | Transformed |= adjustLoopLinks(); | |||
| 1440 | if (!Transformed) { | |||
| 1441 | LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n")do { } while (false); | |||
| 1442 | return false; | |||
| 1443 | } | |||
| 1444 | ||||
| 1445 | return true; | |||
| 1446 | } | |||
| 1447 | ||||
| 1448 | /// \brief Move all instructions except the terminator from FromBB right before | |||
| 1449 | /// InsertBefore | |||
| 1450 | static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { | |||
| 1451 | auto &ToList = InsertBefore->getParent()->getInstList(); | |||
| 1452 | auto &FromList = FromBB->getInstList(); | |||
| 1453 | ||||
| 1454 | ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), | |||
| 1455 | FromBB->getTerminator()->getIterator()); | |||
| 1456 | } | |||
| 1457 | ||||
| 1458 | /// Swap instructions between \p BB1 and \p BB2 but keep terminators intact. | |||
| 1459 | static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) { | |||
| 1460 | // Save all non-terminator instructions of BB1 into TempInstrs and unlink them | |||
| 1461 | // from BB1 afterwards. | |||
| 1462 | auto Iter = map_range(*BB1, [](Instruction &I) { return &I; }); | |||
| 1463 | SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end())); | |||
| 1464 | for (Instruction *I : TempInstrs) | |||
| 1465 | I->removeFromParent(); | |||
| 1466 | ||||
| 1467 | // Move instructions from BB2 to BB1. | |||
| 1468 | moveBBContents(BB2, BB1->getTerminator()); | |||
| 1469 | ||||
| 1470 | // Move instructions from TempInstrs to BB2. | |||
| 1471 | for (Instruction *I : TempInstrs) | |||
| 1472 | I->insertBefore(BB2->getTerminator()); | |||
| 1473 | } | |||
| 1474 | ||||
| 1475 | // Update BI to jump to NewBB instead of OldBB. Records updates to the | |||
| 1476 | // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that | |||
| 1477 | // \p OldBB is exactly once in BI's successor list. | |||
| 1478 | static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, | |||
| 1479 | BasicBlock *NewBB, | |||
| 1480 | std::vector<DominatorTree::UpdateType> &DTUpdates, | |||
| 1481 | bool MustUpdateOnce = true) { | |||
| 1482 | assert((!MustUpdateOnce ||((void)0) | |||
| 1483 | llvm::count_if(successors(BI),((void)0) | |||
| 1484 | [OldBB](BasicBlock *BB) {((void)0) | |||
| 1485 | return BB == OldBB;((void)0) | |||
| 1486 | }) == 1) && "BI must jump to OldBB exactly once.")((void)0); | |||
| 1487 | bool Changed = false; | |||
| 1488 | for (Use &Op : BI->operands()) | |||
| 1489 | if (Op == OldBB) { | |||
| 1490 | Op.set(NewBB); | |||
| 1491 | Changed = true; | |||
| 1492 | } | |||
| 1493 | ||||
| 1494 | if (Changed) { | |||
| 1495 | DTUpdates.push_back( | |||
| 1496 | {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); | |||
| 1497 | DTUpdates.push_back( | |||
| 1498 | {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); | |||
| 1499 | } | |||
| 1500 | assert(Changed && "Expected a successor to be updated")((void)0); | |||
| 1501 | } | |||
| 1502 | ||||
| 1503 | // Move Lcssa PHIs to the right place. | |||
| 1504 | static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, | |||
| 1505 | BasicBlock *InnerLatch, BasicBlock *OuterHeader, | |||
| 1506 | BasicBlock *OuterLatch, BasicBlock *OuterExit, | |||
| 1507 | Loop *InnerLoop, LoopInfo *LI) { | |||
| 1508 | ||||
| 1509 | // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are | |||
| 1510 | // defined either in the header or latch. Those blocks will become header and | |||
| 1511 | // latch of the new outer loop, and the only possible users can PHI nodes | |||
| 1512 | // in the exit block of the loop nest or the outer loop header (reduction | |||
| 1513 | // PHIs, in that case, the incoming value must be defined in the inner loop | |||
| 1514 | // header). We can just substitute the user with the incoming value and remove | |||
| 1515 | // the PHI. | |||
| 1516 | for (PHINode &P : make_early_inc_range(InnerExit->phis())) { | |||
| 1517 | assert(P.getNumIncomingValues() == 1 &&((void)0) | |||
| 1518 | "Only loops with a single exit are supported!")((void)0); | |||
| 1519 | ||||
| 1520 | // Incoming values are guaranteed be instructions currently. | |||
| 1521 | auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch)); | |||
| 1522 | // Skip phis with incoming values from the inner loop body, excluding the | |||
| 1523 | // header and latch. | |||
| 1524 | if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader) | |||
| 1525 | continue; | |||
| 1526 | ||||
| 1527 | assert(all_of(P.users(),((void)0) | |||
| 1528 | [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {((void)0) | |||
| 1529 | return (cast<PHINode>(U)->getParent() == OuterHeader &&((void)0) | |||
| 1530 | IncI->getParent() == InnerHeader) ||((void)0) | |||
| 1531 | cast<PHINode>(U)->getParent() == OuterExit;((void)0) | |||
| 1532 | }) &&((void)0) | |||
| 1533 | "Can only replace phis iff the uses are in the loop nest exit or "((void)0) | |||
| 1534 | "the incoming value is defined in the inner header (it will "((void)0) | |||
| 1535 | "dominate all loop blocks after interchanging)")((void)0); | |||
| 1536 | P.replaceAllUsesWith(IncI); | |||
| 1537 | P.eraseFromParent(); | |||
| 1538 | } | |||
| 1539 | ||||
| 1540 | SmallVector<PHINode *, 8> LcssaInnerExit; | |||
| 1541 | for (PHINode &P : InnerExit->phis()) | |||
| 1542 | LcssaInnerExit.push_back(&P); | |||
| 1543 | ||||
| 1544 | SmallVector<PHINode *, 8> LcssaInnerLatch; | |||
| 1545 | for (PHINode &P : InnerLatch->phis()) | |||
| 1546 | LcssaInnerLatch.push_back(&P); | |||
| 1547 | ||||
| 1548 | // Lcssa PHIs for values used outside the inner loop are in InnerExit. | |||
| 1549 | // If a PHI node has users outside of InnerExit, it has a use outside the | |||
| 1550 | // interchanged loop and we have to preserve it. We move these to | |||
| 1551 | // InnerLatch, which will become the new exit block for the innermost | |||
| 1552 | // loop after interchanging. | |||
| 1553 | for (PHINode *P : LcssaInnerExit) | |||
| 1554 | P->moveBefore(InnerLatch->getFirstNonPHI()); | |||
| 1555 | ||||
| 1556 | // If the inner loop latch contains LCSSA PHIs, those come from a child loop | |||
| 1557 | // and we have to move them to the new inner latch. | |||
| 1558 | for (PHINode *P : LcssaInnerLatch) | |||
| 1559 | P->moveBefore(InnerExit->getFirstNonPHI()); | |||
| 1560 | ||||
| 1561 | // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have | |||
| 1562 | // incoming values defined in the outer loop, we have to add a new PHI | |||
| 1563 | // in the inner loop latch, which became the exit block of the outer loop, | |||
| 1564 | // after interchanging. | |||
| 1565 | if (OuterExit) { | |||
| 1566 | for (PHINode &P : OuterExit->phis()) { | |||
| 1567 | if (P.getNumIncomingValues() != 1) | |||
| 1568 | continue; | |||
| 1569 | // Skip Phis with incoming values defined in the inner loop. Those should | |||
| 1570 | // already have been updated. | |||
| 1571 | auto I = dyn_cast<Instruction>(P.getIncomingValue(0)); | |||
| 1572 | if (!I || LI->getLoopFor(I->getParent()) == InnerLoop) | |||
| 1573 | continue; | |||
| 1574 | ||||
| 1575 | PHINode *NewPhi = dyn_cast<PHINode>(P.clone()); | |||
| 1576 | NewPhi->setIncomingValue(0, P.getIncomingValue(0)); | |||
| 1577 | NewPhi->setIncomingBlock(0, OuterLatch); | |||
| 1578 | // We might have incoming edges from other BBs, i.e., the original outer | |||
| 1579 | // header. | |||
| 1580 | for (auto *Pred : predecessors(InnerLatch)) { | |||
| 1581 | if (Pred == OuterLatch) | |||
| 1582 | continue; | |||
| 1583 | NewPhi->addIncoming(P.getIncomingValue(0), Pred); | |||
| 1584 | } | |||
| 1585 | NewPhi->insertBefore(InnerLatch->getFirstNonPHI()); | |||
| 1586 | P.setIncomingValue(0, NewPhi); | |||
| 1587 | } | |||
| 1588 | } | |||
| 1589 | ||||
| 1590 | // Now adjust the incoming blocks for the LCSSA PHIs. | |||
| 1591 | // For PHIs moved from Inner's exit block, we need to replace Inner's latch | |||
| 1592 | // with the new latch. | |||
| 1593 | InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch); | |||
| 1594 | } | |||
| 1595 | ||||
| 1596 | bool LoopInterchangeTransform::adjustLoopBranches() { | |||
| 1597 | LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n")do { } while (false); | |||
| 1598 | std::vector<DominatorTree::UpdateType> DTUpdates; | |||
| 1599 | ||||
| 1600 | BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); | |||
| 1601 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); | |||
| 1602 | ||||
| 1603 | assert(OuterLoopPreHeader != OuterLoop->getHeader() &&((void)0) | |||
| 1604 | InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&((void)0) | |||
| 1605 | InnerLoopPreHeader && "Guaranteed by loop-simplify form")((void)0); | |||
| 1606 | // Ensure that both preheaders do not contain PHI nodes and have single | |||
| 1607 | // predecessors. This allows us to move them easily. We use | |||
| 1608 | // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing | |||
| 1609 | // preheaders do not satisfy those conditions. | |||
| 1610 | if (isa<PHINode>(OuterLoopPreHeader->begin()) || | |||
| 1611 | !OuterLoopPreHeader->getUniquePredecessor()) | |||
| 1612 | OuterLoopPreHeader = | |||
| 1613 | InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true); | |||
| 1614 | if (InnerLoopPreHeader == OuterLoop->getHeader()) | |||
| 1615 | InnerLoopPreHeader = | |||
| 1616 | InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true); | |||
| 1617 | ||||
| 1618 | // Adjust the loop preheader | |||
| 1619 | BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); | |||
| 1620 | BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); | |||
| 1621 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); | |||
| 1622 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); | |||
| 1623 | BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); | |||
| 1624 | BasicBlock *InnerLoopLatchPredecessor = | |||
| 1625 | InnerLoopLatch->getUniquePredecessor(); | |||
| 1626 | BasicBlock *InnerLoopLatchSuccessor; | |||
| 1627 | BasicBlock *OuterLoopLatchSuccessor; | |||
| 1628 | ||||
| 1629 | BranchInst *OuterLoopLatchBI = | |||
| 1630 | dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); | |||
| 1631 | BranchInst *InnerLoopLatchBI = | |||
| 1632 | dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); | |||
| 1633 | BranchInst *OuterLoopHeaderBI = | |||
| 1634 | dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); | |||
| 1635 | BranchInst *InnerLoopHeaderBI = | |||
| 1636 | dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); | |||
| 1637 | ||||
| 1638 | if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || | |||
| 1639 | !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || | |||
| 1640 | !InnerLoopHeaderBI) | |||
| 1641 | return false; | |||
| 1642 | ||||
| 1643 | BranchInst *InnerLoopLatchPredecessorBI = | |||
| 1644 | dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); | |||
| 1645 | BranchInst *OuterLoopPredecessorBI = | |||
| 1646 | dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); | |||
| 1647 | ||||
| 1648 | if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) | |||
| 1649 | return false; | |||
| 1650 | BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); | |||
| 1651 | if (!InnerLoopHeaderSuccessor) | |||
| 1652 | return false; | |||
| 1653 | ||||
| 1654 | // Adjust Loop Preheader and headers. | |||
| 1655 | // The branches in the outer loop predecessor and the outer loop header can | |||
| 1656 | // be unconditional branches or conditional branches with duplicates. Consider | |||
| 1657 | // this when updating the successors. | |||
| 1658 | updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, | |||
| 1659 | InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); | |||
| 1660 | // The outer loop header might or might not branch to the outer latch. | |||
| 1661 | // We are guaranteed to branch to the inner loop preheader. | |||
| 1662 | if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) { | |||
| 1663 | // In this case the outerLoopHeader should branch to the InnerLoopLatch. | |||
| 1664 | updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch, | |||
| 1665 | DTUpdates, | |||
| 1666 | /*MustUpdateOnce=*/false); | |||
| 1667 | } | |||
| 1668 | updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, | |||
| 1669 | InnerLoopHeaderSuccessor, DTUpdates, | |||
| 1670 | /*MustUpdateOnce=*/false); | |||
| 1671 | ||||
| 1672 | // Adjust reduction PHI's now that the incoming block has changed. | |||
| 1673 | InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader, | |||
| 1674 | OuterLoopHeader); | |||
| 1675 | ||||
| 1676 | updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor, | |||
| 1677 | OuterLoopPreHeader, DTUpdates); | |||
| 1678 | ||||
| 1679 | // -------------Adjust loop latches----------- | |||
| 1680 | if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) | |||
| 1681 | InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); | |||
| 1682 | else | |||
| 1683 | InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); | |||
| 1684 | ||||
| 1685 | updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, | |||
| 1686 | InnerLoopLatchSuccessor, DTUpdates); | |||
| 1687 | ||||
| 1688 | ||||
| 1689 | if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) | |||
| 1690 | OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); | |||
| 1691 | else | |||
| 1692 | OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); | |||
| 1693 | ||||
| 1694 | updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor, | |||
| 1695 | OuterLoopLatchSuccessor, DTUpdates); | |||
| 1696 | updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, | |||
| 1697 | DTUpdates); | |||
| 1698 | ||||
| 1699 | DT->applyUpdates(DTUpdates); | |||
| 1700 | restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader, | |||
| 1701 | OuterLoopPreHeader); | |||
| 1702 | ||||
| 1703 | moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch, | |||
| 1704 | OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(), | |||
| 1705 | InnerLoop, LI); | |||
| 1706 | // For PHIs in the exit block of the outer loop, outer's latch has been | |||
| 1707 | // replaced by Inners'. | |||
| 1708 | OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); | |||
| 1709 | ||||
| 1710 | auto &OuterInnerReductions = LIL.getOuterInnerReductions(); | |||
| 1711 | // Now update the reduction PHIs in the inner and outer loop headers. | |||
| 1712 | SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; | |||
| 1713 | for (PHINode &PHI : InnerLoopHeader->phis()) { | |||
| 1714 | if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) | |||
| 1715 | continue; | |||
| 1716 | InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); | |||
| 1717 | } | |||
| 1718 | for (PHINode &PHI : OuterLoopHeader->phis()) { | |||
| 1719 | if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) | |||
| 1720 | continue; | |||
| 1721 | OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); | |||
| 1722 | } | |||
| 1723 | ||||
| 1724 | // Now move the remaining reduction PHIs from outer to inner loop header and | |||
| 1725 | // vice versa. The PHI nodes must be part of a reduction across the inner and | |||
| 1726 | // outer loop and all the remains to do is and updating the incoming blocks. | |||
| 1727 | for (PHINode *PHI : OuterLoopPHIs) { | |||
| 1728 | PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); | |||
| 1729 | assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node")((void)0); | |||
| 1730 | } | |||
| 1731 | for (PHINode *PHI : InnerLoopPHIs) { | |||
| 1732 | PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); | |||
| 1733 | assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node")((void)0); | |||
| 1734 | } | |||
| 1735 | ||||
| 1736 | // Update the incoming blocks for moved PHI nodes. | |||
| 1737 | OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader); | |||
| 1738 | OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch); | |||
| 1739 | InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader); | |||
| 1740 | InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); | |||
| 1741 | ||||
| 1742 | // Values defined in the outer loop header could be used in the inner loop | |||
| 1743 | // latch. In that case, we need to create LCSSA phis for them, because after | |||
| 1744 | // interchanging they will be defined in the new inner loop and used in the | |||
| 1745 | // new outer loop. | |||
| 1746 | IRBuilder<> Builder(OuterLoopHeader->getContext()); | |||
| 1747 | SmallVector<Instruction *, 4> MayNeedLCSSAPhis; | |||
| 1748 | for (Instruction &I : | |||
| 1749 | make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end()))) | |||
| 1750 | MayNeedLCSSAPhis.push_back(&I); | |||
| 1751 | formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder); | |||
| 1752 | ||||
| 1753 | return true; | |||
| 1754 | } | |||
| 1755 | ||||
| 1756 | bool LoopInterchangeTransform::adjustLoopLinks() { | |||
| 1757 | // Adjust all branches in the inner and outer loop. | |||
| 1758 | bool Changed = adjustLoopBranches(); | |||
| 1759 | if (Changed) { | |||
| 1760 | // We have interchanged the preheaders so we need to interchange the data in | |||
| 1761 | // the preheaders as well. This is because the content of the inner | |||
| 1762 | // preheader was previously executed inside the outer loop. | |||
| 1763 | BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); | |||
| 1764 | BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); | |||
| 1765 | swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader); | |||
| 1766 | } | |||
| 1767 | return Changed; | |||
| 1768 | } | |||
| 1769 | ||||
| 1770 | /// Main LoopInterchange Pass. | |||
| 1771 | struct LoopInterchangeLegacyPass : public LoopPass { | |||
| 1772 | static char ID; | |||
| 1773 | ||||
| 1774 | LoopInterchangeLegacyPass() : LoopPass(ID) { | |||
| 1775 | initializeLoopInterchangeLegacyPassPass(*PassRegistry::getPassRegistry()); | |||
| 1776 | } | |||
| 1777 | ||||
| 1778 | void getAnalysisUsage(AnalysisUsage &AU) const override { | |||
| 1779 | AU.addRequired<DependenceAnalysisWrapperPass>(); | |||
| 1780 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); | |||
| 1781 | ||||
| 1782 | getLoopAnalysisUsage(AU); | |||
| 1783 | } | |||
| 1784 | ||||
| 1785 | bool runOnLoop(Loop *L, LPPassManager &LPM) override { | |||
| 1786 | if (skipLoop(L)) | |||
| 1787 | return false; | |||
| 1788 | ||||
| 1789 | auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); | |||
| 1790 | auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); | |||
| 1791 | auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); | |||
| 1792 | auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); | |||
| 1793 | auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); | |||
| 1794 | ||||
| 1795 | return LoopInterchange(SE, LI, DI, DT, ORE).run(L); | |||
| 1796 | } | |||
| 1797 | }; | |||
| 1798 | ||||
| 1799 | char LoopInterchangeLegacyPass::ID = 0; | |||
| 1800 | ||||
| 1801 | INITIALIZE_PASS_BEGIN(LoopInterchangeLegacyPass, "loop-interchange",static void *initializeLoopInterchangeLegacyPassPassOnce(PassRegistry &Registry) { | |||
| 1802 | "Interchanges loops for cache reuse", false, false)static void *initializeLoopInterchangeLegacyPassPassOnce(PassRegistry &Registry) { | |||
| 1803 | INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry); | |||
| 1804 | INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)initializeDependenceAnalysisWrapperPassPass(Registry); | |||
| 1805 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry); | |||
| 1806 | ||||
| 1807 | INITIALIZE_PASS_END(LoopInterchangeLegacyPass, "loop-interchange",PassInfo *PI = new PassInfo( "Interchanges loops for cache reuse" , "loop-interchange", &LoopInterchangeLegacyPass::ID, PassInfo ::NormalCtor_t(callDefaultCtor<LoopInterchangeLegacyPass> ), false, false); Registry.registerPass(*PI, true); return PI ; } static llvm::once_flag InitializeLoopInterchangeLegacyPassPassFlag ; void llvm::initializeLoopInterchangeLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoopInterchangeLegacyPassPassFlag , initializeLoopInterchangeLegacyPassPassOnce, std::ref(Registry )); } | |||
| 1808 | "Interchanges loops for cache reuse", false, false)PassInfo *PI = new PassInfo( "Interchanges loops for cache reuse" , "loop-interchange", &LoopInterchangeLegacyPass::ID, PassInfo ::NormalCtor_t(callDefaultCtor<LoopInterchangeLegacyPass> ), false, false); Registry.registerPass(*PI, true); return PI ; } static llvm::once_flag InitializeLoopInterchangeLegacyPassPassFlag ; void llvm::initializeLoopInterchangeLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoopInterchangeLegacyPassPassFlag , initializeLoopInterchangeLegacyPassPassOnce, std::ref(Registry )); } | |||
| 1809 | ||||
| 1810 | Pass *llvm::createLoopInterchangePass() { | |||
| 1811 | return new LoopInterchangeLegacyPass(); | |||
| 1812 | } | |||
| 1813 | ||||
| 1814 | PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, | |||
| 1815 | LoopAnalysisManager &AM, | |||
| 1816 | LoopStandardAnalysisResults &AR, | |||
| 1817 | LPMUpdater &U) { | |||
| 1818 | Function &F = *LN.getParent(); | |||
| 1819 | ||||
| 1820 | DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); | |||
| 1821 | OptimizationRemarkEmitter ORE(&F); | |||
| 1822 | if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN)) | |||
| 1823 | return PreservedAnalyses::all(); | |||
| 1824 | return getLoopPassPreservedAnalyses(); | |||
| 1825 | } |