Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Warning:line 1576, column 7
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopInterchange.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -D PIC -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -D_RET_PROTECTOR -ret-protector -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Transforms/Scalar/LoopInterchange.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/Analysis/DependenceAnalysis.h"
21#include "llvm/Analysis/LoopInfo.h"
22#include "llvm/Analysis/LoopNestAnalysis.h"
23#include "llvm/Analysis/LoopPass.h"
24#include "llvm/Analysis/OptimizationRemarkEmitter.h"
25#include "llvm/Analysis/ScalarEvolution.h"
26#include "llvm/Analysis/ScalarEvolutionExpressions.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/Constants.h"
29#include "llvm/IR/DiagnosticInfo.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
33#include "llvm/IR/InstrTypes.h"
34#include "llvm/IR/Instruction.h"
35#include "llvm/IR/Instructions.h"
36#include "llvm/IR/Type.h"
37#include "llvm/IR/User.h"
38#include "llvm/IR/Value.h"
39#include "llvm/InitializePasses.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/Casting.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Transforms/Scalar.h"
47#include "llvm/Transforms/Utils.h"
48#include "llvm/Transforms/Utils/BasicBlockUtils.h"
49#include "llvm/Transforms/Utils/LoopUtils.h"
50#include <cassert>
51#include <utility>
52#include <vector>
53
54using namespace llvm;
55
56#define DEBUG_TYPE"loop-interchange" "loop-interchange"
57
58STATISTIC(LoopsInterchanged, "Number of loops interchanged")static llvm::Statistic LoopsInterchanged = {"loop-interchange"
, "LoopsInterchanged", "Number of loops interchanged"}
;
59
60static cl::opt<int> LoopInterchangeCostThreshold(
61 "loop-interchange-threshold", cl::init(0), cl::Hidden,
62 cl::desc("Interchange if you gain more than this number"));
63
64namespace {
65
66using LoopVector = SmallVector<Loop *, 8>;
67
68// TODO: Check if we can use a sparse matrix here.
69using CharMatrix = std::vector<std::vector<char>>;
70
71} // end anonymous namespace
72
73// Maximum number of dependencies that can be handled in the dependency matrix.
74static const unsigned MaxMemInstrCount = 100;
75
76// Maximum loop depth supported.
77static const unsigned MaxLoopNestDepth = 10;
78
79#ifdef DUMP_DEP_MATRICIES
80static void printDepMatrix(CharMatrix &DepMatrix) {
81 for (auto &Row : DepMatrix) {
82 for (auto D : Row)
83 LLVM_DEBUG(dbgs() << D << " ")do { } while (false);
84 LLVM_DEBUG(dbgs() << "\n")do { } while (false);
85 }
86}
87#endif
88
89static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
90 Loop *L, DependenceInfo *DI) {
91 using ValueVector = SmallVector<Value *, 16>;
92
93 ValueVector MemInstr;
94
95 // For each block.
96 for (BasicBlock *BB : L->blocks()) {
97 // Scan the BB and collect legal loads and stores.
98 for (Instruction &I : *BB) {
99 if (!isa<Instruction>(I))
100 return false;
101 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
102 if (!Ld->isSimple())
103 return false;
104 MemInstr.push_back(&I);
105 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
106 if (!St->isSimple())
107 return false;
108 MemInstr.push_back(&I);
109 }
110 }
111 }
112
113 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()do { } while (false)
114 << " Loads and Stores to analyze\n")do { } while (false);
115
116 ValueVector::iterator I, IE, J, JE;
117
118 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
119 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
120 std::vector<char> Dep;
121 Instruction *Src = cast<Instruction>(*I);
122 Instruction *Dst = cast<Instruction>(*J);
123 if (Src == Dst)
124 continue;
125 // Ignore Input dependencies.
126 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
127 continue;
128 // Track Output, Flow, and Anti dependencies.
129 if (auto D = DI->depends(Src, Dst, true)) {
130 assert(D->isOrdered() && "Expected an output, flow or anti dep.")((void)0);
131 LLVM_DEBUG(StringRef DepType =do { } while (false)
132 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";do { } while (false)
133 dbgs() << "Found " << DepTypedo { } while (false)
134 << " dependency between Src and Dst\n"do { } while (false)
135 << " Src:" << *Src << "\n Dst:" << *Dst << '\n')do { } while (false);
136 unsigned Levels = D->getLevels();
137 char Direction;
138 for (unsigned II = 1; II <= Levels; ++II) {
139 const SCEV *Distance = D->getDistance(II);
140 const SCEVConstant *SCEVConst =
141 dyn_cast_or_null<SCEVConstant>(Distance);
142 if (SCEVConst) {
143 const ConstantInt *CI = SCEVConst->getValue();
144 if (CI->isNegative())
145 Direction = '<';
146 else if (CI->isZero())
147 Direction = '=';
148 else
149 Direction = '>';
150 Dep.push_back(Direction);
151 } else if (D->isScalar(II)) {
152 Direction = 'S';
153 Dep.push_back(Direction);
154 } else {
155 unsigned Dir = D->getDirection(II);
156 if (Dir == Dependence::DVEntry::LT ||
157 Dir == Dependence::DVEntry::LE)
158 Direction = '<';
159 else if (Dir == Dependence::DVEntry::GT ||
160 Dir == Dependence::DVEntry::GE)
161 Direction = '>';
162 else if (Dir == Dependence::DVEntry::EQ)
163 Direction = '=';
164 else
165 Direction = '*';
166 Dep.push_back(Direction);
167 }
168 }
169 while (Dep.size() != Level) {
170 Dep.push_back('I');
171 }
172
173 DepMatrix.push_back(Dep);
174 if (DepMatrix.size() > MaxMemInstrCount) {
175 LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCountdo { } while (false)
176 << " dependencies inside loop\n")do { } while (false);
177 return false;
178 }
179 }
180 }
181 }
182
183 return true;
184}
185
186// A loop is moved from index 'from' to an index 'to'. Update the Dependence
187// matrix by exchanging the two columns.
188static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
189 unsigned ToIndx) {
190 for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)
191 std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);
192}
193
194// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
195// '>'
196static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
197 unsigned Column) {
198 for (unsigned i = 0; i <= Column; ++i) {
199 if (DepMatrix[Row][i] == '<')
200 return false;
201 if (DepMatrix[Row][i] == '>')
202 return true;
203 }
204 // All dependencies were '=','S' or 'I'
205 return false;
206}
207
208// Checks if no dependence exist in the dependency matrix in Row before Column.
209static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
210 unsigned Column) {
211 for (unsigned i = 0; i < Column; ++i) {
212 if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
213 DepMatrix[Row][i] != 'I')
214 return false;
215 }
216 return true;
217}
218
219static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
220 unsigned OuterLoopId, char InnerDep,
221 char OuterDep) {
222 if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
223 return false;
224
225 if (InnerDep == OuterDep)
226 return true;
227
228 // It is legal to interchange if and only if after interchange no row has a
229 // '>' direction as the leftmost non-'='.
230
231 if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
232 return true;
233
234 if (InnerDep == '<')
235 return true;
236
237 if (InnerDep == '>') {
238 // If OuterLoopId represents outermost loop then interchanging will make the
239 // 1st dependency as '>'
240 if (OuterLoopId == 0)
241 return false;
242
243 // If all dependencies before OuterloopId are '=','S'or 'I'. Then
244 // interchanging will result in this row having an outermost non '='
245 // dependency of '>'
246 if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
247 return true;
248 }
249
250 return false;
251}
252
253// Checks if it is legal to interchange 2 loops.
254// [Theorem] A permutation of the loops in a perfect nest is legal if and only
255// if the direction matrix, after the same permutation is applied to its
256// columns, has no ">" direction as the leftmost non-"=" direction in any row.
257static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
258 unsigned InnerLoopId,
259 unsigned OuterLoopId) {
260 unsigned NumRows = DepMatrix.size();
261 // For each row check if it is valid to interchange.
262 for (unsigned Row = 0; Row < NumRows; ++Row) {
263 char InnerDep = DepMatrix[Row][InnerLoopId];
264 char OuterDep = DepMatrix[Row][OuterLoopId];
265 if (InnerDep == '*' || OuterDep == '*')
266 return false;
267 if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
268 return false;
269 }
270 return true;
271}
272
273static LoopVector populateWorklist(Loop &L) {
274 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "do { } while (false)
275 << L.getHeader()->getParent()->getName() << " Loop: %"do { } while (false)
276 << L.getHeader()->getName() << '\n')do { } while (false);
277 LoopVector LoopList;
278 Loop *CurrentLoop = &L;
279 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
280 while (!Vec->empty()) {
281 // The current loop has multiple subloops in it hence it is not tightly
282 // nested.
283 // Discard all loops above it added into Worklist.
284 if (Vec->size() != 1)
285 return {};
286
287 LoopList.push_back(CurrentLoop);
288 CurrentLoop = Vec->front();
289 Vec = &CurrentLoop->getSubLoops();
290 }
291 LoopList.push_back(CurrentLoop);
292 return LoopList;
293}
294
295static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
296 PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
297 if (InnerIndexVar)
298 return InnerIndexVar;
299 if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
300 return nullptr;
301 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
302 PHINode *PhiVar = cast<PHINode>(I);
303 Type *PhiTy = PhiVar->getType();
304 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
305 !PhiTy->isPointerTy())
306 return nullptr;
307 const SCEVAddRecExpr *AddRec =
308 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
309 if (!AddRec || !AddRec->isAffine())
310 continue;
311 const SCEV *Step = AddRec->getStepRecurrence(*SE);
312 if (!isa<SCEVConstant>(Step))
313 continue;
314 // Found the induction variable.
315 // FIXME: Handle loops with more than one induction variable. Note that,
316 // currently, legality makes sure we have only one induction variable.
317 return PhiVar;
318 }
319 return nullptr;
320}
321
322namespace {
323
324/// LoopInterchangeLegality checks if it is legal to interchange the loop.
325class LoopInterchangeLegality {
326public:
327 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
328 OptimizationRemarkEmitter *ORE)
329 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
330
331 /// Check if the loops can be interchanged.
332 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
333 CharMatrix &DepMatrix);
334
335 /// Check if the loop structure is understood. We do not handle triangular
336 /// loops for now.
337 bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
338
339 bool currentLimitations();
340
341 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
342 return OuterInnerReductions;
343 }
344
345private:
346 bool tightlyNested(Loop *Outer, Loop *Inner);
347 bool containsUnsafeInstructions(BasicBlock *BB);
348
349 /// Discover induction and reduction PHIs in the header of \p L. Induction
350 /// PHIs are added to \p Inductions, reductions are added to
351 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
352 /// to be passed as \p InnerLoop.
353 bool findInductionAndReductions(Loop *L,
354 SmallVector<PHINode *, 8> &Inductions,
355 Loop *InnerLoop);
356
357 Loop *OuterLoop;
358 Loop *InnerLoop;
359
360 ScalarEvolution *SE;
361
362 /// Interface to emit optimization remarks.
363 OptimizationRemarkEmitter *ORE;
364
365 /// Set of reduction PHIs taking part of a reduction across the inner and
366 /// outer loop.
367 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
368};
369
370/// LoopInterchangeProfitability checks if it is profitable to interchange the
371/// loop.
372class LoopInterchangeProfitability {
373public:
374 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
375 OptimizationRemarkEmitter *ORE)
376 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
377
378 /// Check if the loop interchange is profitable.
379 bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
380 CharMatrix &DepMatrix);
381
382private:
383 int getInstrOrderCost();
384
385 Loop *OuterLoop;
386 Loop *InnerLoop;
387
388 /// Scev analysis.
389 ScalarEvolution *SE;
390
391 /// Interface to emit optimization remarks.
392 OptimizationRemarkEmitter *ORE;
393};
394
395/// LoopInterchangeTransform interchanges the loop.
396class LoopInterchangeTransform {
397public:
398 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
399 LoopInfo *LI, DominatorTree *DT,
400 const LoopInterchangeLegality &LIL)
401 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
402
403 /// Interchange OuterLoop and InnerLoop.
404 bool transform();
405 void restructureLoops(Loop *NewInner, Loop *NewOuter,
406 BasicBlock *OrigInnerPreHeader,
407 BasicBlock *OrigOuterPreHeader);
408 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
409
410private:
411 bool adjustLoopLinks();
412 bool adjustLoopBranches();
413
414 Loop *OuterLoop;
415 Loop *InnerLoop;
416
417 /// Scev analysis.
418 ScalarEvolution *SE;
419
420 LoopInfo *LI;
421 DominatorTree *DT;
422
423 const LoopInterchangeLegality &LIL;
424};
425
426struct LoopInterchange {
427 ScalarEvolution *SE = nullptr;
428 LoopInfo *LI = nullptr;
429 DependenceInfo *DI = nullptr;
430 DominatorTree *DT = nullptr;
431
432 /// Interface to emit optimization remarks.
433 OptimizationRemarkEmitter *ORE;
434
435 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
436 DominatorTree *DT, OptimizationRemarkEmitter *ORE)
437 : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
438
439 bool run(Loop *L) {
440 if (L->getParentLoop())
441 return false;
442
443 return processLoopList(populateWorklist(*L));
444 }
445
446 bool run(LoopNest &LN) {
447 const auto &LoopList = LN.getLoops();
448 for (unsigned I = 1; I < LoopList.size(); ++I)
449 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
450 return false;
451 return processLoopList(LoopList);
452 }
453
454 bool isComputableLoopNest(ArrayRef<Loop *> LoopList) {
455 for (Loop *L : LoopList) {
456 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
457 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
458 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n")do { } while (false);
459 return false;
460 }
461 if (L->getNumBackEdges() != 1) {
462 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n")do { } while (false);
463 return false;
464 }
465 if (!L->getExitingBlock()) {
466 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n")do { } while (false);
467 return false;
468 }
469 }
470 return true;
471 }
472
473 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
474 // TODO: Add a better heuristic to select the loop to be interchanged based
475 // on the dependence matrix. Currently we select the innermost loop.
476 return LoopList.size() - 1;
477 }
478
479 bool processLoopList(ArrayRef<Loop *> LoopList) {
480 bool Changed = false;
481 unsigned LoopNestDepth = LoopList.size();
482 if (LoopNestDepth < 2) {
483 LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n")do { } while (false);
484 return false;
485 }
486 if (LoopNestDepth > MaxLoopNestDepth) {
487 LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "do { } while (false)
488 << MaxLoopNestDepth << "\n")do { } while (false);
489 return false;
490 }
491 if (!isComputableLoopNest(LoopList)) {
492 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n")do { } while (false);
493 return false;
494 }
495
496 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepthdo { } while (false)
497 << "\n")do { } while (false);
498
499 CharMatrix DependencyMatrix;
500 Loop *OuterMostLoop = *(LoopList.begin());
501 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
502 OuterMostLoop, DI)) {
503 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n")do { } while (false);
504 return false;
505 }
506#ifdef DUMP_DEP_MATRICIES
507 LLVM_DEBUG(dbgs() << "Dependence before interchange\n")do { } while (false);
508 printDepMatrix(DependencyMatrix);
509#endif
510
511 // Get the Outermost loop exit.
512 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
513 if (!LoopNestExit) {
514 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block")do { } while (false);
515 return false;
516 }
517
518 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
519 // Move the selected loop outwards to the best possible position.
520 Loop *LoopToBeInterchanged = LoopList[SelecLoopId];
521 for (unsigned i = SelecLoopId; i > 0; i--) {
522 bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i,
523 i - 1, DependencyMatrix);
524 if (!Interchanged)
525 return Changed;
526 // Update the DependencyMatrix
527 interChangeDependencies(DependencyMatrix, i, i - 1);
528#ifdef DUMP_DEP_MATRICIES
529 LLVM_DEBUG(dbgs() << "Dependence after interchange\n")do { } while (false);
530 printDepMatrix(DependencyMatrix);
531#endif
532 Changed |= Interchanged;
533 }
534 return Changed;
535 }
536
537 bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
538 unsigned OuterLoopId,
539 std::vector<std::vector<char>> &DependencyMatrix) {
540 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopIddo { } while (false)
541 << " and OuterLoopId = " << OuterLoopId << "\n")do { } while (false);
542 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
543 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
544 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n")do { } while (false);
545 return false;
546 }
547 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n")do { } while (false);
548 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
549 if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
550 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n")do { } while (false);
551 return false;
552 }
553
554 ORE->emit([&]() {
555 return OptimizationRemark(DEBUG_TYPE"loop-interchange", "Interchanged",
556 InnerLoop->getStartLoc(),
557 InnerLoop->getHeader())
558 << "Loop interchanged with enclosing loop.";
559 });
560
561 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
562 LIT.transform();
563 LLVM_DEBUG(dbgs() << "Loops interchanged.\n")do { } while (false);
564 LoopsInterchanged++;
565
566 assert(InnerLoop->isLCSSAForm(*DT) &&((void)0)
567 "Inner loop not left in LCSSA form after loop interchange!")((void)0);
568 assert(OuterLoop->isLCSSAForm(*DT) &&((void)0)
569 "Outer loop not left in LCSSA form after loop interchange!")((void)0);
570
571 return true;
572 }
573};
574
575} // end anonymous namespace
576
577bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
578 return any_of(*BB, [](const Instruction &I) {
579 return I.mayHaveSideEffects() || I.mayReadFromMemory();
580 });
581}
582
583bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
584 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
585 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
586 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
587
588 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n")do { } while (false);
589
590 // A perfectly nested loop will not have any branch in between the outer and
591 // inner block i.e. outer header will branch to either inner preheader and
592 // outerloop latch.
593 BranchInst *OuterLoopHeaderBI =
594 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
595 if (!OuterLoopHeaderBI)
596 return false;
597
598 for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
599 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
600 Succ != OuterLoopLatch)
601 return false;
602
603 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n")do { } while (false);
604 // We do not have any basic block in between now make sure the outer header
605 // and outer loop latch doesn't contain any unsafe instructions.
606 if (containsUnsafeInstructions(OuterLoopHeader) ||
607 containsUnsafeInstructions(OuterLoopLatch))
608 return false;
609
610 // Also make sure the inner loop preheader does not contain any unsafe
611 // instructions. Note that all instructions in the preheader will be moved to
612 // the outer loop header when interchanging.
613 if (InnerLoopPreHeader != OuterLoopHeader &&
614 containsUnsafeInstructions(InnerLoopPreHeader))
615 return false;
616
617 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
618 // Ensure the inner loop exit block flows to the outer loop latch possibly
619 // through empty blocks.
620 const BasicBlock &SuccInner =
621 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
622 if (&SuccInner != OuterLoopLatch) {
623 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExitdo { } while (false)
624 << " does not lead to the outer loop latch.\n";)do { } while (false);
625 return false;
626 }
627 // The inner loop exit block does flow to the outer loop latch and not some
628 // other BBs, now make sure it contains safe instructions, since it will be
629 // moved into the (new) inner loop after interchange.
630 if (containsUnsafeInstructions(InnerLoopExit))
631 return false;
632
633 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n")do { } while (false);
634 // We have a perfect loop nest.
635 return true;
636}
637
638bool LoopInterchangeLegality::isLoopStructureUnderstood(
639 PHINode *InnerInduction) {
640 unsigned Num = InnerInduction->getNumOperands();
641 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
642 for (unsigned i = 0; i < Num; ++i) {
643 Value *Val = InnerInduction->getOperand(i);
644 if (isa<Constant>(Val))
645 continue;
646 Instruction *I = dyn_cast<Instruction>(Val);
647 if (!I)
648 return false;
649 // TODO: Handle triangular loops.
650 // e.g. for(int i=0;i<N;i++)
651 // for(int j=i;j<N;j++)
652 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
653 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
654 InnerLoopPreheader &&
655 !OuterLoop->isLoopInvariant(I)) {
656 return false;
657 }
658 }
659
660 // TODO: Handle triangular loops of another form.
661 // e.g. for(int i=0;i<N;i++)
662 // for(int j=0;j<i;j++)
663 // or,
664 // for(int i=0;i<N;i++)
665 // for(int j=0;j*i<N;j++)
666 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
667 BranchInst *InnerLoopLatchBI =
668 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
669 if (!InnerLoopLatchBI->isConditional())
670 return false;
671 if (CmpInst *InnerLoopCmp =
672 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
673 Value *Op0 = InnerLoopCmp->getOperand(0);
674 Value *Op1 = InnerLoopCmp->getOperand(1);
675
676 // LHS and RHS of the inner loop exit condition, e.g.,
677 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
678 Value *Left = nullptr;
679 Value *Right = nullptr;
680
681 // Check if V only involves inner loop induction variable.
682 // Return true if V is InnerInduction, or a cast from
683 // InnerInduction, or a binary operator that involves
684 // InnerInduction and a constant.
685 std::function<bool(Value *)> IsPathToIndVar;
686 IsPathToIndVar = [&InnerInduction, &IsPathToIndVar](Value *V) -> bool {
687 if (V == InnerInduction)
688 return true;
689 if (isa<Constant>(V))
690 return true;
691 Instruction *I = dyn_cast<Instruction>(V);
692 if (!I)
693 return false;
694 if (isa<CastInst>(I))
695 return IsPathToIndVar(I->getOperand(0));
696 if (isa<BinaryOperator>(I))
697 return IsPathToIndVar(I->getOperand(0)) &&
698 IsPathToIndVar(I->getOperand(1));
699 return false;
700 };
701
702 if (IsPathToIndVar(Op0) && !isa<Constant>(Op0)) {
703 Left = Op0;
704 Right = Op1;
705 } else if (IsPathToIndVar(Op1) && !isa<Constant>(Op1)) {
706 Left = Op1;
707 Right = Op0;
708 }
709
710 if (Left == nullptr)
711 return false;
712
713 const SCEV *S = SE->getSCEV(Right);
714 if (!SE->isLoopInvariant(S, OuterLoop))
715 return false;
716 }
717
718 return true;
719}
720
721// If SV is a LCSSA PHI node with a single incoming value, return the incoming
722// value.
723static Value *followLCSSA(Value *SV) {
724 PHINode *PHI = dyn_cast<PHINode>(SV);
725 if (!PHI)
726 return SV;
727
728 if (PHI->getNumIncomingValues() != 1)
729 return SV;
730 return followLCSSA(PHI->getIncomingValue(0));
731}
732
733// Check V's users to see if it is involved in a reduction in L.
734static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
735 // Reduction variables cannot be constants.
736 if (isa<Constant>(V))
737 return nullptr;
738
739 for (Value *User : V->users()) {
740 if (PHINode *PHI = dyn_cast<PHINode>(User)) {
741 if (PHI->getNumIncomingValues() == 1)
742 continue;
743 RecurrenceDescriptor RD;
744 if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
745 return PHI;
746 return nullptr;
747 }
748 }
749
750 return nullptr;
751}
752
753bool LoopInterchangeLegality::findInductionAndReductions(
754 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
755 if (!L->getLoopLatch() || !L->getLoopPredecessor())
756 return false;
757 for (PHINode &PHI : L->getHeader()->phis()) {
758 RecurrenceDescriptor RD;
759 InductionDescriptor ID;
760 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
761 Inductions.push_back(&PHI);
762 else {
763 // PHIs in inner loops need to be part of a reduction in the outer loop,
764 // discovered when checking the PHIs of the outer loop earlier.
765 if (!InnerLoop) {
766 if (!OuterInnerReductions.count(&PHI)) {
767 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "do { } while (false)
768 "across the outer loop.\n")do { } while (false);
769 return false;
770 }
771 } else {
772 assert(PHI.getNumIncomingValues() == 2 &&((void)0)
773 "Phis in loop header should have exactly 2 incoming values")((void)0);
774 // Check if we have a PHI node in the outer loop that has a reduction
775 // result from the inner loop as an incoming value.
776 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
777 PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
778 if (!InnerRedPhi ||
779 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
780 LLVM_DEBUG(do { } while (false)
781 dbgs()do { } while (false)
782 << "Failed to recognize PHI as an induction or reduction.\n")do { } while (false);
783 return false;
784 }
785 OuterInnerReductions.insert(&PHI);
786 OuterInnerReductions.insert(InnerRedPhi);
787 }
788 }
789 }
790 return true;
791}
792
793// This function indicates the current limitations in the transform as a result
794// of which we do not proceed.
795bool LoopInterchangeLegality::currentLimitations() {
796 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
797 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
798
799 // transform currently expects the loop latches to also be the exiting
800 // blocks.
801 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
802 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
803 !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
804 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
805 LLVM_DEBUG(do { } while (false)
806 dbgs() << "Loops where the latch is not the exiting block are not"do { } while (false)
807 << " supported currently.\n")do { } while (false);
808 ORE->emit([&]() {
809 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "ExitingNotLatch",
810 OuterLoop->getStartLoc(),
811 OuterLoop->getHeader())
812 << "Loops where the latch is not the exiting block cannot be"
813 " interchange currently.";
814 });
815 return true;
816 }
817
818 PHINode *InnerInductionVar;
819 SmallVector<PHINode *, 8> Inductions;
820 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
821 LLVM_DEBUG(do { } while (false)
822 dbgs() << "Only outer loops with induction or reduction PHI nodes "do { } while (false)
823 << "are supported currently.\n")do { } while (false);
824 ORE->emit([&]() {
825 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedPHIOuter",
826 OuterLoop->getStartLoc(),
827 OuterLoop->getHeader())
828 << "Only outer loops with induction or reduction PHI nodes can be"
829 " interchanged currently.";
830 });
831 return true;
832 }
833
834 // TODO: Currently we handle only loops with 1 induction variable.
835 if (Inductions.size() != 1) {
836 LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "do { } while (false)
837 << "supported currently.\n")do { } while (false);
838 ORE->emit([&]() {
839 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "MultiIndutionOuter",
840 OuterLoop->getStartLoc(),
841 OuterLoop->getHeader())
842 << "Only outer loops with 1 induction variable can be "
843 "interchanged currently.";
844 });
845 return true;
846 }
847
848 Inductions.clear();
849 if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) {
850 LLVM_DEBUG(do { } while (false)
851 dbgs() << "Only inner loops with induction or reduction PHI nodes "do { } while (false)
852 << "are supported currently.\n")do { } while (false);
853 ORE->emit([&]() {
854 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedPHIInner",
855 InnerLoop->getStartLoc(),
856 InnerLoop->getHeader())
857 << "Only inner loops with induction or reduction PHI nodes can be"
858 " interchange currently.";
859 });
860 return true;
861 }
862
863 // TODO: Currently we handle only loops with 1 induction variable.
864 if (Inductions.size() != 1) {
865 LLVM_DEBUG(do { } while (false)
866 dbgs() << "We currently only support loops with 1 induction variable."do { } while (false)
867 << "Failed to interchange due to current limitation\n")do { } while (false);
868 ORE->emit([&]() {
869 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "MultiInductionInner",
870 InnerLoop->getStartLoc(),
871 InnerLoop->getHeader())
872 << "Only inner loops with 1 induction variable can be "
873 "interchanged currently.";
874 });
875 return true;
876 }
877 InnerInductionVar = Inductions.pop_back_val();
878
879 // TODO: Triangular loops are not handled for now.
880 if (!isLoopStructureUnderstood(InnerInductionVar)) {
881 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n")do { } while (false);
882 ORE->emit([&]() {
883 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedStructureInner",
884 InnerLoop->getStartLoc(),
885 InnerLoop->getHeader())
886 << "Inner loop structure not understood currently.";
887 });
888 return true;
889 }
890
891 // TODO: Current limitation: Since we split the inner loop latch at the point
892 // were induction variable is incremented (induction.next); We cannot have
893 // more than 1 user of induction.next since it would result in broken code
894 // after split.
895 // e.g.
896 // for(i=0;i<N;i++) {
897 // for(j = 0;j<M;j++) {
898 // A[j+1][i+2] = A[j][i]+k;
899 // }
900 // }
901 Instruction *InnerIndexVarInc = nullptr;
902 if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
903 InnerIndexVarInc =
904 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
905 else
906 InnerIndexVarInc =
907 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
908
909 if (!InnerIndexVarInc) {
910 LLVM_DEBUG(do { } while (false)
911 dbgs() << "Did not find an instruction to increment the induction "do { } while (false)
912 << "variable.\n")do { } while (false);
913 ORE->emit([&]() {
914 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "NoIncrementInInner",
915 InnerLoop->getStartLoc(),
916 InnerLoop->getHeader())
917 << "The inner loop does not increment the induction variable.";
918 });
919 return true;
920 }
921
922 // Since we split the inner loop latch on this induction variable. Make sure
923 // we do not have any instruction between the induction variable and branch
924 // instruction.
925
926 bool FoundInduction = false;
927 for (const Instruction &I :
928 llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
929 if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
930 isa<ZExtInst>(I))
931 continue;
932
933 // We found an instruction. If this is not induction variable then it is not
934 // safe to split this loop latch.
935 if (!I.isIdenticalTo(InnerIndexVarInc)) {
936 LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "do { } while (false)
937 << "variable increment and branch.\n")do { } while (false);
938 ORE->emit([&]() {
939 return OptimizationRemarkMissed(
940 DEBUG_TYPE"loop-interchange", "UnsupportedInsBetweenInduction",
941 InnerLoop->getStartLoc(), InnerLoop->getHeader())
942 << "Found unsupported instruction between induction variable "
943 "increment and branch.";
944 });
945 return true;
946 }
947
948 FoundInduction = true;
949 break;
950 }
951 // The loop latch ended and we didn't find the induction variable return as
952 // current limitation.
953 if (!FoundInduction) {
954 LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n")do { } while (false);
955 ORE->emit([&]() {
956 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "NoIndutionVariable",
957 InnerLoop->getStartLoc(),
958 InnerLoop->getHeader())
959 << "Did not find the induction variable.";
960 });
961 return true;
962 }
963 return false;
964}
965
966// We currently only support LCSSA PHI nodes in the inner loop exit, if their
967// users are either reduction PHIs or PHIs outside the outer loop (which means
968// the we are only interested in the final value after the loop).
969static bool
970areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
971 SmallPtrSetImpl<PHINode *> &Reductions) {
972 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
973 for (PHINode &PHI : InnerExit->phis()) {
974 // Reduction lcssa phi will have only 1 incoming block that from loop latch.
975 if (PHI.getNumIncomingValues() > 1)
976 return false;
977 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
978 PHINode *PN = dyn_cast<PHINode>(U);
979 return !PN ||
980 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
981 })) {
982 return false;
983 }
984 }
985 return true;
986}
987
988// We currently support LCSSA PHI nodes in the outer loop exit, if their
989// incoming values do not come from the outer loop latch or if the
990// outer loop latch has a single predecessor. In that case, the value will
991// be available if both the inner and outer loop conditions are true, which
992// will still be true after interchanging. If we have multiple predecessor,
993// that may not be the case, e.g. because the outer loop latch may be executed
994// if the inner loop is not executed.
995static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
996 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
997 for (PHINode &PHI : LoopNestExit->phis()) {
998 // FIXME: We currently are not able to detect floating point reductions
999 // and have to use floating point PHIs as a proxy to prevent
1000 // interchanging in the presence of floating point reductions.
1001 if (PHI.getType()->isFloatingPointTy())
1002 return false;
1003 for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
1004 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
1005 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1006 continue;
1007
1008 // The incoming value is defined in the outer loop latch. Currently we
1009 // only support that in case the outer loop latch has a single predecessor.
1010 // This guarantees that the outer loop latch is executed if and only if
1011 // the inner loop is executed (because tightlyNested() guarantees that the
1012 // outer loop header only branches to the inner loop or the outer loop
1013 // latch).
1014 // FIXME: We could weaken this logic and allow multiple predecessors,
1015 // if the values are produced outside the loop latch. We would need
1016 // additional logic to update the PHI nodes in the exit block as
1017 // well.
1018 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1019 return false;
1020 }
1021 }
1022 return true;
1023}
1024
1025// In case of multi-level nested loops, it may occur that lcssa phis exist in
1026// the latch of InnerLoop, i.e., when defs of the incoming values are further
1027// inside the loopnest. Sometimes those incoming values are not available
1028// after interchange, since the original inner latch will become the new outer
1029// latch which may have predecessor paths that do not include those incoming
1030// values.
1031// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1032// multi-level loop nests.
1033static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1034 if (InnerLoop->getSubLoops().empty())
1035 return true;
1036 // If the original outer latch has only one predecessor, then values defined
1037 // further inside the looploop, e.g., in the innermost loop, will be available
1038 // at the new outer latch after interchange.
1039 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1040 return true;
1041
1042 // The outer latch has more than one predecessors, i.e., the inner
1043 // exit and the inner header.
1044 // PHI nodes in the inner latch are lcssa phis where the incoming values
1045 // are defined further inside the loopnest. Check if those phis are used
1046 // in the original inner latch. If that is the case then bail out since
1047 // those incoming values may not be available at the new outer latch.
1048 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1049 for (PHINode &PHI : InnerLoopLatch->phis()) {
1050 for (auto *U : PHI.users()) {
1051 Instruction *UI = cast<Instruction>(U);
1052 if (InnerLoopLatch == UI->getParent())
1053 return false;
1054 }
1055 }
1056 return true;
1057}
1058
1059bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1060 unsigned OuterLoopId,
1061 CharMatrix &DepMatrix) {
1062 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1063 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopIddo { } while (false)
1064 << " and OuterLoopId = " << OuterLoopIddo { } while (false)
1065 << " due to dependence\n")do { } while (false);
1066 ORE->emit([&]() {
1067 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "Dependence",
1068 InnerLoop->getStartLoc(),
1069 InnerLoop->getHeader())
1070 << "Cannot interchange loops due to dependences.";
1071 });
1072 return false;
1073 }
1074 // Check if outer and inner loop contain legal instructions only.
1075 for (auto *BB : OuterLoop->blocks())
1076 for (Instruction &I : BB->instructionsWithoutDebug())
1077 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
1078 // readnone functions do not prevent interchanging.
1079 if (CI->doesNotReadMemory())
1080 continue;
1081 LLVM_DEBUG(do { } while (false)
1082 dbgs() << "Loops with call instructions cannot be interchanged "do { } while (false)
1083 << "safely.")do { } while (false);
1084 ORE->emit([&]() {
1085 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "CallInst",
1086 CI->getDebugLoc(),
1087 CI->getParent())
1088 << "Cannot interchange loops due to call instruction.";
1089 });
1090
1091 return false;
1092 }
1093
1094 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1095 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n")do { } while (false);
1096 ORE->emit([&]() {
1097 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedInnerLatchPHI",
1098 InnerLoop->getStartLoc(),
1099 InnerLoop->getHeader())
1100 << "Cannot interchange loops because unsupported PHI nodes found "
1101 "in inner loop latch.";
1102 });
1103 return false;
1104 }
1105
1106 // TODO: The loops could not be interchanged due to current limitations in the
1107 // transform module.
1108 if (currentLimitations()) {
1109 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n")do { } while (false);
1110 return false;
1111 }
1112
1113 // Check if the loops are tightly nested.
1114 if (!tightlyNested(OuterLoop, InnerLoop)) {
1115 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n")do { } while (false);
1116 ORE->emit([&]() {
1117 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "NotTightlyNested",
1118 InnerLoop->getStartLoc(),
1119 InnerLoop->getHeader())
1120 << "Cannot interchange loops because they are not tightly "
1121 "nested.";
1122 });
1123 return false;
1124 }
1125
1126 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1127 OuterInnerReductions)) {
1128 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n")do { } while (false);
1129 ORE->emit([&]() {
1130 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedExitPHI",
1131 InnerLoop->getStartLoc(),
1132 InnerLoop->getHeader())
1133 << "Found unsupported PHI node in loop exit.";
1134 });
1135 return false;
1136 }
1137
1138 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1139 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n")do { } while (false);
1140 ORE->emit([&]() {
1141 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "UnsupportedExitPHI",
1142 OuterLoop->getStartLoc(),
1143 OuterLoop->getHeader())
1144 << "Found unsupported PHI node in loop exit.";
1145 });
1146 return false;
1147 }
1148
1149 return true;
1150}
1151
1152int LoopInterchangeProfitability::getInstrOrderCost() {
1153 unsigned GoodOrder, BadOrder;
1154 BadOrder = GoodOrder = 0;
1155 for (BasicBlock *BB : InnerLoop->blocks()) {
1156 for (Instruction &Ins : *BB) {
1157 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1158 unsigned NumOp = GEP->getNumOperands();
1159 bool FoundInnerInduction = false;
1160 bool FoundOuterInduction = false;
1161 for (unsigned i = 0; i < NumOp; ++i) {
1162 // Skip operands that are not SCEV-able.
1163 if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
1164 continue;
1165
1166 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1167 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1168 if (!AR)
1169 continue;
1170
1171 // If we find the inner induction after an outer induction e.g.
1172 // for(int i=0;i<N;i++)
1173 // for(int j=0;j<N;j++)
1174 // A[i][j] = A[i-1][j-1]+k;
1175 // then it is a good order.
1176 if (AR->getLoop() == InnerLoop) {
1177 // We found an InnerLoop induction after OuterLoop induction. It is
1178 // a good order.
1179 FoundInnerInduction = true;
1180 if (FoundOuterInduction) {
1181 GoodOrder++;
1182 break;
1183 }
1184 }
1185 // If we find the outer induction after an inner induction e.g.
1186 // for(int i=0;i<N;i++)
1187 // for(int j=0;j<N;j++)
1188 // A[j][i] = A[j-1][i-1]+k;
1189 // then it is a bad order.
1190 if (AR->getLoop() == OuterLoop) {
1191 // We found an OuterLoop induction after InnerLoop induction. It is
1192 // a bad order.
1193 FoundOuterInduction = true;
1194 if (FoundInnerInduction) {
1195 BadOrder++;
1196 break;
1197 }
1198 }
1199 }
1200 }
1201 }
1202 }
1203 return GoodOrder - BadOrder;
1204}
1205
1206static bool isProfitableForVectorization(unsigned InnerLoopId,
1207 unsigned OuterLoopId,
1208 CharMatrix &DepMatrix) {
1209 // TODO: Improve this heuristic to catch more cases.
1210 // If the inner loop is loop independent or doesn't carry any dependency it is
1211 // profitable to move this to outer position.
1212 for (auto &Row : DepMatrix) {
1213 if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
1214 return false;
1215 // TODO: We need to improve this heuristic.
1216 if (Row[OuterLoopId] != '=')
1217 return false;
1218 }
1219 // If outer loop has dependence and inner loop is loop independent then it is
1220 // profitable to interchange to enable parallelism.
1221 // If there are no dependences, interchanging will not improve anything.
1222 return !DepMatrix.empty();
1223}
1224
1225bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
1226 unsigned OuterLoopId,
1227 CharMatrix &DepMatrix) {
1228 // TODO: Add better profitability checks.
1229 // e.g
1230 // 1) Construct dependency matrix and move the one with no loop carried dep
1231 // inside to enable vectorization.
1232
1233 // This is rough cost estimation algorithm. It counts the good and bad order
1234 // of induction variables in the instruction and allows reordering if number
1235 // of bad orders is more than good.
1236 int Cost = getInstrOrderCost();
1237 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n")do { } while (false);
1238 if (Cost < -LoopInterchangeCostThreshold)
1239 return true;
1240
1241 // It is not profitable as per current cache profitability model. But check if
1242 // we can move this loop outside to improve parallelism.
1243 if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
1244 return true;
1245
1246 ORE->emit([&]() {
1247 return OptimizationRemarkMissed(DEBUG_TYPE"loop-interchange", "InterchangeNotProfitable",
1248 InnerLoop->getStartLoc(),
1249 InnerLoop->getHeader())
1250 << "Interchanging loops is too costly (cost="
1251 << ore::NV("Cost", Cost) << ", threshold="
1252 << ore::NV("Threshold", LoopInterchangeCostThreshold)
1253 << ") and it does not improve parallelism.";
1254 });
1255 return false;
1256}
1257
1258void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1259 Loop *InnerLoop) {
1260 for (Loop *L : *OuterLoop)
1261 if (L == InnerLoop) {
1262 OuterLoop->removeChildLoop(L);
1263 return;
1264 }
1265 llvm_unreachable("Couldn't find loop")__builtin_unreachable();
1266}
1267
1268/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1269/// new inner and outer loop after interchanging: NewInner is the original
1270/// outer loop and NewOuter is the original inner loop.
1271///
1272/// Before interchanging, we have the following structure
1273/// Outer preheader
1274// Outer header
1275// Inner preheader
1276// Inner header
1277// Inner body
1278// Inner latch
1279// outer bbs
1280// Outer latch
1281//
1282// After interchanging:
1283// Inner preheader
1284// Inner header
1285// Outer preheader
1286// Outer header
1287// Inner body
1288// outer bbs
1289// Outer latch
1290// Inner latch
1291void LoopInterchangeTransform::restructureLoops(
1292 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1293 BasicBlock *OrigOuterPreHeader) {
1294 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1295 // The original inner loop preheader moves from the new inner loop to
1296 // the parent loop, if there is one.
1297 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1298 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1299
1300 // Switch the loop levels.
1301 if (OuterLoopParent) {
1302 // Remove the loop from its parent loop.
1303 removeChildLoop(OuterLoopParent, NewInner);
1304 removeChildLoop(NewInner, NewOuter);
1305 OuterLoopParent->addChildLoop(NewOuter);
1306 } else {
1307 removeChildLoop(NewInner, NewOuter);
1308 LI->changeTopLevelLoop(NewInner, NewOuter);
1309 }
1310 while (!NewOuter->isInnermost())
1311 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1312 NewOuter->addChildLoop(NewInner);
1313
1314 // BBs from the original inner loop.
1315 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1316
1317 // Add BBs from the original outer loop to the original inner loop (excluding
1318 // BBs already in inner loop)
1319 for (BasicBlock *BB : NewInner->blocks())
1320 if (LI->getLoopFor(BB) == NewInner)
1321 NewOuter->addBlockEntry(BB);
1322
1323 // Now remove inner loop header and latch from the new inner loop and move
1324 // other BBs (the loop body) to the new inner loop.
1325 BasicBlock *OuterHeader = NewOuter->getHeader();
1326 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1327 for (BasicBlock *BB : OrigInnerBBs) {
1328 // Nothing will change for BBs in child loops.
1329 if (LI->getLoopFor(BB) != NewOuter)
1330 continue;
1331 // Remove the new outer loop header and latch from the new inner loop.
1332 if (BB == OuterHeader || BB == OuterLatch)
1333 NewInner->removeBlockFromLoop(BB);
1334 else
1335 LI->changeLoopFor(BB, NewInner);
1336 }
1337
1338 // The preheader of the original outer loop becomes part of the new
1339 // outer loop.
1340 NewOuter->addBlockEntry(OrigOuterPreHeader);
1341 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1342
1343 // Tell SE that we move the loops around.
1344 SE->forgetLoop(NewOuter);
1345 SE->forgetLoop(NewInner);
1346}
1347
1348bool LoopInterchangeTransform::transform() {
1349 bool Transformed = false;
1350 Instruction *InnerIndexVar;
1351
1352 if (InnerLoop->getSubLoops().empty()) {
1353 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1354 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n")do { } while (false);
1355 PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
1356 if (!InductionPHI) {
1357 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n")do { } while (false);
1358 return false;
1359 }
1360
1361 if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1362 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
1363 else
1364 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
1365
1366 // Ensure that InductionPHI is the first Phi node.
1367 if (&InductionPHI->getParent()->front() != InductionPHI)
1368 InductionPHI->moveBefore(&InductionPHI->getParent()->front());
1369
1370 // Create a new latch block for the inner loop. We split at the
1371 // current latch's terminator and then move the condition and all
1372 // operands that are not either loop-invariant or the induction PHI into the
1373 // new latch block.
1374 BasicBlock *NewLatch =
1375 SplitBlock(InnerLoop->getLoopLatch(),
1376 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1377
1378 SmallSetVector<Instruction *, 4> WorkList;
1379 unsigned i = 0;
1380 auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
1381 for (; i < WorkList.size(); i++) {
1382 // Duplicate instruction and move it the new latch. Update uses that
1383 // have been moved.
1384 Instruction *NewI = WorkList[i]->clone();
1385 NewI->insertBefore(NewLatch->getFirstNonPHI());
1386 assert(!NewI->mayHaveSideEffects() &&((void)0)
1387 "Moving instructions with side-effects may change behavior of "((void)0)
1388 "the loop nest!")((void)0);
1389 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1390 Instruction *UserI = cast<Instruction>(U.getUser());
1391 if (!InnerLoop->contains(UserI->getParent()) ||
1392 UserI->getParent() == NewLatch || UserI == InductionPHI)
1393 U.set(NewI);
1394 }
1395 // Add operands of moved instruction to the worklist, except if they are
1396 // outside the inner loop or are the induction PHI.
1397 for (Value *Op : WorkList[i]->operands()) {
1398 Instruction *OpI = dyn_cast<Instruction>(Op);
1399 if (!OpI ||
1400 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1401 OpI == InductionPHI)
1402 continue;
1403 WorkList.insert(OpI);
1404 }
1405 }
1406 };
1407
1408 // FIXME: Should we interchange when we have a constant condition?
1409 Instruction *CondI = dyn_cast<Instruction>(
1410 cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
1411 ->getCondition());
1412 if (CondI)
1413 WorkList.insert(CondI);
1414 MoveInstructions();
1415 WorkList.insert(cast<Instruction>(InnerIndexVar));
1416 MoveInstructions();
1417
1418 // Splits the inner loops phi nodes out into a separate basic block.
1419 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1420 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1421 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n")do { } while (false);
1422 }
1423
1424 // Instructions in the original inner loop preheader may depend on values
1425 // defined in the outer loop header. Move them there, because the original
1426 // inner loop preheader will become the entry into the interchanged loop nest.
1427 // Currently we move all instructions and rely on LICM to move invariant
1428 // instructions outside the loop nest.
1429 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1430 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1431 if (InnerLoopPreHeader != OuterLoopHeader) {
1432 SmallPtrSet<Instruction *, 4> NeedsMoving;
1433 for (Instruction &I :
1434 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1435 std::prev(InnerLoopPreHeader->end()))))
1436 I.moveBefore(OuterLoopHeader->getTerminator());
1437 }
1438
1439 Transformed |= adjustLoopLinks();
1440 if (!Transformed) {
1441 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n")do { } while (false);
1442 return false;
1443 }
1444
1445 return true;
1446}
1447
1448/// \brief Move all instructions except the terminator from FromBB right before
1449/// InsertBefore
1450static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1451 auto &ToList = InsertBefore->getParent()->getInstList();
1452 auto &FromList = FromBB->getInstList();
1453
1454 ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
1455 FromBB->getTerminator()->getIterator());
1456}
1457
1458/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1459static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1460 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1461 // from BB1 afterwards.
1462 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1463 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1464 for (Instruction *I : TempInstrs)
1465 I->removeFromParent();
1466
1467 // Move instructions from BB2 to BB1.
1468 moveBBContents(BB2, BB1->getTerminator());
1469
1470 // Move instructions from TempInstrs to BB2.
1471 for (Instruction *I : TempInstrs)
1472 I->insertBefore(BB2->getTerminator());
1473}
1474
1475// Update BI to jump to NewBB instead of OldBB. Records updates to the
1476// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1477// \p OldBB is exactly once in BI's successor list.
1478static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1479 BasicBlock *NewBB,
1480 std::vector<DominatorTree::UpdateType> &DTUpdates,
1481 bool MustUpdateOnce = true) {
1482 assert((!MustUpdateOnce ||((void)0)
1483 llvm::count_if(successors(BI),((void)0)
1484 [OldBB](BasicBlock *BB) {((void)0)
1485 return BB == OldBB;((void)0)
1486 }) == 1) && "BI must jump to OldBB exactly once.")((void)0);
1487 bool Changed = false;
1488 for (Use &Op : BI->operands())
1489 if (Op == OldBB) {
1490 Op.set(NewBB);
1491 Changed = true;
1492 }
1493
1494 if (Changed) {
1495 DTUpdates.push_back(
1496 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1497 DTUpdates.push_back(
1498 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1499 }
1500 assert(Changed && "Expected a successor to be updated")((void)0);
1501}
1502
1503// Move Lcssa PHIs to the right place.
1504static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1505 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1506 BasicBlock *OuterLatch, BasicBlock *OuterExit,
1507 Loop *InnerLoop, LoopInfo *LI) {
1508
1509 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1510 // defined either in the header or latch. Those blocks will become header and
1511 // latch of the new outer loop, and the only possible users can PHI nodes
1512 // in the exit block of the loop nest or the outer loop header (reduction
1513 // PHIs, in that case, the incoming value must be defined in the inner loop
1514 // header). We can just substitute the user with the incoming value and remove
1515 // the PHI.
1516 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1517 assert(P.getNumIncomingValues() == 1 &&((void)0)
1518 "Only loops with a single exit are supported!")((void)0);
1519
1520 // Incoming values are guaranteed be instructions currently.
1521 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1522 // Skip phis with incoming values from the inner loop body, excluding the
1523 // header and latch.
1524 if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader)
1525 continue;
1526
1527 assert(all_of(P.users(),((void)0)
1528 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {((void)0)
1529 return (cast<PHINode>(U)->getParent() == OuterHeader &&((void)0)
1530 IncI->getParent() == InnerHeader) ||((void)0)
1531 cast<PHINode>(U)->getParent() == OuterExit;((void)0)
1532 }) &&((void)0)
1533 "Can only replace phis iff the uses are in the loop nest exit or "((void)0)
1534 "the incoming value is defined in the inner header (it will "((void)0)
1535 "dominate all loop blocks after interchanging)")((void)0);
1536 P.replaceAllUsesWith(IncI);
1537 P.eraseFromParent();
1538 }
1539
1540 SmallVector<PHINode *, 8> LcssaInnerExit;
1541 for (PHINode &P : InnerExit->phis())
1542 LcssaInnerExit.push_back(&P);
1543
1544 SmallVector<PHINode *, 8> LcssaInnerLatch;
1545 for (PHINode &P : InnerLatch->phis())
1546 LcssaInnerLatch.push_back(&P);
1547
1548 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1549 // If a PHI node has users outside of InnerExit, it has a use outside the
1550 // interchanged loop and we have to preserve it. We move these to
1551 // InnerLatch, which will become the new exit block for the innermost
1552 // loop after interchanging.
1553 for (PHINode *P : LcssaInnerExit)
1
Assuming '__begin1' is equal to '__end1'
1554 P->moveBefore(InnerLatch->getFirstNonPHI());
1555
1556 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1557 // and we have to move them to the new inner latch.
1558 for (PHINode *P : LcssaInnerLatch)
2
Assuming '__begin1' is equal to '__end1'
1559 P->moveBefore(InnerExit->getFirstNonPHI());
1560
1561 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1562 // incoming values defined in the outer loop, we have to add a new PHI
1563 // in the inner loop latch, which became the exit block of the outer loop,
1564 // after interchanging.
1565 if (OuterExit) {
3
Assuming 'OuterExit' is non-null
4
Taking true branch
1566 for (PHINode &P : OuterExit->phis()) {
1567 if (P.getNumIncomingValues() != 1)
5
Assuming the condition is false
6
Taking false branch
1568 continue;
1569 // Skip Phis with incoming values defined in the inner loop. Those should
1570 // already have been updated.
1571 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1572 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
7
Assuming 'I' is non-null
8
Assuming the condition is false
9
Taking false branch
1573 continue;
1574
1575 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
10
Assuming the object is not a 'PHINode'
11
'NewPhi' initialized to a null pointer value
1576 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
12
Called C++ object pointer is null
1577 NewPhi->setIncomingBlock(0, OuterLatch);
1578 // We might have incoming edges from other BBs, i.e., the original outer
1579 // header.
1580 for (auto *Pred : predecessors(InnerLatch)) {
1581 if (Pred == OuterLatch)
1582 continue;
1583 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1584 }
1585 NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
1586 P.setIncomingValue(0, NewPhi);
1587 }
1588 }
1589
1590 // Now adjust the incoming blocks for the LCSSA PHIs.
1591 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1592 // with the new latch.
1593 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1594}
1595
1596bool LoopInterchangeTransform::adjustLoopBranches() {
1597 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n")do { } while (false);
1598 std::vector<DominatorTree::UpdateType> DTUpdates;
1599
1600 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1601 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1602
1603 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&((void)0)
1604 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&((void)0)
1605 InnerLoopPreHeader && "Guaranteed by loop-simplify form")((void)0);
1606 // Ensure that both preheaders do not contain PHI nodes and have single
1607 // predecessors. This allows us to move them easily. We use
1608 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1609 // preheaders do not satisfy those conditions.
1610 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1611 !OuterLoopPreHeader->getUniquePredecessor())
1612 OuterLoopPreHeader =
1613 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1614 if (InnerLoopPreHeader == OuterLoop->getHeader())
1615 InnerLoopPreHeader =
1616 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1617
1618 // Adjust the loop preheader
1619 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1620 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1621 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1622 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1623 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1624 BasicBlock *InnerLoopLatchPredecessor =
1625 InnerLoopLatch->getUniquePredecessor();
1626 BasicBlock *InnerLoopLatchSuccessor;
1627 BasicBlock *OuterLoopLatchSuccessor;
1628
1629 BranchInst *OuterLoopLatchBI =
1630 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1631 BranchInst *InnerLoopLatchBI =
1632 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1633 BranchInst *OuterLoopHeaderBI =
1634 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1635 BranchInst *InnerLoopHeaderBI =
1636 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1637
1638 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1639 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1640 !InnerLoopHeaderBI)
1641 return false;
1642
1643 BranchInst *InnerLoopLatchPredecessorBI =
1644 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1645 BranchInst *OuterLoopPredecessorBI =
1646 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1647
1648 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1649 return false;
1650 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1651 if (!InnerLoopHeaderSuccessor)
1652 return false;
1653
1654 // Adjust Loop Preheader and headers.
1655 // The branches in the outer loop predecessor and the outer loop header can
1656 // be unconditional branches or conditional branches with duplicates. Consider
1657 // this when updating the successors.
1658 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1659 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
1660 // The outer loop header might or might not branch to the outer latch.
1661 // We are guaranteed to branch to the inner loop preheader.
1662 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
1663 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
1664 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
1665 DTUpdates,
1666 /*MustUpdateOnce=*/false);
1667 }
1668 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1669 InnerLoopHeaderSuccessor, DTUpdates,
1670 /*MustUpdateOnce=*/false);
1671
1672 // Adjust reduction PHI's now that the incoming block has changed.
1673 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
1674 OuterLoopHeader);
1675
1676 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1677 OuterLoopPreHeader, DTUpdates);
1678
1679 // -------------Adjust loop latches-----------
1680 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1681 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1682 else
1683 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1684
1685 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1686 InnerLoopLatchSuccessor, DTUpdates);
1687
1688
1689 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1690 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1691 else
1692 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1693
1694 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1695 OuterLoopLatchSuccessor, DTUpdates);
1696 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1697 DTUpdates);
1698
1699 DT->applyUpdates(DTUpdates);
1700 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1701 OuterLoopPreHeader);
1702
1703 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1704 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
1705 InnerLoop, LI);
1706 // For PHIs in the exit block of the outer loop, outer's latch has been
1707 // replaced by Inners'.
1708 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1709
1710 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1711 // Now update the reduction PHIs in the inner and outer loop headers.
1712 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1713 for (PHINode &PHI : InnerLoopHeader->phis()) {
1714 if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end())
1715 continue;
1716 InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
1717 }
1718 for (PHINode &PHI : OuterLoopHeader->phis()) {
1719 if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end())
1720 continue;
1721 OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
1722 }
1723
1724 // Now move the remaining reduction PHIs from outer to inner loop header and
1725 // vice versa. The PHI nodes must be part of a reduction across the inner and
1726 // outer loop and all the remains to do is and updating the incoming blocks.
1727 for (PHINode *PHI : OuterLoopPHIs) {
1728 PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1729 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node")((void)0);
1730 }
1731 for (PHINode *PHI : InnerLoopPHIs) {
1732 PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1733 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node")((void)0);
1734 }
1735
1736 // Update the incoming blocks for moved PHI nodes.
1737 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
1738 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
1739 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
1740 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1741
1742 // Values defined in the outer loop header could be used in the inner loop
1743 // latch. In that case, we need to create LCSSA phis for them, because after
1744 // interchanging they will be defined in the new inner loop and used in the
1745 // new outer loop.
1746 IRBuilder<> Builder(OuterLoopHeader->getContext());
1747 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
1748 for (Instruction &I :
1749 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
1750 MayNeedLCSSAPhis.push_back(&I);
1751 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder);
1752
1753 return true;
1754}
1755
1756bool LoopInterchangeTransform::adjustLoopLinks() {
1757 // Adjust all branches in the inner and outer loop.
1758 bool Changed = adjustLoopBranches();
1759 if (Changed) {
1760 // We have interchanged the preheaders so we need to interchange the data in
1761 // the preheaders as well. This is because the content of the inner
1762 // preheader was previously executed inside the outer loop.
1763 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1764 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1765 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
1766 }
1767 return Changed;
1768}
1769
1770/// Main LoopInterchange Pass.
1771struct LoopInterchangeLegacyPass : public LoopPass {
1772 static char ID;
1773
1774 LoopInterchangeLegacyPass() : LoopPass(ID) {
1775 initializeLoopInterchangeLegacyPassPass(*PassRegistry::getPassRegistry());
1776 }
1777
1778 void getAnalysisUsage(AnalysisUsage &AU) const override {
1779 AU.addRequired<DependenceAnalysisWrapperPass>();
1780 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1781
1782 getLoopAnalysisUsage(AU);
1783 }
1784
1785 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1786 if (skipLoop(L))
1787 return false;
1788
1789 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1790 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1791 auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
1792 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1793 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1794
1795 return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
1796 }
1797};
1798
1799char LoopInterchangeLegacyPass::ID = 0;
1800
1801INITIALIZE_PASS_BEGIN(LoopInterchangeLegacyPass, "loop-interchange",static void *initializeLoopInterchangeLegacyPassPassOnce(PassRegistry
&Registry) {
1802 "Interchanges loops for cache reuse", false, false)static void *initializeLoopInterchangeLegacyPassPassOnce(PassRegistry
&Registry) {
1803INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
1804INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)initializeDependenceAnalysisWrapperPassPass(Registry);
1805INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
1806
1807INITIALIZE_PASS_END(LoopInterchangeLegacyPass, "loop-interchange",PassInfo *PI = new PassInfo( "Interchanges loops for cache reuse"
, "loop-interchange", &LoopInterchangeLegacyPass::ID, PassInfo
::NormalCtor_t(callDefaultCtor<LoopInterchangeLegacyPass>
), false, false); Registry.registerPass(*PI, true); return PI
; } static llvm::once_flag InitializeLoopInterchangeLegacyPassPassFlag
; void llvm::initializeLoopInterchangeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopInterchangeLegacyPassPassFlag
, initializeLoopInterchangeLegacyPassPassOnce, std::ref(Registry
)); }
1808 "Interchanges loops for cache reuse", false, false)PassInfo *PI = new PassInfo( "Interchanges loops for cache reuse"
, "loop-interchange", &LoopInterchangeLegacyPass::ID, PassInfo
::NormalCtor_t(callDefaultCtor<LoopInterchangeLegacyPass>
), false, false); Registry.registerPass(*PI, true); return PI
; } static llvm::once_flag InitializeLoopInterchangeLegacyPassPassFlag
; void llvm::initializeLoopInterchangeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopInterchangeLegacyPassPassFlag
, initializeLoopInterchangeLegacyPassPassOnce, std::ref(Registry
)); }
1809
1810Pass *llvm::createLoopInterchangePass() {
1811 return new LoopInterchangeLegacyPass();
1812}
1813
1814PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
1815 LoopAnalysisManager &AM,
1816 LoopStandardAnalysisResults &AR,
1817 LPMUpdater &U) {
1818 Function &F = *LN.getParent();
1819
1820 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1821 OptimizationRemarkEmitter ORE(&F);
1822 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN))
1823 return PreservedAnalyses::all();
1824 return getLoopPassPreservedAnalyses();
1825}