| File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86/X86PartialReduction.cpp |
| Warning: | line 238, column 26 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===-- X86PartialReduction.cpp -------------------------------------------===// | ||||||||
| 2 | // | ||||||||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||||
| 6 | // | ||||||||
| 7 | //===----------------------------------------------------------------------===// | ||||||||
| 8 | // | ||||||||
| 9 | // This pass looks for add instructions used by a horizontal reduction to see | ||||||||
| 10 | // if we might be able to use pmaddwd or psadbw. Some cases of this require | ||||||||
| 11 | // cross basic block knowledge and can't be done in SelectionDAG. | ||||||||
| 12 | // | ||||||||
| 13 | //===----------------------------------------------------------------------===// | ||||||||
| 14 | |||||||||
| 15 | #include "X86.h" | ||||||||
| 16 | #include "llvm/Analysis/ValueTracking.h" | ||||||||
| 17 | #include "llvm/CodeGen/TargetPassConfig.h" | ||||||||
| 18 | #include "llvm/IR/Constants.h" | ||||||||
| 19 | #include "llvm/IR/Instructions.h" | ||||||||
| 20 | #include "llvm/IR/IntrinsicsX86.h" | ||||||||
| 21 | #include "llvm/IR/IRBuilder.h" | ||||||||
| 22 | #include "llvm/IR/Operator.h" | ||||||||
| 23 | #include "llvm/Pass.h" | ||||||||
| 24 | #include "X86TargetMachine.h" | ||||||||
| 25 | |||||||||
| 26 | using namespace llvm; | ||||||||
| 27 | |||||||||
| 28 | #define DEBUG_TYPE"x86-partial-reduction" "x86-partial-reduction" | ||||||||
| 29 | |||||||||
| 30 | namespace { | ||||||||
| 31 | |||||||||
| 32 | class X86PartialReduction : public FunctionPass { | ||||||||
| 33 | const DataLayout *DL; | ||||||||
| 34 | const X86Subtarget *ST; | ||||||||
| 35 | |||||||||
| 36 | public: | ||||||||
| 37 | static char ID; // Pass identification, replacement for typeid. | ||||||||
| 38 | |||||||||
| 39 | X86PartialReduction() : FunctionPass(ID) { } | ||||||||
| 40 | |||||||||
| 41 | bool runOnFunction(Function &Fn) override; | ||||||||
| 42 | |||||||||
| 43 | void getAnalysisUsage(AnalysisUsage &AU) const override { | ||||||||
| 44 | AU.setPreservesCFG(); | ||||||||
| 45 | } | ||||||||
| 46 | |||||||||
| 47 | StringRef getPassName() const override { | ||||||||
| 48 | return "X86 Partial Reduction"; | ||||||||
| 49 | } | ||||||||
| 50 | |||||||||
| 51 | private: | ||||||||
| 52 | bool tryMAddReplacement(Instruction *Op); | ||||||||
| 53 | bool trySADReplacement(Instruction *Op); | ||||||||
| 54 | }; | ||||||||
| 55 | } | ||||||||
| 56 | |||||||||
| 57 | FunctionPass *llvm::createX86PartialReductionPass() { | ||||||||
| 58 | return new X86PartialReduction(); | ||||||||
| 59 | } | ||||||||
| 60 | |||||||||
| 61 | char X86PartialReduction::ID = 0; | ||||||||
| 62 | |||||||||
| 63 | INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,static void *initializeX86PartialReductionPassOnce(PassRegistry &Registry) { PassInfo *PI = new PassInfo( "X86 Partial Reduction" , "x86-partial-reduction", &X86PartialReduction::ID, PassInfo ::NormalCtor_t(callDefaultCtor<X86PartialReduction>), false , false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeX86PartialReductionPassFlag; void llvm ::initializeX86PartialReductionPass(PassRegistry &Registry ) { llvm::call_once(InitializeX86PartialReductionPassFlag, initializeX86PartialReductionPassOnce , std::ref(Registry)); } | ||||||||
| 64 | "X86 Partial Reduction", false, false)static void *initializeX86PartialReductionPassOnce(PassRegistry &Registry) { PassInfo *PI = new PassInfo( "X86 Partial Reduction" , "x86-partial-reduction", &X86PartialReduction::ID, PassInfo ::NormalCtor_t(callDefaultCtor<X86PartialReduction>), false , false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeX86PartialReductionPassFlag; void llvm ::initializeX86PartialReductionPass(PassRegistry &Registry ) { llvm::call_once(InitializeX86PartialReductionPassFlag, initializeX86PartialReductionPassOnce , std::ref(Registry)); } | ||||||||
| 65 | |||||||||
| 66 | bool X86PartialReduction::tryMAddReplacement(Instruction *Op) { | ||||||||
| 67 | if (!ST->hasSSE2()) | ||||||||
| 68 | return false; | ||||||||
| 69 | |||||||||
| 70 | // Need at least 8 elements. | ||||||||
| 71 | if (cast<FixedVectorType>(Op->getType())->getNumElements() < 8) | ||||||||
| 72 | return false; | ||||||||
| 73 | |||||||||
| 74 | // Element type should be i32. | ||||||||
| 75 | if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32)) | ||||||||
| 76 | return false; | ||||||||
| 77 | |||||||||
| 78 | auto *Mul = dyn_cast<BinaryOperator>(Op); | ||||||||
| 79 | if (!Mul || Mul->getOpcode() != Instruction::Mul) | ||||||||
| 80 | return false; | ||||||||
| 81 | |||||||||
| 82 | Value *LHS = Mul->getOperand(0); | ||||||||
| 83 | Value *RHS = Mul->getOperand(1); | ||||||||
| 84 | |||||||||
| 85 | // LHS and RHS should be only used once or if they are the same then only | ||||||||
| 86 | // used twice. Only check this when SSE4.1 is enabled and we have zext/sext | ||||||||
| 87 | // instructions, otherwise we use punpck to emulate zero extend in stages. The | ||||||||
| 88 | // trunc/ we need to do likely won't introduce new instructions in that case. | ||||||||
| 89 | if (ST->hasSSE41()) { | ||||||||
| 90 | if (LHS == RHS) { | ||||||||
| 91 | if (!isa<Constant>(LHS) && !LHS->hasNUses(2)) | ||||||||
| 92 | return false; | ||||||||
| 93 | } else { | ||||||||
| 94 | if (!isa<Constant>(LHS) && !LHS->hasOneUse()) | ||||||||
| 95 | return false; | ||||||||
| 96 | if (!isa<Constant>(RHS) && !RHS->hasOneUse()) | ||||||||
| 97 | return false; | ||||||||
| 98 | } | ||||||||
| 99 | } | ||||||||
| 100 | |||||||||
| 101 | auto CanShrinkOp = [&](Value *Op) { | ||||||||
| 102 | auto IsFreeTruncation = [&](Value *Op) { | ||||||||
| 103 | if (auto *Cast = dyn_cast<CastInst>(Op)) { | ||||||||
| 104 | if (Cast->getParent() == Mul->getParent() && | ||||||||
| 105 | (Cast->getOpcode() == Instruction::SExt || | ||||||||
| 106 | Cast->getOpcode() == Instruction::ZExt) && | ||||||||
| 107 | Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 16) | ||||||||
| 108 | return true; | ||||||||
| 109 | } | ||||||||
| 110 | |||||||||
| 111 | return isa<Constant>(Op); | ||||||||
| 112 | }; | ||||||||
| 113 | |||||||||
| 114 | // If the operation can be freely truncated and has enough sign bits we | ||||||||
| 115 | // can shrink. | ||||||||
| 116 | if (IsFreeTruncation(Op) && | ||||||||
| 117 | ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16) | ||||||||
| 118 | return true; | ||||||||
| 119 | |||||||||
| 120 | // SelectionDAG has limited support for truncating through an add or sub if | ||||||||
| 121 | // the inputs are freely truncatable. | ||||||||
| 122 | if (auto *BO = dyn_cast<BinaryOperator>(Op)) { | ||||||||
| 123 | if (BO->getParent() == Mul->getParent() && | ||||||||
| 124 | IsFreeTruncation(BO->getOperand(0)) && | ||||||||
| 125 | IsFreeTruncation(BO->getOperand(1)) && | ||||||||
| 126 | ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16) | ||||||||
| 127 | return true; | ||||||||
| 128 | } | ||||||||
| 129 | |||||||||
| 130 | return false; | ||||||||
| 131 | }; | ||||||||
| 132 | |||||||||
| 133 | // Both Ops need to be shrinkable. | ||||||||
| 134 | if (!CanShrinkOp(LHS) && !CanShrinkOp(RHS)) | ||||||||
| 135 | return false; | ||||||||
| 136 | |||||||||
| 137 | IRBuilder<> Builder(Mul); | ||||||||
| 138 | |||||||||
| 139 | auto *MulTy = cast<FixedVectorType>(Op->getType()); | ||||||||
| 140 | unsigned NumElts = MulTy->getNumElements(); | ||||||||
| 141 | |||||||||
| 142 | // Extract even elements and odd elements and add them together. This will | ||||||||
| 143 | // be pattern matched by SelectionDAG to pmaddwd. This instruction will be | ||||||||
| 144 | // half the original width. | ||||||||
| 145 | SmallVector<int, 16> EvenMask(NumElts / 2); | ||||||||
| 146 | SmallVector<int, 16> OddMask(NumElts / 2); | ||||||||
| 147 | for (int i = 0, e = NumElts / 2; i != e; ++i) { | ||||||||
| 148 | EvenMask[i] = i * 2; | ||||||||
| 149 | OddMask[i] = i * 2 + 1; | ||||||||
| 150 | } | ||||||||
| 151 | // Creating a new mul so the replaceAllUsesWith below doesn't replace the | ||||||||
| 152 | // uses in the shuffles we're creating. | ||||||||
| 153 | Value *NewMul = Builder.CreateMul(Mul->getOperand(0), Mul->getOperand(1)); | ||||||||
| 154 | Value *EvenElts = Builder.CreateShuffleVector(NewMul, NewMul, EvenMask); | ||||||||
| 155 | Value *OddElts = Builder.CreateShuffleVector(NewMul, NewMul, OddMask); | ||||||||
| 156 | Value *MAdd = Builder.CreateAdd(EvenElts, OddElts); | ||||||||
| 157 | |||||||||
| 158 | // Concatenate zeroes to extend back to the original type. | ||||||||
| 159 | SmallVector<int, 32> ConcatMask(NumElts); | ||||||||
| 160 | std::iota(ConcatMask.begin(), ConcatMask.end(), 0); | ||||||||
| 161 | Value *Zero = Constant::getNullValue(MAdd->getType()); | ||||||||
| 162 | Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask); | ||||||||
| 163 | |||||||||
| 164 | Mul->replaceAllUsesWith(Concat); | ||||||||
| 165 | Mul->eraseFromParent(); | ||||||||
| 166 | |||||||||
| 167 | return true; | ||||||||
| 168 | } | ||||||||
| 169 | |||||||||
| 170 | bool X86PartialReduction::trySADReplacement(Instruction *Op) { | ||||||||
| 171 | if (!ST->hasSSE2()) | ||||||||
| 172 | return false; | ||||||||
| 173 | |||||||||
| 174 | // TODO: There's nothing special about i32, any integer type above i16 should | ||||||||
| 175 | // work just as well. | ||||||||
| 176 | if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32)) | ||||||||
| 177 | return false; | ||||||||
| 178 | |||||||||
| 179 | // Operand should be a select. | ||||||||
| 180 | auto *SI = dyn_cast<SelectInst>(Op); | ||||||||
| 181 | if (!SI
| ||||||||
| 182 | return false; | ||||||||
| 183 | |||||||||
| 184 | // Select needs to implement absolute value. | ||||||||
| 185 | Value *LHS, *RHS; | ||||||||
| 186 | auto SPR = matchSelectPattern(SI, LHS, RHS); | ||||||||
| 187 | if (SPR.Flavor != SPF_ABS) | ||||||||
| 188 | return false; | ||||||||
| 189 | |||||||||
| 190 | // Need a subtract of two values. | ||||||||
| 191 | auto *Sub = dyn_cast<BinaryOperator>(LHS); | ||||||||
| 192 | if (!Sub
| ||||||||
| 193 | return false; | ||||||||
| 194 | |||||||||
| 195 | // Look for zero extend from i8. | ||||||||
| 196 | auto getZeroExtendedVal = [](Value *Op) -> Value * { | ||||||||
| 197 | if (auto *ZExt = dyn_cast<ZExtInst>(Op)) | ||||||||
| 198 | if (cast<VectorType>(ZExt->getOperand(0)->getType()) | ||||||||
| 199 | ->getElementType() | ||||||||
| 200 | ->isIntegerTy(8)) | ||||||||
| 201 | return ZExt->getOperand(0); | ||||||||
| 202 | |||||||||
| 203 | return nullptr; | ||||||||
| 204 | }; | ||||||||
| 205 | |||||||||
| 206 | // Both operands of the subtract should be extends from vXi8. | ||||||||
| 207 | Value *Op0 = getZeroExtendedVal(Sub->getOperand(0)); | ||||||||
| 208 | Value *Op1 = getZeroExtendedVal(Sub->getOperand(1)); | ||||||||
| 209 | if (!Op0
| ||||||||
| 210 | return false; | ||||||||
| 211 | |||||||||
| 212 | IRBuilder<> Builder(SI); | ||||||||
| 213 | |||||||||
| 214 | auto *OpTy = cast<FixedVectorType>(Op->getType()); | ||||||||
| 215 | unsigned NumElts = OpTy->getNumElements(); | ||||||||
| 216 | |||||||||
| 217 | unsigned IntrinsicNumElts; | ||||||||
| 218 | Intrinsic::ID IID; | ||||||||
| 219 | if (ST->hasBWI() && NumElts >= 64) { | ||||||||
| 220 | IID = Intrinsic::x86_avx512_psad_bw_512; | ||||||||
| 221 | IntrinsicNumElts = 64; | ||||||||
| 222 | } else if (ST->hasAVX2() && NumElts >= 32) { | ||||||||
| 223 | IID = Intrinsic::x86_avx2_psad_bw; | ||||||||
| 224 | IntrinsicNumElts = 32; | ||||||||
| 225 | } else { | ||||||||
| 226 | IID = Intrinsic::x86_sse2_psad_bw; | ||||||||
| 227 | IntrinsicNumElts = 16; | ||||||||
| 228 | } | ||||||||
| 229 | |||||||||
| 230 | Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID); | ||||||||
| 231 | |||||||||
| 232 | if (NumElts < 16) { | ||||||||
| 233 | // Pad input with zeroes. | ||||||||
| 234 | SmallVector<int, 32> ConcatMask(16); | ||||||||
| 235 | for (unsigned i = 0; i != NumElts; ++i) | ||||||||
| 236 | ConcatMask[i] = i; | ||||||||
| 237 | for (unsigned i = NumElts; i != 16; ++i) | ||||||||
| 238 | ConcatMask[i] = (i % NumElts) + NumElts; | ||||||||
| |||||||||
| 239 | |||||||||
| 240 | Value *Zero = Constant::getNullValue(Op0->getType()); | ||||||||
| 241 | Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask); | ||||||||
| 242 | Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask); | ||||||||
| 243 | NumElts = 16; | ||||||||
| 244 | } | ||||||||
| 245 | |||||||||
| 246 | // Intrinsics produce vXi64 and need to be casted to vXi32. | ||||||||
| 247 | auto *I32Ty = | ||||||||
| 248 | FixedVectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4); | ||||||||
| 249 | |||||||||
| 250 | assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!")((void)0); | ||||||||
| 251 | unsigned NumSplits = NumElts / IntrinsicNumElts; | ||||||||
| 252 | |||||||||
| 253 | // First collect the pieces we need. | ||||||||
| 254 | SmallVector<Value *, 4> Ops(NumSplits); | ||||||||
| 255 | for (unsigned i = 0; i != NumSplits; ++i) { | ||||||||
| 256 | SmallVector<int, 64> ExtractMask(IntrinsicNumElts); | ||||||||
| 257 | std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts); | ||||||||
| 258 | Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask); | ||||||||
| 259 | Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask); | ||||||||
| 260 | Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1}); | ||||||||
| 261 | Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty); | ||||||||
| 262 | } | ||||||||
| 263 | |||||||||
| 264 | assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits")((void)0); | ||||||||
| 265 | unsigned Stages = Log2_32(NumSplits); | ||||||||
| 266 | for (unsigned s = Stages; s > 0; --s) { | ||||||||
| 267 | unsigned NumConcatElts = | ||||||||
| 268 | cast<FixedVectorType>(Ops[0]->getType())->getNumElements() * 2; | ||||||||
| 269 | for (unsigned i = 0; i != 1U << (s - 1); ++i) { | ||||||||
| 270 | SmallVector<int, 64> ConcatMask(NumConcatElts); | ||||||||
| 271 | std::iota(ConcatMask.begin(), ConcatMask.end(), 0); | ||||||||
| 272 | Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask); | ||||||||
| 273 | } | ||||||||
| 274 | } | ||||||||
| 275 | |||||||||
| 276 | // At this point the final value should be in Ops[0]. Now we need to adjust | ||||||||
| 277 | // it to the final original type. | ||||||||
| 278 | NumElts = cast<FixedVectorType>(OpTy)->getNumElements(); | ||||||||
| 279 | if (NumElts == 2) { | ||||||||
| 280 | // Extract down to 2 elements. | ||||||||
| 281 | Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{0, 1}); | ||||||||
| 282 | } else if (NumElts >= 8) { | ||||||||
| 283 | SmallVector<int, 32> ConcatMask(NumElts); | ||||||||
| 284 | unsigned SubElts = | ||||||||
| 285 | cast<FixedVectorType>(Ops[0]->getType())->getNumElements(); | ||||||||
| 286 | for (unsigned i = 0; i != SubElts; ++i) | ||||||||
| 287 | ConcatMask[i] = i; | ||||||||
| 288 | for (unsigned i = SubElts; i != NumElts; ++i) | ||||||||
| 289 | ConcatMask[i] = (i % SubElts) + SubElts; | ||||||||
| 290 | |||||||||
| 291 | Value *Zero = Constant::getNullValue(Ops[0]->getType()); | ||||||||
| 292 | Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask); | ||||||||
| 293 | } | ||||||||
| 294 | |||||||||
| 295 | SI->replaceAllUsesWith(Ops[0]); | ||||||||
| 296 | SI->eraseFromParent(); | ||||||||
| 297 | |||||||||
| 298 | return true; | ||||||||
| 299 | } | ||||||||
| 300 | |||||||||
| 301 | // Walk backwards from the ExtractElementInst and determine if it is the end of | ||||||||
| 302 | // a horizontal reduction. Return the input to the reduction if we find one. | ||||||||
| 303 | static Value *matchAddReduction(const ExtractElementInst &EE) { | ||||||||
| 304 | // Make sure we're extracting index 0. | ||||||||
| 305 | auto *Index = dyn_cast<ConstantInt>(EE.getIndexOperand()); | ||||||||
| 306 | if (!Index || !Index->isNullValue()) | ||||||||
| 307 | return nullptr; | ||||||||
| 308 | |||||||||
| 309 | const auto *BO = dyn_cast<BinaryOperator>(EE.getVectorOperand()); | ||||||||
| 310 | if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse()) | ||||||||
| 311 | return nullptr; | ||||||||
| 312 | |||||||||
| 313 | unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements(); | ||||||||
| 314 | // Ensure the reduction size is a power of 2. | ||||||||
| 315 | if (!isPowerOf2_32(NumElems)) | ||||||||
| 316 | return nullptr; | ||||||||
| 317 | |||||||||
| 318 | const Value *Op = BO; | ||||||||
| 319 | unsigned Stages = Log2_32(NumElems); | ||||||||
| 320 | for (unsigned i = 0; i != Stages; ++i) { | ||||||||
| 321 | const auto *BO = dyn_cast<BinaryOperator>(Op); | ||||||||
| 322 | if (!BO || BO->getOpcode() != Instruction::Add) | ||||||||
| 323 | return nullptr; | ||||||||
| 324 | |||||||||
| 325 | // If this isn't the first add, then it should only have 2 users, the | ||||||||
| 326 | // shuffle and another add which we checked in the previous iteration. | ||||||||
| 327 | if (i != 0 && !BO->hasNUses(2)) | ||||||||
| 328 | return nullptr; | ||||||||
| 329 | |||||||||
| 330 | Value *LHS = BO->getOperand(0); | ||||||||
| 331 | Value *RHS = BO->getOperand(1); | ||||||||
| 332 | |||||||||
| 333 | auto *Shuffle = dyn_cast<ShuffleVectorInst>(LHS); | ||||||||
| 334 | if (Shuffle) { | ||||||||
| 335 | Op = RHS; | ||||||||
| 336 | } else { | ||||||||
| 337 | Shuffle = dyn_cast<ShuffleVectorInst>(RHS); | ||||||||
| 338 | Op = LHS; | ||||||||
| 339 | } | ||||||||
| 340 | |||||||||
| 341 | // The first operand of the shuffle should be the same as the other operand | ||||||||
| 342 | // of the bin op. | ||||||||
| 343 | if (!Shuffle || Shuffle->getOperand(0) != Op) | ||||||||
| 344 | return nullptr; | ||||||||
| 345 | |||||||||
| 346 | // Verify the shuffle has the expected (at this stage of the pyramid) mask. | ||||||||
| 347 | unsigned MaskEnd = 1 << i; | ||||||||
| 348 | for (unsigned Index = 0; Index < MaskEnd; ++Index) | ||||||||
| 349 | if (Shuffle->getMaskValue(Index) != (int)(MaskEnd + Index)) | ||||||||
| 350 | return nullptr; | ||||||||
| 351 | } | ||||||||
| 352 | |||||||||
| 353 | return const_cast<Value *>(Op); | ||||||||
| 354 | } | ||||||||
| 355 | |||||||||
| 356 | // See if this BO is reachable from this Phi by walking forward through single | ||||||||
| 357 | // use BinaryOperators with the same opcode. If we get back then we know we've | ||||||||
| 358 | // found a loop and it is safe to step through this Add to find more leaves. | ||||||||
| 359 | static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO) { | ||||||||
| 360 | // The PHI itself should only have one use. | ||||||||
| 361 | if (!Phi->hasOneUse()) | ||||||||
| 362 | return false; | ||||||||
| 363 | |||||||||
| 364 | Instruction *U = cast<Instruction>(*Phi->user_begin()); | ||||||||
| 365 | if (U == BO) | ||||||||
| 366 | return true; | ||||||||
| 367 | |||||||||
| 368 | while (U->hasOneUse() && U->getOpcode() == BO->getOpcode()) | ||||||||
| 369 | U = cast<Instruction>(*U->user_begin()); | ||||||||
| 370 | |||||||||
| 371 | return U == BO; | ||||||||
| 372 | } | ||||||||
| 373 | |||||||||
| 374 | // Collect all the leaves of the tree of adds that feeds into the horizontal | ||||||||
| 375 | // reduction. Root is the Value that is used by the horizontal reduction. | ||||||||
| 376 | // We look through single use phis, single use adds, or adds that are used by | ||||||||
| 377 | // a phi that forms a loop with the add. | ||||||||
| 378 | static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { | ||||||||
| 379 | SmallPtrSet<Value *, 8> Visited; | ||||||||
| 380 | SmallVector<Value *, 8> Worklist; | ||||||||
| 381 | Worklist.push_back(Root); | ||||||||
| 382 | |||||||||
| 383 | while (!Worklist.empty()) { | ||||||||
| 384 | Value *V = Worklist.pop_back_val(); | ||||||||
| 385 | if (!Visited.insert(V).second) | ||||||||
| 386 | continue; | ||||||||
| 387 | |||||||||
| 388 | if (auto *PN = dyn_cast<PHINode>(V)) { | ||||||||
| 389 | // PHI node should have single use unless it is the root node, then it | ||||||||
| 390 | // has 2 uses. | ||||||||
| 391 | if (!PN->hasNUses(PN == Root ? 2 : 1)) | ||||||||
| 392 | break; | ||||||||
| 393 | |||||||||
| 394 | // Push incoming values to the worklist. | ||||||||
| 395 | append_range(Worklist, PN->incoming_values()); | ||||||||
| 396 | |||||||||
| 397 | continue; | ||||||||
| 398 | } | ||||||||
| 399 | |||||||||
| 400 | if (auto *BO = dyn_cast<BinaryOperator>(V)) { | ||||||||
| 401 | if (BO->getOpcode() == Instruction::Add) { | ||||||||
| 402 | // Simple case. Single use, just push its operands to the worklist. | ||||||||
| 403 | if (BO->hasNUses(BO == Root ? 2 : 1)) { | ||||||||
| 404 | append_range(Worklist, BO->operands()); | ||||||||
| 405 | continue; | ||||||||
| 406 | } | ||||||||
| 407 | |||||||||
| 408 | // If there is additional use, make sure it is an unvisited phi that | ||||||||
| 409 | // gets us back to this node. | ||||||||
| 410 | if (BO->hasNUses(BO == Root ? 3 : 2)) { | ||||||||
| 411 | PHINode *PN = nullptr; | ||||||||
| 412 | for (auto *U : Root->users()) | ||||||||
| 413 | if (auto *P = dyn_cast<PHINode>(U)) | ||||||||
| 414 | if (!Visited.count(P)) | ||||||||
| 415 | PN = P; | ||||||||
| 416 | |||||||||
| 417 | // If we didn't find a 2-input PHI then this isn't a case we can | ||||||||
| 418 | // handle. | ||||||||
| 419 | if (!PN || PN->getNumIncomingValues() != 2) | ||||||||
| 420 | continue; | ||||||||
| 421 | |||||||||
| 422 | // Walk forward from this phi to see if it reaches back to this add. | ||||||||
| 423 | if (!isReachableFromPHI(PN, BO)) | ||||||||
| 424 | continue; | ||||||||
| 425 | |||||||||
| 426 | // The phi forms a loop with this Add, push its operands. | ||||||||
| 427 | append_range(Worklist, BO->operands()); | ||||||||
| 428 | } | ||||||||
| 429 | } | ||||||||
| 430 | } | ||||||||
| 431 | |||||||||
| 432 | // Not an add or phi, make it a leaf. | ||||||||
| 433 | if (auto *I = dyn_cast<Instruction>(V)) { | ||||||||
| 434 | if (!V->hasNUses(I == Root ? 2 : 1)) | ||||||||
| 435 | continue; | ||||||||
| 436 | |||||||||
| 437 | // Add this as a leaf. | ||||||||
| 438 | Leaves.push_back(I); | ||||||||
| 439 | } | ||||||||
| 440 | } | ||||||||
| 441 | } | ||||||||
| 442 | |||||||||
| 443 | bool X86PartialReduction::runOnFunction(Function &F) { | ||||||||
| 444 | if (skipFunction(F)) | ||||||||
| |||||||||
| 445 | return false; | ||||||||
| 446 | |||||||||
| 447 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); | ||||||||
| 448 | if (!TPC) | ||||||||
| 449 | return false; | ||||||||
| 450 | |||||||||
| 451 | auto &TM = TPC->getTM<X86TargetMachine>(); | ||||||||
| 452 | ST = TM.getSubtargetImpl(F); | ||||||||
| 453 | |||||||||
| 454 | DL = &F.getParent()->getDataLayout(); | ||||||||
| 455 | |||||||||
| 456 | bool MadeChange = false; | ||||||||
| 457 | for (auto &BB : F) { | ||||||||
| 458 | for (auto &I : BB) { | ||||||||
| 459 | auto *EE = dyn_cast<ExtractElementInst>(&I); | ||||||||
| 460 | if (!EE
| ||||||||
| 461 | continue; | ||||||||
| 462 | |||||||||
| 463 | // First find a reduction tree. | ||||||||
| 464 | // FIXME: Do we need to handle other opcodes than Add? | ||||||||
| 465 | Value *Root = matchAddReduction(*EE); | ||||||||
| 466 | if (!Root
| ||||||||
| 467 | continue; | ||||||||
| 468 | |||||||||
| 469 | SmallVector<Instruction *, 8> Leaves; | ||||||||
| 470 | collectLeaves(Root, Leaves); | ||||||||
| 471 | |||||||||
| 472 | for (Instruction *I : Leaves) { | ||||||||
| 473 | if (tryMAddReplacement(I)) { | ||||||||
| 474 | MadeChange = true; | ||||||||
| 475 | continue; | ||||||||
| 476 | } | ||||||||
| 477 | |||||||||
| 478 | // Don't do SAD matching on the root node. SelectionDAG already | ||||||||
| 479 | // has support for that and currently generates better code. | ||||||||
| 480 | if (I != Root && trySADReplacement(I)) | ||||||||
| 481 | MadeChange = true; | ||||||||
| 482 | } | ||||||||
| 483 | } | ||||||||
| 484 | } | ||||||||
| 485 | |||||||||
| 486 | return MadeChange; | ||||||||
| 487 | } |
| 1 | //===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===// | ||||
| 2 | // | ||||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
| 6 | // | ||||
| 7 | //===----------------------------------------------------------------------===// | ||||
| 8 | // | ||||
| 9 | // This file declares the X86 specific subclass of TargetSubtargetInfo. | ||||
| 10 | // | ||||
| 11 | //===----------------------------------------------------------------------===// | ||||
| 12 | |||||
| 13 | #ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H | ||||
| 14 | #define LLVM_LIB_TARGET_X86_X86SUBTARGET_H | ||||
| 15 | |||||
| 16 | #include "X86FrameLowering.h" | ||||
| 17 | #include "X86ISelLowering.h" | ||||
| 18 | #include "X86InstrInfo.h" | ||||
| 19 | #include "X86SelectionDAGInfo.h" | ||||
| 20 | #include "llvm/ADT/Triple.h" | ||||
| 21 | #include "llvm/CodeGen/TargetSubtargetInfo.h" | ||||
| 22 | #include "llvm/IR/CallingConv.h" | ||||
| 23 | #include <climits> | ||||
| 24 | #include <memory> | ||||
| 25 | |||||
| 26 | #define GET_SUBTARGETINFO_HEADER | ||||
| 27 | #include "X86GenSubtargetInfo.inc" | ||||
| 28 | |||||
| 29 | namespace llvm { | ||||
| 30 | |||||
| 31 | class CallLowering; | ||||
| 32 | class GlobalValue; | ||||
| 33 | class InstructionSelector; | ||||
| 34 | class LegalizerInfo; | ||||
| 35 | class RegisterBankInfo; | ||||
| 36 | class StringRef; | ||||
| 37 | class TargetMachine; | ||||
| 38 | |||||
| 39 | /// The X86 backend supports a number of different styles of PIC. | ||||
| 40 | /// | ||||
| 41 | namespace PICStyles { | ||||
| 42 | |||||
| 43 | enum class Style { | ||||
| 44 | StubPIC, // Used on i386-darwin in pic mode. | ||||
| 45 | GOT, // Used on 32 bit elf on when in pic mode. | ||||
| 46 | RIPRel, // Used on X86-64 when in pic mode. | ||||
| 47 | None // Set when not in pic mode. | ||||
| 48 | }; | ||||
| 49 | |||||
| 50 | } // end namespace PICStyles | ||||
| 51 | |||||
| 52 | class X86Subtarget final : public X86GenSubtargetInfo { | ||||
| 53 | // NOTE: Do not add anything new to this list. Coarse, CPU name based flags | ||||
| 54 | // are not a good idea. We should be migrating away from these. | ||||
| 55 | enum X86ProcFamilyEnum { | ||||
| 56 | Others, | ||||
| 57 | IntelAtom, | ||||
| 58 | IntelSLM | ||||
| 59 | }; | ||||
| 60 | |||||
| 61 | enum X86SSEEnum { | ||||
| 62 | NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F | ||||
| 63 | }; | ||||
| 64 | |||||
| 65 | enum X863DNowEnum { | ||||
| 66 | NoThreeDNow, MMX, ThreeDNow, ThreeDNowA | ||||
| 67 | }; | ||||
| 68 | |||||
| 69 | /// X86 processor family: Intel Atom, and others | ||||
| 70 | X86ProcFamilyEnum X86ProcFamily = Others; | ||||
| 71 | |||||
| 72 | /// Which PIC style to use | ||||
| 73 | PICStyles::Style PICStyle; | ||||
| 74 | |||||
| 75 | const TargetMachine &TM; | ||||
| 76 | |||||
| 77 | /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. | ||||
| 78 | X86SSEEnum X86SSELevel = NoSSE; | ||||
| 79 | |||||
| 80 | /// MMX, 3DNow, 3DNow Athlon, or none supported. | ||||
| 81 | X863DNowEnum X863DNowLevel = NoThreeDNow; | ||||
| 82 | |||||
| 83 | /// True if the processor supports X87 instructions. | ||||
| 84 | bool HasX87 = false; | ||||
| 85 | |||||
| 86 | /// True if the processor supports CMPXCHG8B. | ||||
| 87 | bool HasCmpxchg8b = false; | ||||
| 88 | |||||
| 89 | /// True if this processor has NOPL instruction | ||||
| 90 | /// (generally pentium pro+). | ||||
| 91 | bool HasNOPL = false; | ||||
| 92 | |||||
| 93 | /// True if this processor has conditional move instructions | ||||
| 94 | /// (generally pentium pro+). | ||||
| 95 | bool HasCMov = false; | ||||
| 96 | |||||
| 97 | /// True if the processor supports X86-64 instructions. | ||||
| 98 | bool HasX86_64 = false; | ||||
| 99 | |||||
| 100 | /// True if the processor supports POPCNT. | ||||
| 101 | bool HasPOPCNT = false; | ||||
| 102 | |||||
| 103 | /// True if the processor supports SSE4A instructions. | ||||
| 104 | bool HasSSE4A = false; | ||||
| 105 | |||||
| 106 | /// Target has AES instructions | ||||
| 107 | bool HasAES = false; | ||||
| 108 | bool HasVAES = false; | ||||
| 109 | |||||
| 110 | /// Target has FXSAVE/FXRESTOR instructions | ||||
| 111 | bool HasFXSR = false; | ||||
| 112 | |||||
| 113 | /// Target has XSAVE instructions | ||||
| 114 | bool HasXSAVE = false; | ||||
| 115 | |||||
| 116 | /// Target has XSAVEOPT instructions | ||||
| 117 | bool HasXSAVEOPT = false; | ||||
| 118 | |||||
| 119 | /// Target has XSAVEC instructions | ||||
| 120 | bool HasXSAVEC = false; | ||||
| 121 | |||||
| 122 | /// Target has XSAVES instructions | ||||
| 123 | bool HasXSAVES = false; | ||||
| 124 | |||||
| 125 | /// Target has carry-less multiplication | ||||
| 126 | bool HasPCLMUL = false; | ||||
| 127 | bool HasVPCLMULQDQ = false; | ||||
| 128 | |||||
| 129 | /// Target has Galois Field Arithmetic instructions | ||||
| 130 | bool HasGFNI = false; | ||||
| 131 | |||||
| 132 | /// Target has 3-operand fused multiply-add | ||||
| 133 | bool HasFMA = false; | ||||
| 134 | |||||
| 135 | /// Target has 4-operand fused multiply-add | ||||
| 136 | bool HasFMA4 = false; | ||||
| 137 | |||||
| 138 | /// Target has XOP instructions | ||||
| 139 | bool HasXOP = false; | ||||
| 140 | |||||
| 141 | /// Target has TBM instructions. | ||||
| 142 | bool HasTBM = false; | ||||
| 143 | |||||
| 144 | /// Target has LWP instructions | ||||
| 145 | bool HasLWP = false; | ||||
| 146 | |||||
| 147 | /// True if the processor has the MOVBE instruction. | ||||
| 148 | bool HasMOVBE = false; | ||||
| 149 | |||||
| 150 | /// True if the processor has the RDRAND instruction. | ||||
| 151 | bool HasRDRAND = false; | ||||
| 152 | |||||
| 153 | /// Processor has 16-bit floating point conversion instructions. | ||||
| 154 | bool HasF16C = false; | ||||
| 155 | |||||
| 156 | /// Processor has FS/GS base insturctions. | ||||
| 157 | bool HasFSGSBase = false; | ||||
| 158 | |||||
| 159 | /// Processor has LZCNT instruction. | ||||
| 160 | bool HasLZCNT = false; | ||||
| 161 | |||||
| 162 | /// Processor has BMI1 instructions. | ||||
| 163 | bool HasBMI = false; | ||||
| 164 | |||||
| 165 | /// Processor has BMI2 instructions. | ||||
| 166 | bool HasBMI2 = false; | ||||
| 167 | |||||
| 168 | /// Processor has VBMI instructions. | ||||
| 169 | bool HasVBMI = false; | ||||
| 170 | |||||
| 171 | /// Processor has VBMI2 instructions. | ||||
| 172 | bool HasVBMI2 = false; | ||||
| 173 | |||||
| 174 | /// Processor has Integer Fused Multiply Add | ||||
| 175 | bool HasIFMA = false; | ||||
| 176 | |||||
| 177 | /// Processor has RTM instructions. | ||||
| 178 | bool HasRTM = false; | ||||
| 179 | |||||
| 180 | /// Processor has ADX instructions. | ||||
| 181 | bool HasADX = false; | ||||
| 182 | |||||
| 183 | /// Processor has SHA instructions. | ||||
| 184 | bool HasSHA = false; | ||||
| 185 | |||||
| 186 | /// Processor has PRFCHW instructions. | ||||
| 187 | bool HasPRFCHW = false; | ||||
| 188 | |||||
| 189 | /// Processor has RDSEED instructions. | ||||
| 190 | bool HasRDSEED = false; | ||||
| 191 | |||||
| 192 | /// Processor has LAHF/SAHF instructions in 64-bit mode. | ||||
| 193 | bool HasLAHFSAHF64 = false; | ||||
| 194 | |||||
| 195 | /// Processor has MONITORX/MWAITX instructions. | ||||
| 196 | bool HasMWAITX = false; | ||||
| 197 | |||||
| 198 | /// Processor has Cache Line Zero instruction | ||||
| 199 | bool HasCLZERO = false; | ||||
| 200 | |||||
| 201 | /// Processor has Cache Line Demote instruction | ||||
| 202 | bool HasCLDEMOTE = false; | ||||
| 203 | |||||
| 204 | /// Processor has MOVDIRI instruction (direct store integer). | ||||
| 205 | bool HasMOVDIRI = false; | ||||
| 206 | |||||
| 207 | /// Processor has MOVDIR64B instruction (direct store 64 bytes). | ||||
| 208 | bool HasMOVDIR64B = false; | ||||
| 209 | |||||
| 210 | /// Processor has ptwrite instruction. | ||||
| 211 | bool HasPTWRITE = false; | ||||
| 212 | |||||
| 213 | /// Processor has Prefetch with intent to Write instruction | ||||
| 214 | bool HasPREFETCHWT1 = false; | ||||
| 215 | |||||
| 216 | /// True if SHLD instructions are slow. | ||||
| 217 | bool IsSHLDSlow = false; | ||||
| 218 | |||||
| 219 | /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and | ||||
| 220 | // PMULUDQ. | ||||
| 221 | bool IsPMULLDSlow = false; | ||||
| 222 | |||||
| 223 | /// True if the PMADDWD instruction is slow compared to PMULLD. | ||||
| 224 | bool IsPMADDWDSlow = false; | ||||
| 225 | |||||
| 226 | /// True if unaligned memory accesses of 16-bytes are slow. | ||||
| 227 | bool IsUAMem16Slow = false; | ||||
| 228 | |||||
| 229 | /// True if unaligned memory accesses of 32-bytes are slow. | ||||
| 230 | bool IsUAMem32Slow = false; | ||||
| 231 | |||||
| 232 | /// True if SSE operations can have unaligned memory operands. | ||||
| 233 | /// This may require setting a configuration bit in the processor. | ||||
| 234 | bool HasSSEUnalignedMem = false; | ||||
| 235 | |||||
| 236 | /// True if this processor has the CMPXCHG16B instruction; | ||||
| 237 | /// this is true for most x86-64 chips, but not the first AMD chips. | ||||
| 238 | bool HasCmpxchg16b = false; | ||||
| 239 | |||||
| 240 | /// True if the LEA instruction should be used for adjusting | ||||
| 241 | /// the stack pointer. This is an optimization for Intel Atom processors. | ||||
| 242 | bool UseLeaForSP = false; | ||||
| 243 | |||||
| 244 | /// True if POPCNT instruction has a false dependency on the destination register. | ||||
| 245 | bool HasPOPCNTFalseDeps = false; | ||||
| 246 | |||||
| 247 | /// True if LZCNT/TZCNT instructions have a false dependency on the destination register. | ||||
| 248 | bool HasLZCNTFalseDeps = false; | ||||
| 249 | |||||
| 250 | /// True if its preferable to combine to a single cross-lane shuffle | ||||
| 251 | /// using a variable mask over multiple fixed shuffles. | ||||
| 252 | bool HasFastVariableCrossLaneShuffle = false; | ||||
| 253 | |||||
| 254 | /// True if its preferable to combine to a single per-lane shuffle | ||||
| 255 | /// using a variable mask over multiple fixed shuffles. | ||||
| 256 | bool HasFastVariablePerLaneShuffle = false; | ||||
| 257 | |||||
| 258 | /// True if vzeroupper instructions should be inserted after code that uses | ||||
| 259 | /// ymm or zmm registers. | ||||
| 260 | bool InsertVZEROUPPER = false; | ||||
| 261 | |||||
| 262 | /// True if there is no performance penalty for writing NOPs with up to | ||||
| 263 | /// 7 bytes. | ||||
| 264 | bool HasFast7ByteNOP = false; | ||||
| 265 | |||||
| 266 | /// True if there is no performance penalty for writing NOPs with up to | ||||
| 267 | /// 11 bytes. | ||||
| 268 | bool HasFast11ByteNOP = false; | ||||
| 269 | |||||
| 270 | /// True if there is no performance penalty for writing NOPs with up to | ||||
| 271 | /// 15 bytes. | ||||
| 272 | bool HasFast15ByteNOP = false; | ||||
| 273 | |||||
| 274 | /// True if gather is reasonably fast. This is true for Skylake client and | ||||
| 275 | /// all AVX-512 CPUs. | ||||
| 276 | bool HasFastGather = false; | ||||
| 277 | |||||
| 278 | /// True if hardware SQRTSS instruction is at least as fast (latency) as | ||||
| 279 | /// RSQRTSS followed by a Newton-Raphson iteration. | ||||
| 280 | bool HasFastScalarFSQRT = false; | ||||
| 281 | |||||
| 282 | /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast | ||||
| 283 | /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration. | ||||
| 284 | bool HasFastVectorFSQRT = false; | ||||
| 285 | |||||
| 286 | /// True if 8-bit divisions are significantly faster than | ||||
| 287 | /// 32-bit divisions and should be used when possible. | ||||
| 288 | bool HasSlowDivide32 = false; | ||||
| 289 | |||||
| 290 | /// True if 32-bit divides are significantly faster than | ||||
| 291 | /// 64-bit divisions and should be used when possible. | ||||
| 292 | bool HasSlowDivide64 = false; | ||||
| 293 | |||||
| 294 | /// True if LZCNT instruction is fast. | ||||
| 295 | bool HasFastLZCNT = false; | ||||
| 296 | |||||
| 297 | /// True if SHLD based rotate is fast. | ||||
| 298 | bool HasFastSHLDRotate = false; | ||||
| 299 | |||||
| 300 | /// True if the processor supports macrofusion. | ||||
| 301 | bool HasMacroFusion = false; | ||||
| 302 | |||||
| 303 | /// True if the processor supports branch fusion. | ||||
| 304 | bool HasBranchFusion = false; | ||||
| 305 | |||||
| 306 | /// True if the processor has enhanced REP MOVSB/STOSB. | ||||
| 307 | bool HasERMSB = false; | ||||
| 308 | |||||
| 309 | /// True if the processor has fast short REP MOV. | ||||
| 310 | bool HasFSRM = false; | ||||
| 311 | |||||
| 312 | /// True if the short functions should be padded to prevent | ||||
| 313 | /// a stall when returning too early. | ||||
| 314 | bool PadShortFunctions = false; | ||||
| 315 | |||||
| 316 | /// True if two memory operand instructions should use a temporary register | ||||
| 317 | /// instead. | ||||
| 318 | bool SlowTwoMemOps = false; | ||||
| 319 | |||||
| 320 | /// True if the LEA instruction inputs have to be ready at address generation | ||||
| 321 | /// (AG) time. | ||||
| 322 | bool LEAUsesAG = false; | ||||
| 323 | |||||
| 324 | /// True if the LEA instruction with certain arguments is slow | ||||
| 325 | bool SlowLEA = false; | ||||
| 326 | |||||
| 327 | /// True if the LEA instruction has all three source operands: base, index, | ||||
| 328 | /// and offset or if the LEA instruction uses base and index registers where | ||||
| 329 | /// the base is EBP, RBP,or R13 | ||||
| 330 | bool Slow3OpsLEA = false; | ||||
| 331 | |||||
| 332 | /// True if INC and DEC instructions are slow when writing to flags | ||||
| 333 | bool SlowIncDec = false; | ||||
| 334 | |||||
| 335 | /// Processor has AVX-512 PreFetch Instructions | ||||
| 336 | bool HasPFI = false; | ||||
| 337 | |||||
| 338 | /// Processor has AVX-512 Exponential and Reciprocal Instructions | ||||
| 339 | bool HasERI = false; | ||||
| 340 | |||||
| 341 | /// Processor has AVX-512 Conflict Detection Instructions | ||||
| 342 | bool HasCDI = false; | ||||
| 343 | |||||
| 344 | /// Processor has AVX-512 population count Instructions | ||||
| 345 | bool HasVPOPCNTDQ = false; | ||||
| 346 | |||||
| 347 | /// Processor has AVX-512 Doubleword and Quadword instructions | ||||
| 348 | bool HasDQI = false; | ||||
| 349 | |||||
| 350 | /// Processor has AVX-512 Byte and Word instructions | ||||
| 351 | bool HasBWI = false; | ||||
| 352 | |||||
| 353 | /// Processor has AVX-512 Vector Length eXtenstions | ||||
| 354 | bool HasVLX = false; | ||||
| 355 | |||||
| 356 | /// Processor has PKU extenstions | ||||
| 357 | bool HasPKU = false; | ||||
| 358 | |||||
| 359 | /// Processor has AVX-512 Vector Neural Network Instructions | ||||
| 360 | bool HasVNNI = false; | ||||
| 361 | |||||
| 362 | /// Processor has AVX Vector Neural Network Instructions | ||||
| 363 | bool HasAVXVNNI = false; | ||||
| 364 | |||||
| 365 | /// Processor has AVX-512 bfloat16 floating-point extensions | ||||
| 366 | bool HasBF16 = false; | ||||
| 367 | |||||
| 368 | /// Processor supports ENQCMD instructions | ||||
| 369 | bool HasENQCMD = false; | ||||
| 370 | |||||
| 371 | /// Processor has AVX-512 Bit Algorithms instructions | ||||
| 372 | bool HasBITALG = false; | ||||
| 373 | |||||
| 374 | /// Processor has AVX-512 vp2intersect instructions | ||||
| 375 | bool HasVP2INTERSECT = false; | ||||
| 376 | |||||
| 377 | /// Processor supports CET SHSTK - Control-Flow Enforcement Technology | ||||
| 378 | /// using Shadow Stack | ||||
| 379 | bool HasSHSTK = false; | ||||
| 380 | |||||
| 381 | /// Processor supports Invalidate Process-Context Identifier | ||||
| 382 | bool HasINVPCID = false; | ||||
| 383 | |||||
| 384 | /// Processor has Software Guard Extensions | ||||
| 385 | bool HasSGX = false; | ||||
| 386 | |||||
| 387 | /// Processor supports Flush Cache Line instruction | ||||
| 388 | bool HasCLFLUSHOPT = false; | ||||
| 389 | |||||
| 390 | /// Processor supports Cache Line Write Back instruction | ||||
| 391 | bool HasCLWB = false; | ||||
| 392 | |||||
| 393 | /// Processor supports Write Back No Invalidate instruction | ||||
| 394 | bool HasWBNOINVD = false; | ||||
| 395 | |||||
| 396 | /// Processor support RDPID instruction | ||||
| 397 | bool HasRDPID = false; | ||||
| 398 | |||||
| 399 | /// Processor supports WaitPKG instructions | ||||
| 400 | bool HasWAITPKG = false; | ||||
| 401 | |||||
| 402 | /// Processor supports PCONFIG instruction | ||||
| 403 | bool HasPCONFIG = false; | ||||
| 404 | |||||
| 405 | /// Processor support key locker instructions | ||||
| 406 | bool HasKL = false; | ||||
| 407 | |||||
| 408 | /// Processor support key locker wide instructions | ||||
| 409 | bool HasWIDEKL = false; | ||||
| 410 | |||||
| 411 | /// Processor supports HRESET instruction | ||||
| 412 | bool HasHRESET = false; | ||||
| 413 | |||||
| 414 | /// Processor supports SERIALIZE instruction | ||||
| 415 | bool HasSERIALIZE = false; | ||||
| 416 | |||||
| 417 | /// Processor supports TSXLDTRK instruction | ||||
| 418 | bool HasTSXLDTRK = false; | ||||
| 419 | |||||
| 420 | /// Processor has AMX support | ||||
| 421 | bool HasAMXTILE = false; | ||||
| 422 | bool HasAMXBF16 = false; | ||||
| 423 | bool HasAMXINT8 = false; | ||||
| 424 | |||||
| 425 | /// Processor supports User Level Interrupt instructions | ||||
| 426 | bool HasUINTR = false; | ||||
| 427 | |||||
| 428 | /// Processor has a single uop BEXTR implementation. | ||||
| 429 | bool HasFastBEXTR = false; | ||||
| 430 | |||||
| 431 | /// Try harder to combine to horizontal vector ops if they are fast. | ||||
| 432 | bool HasFastHorizontalOps = false; | ||||
| 433 | |||||
| 434 | /// Prefer a left/right scalar logical shifts pair over a shift+and pair. | ||||
| 435 | bool HasFastScalarShiftMasks = false; | ||||
| 436 | |||||
| 437 | /// Prefer a left/right vector logical shifts pair over a shift+and pair. | ||||
| 438 | bool HasFastVectorShiftMasks = false; | ||||
| 439 | |||||
| 440 | /// Prefer a movbe over a single-use load + bswap / single-use bswap + store. | ||||
| 441 | bool HasFastMOVBE = false; | ||||
| 442 | |||||
| 443 | /// Use a retpoline thunk rather than indirect calls to block speculative | ||||
| 444 | /// execution. | ||||
| 445 | bool UseRetpolineIndirectCalls = false; | ||||
| 446 | |||||
| 447 | /// Use a retpoline thunk or remove any indirect branch to block speculative | ||||
| 448 | /// execution. | ||||
| 449 | bool UseRetpolineIndirectBranches = false; | ||||
| 450 | |||||
| 451 | /// Deprecated flag, query `UseRetpolineIndirectCalls` and | ||||
| 452 | /// `UseRetpolineIndirectBranches` instead. | ||||
| 453 | bool DeprecatedUseRetpoline = false; | ||||
| 454 | |||||
| 455 | /// When using a retpoline thunk, call an externally provided thunk rather | ||||
| 456 | /// than emitting one inside the compiler. | ||||
| 457 | bool UseRetpolineExternalThunk = false; | ||||
| 458 | |||||
| 459 | /// Prevent generation of indirect call/branch instructions from memory, | ||||
| 460 | /// and force all indirect call/branch instructions from a register to be | ||||
| 461 | /// preceded by an LFENCE. Also decompose RET instructions into a | ||||
| 462 | /// POP+LFENCE+JMP sequence. | ||||
| 463 | bool UseLVIControlFlowIntegrity = false; | ||||
| 464 | |||||
| 465 | /// Enable Speculative Execution Side Effect Suppression | ||||
| 466 | bool UseSpeculativeExecutionSideEffectSuppression = false; | ||||
| 467 | |||||
| 468 | /// Insert LFENCE instructions to prevent data speculatively injected into | ||||
| 469 | /// loads from being used maliciously. | ||||
| 470 | bool UseLVILoadHardening = false; | ||||
| 471 | |||||
| 472 | /// Use software floating point for code generation. | ||||
| 473 | bool UseSoftFloat = false; | ||||
| 474 | |||||
| 475 | /// Use alias analysis during code generation. | ||||
| 476 | bool UseAA = false; | ||||
| 477 | |||||
| 478 | /// The minimum alignment known to hold of the stack frame on | ||||
| 479 | /// entry to the function and which must be maintained by every function. | ||||
| 480 | Align stackAlignment = Align(4); | ||||
| 481 | |||||
| 482 | Align TileConfigAlignment = Align(4); | ||||
| 483 | |||||
| 484 | /// Whether function prologues should save register arguments on the stack. | ||||
| 485 | bool SaveArgs = false; | ||||
| 486 | |||||
| 487 | /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. | ||||
| 488 | /// | ||||
| 489 | // FIXME: this is a known good value for Yonah. How about others? | ||||
| 490 | unsigned MaxInlineSizeThreshold = 128; | ||||
| 491 | |||||
| 492 | /// Indicates target prefers 128 bit instructions. | ||||
| 493 | bool Prefer128Bit = false; | ||||
| 494 | |||||
| 495 | /// Indicates target prefers 256 bit instructions. | ||||
| 496 | bool Prefer256Bit = false; | ||||
| 497 | |||||
| 498 | /// Indicates target prefers AVX512 mask registers. | ||||
| 499 | bool PreferMaskRegisters = false; | ||||
| 500 | |||||
| 501 | /// Use Goldmont specific floating point div/sqrt costs. | ||||
| 502 | bool UseGLMDivSqrtCosts = false; | ||||
| 503 | |||||
| 504 | /// What processor and OS we're targeting. | ||||
| 505 | Triple TargetTriple; | ||||
| 506 | |||||
| 507 | /// GlobalISel related APIs. | ||||
| 508 | std::unique_ptr<CallLowering> CallLoweringInfo; | ||||
| 509 | std::unique_ptr<LegalizerInfo> Legalizer; | ||||
| 510 | std::unique_ptr<RegisterBankInfo> RegBankInfo; | ||||
| 511 | std::unique_ptr<InstructionSelector> InstSelector; | ||||
| 512 | |||||
| 513 | private: | ||||
| 514 | /// Override the stack alignment. | ||||
| 515 | MaybeAlign StackAlignOverride; | ||||
| 516 | |||||
| 517 | /// Preferred vector width from function attribute. | ||||
| 518 | unsigned PreferVectorWidthOverride; | ||||
| 519 | |||||
| 520 | /// Resolved preferred vector width from function attribute and subtarget | ||||
| 521 | /// features. | ||||
| 522 | unsigned PreferVectorWidth = UINT32_MAX0xffffffffU; | ||||
| 523 | |||||
| 524 | /// Required vector width from function attribute. | ||||
| 525 | unsigned RequiredVectorWidth; | ||||
| 526 | |||||
| 527 | /// True if compiling for 64-bit, false for 16-bit or 32-bit. | ||||
| 528 | bool In64BitMode = false; | ||||
| 529 | |||||
| 530 | /// True if compiling for 32-bit, false for 16-bit or 64-bit. | ||||
| 531 | bool In32BitMode = false; | ||||
| 532 | |||||
| 533 | /// True if compiling for 16-bit, false for 32-bit or 64-bit. | ||||
| 534 | bool In16BitMode = false; | ||||
| 535 | |||||
| 536 | X86SelectionDAGInfo TSInfo; | ||||
| 537 | // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which | ||||
| 538 | // X86TargetLowering needs. | ||||
| 539 | X86InstrInfo InstrInfo; | ||||
| 540 | X86TargetLowering TLInfo; | ||||
| 541 | X86FrameLowering FrameLowering; | ||||
| 542 | |||||
| 543 | public: | ||||
| 544 | /// This constructor initializes the data members to match that | ||||
| 545 | /// of the specified triple. | ||||
| 546 | /// | ||||
| 547 | X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, | ||||
| 548 | const X86TargetMachine &TM, MaybeAlign StackAlignOverride, | ||||
| 549 | unsigned PreferVectorWidthOverride, | ||||
| 550 | unsigned RequiredVectorWidth); | ||||
| 551 | |||||
| 552 | const X86TargetLowering *getTargetLowering() const override { | ||||
| 553 | return &TLInfo; | ||||
| 554 | } | ||||
| 555 | |||||
| 556 | const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } | ||||
| 557 | |||||
| 558 | const X86FrameLowering *getFrameLowering() const override { | ||||
| 559 | return &FrameLowering; | ||||
| 560 | } | ||||
| 561 | |||||
| 562 | const X86SelectionDAGInfo *getSelectionDAGInfo() const override { | ||||
| 563 | return &TSInfo; | ||||
| 564 | } | ||||
| 565 | |||||
| 566 | const X86RegisterInfo *getRegisterInfo() const override { | ||||
| 567 | return &getInstrInfo()->getRegisterInfo(); | ||||
| 568 | } | ||||
| 569 | |||||
| 570 | bool getSaveArgs() const { return SaveArgs; } | ||||
| 571 | |||||
| 572 | unsigned getTileConfigSize() const { return 64; } | ||||
| 573 | Align getTileConfigAlignment() const { return TileConfigAlignment; } | ||||
| 574 | |||||
| 575 | /// Returns the minimum alignment known to hold of the | ||||
| 576 | /// stack frame on entry to the function and which must be maintained by every | ||||
| 577 | /// function for this subtarget. | ||||
| 578 | Align getStackAlignment() const { return stackAlignment; } | ||||
| 579 | |||||
| 580 | /// Returns the maximum memset / memcpy size | ||||
| 581 | /// that still makes it profitable to inline the call. | ||||
| 582 | unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; } | ||||
| 583 | |||||
| 584 | /// ParseSubtargetFeatures - Parses features string setting specified | ||||
| 585 | /// subtarget options. Definition of function is auto generated by tblgen. | ||||
| 586 | void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); | ||||
| 587 | |||||
| 588 | /// Methods used by Global ISel | ||||
| 589 | const CallLowering *getCallLowering() const override; | ||||
| 590 | InstructionSelector *getInstructionSelector() const override; | ||||
| 591 | const LegalizerInfo *getLegalizerInfo() const override; | ||||
| 592 | const RegisterBankInfo *getRegBankInfo() const override; | ||||
| 593 | |||||
| 594 | private: | ||||
| 595 | /// Initialize the full set of dependencies so we can use an initializer | ||||
| 596 | /// list for X86Subtarget. | ||||
| 597 | X86Subtarget &initializeSubtargetDependencies(StringRef CPU, | ||||
| 598 | StringRef TuneCPU, | ||||
| 599 | StringRef FS); | ||||
| 600 | void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); | ||||
| 601 | |||||
| 602 | public: | ||||
| 603 | /// Is this x86_64? (disregarding specific ABI / programming model) | ||||
| 604 | bool is64Bit() const { | ||||
| 605 | return In64BitMode; | ||||
| 606 | } | ||||
| 607 | |||||
| 608 | bool is32Bit() const { | ||||
| 609 | return In32BitMode; | ||||
| 610 | } | ||||
| 611 | |||||
| 612 | bool is16Bit() const { | ||||
| 613 | return In16BitMode; | ||||
| 614 | } | ||||
| 615 | |||||
| 616 | /// Is this x86_64 with the ILP32 programming model (x32 ABI)? | ||||
| 617 | bool isTarget64BitILP32() const { | ||||
| 618 | return In64BitMode && (TargetTriple.isX32() || TargetTriple.isOSNaCl()); | ||||
| 619 | } | ||||
| 620 | |||||
| 621 | /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? | ||||
| 622 | bool isTarget64BitLP64() const { | ||||
| 623 | return In64BitMode && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl()); | ||||
| 624 | } | ||||
| 625 | |||||
| 626 | PICStyles::Style getPICStyle() const { return PICStyle; } | ||||
| 627 | void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } | ||||
| 628 | |||||
| 629 | bool hasX87() const { return HasX87; } | ||||
| 630 | bool hasCmpxchg8b() const { return HasCmpxchg8b; } | ||||
| 631 | bool hasNOPL() const { return HasNOPL; } | ||||
| 632 | // SSE codegen depends on cmovs, and all SSE1+ processors support them. | ||||
| 633 | // All 64-bit processors support cmov. | ||||
| 634 | bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); } | ||||
| 635 | bool hasSSE1() const { return X86SSELevel >= SSE1; } | ||||
| 636 | bool hasSSE2() const { return X86SSELevel
| ||||
| 637 | bool hasSSE3() const { return X86SSELevel >= SSE3; } | ||||
| 638 | bool hasSSSE3() const { return X86SSELevel >= SSSE3; } | ||||
| 639 | bool hasSSE41() const { return X86SSELevel >= SSE41; } | ||||
| 640 | bool hasSSE42() const { return X86SSELevel >= SSE42; } | ||||
| 641 | bool hasAVX() const { return X86SSELevel >= AVX; } | ||||
| 642 | bool hasAVX2() const { return X86SSELevel >= AVX2; } | ||||
| 643 | bool hasAVX512() const { return X86SSELevel >= AVX512F; } | ||||
| 644 | bool hasInt256() const { return hasAVX2(); } | ||||
| 645 | bool hasSSE4A() const { return HasSSE4A; } | ||||
| 646 | bool hasMMX() const { return X863DNowLevel >= MMX; } | ||||
| 647 | bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } | ||||
| 648 | bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } | ||||
| 649 | bool hasPOPCNT() const { return HasPOPCNT; } | ||||
| 650 | bool hasAES() const { return HasAES; } | ||||
| 651 | bool hasVAES() const { return HasVAES; } | ||||
| 652 | bool hasFXSR() const { return HasFXSR; } | ||||
| 653 | bool hasXSAVE() const { return HasXSAVE; } | ||||
| 654 | bool hasXSAVEOPT() const { return HasXSAVEOPT; } | ||||
| 655 | bool hasXSAVEC() const { return HasXSAVEC; } | ||||
| 656 | bool hasXSAVES() const { return HasXSAVES; } | ||||
| 657 | bool hasPCLMUL() const { return HasPCLMUL; } | ||||
| 658 | bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; } | ||||
| 659 | bool hasGFNI() const { return HasGFNI; } | ||||
| 660 | // Prefer FMA4 to FMA - its better for commutation/memory folding and | ||||
| 661 | // has equal or better performance on all supported targets. | ||||
| 662 | bool hasFMA() const { return HasFMA; } | ||||
| 663 | bool hasFMA4() const { return HasFMA4; } | ||||
| 664 | bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } | ||||
| 665 | bool hasXOP() const { return HasXOP; } | ||||
| 666 | bool hasTBM() const { return HasTBM; } | ||||
| 667 | bool hasLWP() const { return HasLWP; } | ||||
| 668 | bool hasMOVBE() const { return HasMOVBE; } | ||||
| 669 | bool hasRDRAND() const { return HasRDRAND; } | ||||
| 670 | bool hasF16C() const { return HasF16C; } | ||||
| 671 | bool hasFSGSBase() const { return HasFSGSBase; } | ||||
| 672 | bool hasLZCNT() const { return HasLZCNT; } | ||||
| 673 | bool hasBMI() const { return HasBMI; } | ||||
| 674 | bool hasBMI2() const { return HasBMI2; } | ||||
| 675 | bool hasVBMI() const { return HasVBMI; } | ||||
| 676 | bool hasVBMI2() const { return HasVBMI2; } | ||||
| 677 | bool hasIFMA() const { return HasIFMA; } | ||||
| 678 | bool hasRTM() const { return HasRTM; } | ||||
| 679 | bool hasADX() const { return HasADX; } | ||||
| 680 | bool hasSHA() const { return HasSHA; } | ||||
| 681 | bool hasPRFCHW() const { return HasPRFCHW; } | ||||
| 682 | bool hasPREFETCHWT1() const { return HasPREFETCHWT1; } | ||||
| 683 | bool hasPrefetchW() const { | ||||
| 684 | // The PREFETCHW instruction was added with 3DNow but later CPUs gave it | ||||
| 685 | // its own CPUID bit as part of deprecating 3DNow. Intel eventually added | ||||
| 686 | // it and KNL has another that prefetches to L2 cache. We assume the | ||||
| 687 | // L1 version exists if the L2 version does. | ||||
| 688 | return has3DNow() || hasPRFCHW() || hasPREFETCHWT1(); | ||||
| 689 | } | ||||
| 690 | bool hasSSEPrefetch() const { | ||||
| 691 | // We implicitly enable these when we have a write prefix supporting cache | ||||
| 692 | // level OR if we have prfchw, but don't already have a read prefetch from | ||||
| 693 | // 3dnow. | ||||
| 694 | return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1(); | ||||
| 695 | } | ||||
| 696 | bool hasRDSEED() const { return HasRDSEED; } | ||||
| 697 | bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); } | ||||
| 698 | bool hasMWAITX() const { return HasMWAITX; } | ||||
| 699 | bool hasCLZERO() const { return HasCLZERO; } | ||||
| 700 | bool hasCLDEMOTE() const { return HasCLDEMOTE; } | ||||
| 701 | bool hasMOVDIRI() const { return HasMOVDIRI; } | ||||
| 702 | bool hasMOVDIR64B() const { return HasMOVDIR64B; } | ||||
| 703 | bool hasPTWRITE() const { return HasPTWRITE; } | ||||
| 704 | bool isSHLDSlow() const { return IsSHLDSlow; } | ||||
| 705 | bool isPMULLDSlow() const { return IsPMULLDSlow; } | ||||
| 706 | bool isPMADDWDSlow() const { return IsPMADDWDSlow; } | ||||
| 707 | bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } | ||||
| 708 | bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } | ||||
| 709 | bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } | ||||
| 710 | bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); } | ||||
| 711 | bool useLeaForSP() const { return UseLeaForSP; } | ||||
| 712 | bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } | ||||
| 713 | bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } | ||||
| 714 | bool hasFastVariableCrossLaneShuffle() const { | ||||
| 715 | return HasFastVariableCrossLaneShuffle; | ||||
| 716 | } | ||||
| 717 | bool hasFastVariablePerLaneShuffle() const { | ||||
| 718 | return HasFastVariablePerLaneShuffle; | ||||
| 719 | } | ||||
| 720 | bool insertVZEROUPPER() const { return InsertVZEROUPPER; } | ||||
| 721 | bool hasFastGather() const { return HasFastGather; } | ||||
| 722 | bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } | ||||
| 723 | bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } | ||||
| 724 | bool hasFastLZCNT() const { return HasFastLZCNT; } | ||||
| 725 | bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } | ||||
| 726 | bool hasFastBEXTR() const { return HasFastBEXTR; } | ||||
| 727 | bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } | ||||
| 728 | bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; } | ||||
| 729 | bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; } | ||||
| 730 | bool hasFastMOVBE() const { return HasFastMOVBE; } | ||||
| 731 | bool hasMacroFusion() const { return HasMacroFusion; } | ||||
| 732 | bool hasBranchFusion() const { return HasBranchFusion; } | ||||
| 733 | bool hasERMSB() const { return HasERMSB; } | ||||
| 734 | bool hasFSRM() const { return HasFSRM; } | ||||
| 735 | bool hasSlowDivide32() const { return HasSlowDivide32; } | ||||
| 736 | bool hasSlowDivide64() const { return HasSlowDivide64; } | ||||
| 737 | bool padShortFunctions() const { return PadShortFunctions; } | ||||
| 738 | bool slowTwoMemOps() const { return SlowTwoMemOps; } | ||||
| 739 | bool LEAusesAG() const { return LEAUsesAG; } | ||||
| 740 | bool slowLEA() const { return SlowLEA; } | ||||
| 741 | bool slow3OpsLEA() const { return Slow3OpsLEA; } | ||||
| 742 | bool slowIncDec() const { return SlowIncDec; } | ||||
| 743 | bool hasCDI() const { return HasCDI; } | ||||
| 744 | bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; } | ||||
| 745 | bool hasPFI() const { return HasPFI; } | ||||
| 746 | bool hasERI() const { return HasERI; } | ||||
| 747 | bool hasDQI() const { return HasDQI; } | ||||
| 748 | bool hasBWI() const { return HasBWI; } | ||||
| 749 | bool hasVLX() const { return HasVLX; } | ||||
| 750 | bool hasPKU() const { return HasPKU; } | ||||
| 751 | bool hasVNNI() const { return HasVNNI; } | ||||
| 752 | bool hasBF16() const { return HasBF16; } | ||||
| 753 | bool hasVP2INTERSECT() const { return HasVP2INTERSECT; } | ||||
| 754 | bool hasBITALG() const { return HasBITALG; } | ||||
| 755 | bool hasSHSTK() const { return HasSHSTK; } | ||||
| 756 | bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } | ||||
| 757 | bool hasCLWB() const { return HasCLWB; } | ||||
| 758 | bool hasWBNOINVD() const { return HasWBNOINVD; } | ||||
| 759 | bool hasRDPID() const { return HasRDPID; } | ||||
| 760 | bool hasWAITPKG() const { return HasWAITPKG; } | ||||
| 761 | bool hasPCONFIG() const { return HasPCONFIG; } | ||||
| 762 | bool hasSGX() const { return HasSGX; } | ||||
| 763 | bool hasINVPCID() const { return HasINVPCID; } | ||||
| 764 | bool hasENQCMD() const { return HasENQCMD; } | ||||
| 765 | bool hasKL() const { return HasKL; } | ||||
| 766 | bool hasWIDEKL() const { return HasWIDEKL; } | ||||
| 767 | bool hasHRESET() const { return HasHRESET; } | ||||
| 768 | bool hasSERIALIZE() const { return HasSERIALIZE; } | ||||
| 769 | bool hasTSXLDTRK() const { return HasTSXLDTRK; } | ||||
| 770 | bool hasUINTR() const { return HasUINTR; } | ||||
| 771 | bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } | ||||
| 772 | bool useRetpolineIndirectBranches() const { | ||||
| 773 | return UseRetpolineIndirectBranches; | ||||
| 774 | } | ||||
| 775 | bool hasAVXVNNI() const { return HasAVXVNNI; } | ||||
| 776 | bool hasAMXTILE() const { return HasAMXTILE; } | ||||
| 777 | bool hasAMXBF16() const { return HasAMXBF16; } | ||||
| 778 | bool hasAMXINT8() const { return HasAMXINT8; } | ||||
| 779 | bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } | ||||
| 780 | |||||
| 781 | // These are generic getters that OR together all of the thunk types | ||||
| 782 | // supported by the subtarget. Therefore useIndirectThunk*() will return true | ||||
| 783 | // if any respective thunk feature is enabled. | ||||
| 784 | bool useIndirectThunkCalls() const { | ||||
| 785 | return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity(); | ||||
| 786 | } | ||||
| 787 | bool useIndirectThunkBranches() const { | ||||
| 788 | return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity(); | ||||
| 789 | } | ||||
| 790 | |||||
| 791 | bool preferMaskRegisters() const { return PreferMaskRegisters; } | ||||
| 792 | bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; } | ||||
| 793 | bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; } | ||||
| 794 | bool useLVILoadHardening() const { return UseLVILoadHardening; } | ||||
| 795 | bool useSpeculativeExecutionSideEffectSuppression() const { | ||||
| 796 | return UseSpeculativeExecutionSideEffectSuppression; | ||||
| 797 | } | ||||
| 798 | |||||
| 799 | unsigned getPreferVectorWidth() const { return PreferVectorWidth; } | ||||
| 800 | unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } | ||||
| 801 | |||||
| 802 | // Helper functions to determine when we should allow widening to 512-bit | ||||
| 803 | // during codegen. | ||||
| 804 | // TODO: Currently we're always allowing widening on CPUs without VLX, | ||||
| 805 | // because for many cases we don't have a better option. | ||||
| 806 | bool canExtendTo512DQ() const { | ||||
| 807 | return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512); | ||||
| 808 | } | ||||
| 809 | bool canExtendTo512BW() const { | ||||
| 810 | return hasBWI() && canExtendTo512DQ(); | ||||
| 811 | } | ||||
| 812 | |||||
| 813 | // If there are no 512-bit vectors and we prefer not to use 512-bit registers, | ||||
| 814 | // disable them in the legalizer. | ||||
| 815 | bool useAVX512Regs() const { | ||||
| 816 | return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); | ||||
| 817 | } | ||||
| 818 | |||||
| 819 | bool useBWIRegs() const { | ||||
| 820 | return hasBWI() && useAVX512Regs(); | ||||
| 821 | } | ||||
| 822 | |||||
| 823 | bool isXRaySupported() const override { return is64Bit(); } | ||||
| 824 | |||||
| 825 | /// TODO: to be removed later and replaced with suitable properties | ||||
| 826 | bool isAtom() const { return X86ProcFamily == IntelAtom; } | ||||
| 827 | bool isSLM() const { return X86ProcFamily == IntelSLM; } | ||||
| 828 | bool useSoftFloat() const { return UseSoftFloat; } | ||||
| 829 | bool useAA() const override { return UseAA; } | ||||
| 830 | |||||
| 831 | /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for | ||||
| 832 | /// no-sse2). There isn't any reason to disable it if the target processor | ||||
| 833 | /// supports it. | ||||
| 834 | bool hasMFence() const { return hasSSE2() || is64Bit(); } | ||||
| 835 | |||||
| 836 | const Triple &getTargetTriple() const { return TargetTriple; } | ||||
| 837 | |||||
| 838 | bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } | ||||
| 839 | bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } | ||||
| 840 | bool isTargetOpenBSD() const { return TargetTriple.isOSOpenBSD(); } | ||||
| 841 | bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } | ||||
| 842 | bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } | ||||
| 843 | bool isTargetPS4() const { return TargetTriple.isPS4CPU(); } | ||||
| 844 | |||||
| 845 | bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } | ||||
| 846 | bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } | ||||
| 847 | bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } | ||||
| 848 | |||||
| 849 | bool isTargetLinux() const { return TargetTriple.isOSLinux(); } | ||||
| 850 | bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); } | ||||
| 851 | bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); } | ||||
| 852 | bool isTargetAndroid() const { return TargetTriple.isAndroid(); } | ||||
| 853 | bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } | ||||
| 854 | bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } | ||||
| 855 | bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } | ||||
| 856 | bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } | ||||
| 857 | bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } | ||||
| 858 | |||||
| 859 | bool isTargetWindowsMSVC() const { | ||||
| 860 | return TargetTriple.isWindowsMSVCEnvironment(); | ||||
| 861 | } | ||||
| 862 | |||||
| 863 | bool isTargetWindowsCoreCLR() const { | ||||
| 864 | return TargetTriple.isWindowsCoreCLREnvironment(); | ||||
| 865 | } | ||||
| 866 | |||||
| 867 | bool isTargetWindowsCygwin() const { | ||||
| 868 | return TargetTriple.isWindowsCygwinEnvironment(); | ||||
| 869 | } | ||||
| 870 | |||||
| 871 | bool isTargetWindowsGNU() const { | ||||
| 872 | return TargetTriple.isWindowsGNUEnvironment(); | ||||
| 873 | } | ||||
| 874 | |||||
| 875 | bool isTargetWindowsItanium() const { | ||||
| 876 | return TargetTriple.isWindowsItaniumEnvironment(); | ||||
| 877 | } | ||||
| 878 | |||||
| 879 | bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } | ||||
| 880 | |||||
| 881 | bool isOSWindows() const { return TargetTriple.isOSWindows(); } | ||||
| 882 | |||||
| 883 | bool isTargetWin64() const { return In64BitMode && isOSWindows(); } | ||||
| 884 | |||||
| 885 | bool isTargetWin32() const { return !In64BitMode && isOSWindows(); } | ||||
| 886 | |||||
| 887 | bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; } | ||||
| 888 | bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; } | ||||
| 889 | |||||
| 890 | bool isPICStyleStubPIC() const { | ||||
| 891 | return PICStyle == PICStyles::Style::StubPIC; | ||||
| 892 | } | ||||
| 893 | |||||
| 894 | bool isPositionIndependent() const; | ||||
| 895 | |||||
| 896 | bool isCallingConvWin64(CallingConv::ID CC) const { | ||||
| 897 | switch (CC) { | ||||
| 898 | // On Win64, all these conventions just use the default convention. | ||||
| 899 | case CallingConv::C: | ||||
| 900 | case CallingConv::Fast: | ||||
| 901 | case CallingConv::Tail: | ||||
| 902 | case CallingConv::Swift: | ||||
| 903 | case CallingConv::SwiftTail: | ||||
| 904 | case CallingConv::X86_FastCall: | ||||
| 905 | case CallingConv::X86_StdCall: | ||||
| 906 | case CallingConv::X86_ThisCall: | ||||
| 907 | case CallingConv::X86_VectorCall: | ||||
| 908 | case CallingConv::Intel_OCL_BI: | ||||
| 909 | return isTargetWin64(); | ||||
| 910 | // This convention allows using the Win64 convention on other targets. | ||||
| 911 | case CallingConv::Win64: | ||||
| 912 | return true; | ||||
| 913 | // This convention allows using the SysV convention on Windows targets. | ||||
| 914 | case CallingConv::X86_64_SysV: | ||||
| 915 | return false; | ||||
| 916 | // Otherwise, who knows what this is. | ||||
| 917 | default: | ||||
| 918 | return false; | ||||
| 919 | } | ||||
| 920 | } | ||||
| 921 | |||||
| 922 | /// Classify a global variable reference for the current subtarget according | ||||
| 923 | /// to how we should reference it in a non-pcrel context. | ||||
| 924 | unsigned char classifyLocalReference(const GlobalValue *GV) const; | ||||
| 925 | |||||
| 926 | unsigned char classifyGlobalReference(const GlobalValue *GV, | ||||
| 927 | const Module &M) const; | ||||
| 928 | unsigned char classifyGlobalReference(const GlobalValue *GV) const; | ||||
| 929 | |||||
| 930 | /// Classify a global function reference for the current subtarget. | ||||
| 931 | unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, | ||||
| 932 | const Module &M) const; | ||||
| 933 | unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const; | ||||
| 934 | |||||
| 935 | /// Classify a blockaddress reference for the current subtarget according to | ||||
| 936 | /// how we should reference it in a non-pcrel context. | ||||
| 937 | unsigned char classifyBlockAddressReference() const; | ||||
| 938 | |||||
| 939 | /// Return true if the subtarget allows calls to immediate address. | ||||
| 940 | bool isLegalToCallImmediateAddr() const; | ||||
| 941 | |||||
| 942 | /// If we are using indirect thunks, we need to expand indirectbr to avoid it | ||||
| 943 | /// lowering to an actual indirect jump. | ||||
| 944 | bool enableIndirectBrExpand() const override { | ||||
| 945 | return useIndirectThunkBranches(); | ||||
| 946 | } | ||||
| 947 | |||||
| 948 | /// Enable the MachineScheduler pass for all X86 subtargets. | ||||
| 949 | bool enableMachineScheduler() const override { return true; } | ||||
| 950 | |||||
| 951 | bool enableEarlyIfConversion() const override; | ||||
| 952 | |||||
| 953 | void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> | ||||
| 954 | &Mutations) const override; | ||||
| 955 | |||||
| 956 | AntiDepBreakMode getAntiDepBreakMode() const override { | ||||
| 957 | return TargetSubtargetInfo::ANTIDEP_CRITICAL; | ||||
| 958 | } | ||||
| 959 | |||||
| 960 | bool enableAdvancedRASplitCost() const override { return false; } | ||||
| 961 | }; | ||||
| 962 | |||||
| 963 | } // end namespace llvm | ||||
| 964 | |||||
| 965 | #endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H |