| File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp |
| Warning: | line 404, column 3 Forming reference to null pointer |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// | |||
| 2 | // | |||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | |||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
| 6 | // | |||
| 7 | //===----------------------------------------------------------------------===// | |||
| 8 | // | |||
| 9 | /// \file This pass tries to apply several peephole SDWA patterns. | |||
| 10 | /// | |||
| 11 | /// E.g. original: | |||
| 12 | /// V_LSHRREV_B32_e32 %0, 16, %1 | |||
| 13 | /// V_ADD_CO_U32_e32 %2, %0, %3 | |||
| 14 | /// V_LSHLREV_B32_e32 %4, 16, %2 | |||
| 15 | /// | |||
| 16 | /// Replace: | |||
| 17 | /// V_ADD_CO_U32_sdwa %4, %1, %3 | |||
| 18 | /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | |||
| 19 | /// | |||
| 20 | //===----------------------------------------------------------------------===// | |||
| 21 | ||||
| 22 | #include "AMDGPU.h" | |||
| 23 | #include "GCNSubtarget.h" | |||
| 24 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
| 25 | #include "llvm/ADT/MapVector.h" | |||
| 26 | #include "llvm/ADT/Statistic.h" | |||
| 27 | #include "llvm/CodeGen/MachineFunctionPass.h" | |||
| 28 | ||||
| 29 | using namespace llvm; | |||
| 30 | ||||
| 31 | #define DEBUG_TYPE"si-peephole-sdwa" "si-peephole-sdwa" | |||
| 32 | ||||
| 33 | STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.")static llvm::Statistic NumSDWAPatternsFound = {"si-peephole-sdwa" , "NumSDWAPatternsFound", "Number of SDWA patterns found."}; | |||
| 34 | STATISTIC(NumSDWAInstructionsPeepholed,static llvm::Statistic NumSDWAInstructionsPeepholed = {"si-peephole-sdwa" , "NumSDWAInstructionsPeepholed", "Number of instruction converted to SDWA." } | |||
| 35 | "Number of instruction converted to SDWA.")static llvm::Statistic NumSDWAInstructionsPeepholed = {"si-peephole-sdwa" , "NumSDWAInstructionsPeepholed", "Number of instruction converted to SDWA." }; | |||
| 36 | ||||
| 37 | namespace { | |||
| 38 | ||||
| 39 | class SDWAOperand; | |||
| 40 | class SDWADstOperand; | |||
| 41 | ||||
| 42 | class SIPeepholeSDWA : public MachineFunctionPass { | |||
| 43 | public: | |||
| 44 | using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; | |||
| 45 | ||||
| 46 | private: | |||
| 47 | MachineRegisterInfo *MRI; | |||
| 48 | const SIRegisterInfo *TRI; | |||
| 49 | const SIInstrInfo *TII; | |||
| 50 | ||||
| 51 | MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; | |||
| 52 | MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; | |||
| 53 | SmallVector<MachineInstr *, 8> ConvertedInstructions; | |||
| 54 | ||||
| 55 | Optional<int64_t> foldToImm(const MachineOperand &Op) const; | |||
| 56 | ||||
| 57 | public: | |||
| 58 | static char ID; | |||
| 59 | ||||
| 60 | SIPeepholeSDWA() : MachineFunctionPass(ID) { | |||
| 61 | initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); | |||
| 62 | } | |||
| 63 | ||||
| 64 | bool runOnMachineFunction(MachineFunction &MF) override; | |||
| 65 | void matchSDWAOperands(MachineBasicBlock &MBB); | |||
| 66 | std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); | |||
| 67 | bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; | |||
| 68 | void pseudoOpConvertToVOP2(MachineInstr &MI, | |||
| 69 | const GCNSubtarget &ST) const; | |||
| 70 | bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); | |||
| 71 | void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; | |||
| 72 | ||||
| 73 | StringRef getPassName() const override { return "SI Peephole SDWA"; } | |||
| 74 | ||||
| 75 | void getAnalysisUsage(AnalysisUsage &AU) const override { | |||
| 76 | AU.setPreservesCFG(); | |||
| 77 | MachineFunctionPass::getAnalysisUsage(AU); | |||
| 78 | } | |||
| 79 | }; | |||
| 80 | ||||
| 81 | class SDWAOperand { | |||
| 82 | private: | |||
| 83 | MachineOperand *Target; // Operand that would be used in converted instruction | |||
| 84 | MachineOperand *Replaced; // Operand that would be replace by Target | |||
| 85 | ||||
| 86 | public: | |||
| 87 | SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) | |||
| 88 | : Target(TargetOp), Replaced(ReplacedOp) { | |||
| 89 | assert(Target->isReg())((void)0); | |||
| 90 | assert(Replaced->isReg())((void)0); | |||
| 91 | } | |||
| 92 | ||||
| 93 | virtual ~SDWAOperand() = default; | |||
| 94 | ||||
| 95 | virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; | |||
| 96 | virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; | |||
| 97 | ||||
| 98 | MachineOperand *getTargetOperand() const { return Target; } | |||
| 99 | MachineOperand *getReplacedOperand() const { return Replaced; } | |||
| 100 | MachineInstr *getParentInst() const { return Target->getParent(); } | |||
| 101 | ||||
| 102 | MachineRegisterInfo *getMRI() const { | |||
| 103 | return &getParentInst()->getParent()->getParent()->getRegInfo(); | |||
| 104 | } | |||
| 105 | ||||
| 106 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
| 107 | virtual void print(raw_ostream& OS) const = 0; | |||
| 108 | void dump() const { print(dbgs()); } | |||
| 109 | #endif | |||
| 110 | }; | |||
| 111 | ||||
| 112 | using namespace AMDGPU::SDWA; | |||
| 113 | ||||
| 114 | class SDWASrcOperand : public SDWAOperand { | |||
| 115 | private: | |||
| 116 | SdwaSel SrcSel; | |||
| 117 | bool Abs; | |||
| 118 | bool Neg; | |||
| 119 | bool Sext; | |||
| 120 | ||||
| 121 | public: | |||
| 122 | SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | |||
| 123 | SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, | |||
| 124 | bool Sext_ = false) | |||
| 125 | : SDWAOperand(TargetOp, ReplacedOp), | |||
| 126 | SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} | |||
| 127 | ||||
| 128 | MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; | |||
| 129 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | |||
| 130 | ||||
| 131 | SdwaSel getSrcSel() const { return SrcSel; } | |||
| 132 | bool getAbs() const { return Abs; } | |||
| 133 | bool getNeg() const { return Neg; } | |||
| 134 | bool getSext() const { return Sext; } | |||
| 135 | ||||
| 136 | uint64_t getSrcMods(const SIInstrInfo *TII, | |||
| 137 | const MachineOperand *SrcOp) const; | |||
| 138 | ||||
| 139 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
| 140 | void print(raw_ostream& OS) const override; | |||
| 141 | #endif | |||
| 142 | }; | |||
| 143 | ||||
| 144 | class SDWADstOperand : public SDWAOperand { | |||
| 145 | private: | |||
| 146 | SdwaSel DstSel; | |||
| 147 | DstUnused DstUn; | |||
| 148 | ||||
| 149 | public: | |||
| 150 | ||||
| 151 | SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | |||
| 152 | SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) | |||
| 153 | : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} | |||
| 154 | ||||
| 155 | MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; | |||
| 156 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | |||
| 157 | ||||
| 158 | SdwaSel getDstSel() const { return DstSel; } | |||
| 159 | DstUnused getDstUnused() const { return DstUn; } | |||
| 160 | ||||
| 161 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
| 162 | void print(raw_ostream& OS) const override; | |||
| 163 | #endif | |||
| 164 | }; | |||
| 165 | ||||
| 166 | class SDWADstPreserveOperand : public SDWADstOperand { | |||
| 167 | private: | |||
| 168 | MachineOperand *Preserve; | |||
| 169 | ||||
| 170 | public: | |||
| 171 | SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | |||
| 172 | MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) | |||
| 173 | : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), | |||
| 174 | Preserve(PreserveOp) {} | |||
| 175 | ||||
| 176 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | |||
| 177 | ||||
| 178 | MachineOperand *getPreservedOperand() const { return Preserve; } | |||
| 179 | ||||
| 180 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
| 181 | void print(raw_ostream& OS) const override; | |||
| 182 | #endif | |||
| 183 | }; | |||
| 184 | ||||
| 185 | } // end anonymous namespace | |||
| 186 | ||||
| 187 | INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)static void *initializeSIPeepholeSDWAPassOnce(PassRegistry & Registry) { PassInfo *PI = new PassInfo( "SI Peephole SDWA", "si-peephole-sdwa" , &SIPeepholeSDWA::ID, PassInfo::NormalCtor_t(callDefaultCtor <SIPeepholeSDWA>), false, false); Registry.registerPass (*PI, true); return PI; } static llvm::once_flag InitializeSIPeepholeSDWAPassFlag ; void llvm::initializeSIPeepholeSDWAPass(PassRegistry &Registry ) { llvm::call_once(InitializeSIPeepholeSDWAPassFlag, initializeSIPeepholeSDWAPassOnce , std::ref(Registry)); } | |||
| 188 | ||||
| 189 | char SIPeepholeSDWA::ID = 0; | |||
| 190 | ||||
| 191 | char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; | |||
| 192 | ||||
| 193 | FunctionPass *llvm::createSIPeepholeSDWAPass() { | |||
| 194 | return new SIPeepholeSDWA(); | |||
| 195 | } | |||
| 196 | ||||
| 197 | ||||
| 198 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
| 199 | static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { | |||
| 200 | switch(Sel) { | |||
| 201 | case BYTE_0: OS << "BYTE_0"; break; | |||
| 202 | case BYTE_1: OS << "BYTE_1"; break; | |||
| 203 | case BYTE_2: OS << "BYTE_2"; break; | |||
| 204 | case BYTE_3: OS << "BYTE_3"; break; | |||
| 205 | case WORD_0: OS << "WORD_0"; break; | |||
| 206 | case WORD_1: OS << "WORD_1"; break; | |||
| 207 | case DWORD: OS << "DWORD"; break; | |||
| 208 | } | |||
| 209 | return OS; | |||
| 210 | } | |||
| 211 | ||||
| 212 | static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { | |||
| 213 | switch(Un) { | |||
| 214 | case UNUSED_PAD: OS << "UNUSED_PAD"; break; | |||
| 215 | case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; | |||
| 216 | case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; | |||
| 217 | } | |||
| 218 | return OS; | |||
| 219 | } | |||
| 220 | ||||
| 221 | LLVM_DUMP_METHOD__attribute__((noinline)) | |||
| 222 | void SDWASrcOperand::print(raw_ostream& OS) const { | |||
| 223 | OS << "SDWA src: " << *getTargetOperand() | |||
| 224 | << " src_sel:" << getSrcSel() | |||
| 225 | << " abs:" << getAbs() << " neg:" << getNeg() | |||
| 226 | << " sext:" << getSext() << '\n'; | |||
| 227 | } | |||
| 228 | ||||
| 229 | LLVM_DUMP_METHOD__attribute__((noinline)) | |||
| 230 | void SDWADstOperand::print(raw_ostream& OS) const { | |||
| 231 | OS << "SDWA dst: " << *getTargetOperand() | |||
| 232 | << " dst_sel:" << getDstSel() | |||
| 233 | << " dst_unused:" << getDstUnused() << '\n'; | |||
| 234 | } | |||
| 235 | ||||
| 236 | LLVM_DUMP_METHOD__attribute__((noinline)) | |||
| 237 | void SDWADstPreserveOperand::print(raw_ostream& OS) const { | |||
| 238 | OS << "SDWA preserve dst: " << *getTargetOperand() | |||
| 239 | << " dst_sel:" << getDstSel() | |||
| 240 | << " preserve:" << *getPreservedOperand() << '\n'; | |||
| 241 | } | |||
| 242 | ||||
| 243 | #endif | |||
| 244 | ||||
| 245 | static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { | |||
| 246 | assert(To.isReg() && From.isReg())((void)0); | |||
| 247 | To.setReg(From.getReg()); | |||
| 248 | To.setSubReg(From.getSubReg()); | |||
| 249 | To.setIsUndef(From.isUndef()); | |||
| 250 | if (To.isUse()) { | |||
| 251 | To.setIsKill(From.isKill()); | |||
| 252 | } else { | |||
| 253 | To.setIsDead(From.isDead()); | |||
| 254 | } | |||
| 255 | } | |||
| 256 | ||||
| 257 | static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { | |||
| 258 | return LHS.isReg() && | |||
| 259 | RHS.isReg() && | |||
| 260 | LHS.getReg() == RHS.getReg() && | |||
| 261 | LHS.getSubReg() == RHS.getSubReg(); | |||
| 262 | } | |||
| 263 | ||||
| 264 | static MachineOperand *findSingleRegUse(const MachineOperand *Reg, | |||
| 265 | const MachineRegisterInfo *MRI) { | |||
| 266 | if (!Reg->isReg() || !Reg->isDef()) | |||
| 267 | return nullptr; | |||
| 268 | ||||
| 269 | MachineOperand *ResMO = nullptr; | |||
| 270 | for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { | |||
| 271 | // If there exist use of subreg of Reg then return nullptr | |||
| 272 | if (!isSameReg(UseMO, *Reg)) | |||
| 273 | return nullptr; | |||
| 274 | ||||
| 275 | // Check that there is only one instruction that uses Reg | |||
| 276 | if (!ResMO) { | |||
| 277 | ResMO = &UseMO; | |||
| 278 | } else if (ResMO->getParent() != UseMO.getParent()) { | |||
| 279 | return nullptr; | |||
| 280 | } | |||
| 281 | } | |||
| 282 | ||||
| 283 | return ResMO; | |||
| 284 | } | |||
| 285 | ||||
| 286 | static MachineOperand *findSingleRegDef(const MachineOperand *Reg, | |||
| 287 | const MachineRegisterInfo *MRI) { | |||
| 288 | if (!Reg->isReg()) | |||
| 289 | return nullptr; | |||
| 290 | ||||
| 291 | MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); | |||
| 292 | if (!DefInstr) | |||
| 293 | return nullptr; | |||
| 294 | ||||
| 295 | for (auto &DefMO : DefInstr->defs()) { | |||
| 296 | if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) | |||
| 297 | return &DefMO; | |||
| 298 | } | |||
| 299 | ||||
| 300 | // Ignore implicit defs. | |||
| 301 | return nullptr; | |||
| 302 | } | |||
| 303 | ||||
| 304 | uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, | |||
| 305 | const MachineOperand *SrcOp) const { | |||
| 306 | uint64_t Mods = 0; | |||
| 307 | const auto *MI = SrcOp->getParent(); | |||
| 308 | if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { | |||
| 309 | if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { | |||
| 310 | Mods = Mod->getImm(); | |||
| 311 | } | |||
| 312 | } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { | |||
| 313 | if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { | |||
| 314 | Mods = Mod->getImm(); | |||
| 315 | } | |||
| 316 | } | |||
| 317 | if (Abs || Neg) { | |||
| 318 | assert(!Sext &&((void)0) | |||
| 319 | "Float and integer src modifiers can't be set simulteniously")((void)0); | |||
| 320 | Mods |= Abs ? SISrcMods::ABS : 0u; | |||
| 321 | Mods ^= Neg ? SISrcMods::NEG : 0u; | |||
| 322 | } else if (Sext) { | |||
| 323 | Mods |= SISrcMods::SEXT; | |||
| 324 | } | |||
| 325 | ||||
| 326 | return Mods; | |||
| 327 | } | |||
| 328 | ||||
| 329 | MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { | |||
| 330 | // For SDWA src operand potential instruction is one that use register | |||
| 331 | // defined by parent instruction | |||
| 332 | MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); | |||
| 333 | if (!PotentialMO) | |||
| 334 | return nullptr; | |||
| 335 | ||||
| 336 | return PotentialMO->getParent(); | |||
| 337 | } | |||
| 338 | ||||
| 339 | bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { | |||
| 340 | // Find operand in instruction that matches source operand and replace it with | |||
| 341 | // target operand. Set corresponding src_sel | |||
| 342 | bool IsPreserveSrc = false; | |||
| 343 | MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
| 344 | MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); | |||
| 345 | MachineOperand *SrcMods = | |||
| 346 | TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); | |||
| 347 | assert(Src && (Src->isReg() || Src->isImm()))((void)0); | |||
| 348 | if (!isSameReg(*Src, *getReplacedOperand())) { | |||
| ||||
| 349 | // If this is not src0 then it could be src1 | |||
| 350 | Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
| 351 | SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); | |||
| 352 | SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); | |||
| 353 | ||||
| 354 | if (!Src || | |||
| 355 | !isSameReg(*Src, *getReplacedOperand())) { | |||
| 356 | // It's possible this Src is a tied operand for | |||
| 357 | // UNUSED_PRESERVE, in which case we can either | |||
| 358 | // abandon the peephole attempt, or if legal we can | |||
| 359 | // copy the target operand into the tied slot | |||
| 360 | // if the preserve operation will effectively cause the same | |||
| 361 | // result by overwriting the rest of the dst. | |||
| 362 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
| 363 | MachineOperand *DstUnused = | |||
| 364 | TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
| 365 | ||||
| 366 | if (Dst && | |||
| 367 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { | |||
| 368 | // This will work if the tied src is acessing WORD_0, and the dst is | |||
| 369 | // writing WORD_1. Modifiers don't matter because all the bits that | |||
| 370 | // would be impacted are being overwritten by the dst. | |||
| 371 | // Any other case will not work. | |||
| 372 | SdwaSel DstSel = static_cast<SdwaSel>( | |||
| 373 | TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); | |||
| 374 | if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && | |||
| 375 | getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { | |||
| 376 | IsPreserveSrc = true; | |||
| 377 | auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
| 378 | AMDGPU::OpName::vdst); | |||
| 379 | auto TiedIdx = MI.findTiedOperandIdx(DstIdx); | |||
| 380 | Src = &MI.getOperand(TiedIdx); | |||
| 381 | SrcSel = nullptr; | |||
| 382 | SrcMods = nullptr; | |||
| 383 | } else { | |||
| 384 | // Not legal to convert this src | |||
| 385 | return false; | |||
| 386 | } | |||
| 387 | } | |||
| 388 | } | |||
| 389 | assert(Src && Src->isReg())((void)0); | |||
| 390 | ||||
| 391 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || | |||
| 392 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || | |||
| 393 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || | |||
| 394 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && | |||
| 395 | !isSameReg(*Src, *getReplacedOperand())) { | |||
| 396 | // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to | |||
| 397 | // src2. This is not allowed. | |||
| 398 | return false; | |||
| 399 | } | |||
| 400 | ||||
| 401 | assert(isSameReg(*Src, *getReplacedOperand()) &&((void)0) | |||
| 402 | (IsPreserveSrc || (SrcSel && SrcMods)))((void)0); | |||
| 403 | } | |||
| 404 | copyRegOperand(*Src, *getTargetOperand()); | |||
| ||||
| 405 | if (!IsPreserveSrc) { | |||
| 406 | SrcSel->setImm(getSrcSel()); | |||
| 407 | SrcMods->setImm(getSrcMods(TII, Src)); | |||
| 408 | } | |||
| 409 | getTargetOperand()->setIsKill(false); | |||
| 410 | return true; | |||
| 411 | } | |||
| 412 | ||||
| 413 | MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { | |||
| 414 | // For SDWA dst operand potential instruction is one that defines register | |||
| 415 | // that this operand uses | |||
| 416 | MachineRegisterInfo *MRI = getMRI(); | |||
| 417 | MachineInstr *ParentMI = getParentInst(); | |||
| 418 | ||||
| 419 | MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); | |||
| 420 | if (!PotentialMO) | |||
| 421 | return nullptr; | |||
| 422 | ||||
| 423 | // Check that ParentMI is the only instruction that uses replaced register | |||
| 424 | for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { | |||
| 425 | if (&UseInst != ParentMI) | |||
| 426 | return nullptr; | |||
| 427 | } | |||
| 428 | ||||
| 429 | return PotentialMO->getParent(); | |||
| 430 | } | |||
| 431 | ||||
| 432 | bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { | |||
| 433 | // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused | |||
| 434 | ||||
| 435 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || | |||
| 436 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || | |||
| 437 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || | |||
| 438 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && | |||
| 439 | getDstSel() != AMDGPU::SDWA::DWORD) { | |||
| 440 | // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD | |||
| 441 | return false; | |||
| 442 | } | |||
| 443 | ||||
| 444 | MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
| 445 | assert(Operand &&((void)0) | |||
| 446 | Operand->isReg() &&((void)0) | |||
| 447 | isSameReg(*Operand, *getReplacedOperand()))((void)0); | |||
| 448 | copyRegOperand(*Operand, *getTargetOperand()); | |||
| 449 | MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); | |||
| 450 | assert(DstSel)((void)0); | |||
| 451 | DstSel->setImm(getDstSel()); | |||
| 452 | MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
| 453 | assert(DstUnused)((void)0); | |||
| 454 | DstUnused->setImm(getDstUnused()); | |||
| 455 | ||||
| 456 | // Remove original instruction because it would conflict with our new | |||
| 457 | // instruction by register definition | |||
| 458 | getParentInst()->eraseFromParent(); | |||
| 459 | return true; | |||
| 460 | } | |||
| 461 | ||||
| 462 | bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, | |||
| 463 | const SIInstrInfo *TII) { | |||
| 464 | // MI should be moved right before v_or_b32. | |||
| 465 | // For this we should clear all kill flags on uses of MI src-operands or else | |||
| 466 | // we can encounter problem with use of killed operand. | |||
| 467 | for (MachineOperand &MO : MI.uses()) { | |||
| 468 | if (!MO.isReg()) | |||
| 469 | continue; | |||
| 470 | getMRI()->clearKillFlags(MO.getReg()); | |||
| 471 | } | |||
| 472 | ||||
| 473 | // Move MI before v_or_b32 | |||
| 474 | auto MBB = MI.getParent(); | |||
| 475 | MBB->remove(&MI); | |||
| 476 | MBB->insert(getParentInst(), &MI); | |||
| 477 | ||||
| 478 | // Add Implicit use of preserved register | |||
| 479 | MachineInstrBuilder MIB(*MBB->getParent(), MI); | |||
| 480 | MIB.addReg(getPreservedOperand()->getReg(), | |||
| 481 | RegState::ImplicitKill, | |||
| 482 | getPreservedOperand()->getSubReg()); | |||
| 483 | ||||
| 484 | // Tie dst to implicit use | |||
| 485 | MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), | |||
| 486 | MI.getNumOperands() - 1); | |||
| 487 | ||||
| 488 | // Convert MI as any other SDWADstOperand and remove v_or_b32 | |||
| 489 | return SDWADstOperand::convertToSDWA(MI, TII); | |||
| 490 | } | |||
| 491 | ||||
| 492 | Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { | |||
| 493 | if (Op.isImm()) { | |||
| 494 | return Op.getImm(); | |||
| 495 | } | |||
| 496 | ||||
| 497 | // If this is not immediate then it can be copy of immediate value, e.g.: | |||
| 498 | // %1 = S_MOV_B32 255; | |||
| 499 | if (Op.isReg()) { | |||
| 500 | for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { | |||
| 501 | if (!isSameReg(Op, Def)) | |||
| 502 | continue; | |||
| 503 | ||||
| 504 | const MachineInstr *DefInst = Def.getParent(); | |||
| 505 | if (!TII->isFoldableCopy(*DefInst)) | |||
| 506 | return None; | |||
| 507 | ||||
| 508 | const MachineOperand &Copied = DefInst->getOperand(1); | |||
| 509 | if (!Copied.isImm()) | |||
| 510 | return None; | |||
| 511 | ||||
| 512 | return Copied.getImm(); | |||
| 513 | } | |||
| 514 | } | |||
| 515 | ||||
| 516 | return None; | |||
| 517 | } | |||
| 518 | ||||
| 519 | std::unique_ptr<SDWAOperand> | |||
| 520 | SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { | |||
| 521 | unsigned Opcode = MI.getOpcode(); | |||
| 522 | switch (Opcode) { | |||
| 523 | case AMDGPU::V_LSHRREV_B32_e32: | |||
| 524 | case AMDGPU::V_ASHRREV_I32_e32: | |||
| 525 | case AMDGPU::V_LSHLREV_B32_e32: | |||
| 526 | case AMDGPU::V_LSHRREV_B32_e64: | |||
| 527 | case AMDGPU::V_ASHRREV_I32_e64: | |||
| 528 | case AMDGPU::V_LSHLREV_B32_e64: { | |||
| 529 | // from: v_lshrrev_b32_e32 v1, 16/24, v0 | |||
| 530 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 | |||
| 531 | ||||
| 532 | // from: v_ashrrev_i32_e32 v1, 16/24, v0 | |||
| 533 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 | |||
| 534 | ||||
| 535 | // from: v_lshlrev_b32_e32 v1, 16/24, v0 | |||
| 536 | // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD | |||
| 537 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
| 538 | auto Imm = foldToImm(*Src0); | |||
| 539 | if (!Imm) | |||
| 540 | break; | |||
| 541 | ||||
| 542 | if (*Imm != 16 && *Imm != 24) | |||
| 543 | break; | |||
| 544 | ||||
| 545 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
| 546 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
| 547 | if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) | |||
| 548 | break; | |||
| 549 | ||||
| 550 | if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || | |||
| 551 | Opcode == AMDGPU::V_LSHLREV_B32_e64) { | |||
| 552 | return std::make_unique<SDWADstOperand>( | |||
| 553 | Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); | |||
| 554 | } else { | |||
| 555 | return std::make_unique<SDWASrcOperand>( | |||
| 556 | Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, | |||
| 557 | Opcode != AMDGPU::V_LSHRREV_B32_e32 && | |||
| 558 | Opcode != AMDGPU::V_LSHRREV_B32_e64); | |||
| 559 | } | |||
| 560 | break; | |||
| 561 | } | |||
| 562 | ||||
| 563 | case AMDGPU::V_LSHRREV_B16_e32: | |||
| 564 | case AMDGPU::V_ASHRREV_I16_e32: | |||
| 565 | case AMDGPU::V_LSHLREV_B16_e32: | |||
| 566 | case AMDGPU::V_LSHRREV_B16_e64: | |||
| 567 | case AMDGPU::V_ASHRREV_I16_e64: | |||
| 568 | case AMDGPU::V_LSHLREV_B16_e64: { | |||
| 569 | // from: v_lshrrev_b16_e32 v1, 8, v0 | |||
| 570 | // to SDWA src:v0 src_sel:BYTE_1 | |||
| 571 | ||||
| 572 | // from: v_ashrrev_i16_e32 v1, 8, v0 | |||
| 573 | // to SDWA src:v0 src_sel:BYTE_1 sext:1 | |||
| 574 | ||||
| 575 | // from: v_lshlrev_b16_e32 v1, 8, v0 | |||
| 576 | // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD | |||
| 577 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
| 578 | auto Imm = foldToImm(*Src0); | |||
| 579 | if (!Imm || *Imm != 8) | |||
| 580 | break; | |||
| 581 | ||||
| 582 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
| 583 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
| 584 | ||||
| 585 | if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) | |||
| 586 | break; | |||
| 587 | ||||
| 588 | if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || | |||
| 589 | Opcode == AMDGPU::V_LSHLREV_B16_e64) { | |||
| 590 | return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); | |||
| 591 | } else { | |||
| 592 | return std::make_unique<SDWASrcOperand>( | |||
| 593 | Src1, Dst, BYTE_1, false, false, | |||
| 594 | Opcode != AMDGPU::V_LSHRREV_B16_e32 && | |||
| 595 | Opcode != AMDGPU::V_LSHRREV_B16_e64); | |||
| 596 | } | |||
| 597 | break; | |||
| 598 | } | |||
| 599 | ||||
| 600 | case AMDGPU::V_BFE_I32_e64: | |||
| 601 | case AMDGPU::V_BFE_U32_e64: { | |||
| 602 | // e.g.: | |||
| 603 | // from: v_bfe_u32 v1, v0, 8, 8 | |||
| 604 | // to SDWA src:v0 src_sel:BYTE_1 | |||
| 605 | ||||
| 606 | // offset | width | src_sel | |||
| 607 | // ------------------------ | |||
| 608 | // 0 | 8 | BYTE_0 | |||
| 609 | // 0 | 16 | WORD_0 | |||
| 610 | // 0 | 32 | DWORD ? | |||
| 611 | // 8 | 8 | BYTE_1 | |||
| 612 | // 16 | 8 | BYTE_2 | |||
| 613 | // 16 | 16 | WORD_1 | |||
| 614 | // 24 | 8 | BYTE_3 | |||
| 615 | ||||
| 616 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
| 617 | auto Offset = foldToImm(*Src1); | |||
| 618 | if (!Offset) | |||
| 619 | break; | |||
| 620 | ||||
| 621 | MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); | |||
| 622 | auto Width = foldToImm(*Src2); | |||
| 623 | if (!Width) | |||
| 624 | break; | |||
| 625 | ||||
| 626 | SdwaSel SrcSel = DWORD; | |||
| 627 | ||||
| 628 | if (*Offset == 0 && *Width == 8) | |||
| 629 | SrcSel = BYTE_0; | |||
| 630 | else if (*Offset == 0 && *Width == 16) | |||
| 631 | SrcSel = WORD_0; | |||
| 632 | else if (*Offset == 0 && *Width == 32) | |||
| 633 | SrcSel = DWORD; | |||
| 634 | else if (*Offset == 8 && *Width == 8) | |||
| 635 | SrcSel = BYTE_1; | |||
| 636 | else if (*Offset == 16 && *Width == 8) | |||
| 637 | SrcSel = BYTE_2; | |||
| 638 | else if (*Offset == 16 && *Width == 16) | |||
| 639 | SrcSel = WORD_1; | |||
| 640 | else if (*Offset == 24 && *Width == 8) | |||
| 641 | SrcSel = BYTE_3; | |||
| 642 | else | |||
| 643 | break; | |||
| 644 | ||||
| 645 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
| 646 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
| 647 | ||||
| 648 | if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical()) | |||
| 649 | break; | |||
| 650 | ||||
| 651 | return std::make_unique<SDWASrcOperand>( | |||
| 652 | Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); | |||
| 653 | } | |||
| 654 | ||||
| 655 | case AMDGPU::V_AND_B32_e32: | |||
| 656 | case AMDGPU::V_AND_B32_e64: { | |||
| 657 | // e.g.: | |||
| 658 | // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 | |||
| 659 | // to SDWA src:v0 src_sel:WORD_0/BYTE_0 | |||
| 660 | ||||
| 661 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
| 662 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
| 663 | auto ValSrc = Src1; | |||
| 664 | auto Imm = foldToImm(*Src0); | |||
| 665 | ||||
| 666 | if (!Imm) { | |||
| 667 | Imm = foldToImm(*Src1); | |||
| 668 | ValSrc = Src0; | |||
| 669 | } | |||
| 670 | ||||
| 671 | if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) | |||
| 672 | break; | |||
| 673 | ||||
| 674 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
| 675 | ||||
| 676 | if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical()) | |||
| 677 | break; | |||
| 678 | ||||
| 679 | return std::make_unique<SDWASrcOperand>( | |||
| 680 | ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); | |||
| 681 | } | |||
| 682 | ||||
| 683 | case AMDGPU::V_OR_B32_e32: | |||
| 684 | case AMDGPU::V_OR_B32_e64: { | |||
| 685 | // Patterns for dst_unused:UNUSED_PRESERVE. | |||
| 686 | // e.g., from: | |||
| 687 | // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD | |||
| 688 | // src1_sel:WORD_1 src2_sel:WORD1 | |||
| 689 | // v_add_f16_e32 v3, v1, v2 | |||
| 690 | // v_or_b32_e32 v4, v0, v3 | |||
| 691 | // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 | |||
| 692 | ||||
| 693 | // Check if one of operands of v_or_b32 is SDWA instruction | |||
| 694 | using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; | |||
| 695 | auto CheckOROperandsForSDWA = | |||
| 696 | [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { | |||
| 697 | if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) | |||
| 698 | return CheckRetType(None); | |||
| 699 | ||||
| 700 | MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); | |||
| 701 | if (!Op1Def) | |||
| 702 | return CheckRetType(None); | |||
| 703 | ||||
| 704 | MachineInstr *Op1Inst = Op1Def->getParent(); | |||
| 705 | if (!TII->isSDWA(*Op1Inst)) | |||
| 706 | return CheckRetType(None); | |||
| 707 | ||||
| 708 | MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); | |||
| 709 | if (!Op2Def) | |||
| 710 | return CheckRetType(None); | |||
| 711 | ||||
| 712 | return CheckRetType(std::make_pair(Op1Def, Op2Def)); | |||
| 713 | }; | |||
| 714 | ||||
| 715 | MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
| 716 | MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
| 717 | assert(OrSDWA && OrOther)((void)0); | |||
| 718 | auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); | |||
| 719 | if (!Res) { | |||
| 720 | OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
| 721 | OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
| 722 | assert(OrSDWA && OrOther)((void)0); | |||
| 723 | Res = CheckOROperandsForSDWA(OrSDWA, OrOther); | |||
| 724 | if (!Res) | |||
| 725 | break; | |||
| 726 | } | |||
| 727 | ||||
| 728 | MachineOperand *OrSDWADef = Res->first; | |||
| 729 | MachineOperand *OrOtherDef = Res->second; | |||
| 730 | assert(OrSDWADef && OrOtherDef)((void)0); | |||
| 731 | ||||
| 732 | MachineInstr *SDWAInst = OrSDWADef->getParent(); | |||
| 733 | MachineInstr *OtherInst = OrOtherDef->getParent(); | |||
| 734 | ||||
| 735 | // Check that OtherInstr is actually bitwise compatible with SDWAInst = their | |||
| 736 | // destination patterns don't overlap. Compatible instruction can be either | |||
| 737 | // regular instruction with compatible bitness or SDWA instruction with | |||
| 738 | // correct dst_sel | |||
| 739 | // SDWAInst | OtherInst bitness / OtherInst dst_sel | |||
| 740 | // ----------------------------------------------------- | |||
| 741 | // DWORD | no / no | |||
| 742 | // WORD_0 | no / BYTE_2/3, WORD_1 | |||
| 743 | // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 | |||
| 744 | // BYTE_0 | no / BYTE_1/2/3, WORD_1 | |||
| 745 | // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 | |||
| 746 | // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 | |||
| 747 | // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 | |||
| 748 | // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK | |||
| 749 | // but v_add_f32 is not. | |||
| 750 | ||||
| 751 | // TODO: add support for non-SDWA instructions as OtherInst. | |||
| 752 | // For now this only works with SDWA instructions. For regular instructions | |||
| 753 | // there is no way to determine if the instruction writes only 8/16/24-bit | |||
| 754 | // out of full register size and all registers are at min 32-bit wide. | |||
| 755 | if (!TII->isSDWA(*OtherInst)) | |||
| 756 | break; | |||
| 757 | ||||
| 758 | SdwaSel DstSel = static_cast<SdwaSel>( | |||
| 759 | TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; | |||
| 760 | SdwaSel OtherDstSel = static_cast<SdwaSel>( | |||
| 761 | TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); | |||
| 762 | ||||
| 763 | bool DstSelAgree = false; | |||
| 764 | switch (DstSel) { | |||
| 765 | case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || | |||
| 766 | (OtherDstSel == BYTE_3) || | |||
| 767 | (OtherDstSel == WORD_1)); | |||
| 768 | break; | |||
| 769 | case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || | |||
| 770 | (OtherDstSel == BYTE_1) || | |||
| 771 | (OtherDstSel == WORD_0)); | |||
| 772 | break; | |||
| 773 | case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || | |||
| 774 | (OtherDstSel == BYTE_2) || | |||
| 775 | (OtherDstSel == BYTE_3) || | |||
| 776 | (OtherDstSel == WORD_1)); | |||
| 777 | break; | |||
| 778 | case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || | |||
| 779 | (OtherDstSel == BYTE_2) || | |||
| 780 | (OtherDstSel == BYTE_3) || | |||
| 781 | (OtherDstSel == WORD_1)); | |||
| 782 | break; | |||
| 783 | case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || | |||
| 784 | (OtherDstSel == BYTE_1) || | |||
| 785 | (OtherDstSel == BYTE_3) || | |||
| 786 | (OtherDstSel == WORD_0)); | |||
| 787 | break; | |||
| 788 | case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || | |||
| 789 | (OtherDstSel == BYTE_1) || | |||
| 790 | (OtherDstSel == BYTE_2) || | |||
| 791 | (OtherDstSel == WORD_0)); | |||
| 792 | break; | |||
| 793 | default: DstSelAgree = false; | |||
| 794 | } | |||
| 795 | ||||
| 796 | if (!DstSelAgree) | |||
| 797 | break; | |||
| 798 | ||||
| 799 | // Also OtherInst dst_unused should be UNUSED_PAD | |||
| 800 | DstUnused OtherDstUnused = static_cast<DstUnused>( | |||
| 801 | TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); | |||
| 802 | if (OtherDstUnused != DstUnused::UNUSED_PAD) | |||
| 803 | break; | |||
| 804 | ||||
| 805 | // Create DstPreserveOperand | |||
| 806 | MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
| 807 | assert(OrDst && OrDst->isReg())((void)0); | |||
| 808 | ||||
| 809 | return std::make_unique<SDWADstPreserveOperand>( | |||
| 810 | OrDst, OrSDWADef, OrOtherDef, DstSel); | |||
| 811 | ||||
| 812 | } | |||
| 813 | } | |||
| 814 | ||||
| 815 | return std::unique_ptr<SDWAOperand>(nullptr); | |||
| 816 | } | |||
| 817 | ||||
| 818 | #if !defined(NDEBUG1) | |||
| 819 | static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { | |||
| 820 | Operand.print(OS); | |||
| 821 | return OS; | |||
| 822 | } | |||
| 823 | #endif | |||
| 824 | ||||
| 825 | void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { | |||
| 826 | for (MachineInstr &MI : MBB) { | |||
| 827 | if (auto Operand = matchSDWAOperand(MI)) { | |||
| 828 | LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n')do { } while (false); | |||
| 829 | SDWAOperands[&MI] = std::move(Operand); | |||
| 830 | ++NumSDWAPatternsFound; | |||
| 831 | } | |||
| 832 | } | |||
| 833 | } | |||
| 834 | ||||
| 835 | // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and | |||
| 836 | // V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA | |||
| 837 | // to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa. | |||
| 838 | // | |||
| 839 | // We are transforming from a VOP3 into a VOP2 form of the instruction. | |||
| 840 | // %19:vgpr_32 = V_AND_B32_e32 255, | |||
| 841 | // killed %16:vgpr_32, implicit $exec | |||
| 842 | // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 | |||
| 843 | // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec | |||
| 844 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 | |||
| 845 | // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec | |||
| 846 | // | |||
| 847 | // becomes | |||
| 848 | // %47:vgpr_32 = V_ADD_CO_U32_sdwa | |||
| 849 | // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, | |||
| 850 | // implicit-def $vcc, implicit $exec | |||
| 851 | // %48:vgpr_32 = V_ADDC_U32_e32 | |||
| 852 | // 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec | |||
| 853 | void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, | |||
| 854 | const GCNSubtarget &ST) const { | |||
| 855 | int Opc = MI.getOpcode(); | |||
| 856 | assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&((void)0) | |||
| 857 | "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64")((void)0); | |||
| 858 | ||||
| 859 | // Can the candidate MI be shrunk? | |||
| 860 | if (!TII->canShrink(MI, *MRI)) | |||
| 861 | return; | |||
| 862 | Opc = AMDGPU::getVOPe32(Opc); | |||
| 863 | // Find the related ADD instruction. | |||
| 864 | const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); | |||
| 865 | if (!Sdst) | |||
| 866 | return; | |||
| 867 | MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); | |||
| 868 | if (!NextOp) | |||
| 869 | return; | |||
| 870 | MachineInstr &MISucc = *NextOp->getParent(); | |||
| 871 | // Can the successor be shrunk? | |||
| 872 | if (!TII->canShrink(MISucc, *MRI)) | |||
| 873 | return; | |||
| 874 | int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); | |||
| 875 | // Make sure the carry in/out are subsequently unused. | |||
| 876 | MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); | |||
| 877 | if (!CarryIn) | |||
| 878 | return; | |||
| 879 | MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); | |||
| 880 | if (!CarryOut) | |||
| 881 | return; | |||
| 882 | if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) | |||
| 883 | return; | |||
| 884 | // Make sure VCC or its subregs are dead before MI. | |||
| 885 | MachineBasicBlock &MBB = *MI.getParent(); | |||
| 886 | auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); | |||
| 887 | if (Liveness != MachineBasicBlock::LQR_Dead) | |||
| 888 | return; | |||
| 889 | // Check if VCC is referenced in range of (MI,MISucc]. | |||
| 890 | for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); | |||
| 891 | I != E; ++I) { | |||
| 892 | if (I->modifiesRegister(AMDGPU::VCC, TRI)) | |||
| 893 | return; | |||
| 894 | } | |||
| 895 | ||||
| 896 | // Make the two new e32 instruction variants. | |||
| 897 | // Replace MI with V_{SUB|ADD}_I32_e32 | |||
| 898 | BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) | |||
| 899 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) | |||
| 900 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) | |||
| 901 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) | |||
| 902 | .setMIFlags(MI.getFlags()); | |||
| 903 | ||||
| 904 | MI.eraseFromParent(); | |||
| 905 | ||||
| 906 | // Replace MISucc with V_{SUBB|ADDC}_U32_e32 | |||
| 907 | BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)) | |||
| 908 | .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)) | |||
| 909 | .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)) | |||
| 910 | .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)) | |||
| 911 | .setMIFlags(MISucc.getFlags()); | |||
| 912 | ||||
| 913 | MISucc.eraseFromParent(); | |||
| 914 | } | |||
| 915 | ||||
| 916 | bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, | |||
| 917 | const GCNSubtarget &ST) const { | |||
| 918 | // Check if this is already an SDWA instruction | |||
| 919 | unsigned Opc = MI.getOpcode(); | |||
| 920 | if (TII->isSDWA(Opc)) | |||
| 921 | return true; | |||
| 922 | ||||
| 923 | // Check if this instruction has opcode that supports SDWA | |||
| 924 | if (AMDGPU::getSDWAOp(Opc) == -1) | |||
| 925 | Opc = AMDGPU::getVOPe32(Opc); | |||
| 926 | ||||
| 927 | if (AMDGPU::getSDWAOp(Opc) == -1) | |||
| 928 | return false; | |||
| 929 | ||||
| 930 | if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) | |||
| 931 | return false; | |||
| 932 | ||||
| 933 | if (TII->isVOPC(Opc)) { | |||
| 934 | if (!ST.hasSDWASdst()) { | |||
| 935 | const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); | |||
| 936 | if (SDst && (SDst->getReg() != AMDGPU::VCC && | |||
| 937 | SDst->getReg() != AMDGPU::VCC_LO)) | |||
| 938 | return false; | |||
| 939 | } | |||
| 940 | ||||
| 941 | if (!ST.hasSDWAOutModsVOPC() && | |||
| 942 | (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || | |||
| 943 | TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) | |||
| 944 | return false; | |||
| 945 | ||||
| 946 | } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || | |||
| 947 | !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { | |||
| 948 | return false; | |||
| 949 | } | |||
| 950 | ||||
| 951 | if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || | |||
| 952 | Opc == AMDGPU::V_FMAC_F32_e32 || | |||
| 953 | Opc == AMDGPU::V_MAC_F16_e32 || | |||
| 954 | Opc == AMDGPU::V_MAC_F32_e32)) | |||
| 955 | return false; | |||
| 956 | ||||
| 957 | // Check if target supports this SDWA opcode | |||
| 958 | if (TII->pseudoToMCOpcode(Opc) == -1) | |||
| 959 | return false; | |||
| 960 | ||||
| 961 | // FIXME: has SDWA but require handling of implicit VCC use | |||
| 962 | if (Opc == AMDGPU::V_CNDMASK_B32_e32) | |||
| 963 | return false; | |||
| 964 | ||||
| 965 | if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { | |||
| 966 | if (!Src0->isReg() && !Src0->isImm()) | |||
| 967 | return false; | |||
| 968 | } | |||
| 969 | ||||
| 970 | if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { | |||
| 971 | if (!Src1->isReg() && !Src1->isImm()) | |||
| 972 | return false; | |||
| 973 | } | |||
| 974 | ||||
| 975 | return true; | |||
| 976 | } | |||
| 977 | ||||
| 978 | bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, | |||
| 979 | const SDWAOperandsVector &SDWAOperands) { | |||
| 980 | ||||
| 981 | LLVM_DEBUG(dbgs() << "Convert instruction:" << MI)do { } while (false); | |||
| 982 | ||||
| 983 | // Convert to sdwa | |||
| 984 | int SDWAOpcode; | |||
| 985 | unsigned Opcode = MI.getOpcode(); | |||
| 986 | if (TII->isSDWA(Opcode)) { | |||
| 987 | SDWAOpcode = Opcode; | |||
| 988 | } else { | |||
| 989 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode); | |||
| 990 | if (SDWAOpcode == -1) | |||
| 991 | SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); | |||
| 992 | } | |||
| 993 | assert(SDWAOpcode != -1)((void)0); | |||
| 994 | ||||
| 995 | const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); | |||
| 996 | ||||
| 997 | // Create SDWA version of instruction MI and initialize its operands | |||
| 998 | MachineInstrBuilder SDWAInst = | |||
| 999 | BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) | |||
| 1000 | .setMIFlags(MI.getFlags()); | |||
| 1001 | ||||
| 1002 | // Copy dst, if it is present in original then should also be present in SDWA | |||
| 1003 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
| 1004 | if (Dst) { | |||
| 1005 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1)((void)0); | |||
| 1006 | SDWAInst.add(*Dst); | |||
| 1007 | } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { | |||
| 1008 | assert(Dst &&((void)0) | |||
| 1009 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1)((void)0); | |||
| 1010 | SDWAInst.add(*Dst); | |||
| 1011 | } else { | |||
| 1012 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1)((void)0); | |||
| 1013 | SDWAInst.addReg(TRI->getVCC(), RegState::Define); | |||
| 1014 | } | |||
| 1015 | ||||
| 1016 | // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and | |||
| 1017 | // src0_modifiers (except for v_nop_sdwa, but it can't get here) | |||
| 1018 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
| 1019 | assert(((void)0) | |||
| 1020 | Src0 &&((void)0) | |||
| 1021 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&((void)0) | |||
| 1022 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1)((void)0); | |||
| 1023 | if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) | |||
| 1024 | SDWAInst.addImm(Mod->getImm()); | |||
| 1025 | else | |||
| 1026 | SDWAInst.addImm(0); | |||
| 1027 | SDWAInst.add(*Src0); | |||
| 1028 | ||||
| 1029 | // Copy src1 if present, initialize src1_modifiers. | |||
| 1030 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
| 1031 | if (Src1) { | |||
| 1032 | assert(((void)0) | |||
| 1033 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&((void)0) | |||
| 1034 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1)((void)0); | |||
| 1035 | if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) | |||
| 1036 | SDWAInst.addImm(Mod->getImm()); | |||
| 1037 | else | |||
| 1038 | SDWAInst.addImm(0); | |||
| 1039 | SDWAInst.add(*Src1); | |||
| 1040 | } | |||
| 1041 | ||||
| 1042 | if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || | |||
| 1043 | SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || | |||
| 1044 | SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || | |||
| 1045 | SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { | |||
| 1046 | // v_mac_f16/32 has additional src2 operand tied to vdst | |||
| 1047 | MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); | |||
| 1048 | assert(Src2)((void)0); | |||
| 1049 | SDWAInst.add(*Src2); | |||
| 1050 | } | |||
| 1051 | ||||
| 1052 | // Copy clamp if present, initialize otherwise | |||
| 1053 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1)((void)0); | |||
| 1054 | MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
| 1055 | if (Clamp) { | |||
| 1056 | SDWAInst.add(*Clamp); | |||
| 1057 | } else { | |||
| 1058 | SDWAInst.addImm(0); | |||
| 1059 | } | |||
| 1060 | ||||
| 1061 | // Copy omod if present, initialize otherwise if needed | |||
| 1062 | if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { | |||
| 1063 | MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); | |||
| 1064 | if (OMod) { | |||
| 1065 | SDWAInst.add(*OMod); | |||
| 1066 | } else { | |||
| 1067 | SDWAInst.addImm(0); | |||
| 1068 | } | |||
| 1069 | } | |||
| 1070 | ||||
| 1071 | // Copy dst_sel if present, initialize otherwise if needed | |||
| 1072 | if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { | |||
| 1073 | MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); | |||
| 1074 | if (DstSel) { | |||
| 1075 | SDWAInst.add(*DstSel); | |||
| 1076 | } else { | |||
| 1077 | SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); | |||
| 1078 | } | |||
| 1079 | } | |||
| 1080 | ||||
| 1081 | // Copy dst_unused if present, initialize otherwise if needed | |||
| 1082 | if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { | |||
| 1083 | MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
| 1084 | if (DstUnused) { | |||
| 1085 | SDWAInst.add(*DstUnused); | |||
| 1086 | } else { | |||
| 1087 | SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); | |||
| 1088 | } | |||
| 1089 | } | |||
| 1090 | ||||
| 1091 | // Copy src0_sel if present, initialize otherwise | |||
| 1092 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1)((void)0); | |||
| 1093 | MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); | |||
| 1094 | if (Src0Sel) { | |||
| 1095 | SDWAInst.add(*Src0Sel); | |||
| 1096 | } else { | |||
| 1097 | SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); | |||
| 1098 | } | |||
| 1099 | ||||
| 1100 | // Copy src1_sel if present, initialize otherwise if needed | |||
| 1101 | if (Src1) { | |||
| 1102 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1)((void)0); | |||
| 1103 | MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); | |||
| 1104 | if (Src1Sel) { | |||
| 1105 | SDWAInst.add(*Src1Sel); | |||
| 1106 | } else { | |||
| 1107 | SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); | |||
| 1108 | } | |||
| 1109 | } | |||
| 1110 | ||||
| 1111 | // Check for a preserved register that needs to be copied. | |||
| 1112 | auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
| 1113 | if (DstUnused && | |||
| 1114 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { | |||
| 1115 | // We expect, if we are here, that the instruction was already in it's SDWA form, | |||
| 1116 | // with a tied operand. | |||
| 1117 | assert(Dst && Dst->isTied())((void)0); | |||
| 1118 | assert(Opcode == static_cast<unsigned int>(SDWAOpcode))((void)0); | |||
| 1119 | // We also expect a vdst, since sdst can't preserve. | |||
| 1120 | auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); | |||
| 1121 | assert(PreserveDstIdx != -1)((void)0); | |||
| 1122 | ||||
| 1123 | auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); | |||
| 1124 | auto Tied = MI.getOperand(TiedIdx); | |||
| 1125 | ||||
| 1126 | SDWAInst.add(Tied); | |||
| 1127 | SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); | |||
| 1128 | } | |||
| 1129 | ||||
| 1130 | // Apply all sdwa operand patterns. | |||
| 1131 | bool Converted = false; | |||
| 1132 | for (auto &Operand : SDWAOperands) { | |||
| 1133 | LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand)do { } while (false); | |||
| 1134 | // There should be no intesection between SDWA operands and potential MIs | |||
| 1135 | // e.g.: | |||
| 1136 | // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 | |||
| 1137 | // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 | |||
| 1138 | // v_add_u32 v3, v4, v2 | |||
| 1139 | // | |||
| 1140 | // In that example it is possible that we would fold 2nd instruction into 3rd | |||
| 1141 | // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was | |||
| 1142 | // already destroyed). So if SDWAOperand is also a potential MI then do not | |||
| 1143 | // apply it. | |||
| 1144 | if (PotentialMatches.count(Operand->getParentInst()) == 0) | |||
| 1145 | Converted |= Operand->convertToSDWA(*SDWAInst, TII); | |||
| 1146 | } | |||
| 1147 | if (Converted) { | |||
| 1148 | ConvertedInstructions.push_back(SDWAInst); | |||
| 1149 | } else { | |||
| 1150 | SDWAInst->eraseFromParent(); | |||
| 1151 | return false; | |||
| 1152 | } | |||
| 1153 | ||||
| 1154 | LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n')do { } while (false); | |||
| 1155 | ++NumSDWAInstructionsPeepholed; | |||
| 1156 | ||||
| 1157 | MI.eraseFromParent(); | |||
| 1158 | return true; | |||
| 1159 | } | |||
| 1160 | ||||
| 1161 | // If an instruction was converted to SDWA it should not have immediates or SGPR | |||
| 1162 | // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. | |||
| 1163 | void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, | |||
| 1164 | const GCNSubtarget &ST) const { | |||
| 1165 | const MCInstrDesc &Desc = TII->get(MI.getOpcode()); | |||
| 1166 | unsigned ConstantBusCount = 0; | |||
| 1167 | for (MachineOperand &Op : MI.explicit_uses()) { | |||
| 1168 | if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) | |||
| 1169 | continue; | |||
| 1170 | ||||
| 1171 | unsigned I = MI.getOperandNo(&Op); | |||
| 1172 | if (Desc.OpInfo[I].RegClass == -1 || | |||
| 1173 | !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) | |||
| 1174 | continue; | |||
| 1175 | ||||
| 1176 | if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && | |||
| 1177 | TRI->isSGPRReg(*MRI, Op.getReg())) { | |||
| 1178 | ++ConstantBusCount; | |||
| 1179 | continue; | |||
| 1180 | } | |||
| 1181 | ||||
| 1182 | Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
| 1183 | auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), | |||
| 1184 | TII->get(AMDGPU::V_MOV_B32_e32), VGPR); | |||
| 1185 | if (Op.isImm()) | |||
| 1186 | Copy.addImm(Op.getImm()); | |||
| 1187 | else if (Op.isReg()) | |||
| 1188 | Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, | |||
| 1189 | Op.getSubReg()); | |||
| 1190 | Op.ChangeToRegister(VGPR, false); | |||
| 1191 | } | |||
| 1192 | } | |||
| 1193 | ||||
| 1194 | bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { | |||
| 1195 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
| 1196 | ||||
| 1197 | if (!ST.hasSDWA() || skipFunction(MF.getFunction())) | |||
| 1198 | return false; | |||
| 1199 | ||||
| 1200 | MRI = &MF.getRegInfo(); | |||
| 1201 | TRI = ST.getRegisterInfo(); | |||
| 1202 | TII = ST.getInstrInfo(); | |||
| 1203 | ||||
| 1204 | // Find all SDWA operands in MF. | |||
| 1205 | bool Ret = false; | |||
| 1206 | for (MachineBasicBlock &MBB : MF) { | |||
| 1207 | bool Changed = false; | |||
| 1208 | do { | |||
| 1209 | // Preprocess the ADD/SUB pairs so they could be SDWA'ed. | |||
| 1210 | // Look for a possible ADD or SUB that resulted from a previously lowered | |||
| 1211 | // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 | |||
| 1212 | // lowers the pair of instructions into e32 form. | |||
| 1213 | matchSDWAOperands(MBB); | |||
| 1214 | for (const auto &OperandPair : SDWAOperands) { | |||
| 1215 | const auto &Operand = OperandPair.second; | |||
| 1216 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII); | |||
| 1217 | if (PotentialMI && | |||
| 1218 | (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || | |||
| 1219 | PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) | |||
| 1220 | pseudoOpConvertToVOP2(*PotentialMI, ST); | |||
| 1221 | } | |||
| 1222 | SDWAOperands.clear(); | |||
| 1223 | ||||
| 1224 | // Generate potential match list. | |||
| 1225 | matchSDWAOperands(MBB); | |||
| 1226 | ||||
| 1227 | for (const auto &OperandPair : SDWAOperands) { | |||
| 1228 | const auto &Operand = OperandPair.second; | |||
| 1229 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII); | |||
| 1230 | if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { | |||
| 1231 | PotentialMatches[PotentialMI].push_back(Operand.get()); | |||
| 1232 | } | |||
| 1233 | } | |||
| 1234 | ||||
| 1235 | for (auto &PotentialPair : PotentialMatches) { | |||
| 1236 | MachineInstr &PotentialMI = *PotentialPair.first; | |||
| 1237 | convertToSDWA(PotentialMI, PotentialPair.second); | |||
| 1238 | } | |||
| 1239 | ||||
| 1240 | PotentialMatches.clear(); | |||
| 1241 | SDWAOperands.clear(); | |||
| 1242 | ||||
| 1243 | Changed = !ConvertedInstructions.empty(); | |||
| 1244 | ||||
| 1245 | if (Changed) | |||
| 1246 | Ret = true; | |||
| 1247 | while (!ConvertedInstructions.empty()) | |||
| 1248 | legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); | |||
| 1249 | } while (Changed); | |||
| 1250 | } | |||
| 1251 | ||||
| 1252 | return Ret; | |||
| 1253 | } |