Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
Warning:line 1150, column 10
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -D PIC -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -D_RET_PROTECTOR -ret-protector -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86/X86ISelLowering.cpp

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/Instructions.h"
52#include "llvm/IR/Intrinsics.h"
53#include "llvm/IR/IRBuilder.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
58#include "llvm/Support/CommandLine.h"
59#include "llvm/Support/Debug.h"
60#include "llvm/Support/ErrorHandling.h"
61#include "llvm/Support/KnownBits.h"
62#include "llvm/Support/MathExtras.h"
63#include "llvm/Target/TargetOptions.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE"x86-isel" "x86-isel"
71
72STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
73
74static cl::opt<int> ExperimentalPrefLoopAlignment(
75 "x86-experimental-pref-loop-alignment", cl::init(4),
76 cl::desc(
77 "Sets the preferable loop alignment for experiments (as log2 bytes)"
78 "(the last x86-experimental-pref-loop-alignment bits"
79 " of the loop header PC will be 0)."),
80 cl::Hidden);
81
82static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
83 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
84 cl::desc(
85 "Sets the preferable loop alignment for experiments (as log2 bytes) "
86 "for innermost loops only. If specified, this option overrides "
87 "alignment set by x86-experimental-pref-loop-alignment."),
88 cl::Hidden);
89
90static cl::opt<bool> MulConstantOptimization(
91 "mul-constant-optimization", cl::init(true),
92 cl::desc("Replace 'mul x, Const' with more effective instructions like "
93 "SHIFT, LEA, etc."),
94 cl::Hidden);
95
96static cl::opt<bool> ExperimentalUnorderedISEL(
97 "x86-experimental-unordered-atomic-isel", cl::init(false),
98 cl::desc("Use LoadSDNode and StoreSDNode instead of "
99 "AtomicSDNode for unordered atomic loads and "
100 "stores respectively."),
101 cl::Hidden);
102
103/// Call this when the user attempts to do something unsupported, like
104/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
105/// report_fatal_error, so calling code should attempt to recover without
106/// crashing.
107static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
108 const char *Msg) {
109 MachineFunction &MF = DAG.getMachineFunction();
110 DAG.getContext()->diagnose(
111 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
112}
113
114X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
115 const X86Subtarget &STI)
116 : TargetLowering(TM), Subtarget(STI) {
117 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
118 X86ScalarSSEf64 = Subtarget.hasSSE2();
119 X86ScalarSSEf32 = Subtarget.hasSSE1();
120 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
121
122 // Set up the TargetLowering object.
123
124 // X86 is weird. It always uses i8 for shift amounts and setcc results.
125 setBooleanContents(ZeroOrOneBooleanContent);
126 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
127 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
128
129 // For 64-bit, since we have so many registers, use the ILP scheduler.
130 // For 32-bit, use the register pressure specific scheduling.
131 // For Atom, always use ILP scheduling.
132 if (Subtarget.isAtom())
133 setSchedulingPreference(Sched::ILP);
134 else if (Subtarget.is64Bit())
135 setSchedulingPreference(Sched::ILP);
136 else
137 setSchedulingPreference(Sched::RegPressure);
138 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
139 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
140
141 // Bypass expensive divides and use cheaper ones.
142 if (TM.getOptLevel() >= CodeGenOpt::Default) {
143 if (Subtarget.hasSlowDivide32())
144 addBypassSlowDiv(32, 8);
145 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
146 addBypassSlowDiv(64, 32);
147 }
148
149 // Setup Windows compiler runtime calls.
150 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
151 static const struct {
152 const RTLIB::Libcall Op;
153 const char * const Name;
154 const CallingConv::ID CC;
155 } LibraryCalls[] = {
156 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
157 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
158 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
159 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
160 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
161 };
162
163 for (const auto &LC : LibraryCalls) {
164 setLibcallName(LC.Op, LC.Name);
165 setLibcallCallingConv(LC.Op, LC.CC);
166 }
167 }
168
169 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
170 // MSVCRT doesn't have powi; fall back to pow
171 setLibcallName(RTLIB::POWI_F32, nullptr);
172 setLibcallName(RTLIB::POWI_F64, nullptr);
173 }
174
175 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
176 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
177 // FIXME: Should we be limiting the atomic size on other configs? Default is
178 // 1024.
179 if (!Subtarget.hasCmpxchg8b())
180 setMaxAtomicSizeInBitsSupported(32);
181
182 // Set up the register classes.
183 addRegisterClass(MVT::i8, &X86::GR8RegClass);
184 addRegisterClass(MVT::i16, &X86::GR16RegClass);
185 addRegisterClass(MVT::i32, &X86::GR32RegClass);
186 if (Subtarget.is64Bit())
187 addRegisterClass(MVT::i64, &X86::GR64RegClass);
188
189 for (MVT VT : MVT::integer_valuetypes())
190 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
191
192 // We don't accept any truncstore of integer registers.
193 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
194 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
197 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
198 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
199
200 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
201
202 // SETOEQ and SETUNE require checking two conditions.
203 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 setCondCodeAction(ISD::SETOEQ, VT, Expand);
205 setCondCodeAction(ISD::SETUNE, VT, Expand);
206 }
207
208 // Integer absolute.
209 if (Subtarget.hasCMov()) {
210 setOperationAction(ISD::ABS , MVT::i16 , Custom);
211 setOperationAction(ISD::ABS , MVT::i32 , Custom);
212 if (Subtarget.is64Bit())
213 setOperationAction(ISD::ABS , MVT::i64 , Custom);
214 }
215
216 // Funnel shifts.
217 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
218 // For slow shld targets we only lower for code size.
219 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
220
221 setOperationAction(ShiftOp , MVT::i8 , Custom);
222 setOperationAction(ShiftOp , MVT::i16 , Custom);
223 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
224 if (Subtarget.is64Bit())
225 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
226 }
227
228 if (!Subtarget.useSoftFloat()) {
229 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
230 // operation.
231 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
232 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
233 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
234 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
235 // We have an algorithm for SSE2, and we turn this into a 64-bit
236 // FILD or VCVTUSI2SS/SD for other targets.
237 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
238 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
239 // We have an algorithm for SSE2->double, and we turn this into a
240 // 64-bit FILD followed by conditional FADD for other targets.
241 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
242 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
243
244 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
245 // this operation.
246 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
247 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
248 // SSE has no i16 to fp conversion, only i32. We promote in the handler
249 // to allow f80 to use i16 and f64 to use i16 with sse1 only
250 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
251 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
252 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
253 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
254 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
255 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
256 // are Legal, f80 is custom lowered.
257 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
258 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
259
260 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
261 // this operation.
262 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
263 // FIXME: This doesn't generate invalid exception when it should. PR44019.
264 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
265 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
266 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
267 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
268 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
271 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
272 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
273
274 // Handle FP_TO_UINT by promoting the destination to a larger signed
275 // conversion.
276 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
278 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
279 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
280 // FIXME: This doesn't generate invalid exception when it should. PR44019.
281 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
282 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
283 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
284 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
285 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
286
287 setOperationAction(ISD::LRINT, MVT::f32, Custom);
288 setOperationAction(ISD::LRINT, MVT::f64, Custom);
289 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
290 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
291
292 if (!Subtarget.is64Bit()) {
293 setOperationAction(ISD::LRINT, MVT::i64, Custom);
294 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
295 }
296 }
297
298 if (Subtarget.hasSSE2()) {
299 // Custom lowering for saturating float to int conversions.
300 // We handle promotion to larger result types manually.
301 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
302 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
303 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
304 }
305 if (Subtarget.is64Bit()) {
306 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
307 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
308 }
309 }
310
311 // Handle address space casts between mixed sized pointers.
312 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
313 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
314
315 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
316 if (!X86ScalarSSEf64) {
317 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
318 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
319 if (Subtarget.is64Bit()) {
320 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
321 // Without SSE, i64->f64 goes through memory.
322 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
323 }
324 } else if (!Subtarget.is64Bit())
325 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
326
327 // Scalar integer divide and remainder are lowered to use operations that
328 // produce two results, to match the available instructions. This exposes
329 // the two-result form to trivial CSE, which is able to combine x/y and x%y
330 // into a single instruction.
331 //
332 // Scalar integer multiply-high is also lowered to use two-result
333 // operations, to match the available instructions. However, plain multiply
334 // (low) operations are left as Legal, as there are single-result
335 // instructions for this in x86. Using the two-result multiply instructions
336 // when both high and low results are needed must be arranged by dagcombine.
337 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
338 setOperationAction(ISD::MULHS, VT, Expand);
339 setOperationAction(ISD::MULHU, VT, Expand);
340 setOperationAction(ISD::SDIV, VT, Expand);
341 setOperationAction(ISD::UDIV, VT, Expand);
342 setOperationAction(ISD::SREM, VT, Expand);
343 setOperationAction(ISD::UREM, VT, Expand);
344 }
345
346 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
347 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
348 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
349 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
350 setOperationAction(ISD::BR_CC, VT, Expand);
351 setOperationAction(ISD::SELECT_CC, VT, Expand);
352 }
353 if (Subtarget.is64Bit())
354 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
355 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
356 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
357 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
358
359 setOperationAction(ISD::FREM , MVT::f32 , Expand);
360 setOperationAction(ISD::FREM , MVT::f64 , Expand);
361 setOperationAction(ISD::FREM , MVT::f80 , Expand);
362 setOperationAction(ISD::FREM , MVT::f128 , Expand);
363
364 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
365 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
366 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
367 }
368
369 // Promote the i8 variants and force them on up to i32 which has a shorter
370 // encoding.
371 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
372 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
373
374 if (Subtarget.hasBMI()) {
375 // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
376 // is enabled.
377 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
378 } else {
379 setOperationAction(ISD::CTTZ, MVT::i16, Custom);
380 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
381 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
382 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
383 if (Subtarget.is64Bit()) {
384 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
385 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
386 }
387 }
388
389 if (Subtarget.hasLZCNT()) {
390 // When promoting the i8 variants, force them to i32 for a shorter
391 // encoding.
392 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
393 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
394 } else {
395 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
396 if (VT == MVT::i64 && !Subtarget.is64Bit())
397 continue;
398 setOperationAction(ISD::CTLZ , VT, Custom);
399 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
400 }
401 }
402
403 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
404 ISD::STRICT_FP_TO_FP16}) {
405 // Special handling for half-precision floating point conversions.
406 // If we don't have F16C support, then lower half float conversions
407 // into library calls.
408 setOperationAction(
409 Op, MVT::f32,
410 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
411 // There's never any support for operations beyond MVT::f32.
412 setOperationAction(Op, MVT::f64, Expand);
413 setOperationAction(Op, MVT::f80, Expand);
414 setOperationAction(Op, MVT::f128, Expand);
415 }
416
417 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
418 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
419 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
420 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
421 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
422 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
423 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
424 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
425
426 setOperationAction(ISD::PARITY, MVT::i8, Custom);
427 if (Subtarget.hasPOPCNT()) {
428 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
429 } else {
430 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
431 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
432 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
433 if (Subtarget.is64Bit())
434 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
435 else
436 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
437
438 setOperationAction(ISD::PARITY, MVT::i16, Custom);
439 setOperationAction(ISD::PARITY, MVT::i32, Custom);
440 if (Subtarget.is64Bit())
441 setOperationAction(ISD::PARITY, MVT::i64, Custom);
442 }
443
444 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
445
446 if (!Subtarget.hasMOVBE())
447 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
448
449 // X86 wants to expand cmov itself.
450 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
451 setOperationAction(ISD::SELECT, VT, Custom);
452 setOperationAction(ISD::SETCC, VT, Custom);
453 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
454 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
455 }
456 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::SELECT, VT, Custom);
460 setOperationAction(ISD::SETCC, VT, Custom);
461 }
462
463 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
464 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
465 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
466
467 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
468 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
469 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
470 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
471 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
472 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
473 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
474 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
475
476 // Darwin ABI issue.
477 for (auto VT : { MVT::i32, MVT::i64 }) {
478 if (VT == MVT::i64 && !Subtarget.is64Bit())
479 continue;
480 setOperationAction(ISD::ConstantPool , VT, Custom);
481 setOperationAction(ISD::JumpTable , VT, Custom);
482 setOperationAction(ISD::GlobalAddress , VT, Custom);
483 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
484 setOperationAction(ISD::ExternalSymbol , VT, Custom);
485 setOperationAction(ISD::BlockAddress , VT, Custom);
486 }
487
488 // 64-bit shl, sra, srl (iff 32-bit x86)
489 for (auto VT : { MVT::i32, MVT::i64 }) {
490 if (VT == MVT::i64 && !Subtarget.is64Bit())
491 continue;
492 setOperationAction(ISD::SHL_PARTS, VT, Custom);
493 setOperationAction(ISD::SRA_PARTS, VT, Custom);
494 setOperationAction(ISD::SRL_PARTS, VT, Custom);
495 }
496
497 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
498 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
499
500 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
501
502 // Expand certain atomics
503 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
504 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
505 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
506 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
507 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
508 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
509 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
510 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
511 }
512
513 if (!Subtarget.is64Bit())
514 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
515
516 if (Subtarget.hasCmpxchg16b()) {
517 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
518 }
519
520 // FIXME - use subtarget debug flags
521 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
522 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
523 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
524 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
525 }
526
527 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
528 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
529
530 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
531 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
532
533 setOperationAction(ISD::TRAP, MVT::Other, Legal);
534 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
535 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
536
537 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
538 setOperationAction(ISD::VASTART , MVT::Other, Custom);
539 setOperationAction(ISD::VAEND , MVT::Other, Expand);
540 bool Is64Bit = Subtarget.is64Bit();
541 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
542 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
543
544 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
545 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
546
547 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
548
549 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
550 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
551 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
552
553 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
554 // f32 and f64 use SSE.
555 // Set up the FP register classes.
556 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
557 : &X86::FR32RegClass);
558 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
559 : &X86::FR64RegClass);
560
561 // Disable f32->f64 extload as we can only generate this in one instruction
562 // under optsize. So its easier to pattern match (fpext (load)) for that
563 // case instead of needing to emit 2 instructions for extload in the
564 // non-optsize case.
565 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
566
567 for (auto VT : { MVT::f32, MVT::f64 }) {
568 // Use ANDPD to simulate FABS.
569 setOperationAction(ISD::FABS, VT, Custom);
570
571 // Use XORP to simulate FNEG.
572 setOperationAction(ISD::FNEG, VT, Custom);
573
574 // Use ANDPD and ORPD to simulate FCOPYSIGN.
575 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
576
577 // These might be better off as horizontal vector ops.
578 setOperationAction(ISD::FADD, VT, Custom);
579 setOperationAction(ISD::FSUB, VT, Custom);
580
581 // We don't support sin/cos/fmod
582 setOperationAction(ISD::FSIN , VT, Expand);
583 setOperationAction(ISD::FCOS , VT, Expand);
584 setOperationAction(ISD::FSINCOS, VT, Expand);
585 }
586
587 // Lower this to MOVMSK plus an AND.
588 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
589 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
590
591 } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
592 (UseX87 || Is64Bit)) {
593 // Use SSE for f32, x87 for f64.
594 // Set up the FP register classes.
595 addRegisterClass(MVT::f32, &X86::FR32RegClass);
596 if (UseX87)
597 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
598
599 // Use ANDPS to simulate FABS.
600 setOperationAction(ISD::FABS , MVT::f32, Custom);
601
602 // Use XORP to simulate FNEG.
603 setOperationAction(ISD::FNEG , MVT::f32, Custom);
604
605 if (UseX87)
606 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
607
608 // Use ANDPS and ORPS to simulate FCOPYSIGN.
609 if (UseX87)
610 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
611 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
612
613 // We don't support sin/cos/fmod
614 setOperationAction(ISD::FSIN , MVT::f32, Expand);
615 setOperationAction(ISD::FCOS , MVT::f32, Expand);
616 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
617
618 if (UseX87) {
619 // Always expand sin/cos functions even though x87 has an instruction.
620 setOperationAction(ISD::FSIN, MVT::f64, Expand);
621 setOperationAction(ISD::FCOS, MVT::f64, Expand);
622 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
623 }
624 } else if (UseX87) {
625 // f32 and f64 in x87.
626 // Set up the FP register classes.
627 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
628 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
629
630 for (auto VT : { MVT::f32, MVT::f64 }) {
631 setOperationAction(ISD::UNDEF, VT, Expand);
632 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
633
634 // Always expand sin/cos functions even though x87 has an instruction.
635 setOperationAction(ISD::FSIN , VT, Expand);
636 setOperationAction(ISD::FCOS , VT, Expand);
637 setOperationAction(ISD::FSINCOS, VT, Expand);
638 }
639 }
640
641 // Expand FP32 immediates into loads from the stack, save special cases.
642 if (isTypeLegal(MVT::f32)) {
643 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
644 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
645 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
646 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
647 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
648 } else // SSE immediates.
649 addLegalFPImmediate(APFloat(+0.0f)); // xorps
650 }
651 // Expand FP64 immediates into loads from the stack, save special cases.
652 if (isTypeLegal(MVT::f64)) {
653 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
654 addLegalFPImmediate(APFloat(+0.0)); // FLD0
655 addLegalFPImmediate(APFloat(+1.0)); // FLD1
656 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
657 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
658 } else // SSE immediates.
659 addLegalFPImmediate(APFloat(+0.0)); // xorpd
660 }
661 // Handle constrained floating-point operations of scalar.
662 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
663 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
664 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
665 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
666 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
667 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
668 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
669 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
670 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
671 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
672 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
673 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
674 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
675
676 // We don't support FMA.
677 setOperationAction(ISD::FMA, MVT::f64, Expand);
678 setOperationAction(ISD::FMA, MVT::f32, Expand);
679
680 // f80 always uses X87.
681 if (UseX87) {
682 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
683 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
684 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
685 {
686 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
687 addLegalFPImmediate(TmpFlt); // FLD0
688 TmpFlt.changeSign();
689 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
690
691 bool ignored;
692 APFloat TmpFlt2(+1.0);
693 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
694 &ignored);
695 addLegalFPImmediate(TmpFlt2); // FLD1
696 TmpFlt2.changeSign();
697 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
698 }
699
700 // Always expand sin/cos functions even though x87 has an instruction.
701 setOperationAction(ISD::FSIN , MVT::f80, Expand);
702 setOperationAction(ISD::FCOS , MVT::f80, Expand);
703 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
704
705 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
706 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
707 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
708 setOperationAction(ISD::FRINT, MVT::f80, Expand);
709 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
710 setOperationAction(ISD::FMA, MVT::f80, Expand);
711 setOperationAction(ISD::LROUND, MVT::f80, Expand);
712 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
713 setOperationAction(ISD::LRINT, MVT::f80, Custom);
714 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
715
716 // Handle constrained floating-point operations of scalar.
717 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
718 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
719 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
720 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
721 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
722 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
723 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
724 // as Custom.
725 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
726 }
727
728 // f128 uses xmm registers, but most operations require libcalls.
729 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
730 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
731 : &X86::VR128RegClass);
732
733 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
734
735 setOperationAction(ISD::FADD, MVT::f128, LibCall);
736 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
737 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
738 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
739 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
740 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
741 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
742 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
743 setOperationAction(ISD::FMA, MVT::f128, LibCall);
744 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
745
746 setOperationAction(ISD::FABS, MVT::f128, Custom);
747 setOperationAction(ISD::FNEG, MVT::f128, Custom);
748 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
749
750 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
751 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
752 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
753 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
754 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
755 // No STRICT_FSINCOS
756 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
757 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
758
759 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
760 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
761 // We need to custom handle any FP_ROUND with an f128 input, but
762 // LegalizeDAG uses the result type to know when to run a custom handler.
763 // So we have to list all legal floating point result types here.
764 if (isTypeLegal(MVT::f32)) {
765 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
766 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
767 }
768 if (isTypeLegal(MVT::f64)) {
769 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
770 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
771 }
772 if (isTypeLegal(MVT::f80)) {
773 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
774 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
775 }
776
777 setOperationAction(ISD::SETCC, MVT::f128, Custom);
778
779 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
780 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
781 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
782 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
783 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
784 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
785 }
786
787 // Always use a library call for pow.
788 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
789 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
790 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
791 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
792
793 setOperationAction(ISD::FLOG, MVT::f80, Expand);
794 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
795 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
796 setOperationAction(ISD::FEXP, MVT::f80, Expand);
797 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
798 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
799 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
800
801 // Some FP actions are always expanded for vector types.
802 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
803 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
804 setOperationAction(ISD::FSIN, VT, Expand);
805 setOperationAction(ISD::FSINCOS, VT, Expand);
806 setOperationAction(ISD::FCOS, VT, Expand);
807 setOperationAction(ISD::FREM, VT, Expand);
808 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
809 setOperationAction(ISD::FPOW, VT, Expand);
810 setOperationAction(ISD::FLOG, VT, Expand);
811 setOperationAction(ISD::FLOG2, VT, Expand);
812 setOperationAction(ISD::FLOG10, VT, Expand);
813 setOperationAction(ISD::FEXP, VT, Expand);
814 setOperationAction(ISD::FEXP2, VT, Expand);
815 }
816
817 // First set operation action for all vector types to either promote
818 // (for widening) or expand (for scalarization). Then we will selectively
819 // turn on ones that can be effectively codegen'd.
820 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
821 setOperationAction(ISD::SDIV, VT, Expand);
822 setOperationAction(ISD::UDIV, VT, Expand);
823 setOperationAction(ISD::SREM, VT, Expand);
824 setOperationAction(ISD::UREM, VT, Expand);
825 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
826 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
827 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
828 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
829 setOperationAction(ISD::FMA, VT, Expand);
830 setOperationAction(ISD::FFLOOR, VT, Expand);
831 setOperationAction(ISD::FCEIL, VT, Expand);
832 setOperationAction(ISD::FTRUNC, VT, Expand);
833 setOperationAction(ISD::FRINT, VT, Expand);
834 setOperationAction(ISD::FNEARBYINT, VT, Expand);
835 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
836 setOperationAction(ISD::MULHS, VT, Expand);
837 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
838 setOperationAction(ISD::MULHU, VT, Expand);
839 setOperationAction(ISD::SDIVREM, VT, Expand);
840 setOperationAction(ISD::UDIVREM, VT, Expand);
841 setOperationAction(ISD::CTPOP, VT, Expand);
842 setOperationAction(ISD::CTTZ, VT, Expand);
843 setOperationAction(ISD::CTLZ, VT, Expand);
844 setOperationAction(ISD::ROTL, VT, Expand);
845 setOperationAction(ISD::ROTR, VT, Expand);
846 setOperationAction(ISD::BSWAP, VT, Expand);
847 setOperationAction(ISD::SETCC, VT, Expand);
848 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
849 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
850 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
851 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
852 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
853 setOperationAction(ISD::TRUNCATE, VT, Expand);
854 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
855 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
856 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
857 setOperationAction(ISD::SELECT_CC, VT, Expand);
858 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
859 setTruncStoreAction(InnerVT, VT, Expand);
860
861 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
862 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
863
864 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
865 // types, we have to deal with them whether we ask for Expansion or not.
866 // Setting Expand causes its own optimisation problems though, so leave
867 // them legal.
868 if (VT.getVectorElementType() == MVT::i1)
869 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
870
871 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
872 // split/scalarized right now.
873 if (VT.getVectorElementType() == MVT::f16)
874 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
875 }
876 }
877
878 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
879 // with -msoft-float, disable use of MMX as well.
880 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
881 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
882 // No operations on x86mmx supported, everything uses intrinsics.
883 }
884
885 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
890 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
891 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
892 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
893 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
894 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
895 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
896 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
897
898 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
899 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
900
901 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
902 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
903 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
904 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
905 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
906 }
907
908 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
909 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
910 : &X86::VR128RegClass);
911
912 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
913 // registers cannot be used even for integer operations.
914 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
915 : &X86::VR128RegClass);
916 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
917 : &X86::VR128RegClass);
918 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
919 : &X86::VR128RegClass);
920 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
921 : &X86::VR128RegClass);
922
923 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
924 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
925 setOperationAction(ISD::SDIV, VT, Custom);
926 setOperationAction(ISD::SREM, VT, Custom);
927 setOperationAction(ISD::UDIV, VT, Custom);
928 setOperationAction(ISD::UREM, VT, Custom);
929 }
930
931 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
932 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
933 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
934
935 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
936 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
937 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
938 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
939 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
940 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
941 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
942 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
943 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
944 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
945
946 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
947 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
948
949 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
950 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
951 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
952
953 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
954 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
955 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
956 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
957 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
958 }
959
960 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
961 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
962 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
963 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
964 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
965 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
966 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
967 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
968 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
969 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
970
971 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
972 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
973 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
974
975 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
976 setOperationAction(ISD::SETCC, VT, Custom);
977 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
978 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
979 setOperationAction(ISD::CTPOP, VT, Custom);
980 setOperationAction(ISD::ABS, VT, Custom);
981
982 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
983 // setcc all the way to isel and prefer SETGT in some isel patterns.
984 setCondCodeAction(ISD::SETLT, VT, Custom);
985 setCondCodeAction(ISD::SETLE, VT, Custom);
986 }
987
988 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
989 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
990 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
991 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
992 setOperationAction(ISD::VSELECT, VT, Custom);
993 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
994 }
995
996 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
997 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
998 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
999 setOperationAction(ISD::VSELECT, VT, Custom);
1000
1001 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1002 continue;
1003
1004 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1005 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1006 }
1007
1008 // Custom lower v2i64 and v2f64 selects.
1009 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1010 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1011 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1012 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1013 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1014
1015 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1016 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1017 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1018 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1019 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
1020 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1021
1022 // Custom legalize these to avoid over promotion or custom promotion.
1023 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1024 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1025 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1026 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1027 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1028 }
1029
1030 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1031 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1032 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1033 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1034
1035 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1036 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1037
1038 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1039 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1040
1041 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1042 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1043 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1044 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1045 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1046
1047 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1048 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1049 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1050 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1051
1052 // We want to legalize this to an f64 load rather than an i64 load on
1053 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1054 // store.
1055 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1056 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1057 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1058 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1059 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1060 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1061
1062 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1063 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1064 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1065 if (!Subtarget.hasAVX512())
1066 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1067
1068 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1069 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1070 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1071
1072 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1073
1074 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1075 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1076 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1077 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1078 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1079 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1080
1081 // In the customized shift lowering, the legal v4i32/v2i64 cases
1082 // in AVX2 will be recognized.
1083 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1084 setOperationAction(ISD::SRL, VT, Custom);
1085 setOperationAction(ISD::SHL, VT, Custom);
1086 setOperationAction(ISD::SRA, VT, Custom);
1087 }
1088
1089 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1090 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1091
1092 // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1093 // shifts) is better.
1094 if (!Subtarget.useAVX512Regs() &&
1095 !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1096 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1097
1098 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1099 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1100 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1101 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1102 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1103 }
1104
1105 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1106 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1107 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1108 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1109 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1110 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1111 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1112 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1113 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1114
1115 // These might be better off as horizontal vector ops.
1116 setOperationAction(ISD::ADD, MVT::i16, Custom);
1117 setOperationAction(ISD::ADD, MVT::i32, Custom);
1118 setOperationAction(ISD::SUB, MVT::i16, Custom);
1119 setOperationAction(ISD::SUB, MVT::i32, Custom);
1120 }
1121
1122 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1123 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1124 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1125 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1126 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1127 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1128 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1129 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1130 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1131 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1132 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1133 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1134 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1135 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1136
1137 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1138 }
1139
1140 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1141 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1142 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1143 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1144 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1145 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1146 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1147 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1148
1149 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1150
1151 // FIXME: Do we need to handle scalar-to-vector here?
1152 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1153
1154 // We directly match byte blends in the backend as they match the VSELECT
1155 // condition form.
1156 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1157
1158 // SSE41 brings specific instructions for doing vector sign extend even in
1159 // cases where we don't have SRA.
1160 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1161 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1162 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1163 }
1164
1165 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1166 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1167 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1168 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1169 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1170 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1171 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1172 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1173 }
1174
1175 // i8 vectors are custom because the source register and source
1176 // source memory operand types are not the same width.
1177 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1178
1179 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1180 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1181 // do the pre and post work in the vector domain.
1182 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1183 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1184 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1185 // so that DAG combine doesn't try to turn it into uint_to_fp.
1186 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1187 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1188 }
1189 }
1190
1191 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1192 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1193 }
1194
1195 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1196 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1197 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1198 setOperationAction(ISD::ROTL, VT, Custom);
1199
1200 // XOP can efficiently perform BITREVERSE with VPPERM.
1201 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1202 setOperationAction(ISD::BITREVERSE, VT, Custom);
1203
1204 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1205 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1206 setOperationAction(ISD::BITREVERSE, VT, Custom);
1207 }
1208
1209 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1210 bool HasInt256 = Subtarget.hasInt256();
1211
1212 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1213 : &X86::VR256RegClass);
1214 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1215 : &X86::VR256RegClass);
1216 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1217 : &X86::VR256RegClass);
1218 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1219 : &X86::VR256RegClass);
1220 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1221 : &X86::VR256RegClass);
1222 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1223 : &X86::VR256RegClass);
1224
1225 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1226 setOperationAction(ISD::FFLOOR, VT, Legal);
1227 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1228 setOperationAction(ISD::FCEIL, VT, Legal);
1229 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1230 setOperationAction(ISD::FTRUNC, VT, Legal);
1231 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1232 setOperationAction(ISD::FRINT, VT, Legal);
1233 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1234 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1235 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1236 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1237 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1238
1239 setOperationAction(ISD::FROUND, VT, Custom);
1240
1241 setOperationAction(ISD::FNEG, VT, Custom);
1242 setOperationAction(ISD::FABS, VT, Custom);
1243 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1244 }
1245
1246 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1247 // even though v8i16 is a legal type.
1248 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1249 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1250 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1251 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1252 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1253 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1254 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1255
1256 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1257 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1258
1259 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1260 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1261 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1262 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1263 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1264 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1265 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1266 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1267 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1268 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1269 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1270 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1271
1272 if (!Subtarget.hasAVX512())
1273 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1274
1275 // In the customized shift lowering, the legal v8i32/v4i64 cases
1276 // in AVX2 will be recognized.
1277 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1278 setOperationAction(ISD::SRL, VT, Custom);
1279 setOperationAction(ISD::SHL, VT, Custom);
1280 setOperationAction(ISD::SRA, VT, Custom);
1281 }
1282
1283 // These types need custom splitting if their input is a 128-bit vector.
1284 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1285 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1286 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1287 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1288
1289 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1290 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1291
1292 // With BWI, expanding (and promoting the shifts) is the better.
1293 if (!Subtarget.useBWIRegs())
1294 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1295
1296 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1297 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1298 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1299 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1300 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1301 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1302
1303 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1304 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1305 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1307 }
1308
1309 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1310 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1311 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1312 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1313
1314 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1315 setOperationAction(ISD::SETCC, VT, Custom);
1316 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1317 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1318 setOperationAction(ISD::CTPOP, VT, Custom);
1319 setOperationAction(ISD::CTLZ, VT, Custom);
1320
1321 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1322 // setcc all the way to isel and prefer SETGT in some isel patterns.
1323 setCondCodeAction(ISD::SETLT, VT, Custom);
1324 setCondCodeAction(ISD::SETLE, VT, Custom);
1325 }
1326
1327 if (Subtarget.hasAnyFMA()) {
1328 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1329 MVT::v2f64, MVT::v4f64 }) {
1330 setOperationAction(ISD::FMA, VT, Legal);
1331 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1332 }
1333 }
1334
1335 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1336 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1337 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1338 }
1339
1340 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1341 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1342 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1343 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1344
1345 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1346 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1347 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1348 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1349 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1350 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1351
1352 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1353 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1354
1355 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1356 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1357 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1358 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1359 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1360
1361 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1362 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1363 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1364 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1365 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1366 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1367 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1368 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1369 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1370 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1371 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1372 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1373
1374 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1375 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1376 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1377 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1378 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1379 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1380 }
1381
1382 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1383 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1384 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1385 }
1386
1387 if (HasInt256) {
1388 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1389 // when we have a 256bit-wide blend with immediate.
1390 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1391 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1392
1393 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1394 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1395 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1396 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1397 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1398 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1399 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1401 }
1402 }
1403
1404 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1405 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1406 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1407 setOperationAction(ISD::MSTORE, VT, Legal);
1408 }
1409
1410 // Extract subvector is special because the value type
1411 // (result) is 128-bit but the source is 256-bit wide.
1412 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1413 MVT::v4f32, MVT::v2f64 }) {
1414 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1415 }
1416
1417 // Custom lower several nodes for 256-bit types.
1418 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1419 MVT::v8f32, MVT::v4f64 }) {
1420 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1421 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1422 setOperationAction(ISD::VSELECT, VT, Custom);
1423 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1424 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1425 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1426 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1427 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1428 setOperationAction(ISD::STORE, VT, Custom);
1429 }
1430
1431 if (HasInt256) {
1432 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1433
1434 // Custom legalize 2x32 to get a little better code.
1435 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1436 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1437
1438 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1439 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1440 setOperationAction(ISD::MGATHER, VT, Custom);
1441 }
1442 }
1443
1444 // This block controls legalization of the mask vector sizes that are
1445 // available with AVX512. 512-bit vectors are in a separate block controlled
1446 // by useAVX512Regs.
1447 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1448 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1449 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1450 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1451 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1452 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1453
1454 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1455 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1456 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1457
1458 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1459 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1460 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1461 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1462 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1463 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1464 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1465 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1466 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1467 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1468 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1469 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1470
1471 // There is no byte sized k-register load or store without AVX512DQ.
1472 if (!Subtarget.hasDQI()) {
1473 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1474 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1475 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1476 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1477
1478 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1479 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1480 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1481 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1482 }
1483
1484 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1485 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1486 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1487 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1488 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1489 }
1490
1491 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1492 setOperationAction(ISD::VSELECT, VT, Expand);
1493
1494 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1495 setOperationAction(ISD::SETCC, VT, Custom);
1496 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1497 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1498 setOperationAction(ISD::SELECT, VT, Custom);
1499 setOperationAction(ISD::TRUNCATE, VT, Custom);
1500
1501 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1502 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1503 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1504 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1505 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1506 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1507 }
1508
1509 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1510 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1511 }
1512
1513 // This block controls legalization for 512-bit operations with 32/64 bit
1514 // elements. 512-bits can be disabled based on prefer-vector-width and
1515 // required-vector-width function attributes.
1516 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1517 bool HasBWI = Subtarget.hasBWI();
1518
1519 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1520 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1521 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1522 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1523 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1524 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1525
1526 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1527 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1528 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1529 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1530 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1531 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1532 if (HasBWI)
1533 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1534 }
1535
1536 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1537 setOperationAction(ISD::FNEG, VT, Custom);
1538 setOperationAction(ISD::FABS, VT, Custom);
1539 setOperationAction(ISD::FMA, VT, Legal);
1540 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1541 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1542 }
1543
1544 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1545 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1546 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1547 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1548 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1549 }
1550 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1551 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1552 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1553 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1554 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1555 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1556 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1557 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1558
1559 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1560 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1561 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1562 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1563 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1564 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1565 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1566 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1567 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1568 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1569 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1570 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1571
1572 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1573 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1574 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1575 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1576 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1577 if (HasBWI)
1578 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1579
1580 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1581 // to 512-bit rather than use the AVX2 instructions so that we can use
1582 // k-masks.
1583 if (!Subtarget.hasVLX()) {
1584 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1585 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1586 setOperationAction(ISD::MLOAD, VT, Custom);
1587 setOperationAction(ISD::MSTORE, VT, Custom);
1588 }
1589 }
1590
1591 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1592 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1593 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1594 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1595 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1596 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1597 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1598 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1599 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1600 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1601 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1602 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1603 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1604
1605 if (HasBWI) {
1606 // Extends from v64i1 masks to 512-bit vectors.
1607 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1608 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1609 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1610 }
1611
1612 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1613 setOperationAction(ISD::FFLOOR, VT, Legal);
1614 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1615 setOperationAction(ISD::FCEIL, VT, Legal);
1616 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1617 setOperationAction(ISD::FTRUNC, VT, Legal);
1618 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1619 setOperationAction(ISD::FRINT, VT, Legal);
1620 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1621 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1622 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1623 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1624 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1625
1626 setOperationAction(ISD::FROUND, VT, Custom);
1627 }
1628
1629 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1630 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1631 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1632 }
1633
1634 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1635 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1636 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1637 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1638
1639 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1640 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1641 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1642 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1643
1644 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1645 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1646 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1647 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1648 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1649 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1650
1651 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1652 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1653
1654 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1655
1656 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1657 setOperationAction(ISD::SRL, VT, Custom);
1658 setOperationAction(ISD::SHL, VT, Custom);
1659 setOperationAction(ISD::SRA, VT, Custom);
1660 setOperationAction(ISD::SETCC, VT, Custom);
1661
1662 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1663 // setcc all the way to isel and prefer SETGT in some isel patterns.
1664 setCondCodeAction(ISD::SETLT, VT, Custom);
1665 setCondCodeAction(ISD::SETLE, VT, Custom);
1666 }
1667 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1668 setOperationAction(ISD::SMAX, VT, Legal);
1669 setOperationAction(ISD::UMAX, VT, Legal);
1670 setOperationAction(ISD::SMIN, VT, Legal);
1671 setOperationAction(ISD::UMIN, VT, Legal);
1672 setOperationAction(ISD::ABS, VT, Legal);
1673 setOperationAction(ISD::CTPOP, VT, Custom);
1674 setOperationAction(ISD::ROTL, VT, Custom);
1675 setOperationAction(ISD::ROTR, VT, Custom);
1676 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1677 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1678 }
1679
1680 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1681 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1682 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1683 setOperationAction(ISD::CTLZ, VT, Custom);
1684 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1685 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1686 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1687 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1688 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1689 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1690 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1691 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1692 }
1693
1694 if (Subtarget.hasDQI()) {
1695 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1696 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1697 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1698 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1699 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1700 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1701 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1702 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1703
1704 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1705 }
1706
1707 if (Subtarget.hasCDI()) {
1708 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1709 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1710 setOperationAction(ISD::CTLZ, VT, Legal);
1711 }
1712 } // Subtarget.hasCDI()
1713
1714 if (Subtarget.hasVPOPCNTDQ()) {
1715 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1716 setOperationAction(ISD::CTPOP, VT, Legal);
1717 }
1718
1719 // Extract subvector is special because the value type
1720 // (result) is 256-bit but the source is 512-bit wide.
1721 // 128-bit was made Legal under AVX1.
1722 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1723 MVT::v8f32, MVT::v4f64 })
1724 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1725
1726 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1727 MVT::v16f32, MVT::v8f64 }) {
1728 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1729 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1730 setOperationAction(ISD::SELECT, VT, Custom);
1731 setOperationAction(ISD::VSELECT, VT, Custom);
1732 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1733 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1734 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1735 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1736 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1737 }
1738
1739 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1740 setOperationAction(ISD::MLOAD, VT, Legal);
1741 setOperationAction(ISD::MSTORE, VT, Legal);
1742 setOperationAction(ISD::MGATHER, VT, Custom);
1743 setOperationAction(ISD::MSCATTER, VT, Custom);
1744 }
1745 if (HasBWI) {
1746 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1747 setOperationAction(ISD::MLOAD, VT, Legal);
1748 setOperationAction(ISD::MSTORE, VT, Legal);
1749 }
1750 } else {
1751 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1752 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1753 }
1754
1755 if (Subtarget.hasVBMI2()) {
1756 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1757 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1758 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1759 setOperationAction(ISD::FSHL, VT, Custom);
1760 setOperationAction(ISD::FSHR, VT, Custom);
1761 }
1762
1763 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1764 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1765 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1766 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1767 }
1768 }// useAVX512Regs
1769
1770 // This block controls legalization for operations that don't have
1771 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1772 // narrower widths.
1773 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1774 // These operations are handled on non-VLX by artificially widening in
1775 // isel patterns.
1776
1777 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1778 Subtarget.hasVLX() ? Legal : Custom);
1779 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1780 Subtarget.hasVLX() ? Legal : Custom);
1781 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1782 Subtarget.hasVLX() ? Legal : Custom);
1783 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1784 Subtarget.hasVLX() ? Legal : Custom);
1785 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1786 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1787 Subtarget.hasVLX() ? Legal : Custom);
1788 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1789 Subtarget.hasVLX() ? Legal : Custom);
1790 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1791 Subtarget.hasVLX() ? Legal : Custom);
1792 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1793 Subtarget.hasVLX() ? Legal : Custom);
1794
1795 if (Subtarget.hasDQI()) {
1796 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1797 // v2f32 UINT_TO_FP is already custom under SSE2.
1798 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&((void)0)
1799 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&((void)0)
1800 "Unexpected operation action!")((void)0);
1801 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1802 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1803 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1804 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1805 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1806 }
1807
1808 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1809 setOperationAction(ISD::SMAX, VT, Legal);
1810 setOperationAction(ISD::UMAX, VT, Legal);
1811 setOperationAction(ISD::SMIN, VT, Legal);
1812 setOperationAction(ISD::UMIN, VT, Legal);
1813 setOperationAction(ISD::ABS, VT, Legal);
1814 }
1815
1816 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1817 setOperationAction(ISD::ROTL, VT, Custom);
1818 setOperationAction(ISD::ROTR, VT, Custom);
1819 }
1820
1821 // Custom legalize 2x32 to get a little better code.
1822 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1823 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1824
1825 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1826 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1827 setOperationAction(ISD::MSCATTER, VT, Custom);
1828
1829 if (Subtarget.hasDQI()) {
1830 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1831 setOperationAction(ISD::SINT_TO_FP, VT,
1832 Subtarget.hasVLX() ? Legal : Custom);
1833 setOperationAction(ISD::UINT_TO_FP, VT,
1834 Subtarget.hasVLX() ? Legal : Custom);
1835 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1836 Subtarget.hasVLX() ? Legal : Custom);
1837 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1838 Subtarget.hasVLX() ? Legal : Custom);
1839 setOperationAction(ISD::FP_TO_SINT, VT,
1840 Subtarget.hasVLX() ? Legal : Custom);
1841 setOperationAction(ISD::FP_TO_UINT, VT,
1842 Subtarget.hasVLX() ? Legal : Custom);
1843 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1844 Subtarget.hasVLX() ? Legal : Custom);
1845 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1846 Subtarget.hasVLX() ? Legal : Custom);
1847 setOperationAction(ISD::MUL, VT, Legal);
1848 }
1849 }
1850
1851 if (Subtarget.hasCDI()) {
1852 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1853 setOperationAction(ISD::CTLZ, VT, Legal);
1854 }
1855 } // Subtarget.hasCDI()
1856
1857 if (Subtarget.hasVPOPCNTDQ()) {
1858 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1859 setOperationAction(ISD::CTPOP, VT, Legal);
1860 }
1861 }
1862
1863 // This block control legalization of v32i1/v64i1 which are available with
1864 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1865 // useBWIRegs.
1866 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1867 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1868 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1869
1870 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1871 setOperationAction(ISD::VSELECT, VT, Expand);
1872 setOperationAction(ISD::TRUNCATE, VT, Custom);
1873 setOperationAction(ISD::SETCC, VT, Custom);
1874 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1875 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1876 setOperationAction(ISD::SELECT, VT, Custom);
1877 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1878 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1879 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1880 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1881 }
1882
1883 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1884 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1885
1886 // Extends from v32i1 masks to 256-bit vectors.
1887 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1888 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1889 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1890
1891 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1892 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1893 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1894 }
1895
1896 // These operations are handled on non-VLX by artificially widening in
1897 // isel patterns.
1898 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1899
1900 if (Subtarget.hasBITALG()) {
1901 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1902 setOperationAction(ISD::CTPOP, VT, Legal);
1903 }
1904 }
1905
1906 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1907 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1908 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1909 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1910 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1911 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1912
1913 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1914 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1915 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1916 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1917 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1918
1919 if (Subtarget.hasBWI()) {
1920 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1921 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1922 }
1923
1924 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1925 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1926 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1927 }
1928
1929 if (Subtarget.hasAMXTILE()) {
1930 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
1931 }
1932
1933 // We want to custom lower some of our intrinsics.
1934 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1935 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1936 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1937 if (!Subtarget.is64Bit()) {
1938 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1939 }
1940
1941 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1942 // handle type legalization for these operations here.
1943 //
1944 // FIXME: We really should do custom legalization for addition and
1945 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1946 // than generic legalization for 64-bit multiplication-with-overflow, though.
1947 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1948 if (VT == MVT::i64 && !Subtarget.is64Bit())
1949 continue;
1950 // Add/Sub/Mul with overflow operations are custom lowered.
1951 setOperationAction(ISD::SADDO, VT, Custom);
1952 setOperationAction(ISD::UADDO, VT, Custom);
1953 setOperationAction(ISD::SSUBO, VT, Custom);
1954 setOperationAction(ISD::USUBO, VT, Custom);
1955 setOperationAction(ISD::SMULO, VT, Custom);
1956 setOperationAction(ISD::UMULO, VT, Custom);
1957
1958 // Support carry in as value rather than glue.
1959 setOperationAction(ISD::ADDCARRY, VT, Custom);
1960 setOperationAction(ISD::SUBCARRY, VT, Custom);
1961 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1962 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
1963 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
1964 }
1965
1966 if (!Subtarget.is64Bit()) {
1967 // These libcalls are not available in 32-bit.
1968 setLibcallName(RTLIB::SHL_I128, nullptr);
1969 setLibcallName(RTLIB::SRL_I128, nullptr);
1970 setLibcallName(RTLIB::SRA_I128, nullptr);
1971 setLibcallName(RTLIB::MUL_I128, nullptr);
1972 }
1973
1974 // Combine sin / cos into _sincos_stret if it is available.
1975 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1976 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1977 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1978 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1979 }
1980
1981 if (Subtarget.isTargetWin64()) {
1982 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1983 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1984 setOperationAction(ISD::SREM, MVT::i128, Custom);
1985 setOperationAction(ISD::UREM, MVT::i128, Custom);
1986 }
1987
1988 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1989 // is. We should promote the value to 64-bits to solve this.
1990 // This is what the CRT headers do - `fmodf` is an inline header
1991 // function casting to f64 and calling `fmod`.
1992 if (Subtarget.is32Bit() &&
1993 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1994 for (ISD::NodeType Op :
1995 {ISD::FCEIL, ISD::STRICT_FCEIL,
1996 ISD::FCOS, ISD::STRICT_FCOS,
1997 ISD::FEXP, ISD::STRICT_FEXP,
1998 ISD::FFLOOR, ISD::STRICT_FFLOOR,
1999 ISD::FREM, ISD::STRICT_FREM,
2000 ISD::FLOG, ISD::STRICT_FLOG,
2001 ISD::FLOG10, ISD::STRICT_FLOG10,
2002 ISD::FPOW, ISD::STRICT_FPOW,
2003 ISD::FSIN, ISD::STRICT_FSIN})
2004 if (isOperationExpand(Op, MVT::f32))
2005 setOperationAction(Op, MVT::f32, Promote);
2006
2007 // We have target-specific dag combine patterns for the following nodes:
2008 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2009 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2010 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2012 setTargetDAGCombine(ISD::CONCAT_VECTORS);
2013 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2014 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2015 setTargetDAGCombine(ISD::BITCAST);
2016 setTargetDAGCombine(ISD::VSELECT);
2017 setTargetDAGCombine(ISD::SELECT);
2018 setTargetDAGCombine(ISD::SHL);
2019 setTargetDAGCombine(ISD::SRA);
2020 setTargetDAGCombine(ISD::SRL);
2021 setTargetDAGCombine(ISD::OR);
2022 setTargetDAGCombine(ISD::AND);
2023 setTargetDAGCombine(ISD::ADD);
2024 setTargetDAGCombine(ISD::FADD);
2025 setTargetDAGCombine(ISD::FSUB);
2026 setTargetDAGCombine(ISD::FNEG);
2027 setTargetDAGCombine(ISD::FMA);
2028 setTargetDAGCombine(ISD::STRICT_FMA);
2029 setTargetDAGCombine(ISD::FMINNUM);
2030 setTargetDAGCombine(ISD::FMAXNUM);
2031 setTargetDAGCombine(ISD::SUB);
2032 setTargetDAGCombine(ISD::LOAD);
2033 setTargetDAGCombine(ISD::MLOAD);
2034 setTargetDAGCombine(ISD::STORE);
2035 setTargetDAGCombine(ISD::MSTORE);
2036 setTargetDAGCombine(ISD::TRUNCATE);
2037 setTargetDAGCombine(ISD::ZERO_EXTEND);
2038 setTargetDAGCombine(ISD::ANY_EXTEND);
2039 setTargetDAGCombine(ISD::SIGN_EXTEND);
2040 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2041 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2042 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2043 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2044 setTargetDAGCombine(ISD::SINT_TO_FP);
2045 setTargetDAGCombine(ISD::UINT_TO_FP);
2046 setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2047 setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2048 setTargetDAGCombine(ISD::SETCC);
2049 setTargetDAGCombine(ISD::MUL);
2050 setTargetDAGCombine(ISD::XOR);
2051 setTargetDAGCombine(ISD::MSCATTER);
2052 setTargetDAGCombine(ISD::MGATHER);
2053 setTargetDAGCombine(ISD::FP16_TO_FP);
2054 setTargetDAGCombine(ISD::FP_EXTEND);
2055 setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2056 setTargetDAGCombine(ISD::FP_ROUND);
2057
2058 computeRegisterProperties(Subtarget.getRegisterInfo());
2059
2060 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2061 MaxStoresPerMemsetOptSize = 8;
2062 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2063 MaxStoresPerMemcpyOptSize = 4;
2064 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2065 MaxStoresPerMemmoveOptSize = 4;
2066
2067 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2068 // that needs to benchmarked and balanced with the potential use of vector
2069 // load/store types (PR33329, PR33914).
2070 MaxLoadsPerMemcmp = 2;
2071 MaxLoadsPerMemcmpOptSize = 2;
2072
2073 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2074 setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2075
2076 // An out-of-order CPU can speculatively execute past a predictable branch,
2077 // but a conditional move could be stalled by an expensive earlier operation.
2078 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2079 EnableExtLdPromotion = true;
2080 setPrefFunctionAlignment(Align(16));
2081
2082 verifyIntrinsicTables();
2083
2084 // Default to having -disable-strictnode-mutation on
2085 IsStrictFPEnabled = true;
2086}
2087
2088// This has so far only been implemented for 64-bit MachO.
2089bool X86TargetLowering::useLoadStackGuardNode() const {
2090 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2091}
2092
2093bool X86TargetLowering::useStackGuardXorFP() const {
2094 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2095 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2096}
2097
2098SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2099 const SDLoc &DL) const {
2100 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2101 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2102 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2103 return SDValue(Node, 0);
2104}
2105
2106TargetLoweringBase::LegalizeTypeAction
2107X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2108 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2109 !Subtarget.hasBWI())
2110 return TypeSplitVector;
2111
2112 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2113 VT.getVectorElementType() != MVT::i1)
2114 return TypeWidenVector;
2115
2116 return TargetLoweringBase::getPreferredVectorAction(VT);
2117}
2118
2119static std::pair<MVT, unsigned>
2120handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2121 const X86Subtarget &Subtarget) {
2122 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2123 // convention is one that uses k registers.
2124 if (NumElts == 2)
2125 return {MVT::v2i64, 1};
2126 if (NumElts == 4)
2127 return {MVT::v4i32, 1};
2128 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2129 CC != CallingConv::Intel_OCL_BI)
2130 return {MVT::v8i16, 1};
2131 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2132 CC != CallingConv::Intel_OCL_BI)
2133 return {MVT::v16i8, 1};
2134 // v32i1 passes in ymm unless we have BWI and the calling convention is
2135 // regcall.
2136 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2137 return {MVT::v32i8, 1};
2138 // Split v64i1 vectors if we don't have v64i8 available.
2139 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2140 if (Subtarget.useAVX512Regs())
2141 return {MVT::v64i8, 1};
2142 return {MVT::v32i8, 2};
2143 }
2144
2145 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2146 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2147 NumElts > 64)
2148 return {MVT::i8, NumElts};
2149
2150 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2151}
2152
2153MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2154 CallingConv::ID CC,
2155 EVT VT) const {
2156 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2157 Subtarget.hasAVX512()) {
2158 unsigned NumElts = VT.getVectorNumElements();
2159
2160 MVT RegisterVT;
2161 unsigned NumRegisters;
2162 std::tie(RegisterVT, NumRegisters) =
2163 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2164 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2165 return RegisterVT;
2166 }
2167
2168 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2169}
2170
2171unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2172 CallingConv::ID CC,
2173 EVT VT) const {
2174 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2175 Subtarget.hasAVX512()) {
2176 unsigned NumElts = VT.getVectorNumElements();
2177
2178 MVT RegisterVT;
2179 unsigned NumRegisters;
2180 std::tie(RegisterVT, NumRegisters) =
2181 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2182 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2183 return NumRegisters;
2184 }
2185
2186 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2187}
2188
2189unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2190 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2191 unsigned &NumIntermediates, MVT &RegisterVT) const {
2192 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2193 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2194 Subtarget.hasAVX512() &&
2195 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2196 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2197 VT.getVectorNumElements() > 64)) {
2198 RegisterVT = MVT::i8;
2199 IntermediateVT = MVT::i1;
2200 NumIntermediates = VT.getVectorNumElements();
2201 return NumIntermediates;
2202 }
2203
2204 // Split v64i1 vectors if we don't have v64i8 available.
2205 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2206 CC != CallingConv::X86_RegCall) {
2207 RegisterVT = MVT::v32i8;
2208 IntermediateVT = MVT::v32i1;
2209 NumIntermediates = 2;
2210 return 2;
2211 }
2212
2213 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2214 NumIntermediates, RegisterVT);
2215}
2216
2217EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2218 LLVMContext& Context,
2219 EVT VT) const {
2220 if (!VT.isVector())
2221 return MVT::i8;
2222
2223 if (Subtarget.hasAVX512()) {
2224 // Figure out what this type will be legalized to.
2225 EVT LegalVT = VT;
2226 while (getTypeAction(Context, LegalVT) != TypeLegal)
2227 LegalVT = getTypeToTransformTo(Context, LegalVT);
2228
2229 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2230 if (LegalVT.getSimpleVT().is512BitVector())
2231 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2232
2233 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2234 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2235 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2236 // vXi16/vXi8.
2237 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2238 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2239 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2240 }
2241 }
2242
2243 return VT.changeVectorElementTypeToInteger();
2244}
2245
2246/// Helper for getByValTypeAlignment to determine
2247/// the desired ByVal argument alignment.
2248static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2249 if (MaxAlign == 16)
2250 return;
2251 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2252 if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2253 MaxAlign = Align(16);
2254 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2255 Align EltAlign;
2256 getMaxByValAlign(ATy->getElementType(), EltAlign);
2257 if (EltAlign > MaxAlign)
2258 MaxAlign = EltAlign;
2259 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2260 for (auto *EltTy : STy->elements()) {
2261 Align EltAlign;
2262 getMaxByValAlign(EltTy, EltAlign);
2263 if (EltAlign > MaxAlign)
2264 MaxAlign = EltAlign;
2265 if (MaxAlign == 16)
2266 break;
2267 }
2268 }
2269}
2270
2271/// Return the desired alignment for ByVal aggregate
2272/// function arguments in the caller parameter area. For X86, aggregates
2273/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2274/// are at 4-byte boundaries.
2275unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2276 const DataLayout &DL) const {
2277 if (Subtarget.is64Bit()) {
2278 // Max of 8 and alignment of type.
2279 Align TyAlign = DL.getABITypeAlign(Ty);
2280 if (TyAlign > 8)
2281 return TyAlign.value();
2282 return 8;
2283 }
2284
2285 Align Alignment(4);
2286 if (Subtarget.hasSSE1())
2287 getMaxByValAlign(Ty, Alignment);
2288 return Alignment.value();
2289}
2290
2291/// It returns EVT::Other if the type should be determined using generic
2292/// target-independent logic.
2293/// For vector ops we check that the overall size isn't larger than our
2294/// preferred vector width.
2295EVT X86TargetLowering::getOptimalMemOpType(
2296 const MemOp &Op, const AttributeList &FuncAttributes) const {
2297 if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2298 if (Op.size() >= 16 &&
2299 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2300 // FIXME: Check if unaligned 64-byte accesses are slow.
2301 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2302 (Subtarget.getPreferVectorWidth() >= 512)) {
2303 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2304 }
2305 // FIXME: Check if unaligned 32-byte accesses are slow.
2306 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2307 (Subtarget.getPreferVectorWidth() >= 256)) {
2308 // Although this isn't a well-supported type for AVX1, we'll let
2309 // legalization and shuffle lowering produce the optimal codegen. If we
2310 // choose an optimal type with a vector element larger than a byte,
2311 // getMemsetStores() may create an intermediate splat (using an integer
2312 // multiply) before we splat as a vector.
2313 return MVT::v32i8;
2314 }
2315 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2316 return MVT::v16i8;
2317 // TODO: Can SSE1 handle a byte vector?
2318 // If we have SSE1 registers we should be able to use them.
2319 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2320 (Subtarget.getPreferVectorWidth() >= 128))
2321 return MVT::v4f32;
2322 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2323 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2324 // Do not use f64 to lower memcpy if source is string constant. It's
2325 // better to use i32 to avoid the loads.
2326 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2327 // The gymnastics of splatting a byte value into an XMM register and then
2328 // only using 8-byte stores (because this is a CPU with slow unaligned
2329 // 16-byte accesses) makes that a loser.
2330 return MVT::f64;
2331 }
2332 }
2333 // This is a compromise. If we reach here, unaligned accesses may be slow on
2334 // this target. However, creating smaller, aligned accesses could be even
2335 // slower and would certainly be a lot more code.
2336 if (Subtarget.is64Bit() && Op.size() >= 8)
2337 return MVT::i64;
2338 return MVT::i32;
2339}
2340
2341bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2342 if (VT == MVT::f32)
2343 return X86ScalarSSEf32;
2344 if (VT == MVT::f64)
2345 return X86ScalarSSEf64;
2346 return true;
2347}
2348
2349bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2350 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2351 bool *Fast) const {
2352 if (Fast) {
2353 switch (VT.getSizeInBits()) {
2354 default:
2355 // 8-byte and under are always assumed to be fast.
2356 *Fast = true;
2357 break;
2358 case 128:
2359 *Fast = !Subtarget.isUnalignedMem16Slow();
2360 break;
2361 case 256:
2362 *Fast = !Subtarget.isUnalignedMem32Slow();
2363 break;
2364 // TODO: What about AVX-512 (512-bit) accesses?
2365 }
2366 }
2367 // NonTemporal vector memory ops must be aligned.
2368 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2369 // NT loads can only be vector aligned, so if its less aligned than the
2370 // minimum vector size (which we can split the vector down to), we might as
2371 // well use a regular unaligned vector load.
2372 // We don't have any NT loads pre-SSE41.
2373 if (!!(Flags & MachineMemOperand::MOLoad))
2374 return (Alignment < 16 || !Subtarget.hasSSE41());
2375 return false;
2376 }
2377 // Misaligned accesses of any size are always allowed.
2378 return true;
2379}
2380
2381/// Return the entry encoding for a jump table in the
2382/// current function. The returned value is a member of the
2383/// MachineJumpTableInfo::JTEntryKind enum.
2384unsigned X86TargetLowering::getJumpTableEncoding() const {
2385 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2386 // symbol.
2387 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2388 return MachineJumpTableInfo::EK_Custom32;
2389
2390 // Otherwise, use the normal jump table encoding heuristics.
2391 return TargetLowering::getJumpTableEncoding();
2392}
2393
2394bool X86TargetLowering::useSoftFloat() const {
2395 return Subtarget.useSoftFloat();
2396}
2397
2398void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2399 ArgListTy &Args) const {
2400
2401 // Only relabel X86-32 for C / Stdcall CCs.
2402 if (Subtarget.is64Bit())
2403 return;
2404 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2405 return;
2406 unsigned ParamRegs = 0;
2407 if (auto *M = MF->getFunction().getParent())
2408 ParamRegs = M->getNumberRegisterParameters();
2409
2410 // Mark the first N int arguments as having reg
2411 for (auto &Arg : Args) {
2412 Type *T = Arg.Ty;
2413 if (T->isIntOrPtrTy())
2414 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2415 unsigned numRegs = 1;
2416 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2417 numRegs = 2;
2418 if (ParamRegs < numRegs)
2419 return;
2420 ParamRegs -= numRegs;
2421 Arg.IsInReg = true;
2422 }
2423 }
2424}
2425
2426const MCExpr *
2427X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2428 const MachineBasicBlock *MBB,
2429 unsigned uid,MCContext &Ctx) const{
2430 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((void)0);
2431 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2432 // entries.
2433 return MCSymbolRefExpr::create(MBB->getSymbol(),
2434 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2435}
2436
2437/// Returns relocation base for the given PIC jumptable.
2438SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2439 SelectionDAG &DAG) const {
2440 if (!Subtarget.is64Bit())
2441 // This doesn't have SDLoc associated with it, but is not really the
2442 // same as a Register.
2443 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2444 getPointerTy(DAG.getDataLayout()));
2445 return Table;
2446}
2447
2448/// This returns the relocation base for the given PIC jumptable,
2449/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2450const MCExpr *X86TargetLowering::
2451getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2452 MCContext &Ctx) const {
2453 // X86-64 uses RIP relative addressing based on the jump table label.
2454 if (Subtarget.isPICStyleRIPRel())
2455 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2456
2457 // Otherwise, the reference is relative to the PIC base.
2458 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2459}
2460
2461std::pair<const TargetRegisterClass *, uint8_t>
2462X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2463 MVT VT) const {
2464 const TargetRegisterClass *RRC = nullptr;
2465 uint8_t Cost = 1;
2466 switch (VT.SimpleTy) {
2467 default:
2468 return TargetLowering::findRepresentativeClass(TRI, VT);
2469 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2470 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2471 break;
2472 case MVT::x86mmx:
2473 RRC = &X86::VR64RegClass;
2474 break;
2475 case MVT::f32: case MVT::f64:
2476 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2477 case MVT::v4f32: case MVT::v2f64:
2478 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2479 case MVT::v8f32: case MVT::v4f64:
2480 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2481 case MVT::v16f32: case MVT::v8f64:
2482 RRC = &X86::VR128XRegClass;
2483 break;
2484 }
2485 return std::make_pair(RRC, Cost);
2486}
2487
2488unsigned X86TargetLowering::getAddressSpace() const {
2489 if (Subtarget.is64Bit())
2490 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2491 return 256;
2492}
2493
2494static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2495 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2496 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2497}
2498
2499static Constant* SegmentOffset(IRBuilderBase &IRB,
2500 int Offset, unsigned AddressSpace) {
2501 return ConstantExpr::getIntToPtr(
2502 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2503 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2504}
2505
2506Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2507 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2508 // tcbhead_t; use it instead of the usual global variable (see
2509 // sysdeps/{i386,x86_64}/nptl/tls.h)
2510 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2511 if (Subtarget.isTargetFuchsia()) {
2512 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2513 return SegmentOffset(IRB, 0x10, getAddressSpace());
2514 } else {
2515 unsigned AddressSpace = getAddressSpace();
2516 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2517 // Specially, some users may customize the base reg and offset.
2518 int Offset = M->getStackProtectorGuardOffset();
2519 // If we don't set -stack-protector-guard-offset value:
2520 // %fs:0x28, unless we're using a Kernel code model, in which case
2521 // it's %gs:0x28. gs:0x14 on i386.
2522 if (Offset == INT_MAX2147483647)
2523 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2524
2525 StringRef GuardReg = M->getStackProtectorGuardReg();
2526 if (GuardReg == "fs")
2527 AddressSpace = X86AS::FS;
2528 else if (GuardReg == "gs")
2529 AddressSpace = X86AS::GS;
2530 return SegmentOffset(IRB, Offset, AddressSpace);
2531 }
2532 }
2533 return TargetLowering::getIRStackGuard(IRB);
2534}
2535
2536void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2537 // MSVC CRT provides functionalities for stack protection.
2538 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2539 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2540 // MSVC CRT has a global variable holding security cookie.
2541 M.getOrInsertGlobal("__security_cookie",
2542 Type::getInt8PtrTy(M.getContext()));
2543
2544 // MSVC CRT has a function to validate security cookie.
2545 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2546 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2547 Type::getInt8PtrTy(M.getContext()));
2548 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2549 F->setCallingConv(CallingConv::X86_FastCall);
2550 F->addAttribute(1, Attribute::AttrKind::InReg);
2551 }
2552 return;
2553 }
2554
2555 StringRef GuardMode = M.getStackProtectorGuard();
2556
2557 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2558 if ((GuardMode == "tls" || GuardMode.empty()) &&
2559 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2560 return;
2561 TargetLowering::insertSSPDeclarations(M);
2562}
2563
2564Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2565 // MSVC CRT has a global variable holding security cookie.
2566 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2567 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2568 return M.getGlobalVariable("__security_cookie");
2569 }
2570 return TargetLowering::getSDagStackGuard(M);
2571}
2572
2573Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2574 // MSVC CRT has a function to validate security cookie.
2575 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2576 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2577 return M.getFunction("__security_check_cookie");
2578 }
2579 return TargetLowering::getSSPStackGuardCheck(M);
2580}
2581
2582Value *
2583X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2584 if (Subtarget.getTargetTriple().isOSContiki())
2585 return getDefaultSafeStackPointerLocation(IRB, false);
2586
2587 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2588 // definition of TLS_SLOT_SAFESTACK in
2589 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2590 if (Subtarget.isTargetAndroid()) {
2591 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2592 // %gs:0x24 on i386
2593 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2594 return SegmentOffset(IRB, Offset, getAddressSpace());
2595 }
2596
2597 // Fuchsia is similar.
2598 if (Subtarget.isTargetFuchsia()) {
2599 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2600 return SegmentOffset(IRB, 0x18, getAddressSpace());
2601 }
2602
2603 return TargetLowering::getSafeStackPointerLocation(IRB);
2604}
2605
2606//===----------------------------------------------------------------------===//
2607// Return Value Calling Convention Implementation
2608//===----------------------------------------------------------------------===//
2609
2610bool X86TargetLowering::CanLowerReturn(
2611 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2612 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2613 SmallVector<CCValAssign, 16> RVLocs;
2614 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2615 return CCInfo.CheckReturn(Outs, RetCC_X86);
2616}
2617
2618const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2619 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2620 return ScratchRegs;
2621}
2622
2623/// Lowers masks values (v*i1) to the local register values
2624/// \returns DAG node after lowering to register type
2625static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2626 const SDLoc &Dl, SelectionDAG &DAG) {
2627 EVT ValVT = ValArg.getValueType();
2628
2629 if (ValVT == MVT::v1i1)
2630 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2631 DAG.getIntPtrConstant(0, Dl));
2632
2633 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2634 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2635 // Two stage lowering might be required
2636 // bitcast: v8i1 -> i8 / v16i1 -> i16
2637 // anyextend: i8 -> i32 / i16 -> i32
2638 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2639 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2640 if (ValLoc == MVT::i32)
2641 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2642 return ValToCopy;
2643 }
2644
2645 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2646 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2647 // One stage lowering is required
2648 // bitcast: v32i1 -> i32 / v64i1 -> i64
2649 return DAG.getBitcast(ValLoc, ValArg);
2650 }
2651
2652 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2653}
2654
2655/// Breaks v64i1 value into two registers and adds the new node to the DAG
2656static void Passv64i1ArgInRegs(
2657 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2658 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2659 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2660 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((void)0);
2661 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((void)0);
2662 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((void)0);
2663 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((void)0)
2664 "The value should reside in two registers")((void)0);
2665
2666 // Before splitting the value we cast it to i64
2667 Arg = DAG.getBitcast(MVT::i64, Arg);
2668
2669 // Splitting the value into two i32 types
2670 SDValue Lo, Hi;
2671 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2672 DAG.getConstant(0, Dl, MVT::i32));
2673 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2674 DAG.getConstant(1, Dl, MVT::i32));
2675
2676 // Attach the two i32 types into corresponding registers
2677 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2678 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2679}
2680
2681SDValue
2682X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2683 bool isVarArg,
2684 const SmallVectorImpl<ISD::OutputArg> &Outs,
2685 const SmallVectorImpl<SDValue> &OutVals,
2686 const SDLoc &dl, SelectionDAG &DAG) const {
2687 MachineFunction &MF = DAG.getMachineFunction();
2688 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2689
2690 // In some cases we need to disable registers from the default CSR list.
2691 // For example, when they are used for argument passing.
2692 bool ShouldDisableCalleeSavedRegister =
2693 CallConv == CallingConv::X86_RegCall ||
2694 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2695
2696 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2697 report_fatal_error("X86 interrupts may not return any value");
2698
2699 SmallVector<CCValAssign, 16> RVLocs;
2700 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2701 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2702
2703 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2704 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2705 ++I, ++OutsIndex) {
2706 CCValAssign &VA = RVLocs[I];
2707 assert(VA.isRegLoc() && "Can only return in registers!")((void)0);
2708
2709 // Add the register to the CalleeSaveDisableRegs list.
2710 if (ShouldDisableCalleeSavedRegister)
2711 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2712
2713 SDValue ValToCopy = OutVals[OutsIndex];
2714 EVT ValVT = ValToCopy.getValueType();
2715
2716 // Promote values to the appropriate types.
2717 if (VA.getLocInfo() == CCValAssign::SExt)
2718 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2719 else if (VA.getLocInfo() == CCValAssign::ZExt)
2720 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2721 else if (VA.getLocInfo() == CCValAssign::AExt) {
2722 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2723 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2724 else
2725 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2726 }
2727 else if (VA.getLocInfo() == CCValAssign::BCvt)
2728 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2729
2730 assert(VA.getLocInfo() != CCValAssign::FPExt &&((void)0)
2731 "Unexpected FP-extend for return value.")((void)0);
2732
2733 // Report an error if we have attempted to return a value via an XMM
2734 // register and SSE was disabled.
2735 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2736 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2737 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2738 } else if (!Subtarget.hasSSE2() &&
2739 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2740 ValVT == MVT::f64) {
2741 // When returning a double via an XMM register, report an error if SSE2 is
2742 // not enabled.
2743 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2744 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2745 }
2746
2747 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2748 // the RET instruction and handled by the FP Stackifier.
2749 if (VA.getLocReg() == X86::FP0 ||
2750 VA.getLocReg() == X86::FP1) {
2751 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2752 // change the value to the FP stack register class.
2753 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2754 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2755 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2756 // Don't emit a copytoreg.
2757 continue;
2758 }
2759
2760 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2761 // which is returned in RAX / RDX.
2762 if (Subtarget.is64Bit()) {
2763 if (ValVT == MVT::x86mmx) {
2764 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2765 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2766 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2767 ValToCopy);
2768 // If we don't have SSE2 available, convert to v4f32 so the generated
2769 // register is legal.
2770 if (!Subtarget.hasSSE2())
2771 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2772 }
2773 }
2774 }
2775
2776 if (VA.needsCustom()) {
2777 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
2778 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
2779
2780 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2781 Subtarget);
2782
2783 // Add the second register to the CalleeSaveDisableRegs list.
2784 if (ShouldDisableCalleeSavedRegister)
2785 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2786 } else {
2787 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2788 }
2789 }
2790
2791 SDValue Flag;
2792 SmallVector<SDValue, 6> RetOps;
2793 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2794 // Operand #1 = Bytes To Pop
2795 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2796 MVT::i32));
2797
2798 // Copy the result values into the output registers.
2799 for (auto &RetVal : RetVals) {
2800 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2801 RetOps.push_back(RetVal.second);
2802 continue; // Don't emit a copytoreg.
2803 }
2804
2805 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2806 Flag = Chain.getValue(1);
2807 RetOps.push_back(
2808 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2809 }
2810
2811 // Swift calling convention does not require we copy the sret argument
2812 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2813
2814 // All x86 ABIs require that for returning structs by value we copy
2815 // the sret argument into %rax/%eax (depending on ABI) for the return.
2816 // We saved the argument into a virtual register in the entry block,
2817 // so now we copy the value out and into %rax/%eax.
2818 //
2819 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2820 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2821 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2822 // either case FuncInfo->setSRetReturnReg() will have been called.
2823 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2824 // When we have both sret and another return value, we should use the
2825 // original Chain stored in RetOps[0], instead of the current Chain updated
2826 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2827
2828 // For the case of sret and another return value, we have
2829 // Chain_0 at the function entry
2830 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2831 // If we use Chain_1 in getCopyFromReg, we will have
2832 // Val = getCopyFromReg(Chain_1)
2833 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2834
2835 // getCopyToReg(Chain_0) will be glued together with
2836 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2837 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2838 // Data dependency from Unit B to Unit A due to usage of Val in
2839 // getCopyToReg(Chain_1, Val)
2840 // Chain dependency from Unit A to Unit B
2841
2842 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2843 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2844 getPointerTy(MF.getDataLayout()));
2845
2846 Register RetValReg
2847 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2848 X86::RAX : X86::EAX;
2849 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2850 Flag = Chain.getValue(1);
2851
2852 // RAX/EAX now acts like a return value.
2853 RetOps.push_back(
2854 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2855
2856 // Add the returned register to the CalleeSaveDisableRegs list.
2857 if (ShouldDisableCalleeSavedRegister)
2858 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2859 }
2860
2861 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2862 const MCPhysReg *I =
2863 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2864 if (I) {
2865 for (; *I; ++I) {
2866 if (X86::GR64RegClass.contains(*I))
2867 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2868 else
2869 llvm_unreachable("Unexpected register class in CSRsViaCopy!")__builtin_unreachable();
2870 }
2871 }
2872
2873 RetOps[0] = Chain; // Update chain.
2874
2875 // Add the flag if we have it.
2876 if (Flag.getNode())
2877 RetOps.push_back(Flag);
2878
2879 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2880 if (CallConv == CallingConv::X86_INTR)
2881 opcode = X86ISD::IRET;
2882 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2883}
2884
2885bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2886 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2887 return false;
2888
2889 SDValue TCChain = Chain;
2890 SDNode *Copy = *N->use_begin();
2891 if (Copy->getOpcode() == ISD::CopyToReg) {
2892 // If the copy has a glue operand, we conservatively assume it isn't safe to
2893 // perform a tail call.
2894 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2895 return false;
2896 TCChain = Copy->getOperand(0);
2897 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2898 return false;
2899
2900 bool HasRet = false;
2901 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2902 UI != UE; ++UI) {
2903 if (UI->getOpcode() != X86ISD::RET_FLAG)
2904 return false;
2905 // If we are returning more than one value, we can definitely
2906 // not make a tail call see PR19530
2907 if (UI->getNumOperands() > 4)
2908 return false;
2909 if (UI->getNumOperands() == 4 &&
2910 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2911 return false;
2912 HasRet = true;
2913 }
2914
2915 if (!HasRet)
2916 return false;
2917
2918 Chain = TCChain;
2919 return true;
2920}
2921
2922EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2923 ISD::NodeType ExtendKind) const {
2924 MVT ReturnMVT = MVT::i32;
2925
2926 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2927 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2928 // The ABI does not require i1, i8 or i16 to be extended.
2929 //
2930 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2931 // always extending i8/i16 return values, so keep doing that for now.
2932 // (PR26665).
2933 ReturnMVT = MVT::i8;
2934 }
2935
2936 EVT MinVT = getRegisterType(Context, ReturnMVT);
2937 return VT.bitsLT(MinVT) ? MinVT : VT;
2938}
2939
2940/// Reads two 32 bit registers and creates a 64 bit mask value.
2941/// \param VA The current 32 bit value that need to be assigned.
2942/// \param NextVA The next 32 bit value that need to be assigned.
2943/// \param Root The parent DAG node.
2944/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2945/// glue purposes. In the case the DAG is already using
2946/// physical register instead of virtual, we should glue
2947/// our new SDValue to InFlag SDvalue.
2948/// \return a new SDvalue of size 64bit.
2949static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2950 SDValue &Root, SelectionDAG &DAG,
2951 const SDLoc &Dl, const X86Subtarget &Subtarget,
2952 SDValue *InFlag = nullptr) {
2953 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")((void)0);
2954 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((void)0);
2955 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
2956 "Expecting first location of 64 bit width type")((void)0);
2957 assert(NextVA.getValVT() == VA.getValVT() &&((void)0)
2958 "The locations should have the same type")((void)0);
2959 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((void)0)
2960 "The values should reside in two registers")((void)0);
2961
2962 SDValue Lo, Hi;
2963 SDValue ArgValueLo, ArgValueHi;
2964
2965 MachineFunction &MF = DAG.getMachineFunction();
2966 const TargetRegisterClass *RC = &X86::GR32RegClass;
2967
2968 // Read a 32 bit value from the registers.
2969 if (nullptr == InFlag) {
2970 // When no physical register is present,
2971 // create an intermediate virtual register.
2972 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2973 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2974 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2975 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2976 } else {
2977 // When a physical register is available read the value from it and glue
2978 // the reads together.
2979 ArgValueLo =
2980 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2981 *InFlag = ArgValueLo.getValue(2);
2982 ArgValueHi =
2983 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2984 *InFlag = ArgValueHi.getValue(2);
2985 }
2986
2987 // Convert the i32 type into v32i1 type.
2988 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2989
2990 // Convert the i32 type into v32i1 type.
2991 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2992
2993 // Concatenate the two values together.
2994 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2995}
2996
2997/// The function will lower a register of various sizes (8/16/32/64)
2998/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2999/// \returns a DAG node contains the operand after lowering to mask type.
3000static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3001 const EVT &ValLoc, const SDLoc &Dl,
3002 SelectionDAG &DAG) {
3003 SDValue ValReturned = ValArg;
3004
3005 if (ValVT == MVT::v1i1)
3006 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3007
3008 if (ValVT == MVT::v64i1) {
3009 // In 32 bit machine, this case is handled by getv64i1Argument
3010 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((void)0);
3011 // In 64 bit machine, There is no need to truncate the value only bitcast
3012 } else {
3013 MVT maskLen;
3014 switch (ValVT.getSimpleVT().SimpleTy) {
3015 case MVT::v8i1:
3016 maskLen = MVT::i8;
3017 break;
3018 case MVT::v16i1:
3019 maskLen = MVT::i16;
3020 break;
3021 case MVT::v32i1:
3022 maskLen = MVT::i32;
3023 break;
3024 default:
3025 llvm_unreachable("Expecting a vector of i1 types")__builtin_unreachable();
3026 }
3027
3028 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3029 }
3030 return DAG.getBitcast(ValVT, ValReturned);
3031}
3032
3033/// Lower the result values of a call into the
3034/// appropriate copies out of appropriate physical registers.
3035///
3036SDValue X86TargetLowering::LowerCallResult(
3037 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3038 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3039 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3040 uint32_t *RegMask) const {
3041
3042 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3043 // Assign locations to each value returned by this call.
3044 SmallVector<CCValAssign, 16> RVLocs;
3045 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3046 *DAG.getContext());
3047 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3048
3049 // Copy all of the result registers out of their specified physreg.
3050 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3051 ++I, ++InsIndex) {
3052 CCValAssign &VA = RVLocs[I];
3053 EVT CopyVT = VA.getLocVT();
3054
3055 // In some calling conventions we need to remove the used registers
3056 // from the register mask.
3057 if (RegMask) {
3058 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3059 SubRegs.isValid(); ++SubRegs)
3060 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3061 }
3062
3063 // Report an error if there was an attempt to return FP values via XMM
3064 // registers.
3065 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3066 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3067 if (VA.getLocReg() == X86::XMM1)
3068 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3069 else
3070 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3071 } else if (!Subtarget.hasSSE2() &&
3072 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3073 CopyVT == MVT::f64) {
3074 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3075 if (VA.getLocReg() == X86::XMM1)
3076 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3077 else
3078 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3079 }
3080
3081 // If we prefer to use the value in xmm registers, copy it out as f80 and
3082 // use a truncate to move it from fp stack reg to xmm reg.
3083 bool RoundAfterCopy = false;
3084 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3085 isScalarFPTypeInSSEReg(VA.getValVT())) {
3086 if (!Subtarget.hasX87())
3087 report_fatal_error("X87 register return with X87 disabled");
3088 CopyVT = MVT::f80;
3089 RoundAfterCopy = (CopyVT != VA.getLocVT());
3090 }
3091
3092 SDValue Val;
3093 if (VA.needsCustom()) {
3094 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
3095 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
3096 Val =
3097 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3098 } else {
3099 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3100 .getValue(1);
3101 Val = Chain.getValue(0);
3102 InFlag = Chain.getValue(2);
3103 }
3104
3105 if (RoundAfterCopy)
3106 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3107 // This truncation won't change the value.
3108 DAG.getIntPtrConstant(1, dl));
3109
3110 if (VA.isExtInLoc()) {
3111 if (VA.getValVT().isVector() &&
3112 VA.getValVT().getScalarType() == MVT::i1 &&
3113 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3114 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3115 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3116 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3117 } else
3118 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3119 }
3120
3121 if (VA.getLocInfo() == CCValAssign::BCvt)
3122 Val = DAG.getBitcast(VA.getValVT(), Val);
3123
3124 InVals.push_back(Val);
3125 }
3126
3127 return Chain;
3128}
3129
3130//===----------------------------------------------------------------------===//
3131// C & StdCall & Fast Calling Convention implementation
3132//===----------------------------------------------------------------------===//
3133// StdCall calling convention seems to be standard for many Windows' API
3134// routines and around. It differs from C calling convention just a little:
3135// callee should clean up the stack, not caller. Symbols should be also
3136// decorated in some fancy way :) It doesn't support any vector arguments.
3137// For info on fast calling convention see Fast Calling Convention (tail call)
3138// implementation LowerX86_32FastCCCallTo.
3139
3140/// CallIsStructReturn - Determines whether a call uses struct return
3141/// semantics.
3142enum StructReturnType {
3143 NotStructReturn,
3144 RegStructReturn,
3145 StackStructReturn
3146};
3147static StructReturnType
3148callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3149 if (Outs.empty())
3150 return NotStructReturn;
3151
3152 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3153 if (!Flags.isSRet())
3154 return NotStructReturn;
3155 if (Flags.isInReg() || IsMCU)
3156 return RegStructReturn;
3157 return StackStructReturn;
3158}
3159
3160/// Determines whether a function uses struct return semantics.
3161static StructReturnType
3162argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3163 if (Ins.empty())
3164 return NotStructReturn;
3165
3166 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3167 if (!Flags.isSRet())
3168 return NotStructReturn;
3169 if (Flags.isInReg() || IsMCU)
3170 return RegStructReturn;
3171 return StackStructReturn;
3172}
3173
3174/// Make a copy of an aggregate at address specified by "Src" to address
3175/// "Dst" with size and alignment information specified by the specific
3176/// parameter attribute. The copy will be passed as a byval function parameter.
3177static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3178 SDValue Chain, ISD::ArgFlagsTy Flags,
3179 SelectionDAG &DAG, const SDLoc &dl) {
3180 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3181
3182 return DAG.getMemcpy(
3183 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3184 /*isVolatile*/ false, /*AlwaysInline=*/true,
3185 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3186}
3187
3188/// Return true if the calling convention is one that we can guarantee TCO for.
3189static bool canGuaranteeTCO(CallingConv::ID CC) {
3190 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3191 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3192 CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3193 CC == CallingConv::SwiftTail);
3194}
3195
3196/// Return true if we might ever do TCO for calls with this calling convention.
3197static bool mayTailCallThisCC(CallingConv::ID CC) {
3198 switch (CC) {
3199 // C calling conventions:
3200 case CallingConv::C:
3201 case CallingConv::Win64:
3202 case CallingConv::X86_64_SysV:
3203 // Callee pop conventions:
3204 case CallingConv::X86_ThisCall:
3205 case CallingConv::X86_StdCall:
3206 case CallingConv::X86_VectorCall:
3207 case CallingConv::X86_FastCall:
3208 // Swift:
3209 case CallingConv::Swift:
3210 return true;
3211 default:
3212 return canGuaranteeTCO(CC);
3213 }
3214}
3215
3216/// Return true if the function is being made into a tailcall target by
3217/// changing its ABI.
3218static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3219 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3220 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3221}
3222
3223bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3224 if (!CI->isTailCall())
3225 return false;
3226
3227 CallingConv::ID CalleeCC = CI->getCallingConv();
3228 if (!mayTailCallThisCC(CalleeCC))
3229 return false;
3230
3231 return true;
3232}
3233
3234SDValue
3235X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3236 const SmallVectorImpl<ISD::InputArg> &Ins,
3237 const SDLoc &dl, SelectionDAG &DAG,
3238 const CCValAssign &VA,
3239 MachineFrameInfo &MFI, unsigned i) const {
3240 // Create the nodes corresponding to a load from this parameter slot.
3241 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3242 bool AlwaysUseMutable = shouldGuaranteeTCO(
3243 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3244 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3245 EVT ValVT;
3246 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3247
3248 // If value is passed by pointer we have address passed instead of the value
3249 // itself. No need to extend if the mask value and location share the same
3250 // absolute size.
3251 bool ExtendedInMem =
3252 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3253 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3254
3255 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3256 ValVT = VA.getLocVT();
3257 else
3258 ValVT = VA.getValVT();
3259
3260 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3261 // changed with more analysis.
3262 // In case of tail call optimization mark all arguments mutable. Since they
3263 // could be overwritten by lowering of arguments in case of a tail call.
3264 if (Flags.isByVal()) {
3265 unsigned Bytes = Flags.getByValSize();
3266 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3267
3268 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3269 // can be improved with deeper analysis.
3270 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3271 /*isAliased=*/true);
3272 return DAG.getFrameIndex(FI, PtrVT);
3273 }
3274
3275 EVT ArgVT = Ins[i].ArgVT;
3276
3277 // If this is a vector that has been split into multiple parts, and the
3278 // scalar size of the parts don't match the vector element size, then we can't
3279 // elide the copy. The parts will have padding between them instead of being
3280 // packed like a vector.
3281 bool ScalarizedAndExtendedVector =
3282 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3283 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3284
3285 // This is an argument in memory. We might be able to perform copy elision.
3286 // If the argument is passed directly in memory without any extension, then we
3287 // can perform copy elision. Large vector types, for example, may be passed
3288 // indirectly by pointer.
3289 if (Flags.isCopyElisionCandidate() &&
3290 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3291 !ScalarizedAndExtendedVector) {
3292 SDValue PartAddr;
3293 if (Ins[i].PartOffset == 0) {
3294 // If this is a one-part value or the first part of a multi-part value,
3295 // create a stack object for the entire argument value type and return a
3296 // load from our portion of it. This assumes that if the first part of an
3297 // argument is in memory, the rest will also be in memory.
3298 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3299 /*IsImmutable=*/false);
3300 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3301 return DAG.getLoad(
3302 ValVT, dl, Chain, PartAddr,
3303 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3304 } else {
3305 // This is not the first piece of an argument in memory. See if there is
3306 // already a fixed stack object including this offset. If so, assume it
3307 // was created by the PartOffset == 0 branch above and create a load from
3308 // the appropriate offset into it.
3309 int64_t PartBegin = VA.getLocMemOffset();
3310 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3311 int FI = MFI.getObjectIndexBegin();
3312 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3313 int64_t ObjBegin = MFI.getObjectOffset(FI);
3314 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3315 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3316 break;
3317 }
3318 if (MFI.isFixedObjectIndex(FI)) {
3319 SDValue Addr =
3320 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3321 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3322 return DAG.getLoad(
3323 ValVT, dl, Chain, Addr,
3324 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3325 Ins[i].PartOffset));
3326 }
3327 }
3328 }
3329
3330 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3331 VA.getLocMemOffset(), isImmutable);
3332
3333 // Set SExt or ZExt flag.
3334 if (VA.getLocInfo() == CCValAssign::ZExt) {
3335 MFI.setObjectZExt(FI, true);
3336 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3337 MFI.setObjectSExt(FI, true);
3338 }
3339
3340 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3341 SDValue Val = DAG.getLoad(
3342 ValVT, dl, Chain, FIN,
3343 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3344 return ExtendedInMem
3345 ? (VA.getValVT().isVector()
3346 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3347 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3348 : Val;
3349}
3350
3351// FIXME: Get this from tablegen.
3352static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3353 const X86Subtarget &Subtarget) {
3354 assert(Subtarget.is64Bit())((void)0);
3355
3356 if (Subtarget.isCallingConvWin64(CallConv)) {
3357 static const MCPhysReg GPR64ArgRegsWin64[] = {
3358 X86::RCX, X86::RDX, X86::R8, X86::R9
3359 };
3360 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3361 }
3362
3363 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3364 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3365 };
3366 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3367}
3368
3369// FIXME: Get this from tablegen.
3370static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3371 CallingConv::ID CallConv,
3372 const X86Subtarget &Subtarget) {
3373 assert(Subtarget.is64Bit())((void)0);
3374 if (Subtarget.isCallingConvWin64(CallConv)) {
3375 // The XMM registers which might contain var arg parameters are shadowed
3376 // in their paired GPR. So we only need to save the GPR to their home
3377 // slots.
3378 // TODO: __vectorcall will change this.
3379 return None;
3380 }
3381
3382 bool isSoftFloat = Subtarget.useSoftFloat();
3383 if (isSoftFloat || !Subtarget.hasSSE1())
3384 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3385 // registers.
3386 return None;
3387
3388 static const MCPhysReg XMMArgRegs64Bit[] = {
3389 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3390 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3391 };
3392 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3393}
3394
3395#ifndef NDEBUG1
3396static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3397 return llvm::is_sorted(
3398 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3399 return A.getValNo() < B.getValNo();
3400 });
3401}
3402#endif
3403
3404namespace {
3405/// This is a helper class for lowering variable arguments parameters.
3406class VarArgsLoweringHelper {
3407public:
3408 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3409 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3410 CallingConv::ID CallConv, CCState &CCInfo)
3411 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3412 TheMachineFunction(DAG.getMachineFunction()),
3413 TheFunction(TheMachineFunction.getFunction()),
3414 FrameInfo(TheMachineFunction.getFrameInfo()),
3415 FrameLowering(*Subtarget.getFrameLowering()),
3416 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3417 CCInfo(CCInfo) {}
3418
3419 // Lower variable arguments parameters.
3420 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3421
3422private:
3423 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3424
3425 void forwardMustTailParameters(SDValue &Chain);
3426
3427 bool is64Bit() const { return Subtarget.is64Bit(); }
3428 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3429
3430 X86MachineFunctionInfo *FuncInfo;
3431 const SDLoc &DL;
3432 SelectionDAG &DAG;
3433 const X86Subtarget &Subtarget;
3434 MachineFunction &TheMachineFunction;
3435 const Function &TheFunction;
3436 MachineFrameInfo &FrameInfo;
3437 const TargetFrameLowering &FrameLowering;
3438 const TargetLowering &TargLowering;
3439 CallingConv::ID CallConv;
3440 CCState &CCInfo;
3441};
3442} // namespace
3443
3444void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3445 SDValue &Chain, unsigned StackSize) {
3446 // If the function takes variable number of arguments, make a frame index for
3447 // the start of the first vararg value... for expansion of llvm.va_start. We
3448 // can skip this if there are no va_start calls.
3449 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3450 CallConv != CallingConv::X86_ThisCall)) {
3451 FuncInfo->setVarArgsFrameIndex(
3452 FrameInfo.CreateFixedObject(1, StackSize, true));
3453 }
3454
3455 // 64-bit calling conventions support varargs and register parameters, so we
3456 // have to do extra work to spill them in the prologue.
3457 if (is64Bit()) {
3458 // Find the first unallocated argument registers.
3459 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3460 ArrayRef<MCPhysReg> ArgXMMs =
3461 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3462 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3463 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3464
3465 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((void)0)
3466 "SSE register cannot be used when SSE is disabled!")((void)0);
3467
3468 if (isWin64()) {
3469 // Get to the caller-allocated home save location. Add 8 to account
3470 // for the return address.
3471 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3472 FuncInfo->setRegSaveFrameIndex(
3473 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3474 // Fixup to set vararg frame on shadow area (4 x i64).
3475 if (NumIntRegs < 4)
3476 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3477 } else {
3478 // For X86-64, if there are vararg parameters that are passed via
3479 // registers, then we must store them to their spots on the stack so
3480 // they may be loaded by dereferencing the result of va_next.
3481 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3482 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3483 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3484 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3485 }
3486
3487 SmallVector<SDValue, 6>
3488 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3489 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3490 // keeping live input value
3491 SDValue ALVal; // if applicable keeps SDValue for %al register
3492
3493 // Gather all the live in physical registers.
3494 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3495 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3496 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3497 }
3498 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3499 if (!AvailableXmms.empty()) {
3500 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3501 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3502 for (MCPhysReg Reg : AvailableXmms) {
3503 // FastRegisterAllocator spills virtual registers at basic
3504 // block boundary. That leads to usages of xmm registers
3505 // outside of check for %al. Pass physical registers to
3506 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3507 TheMachineFunction.getRegInfo().addLiveIn(Reg);
3508 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3509 }
3510 }
3511
3512 // Store the integer parameter registers.
3513 SmallVector<SDValue, 8> MemOps;
3514 SDValue RSFIN =
3515 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3516 TargLowering.getPointerTy(DAG.getDataLayout()));
3517 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3518 for (SDValue Val : LiveGPRs) {
3519 SDValue FIN = DAG.getNode(ISD::ADD, DL,
3520 TargLowering.getPointerTy(DAG.getDataLayout()),
3521 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3522 SDValue Store =
3523 DAG.getStore(Val.getValue(1), DL, Val, FIN,
3524 MachinePointerInfo::getFixedStack(
3525 DAG.getMachineFunction(),
3526 FuncInfo->getRegSaveFrameIndex(), Offset));
3527 MemOps.push_back(Store);
3528 Offset += 8;
3529 }
3530
3531 // Now store the XMM (fp + vector) parameter registers.
3532 if (!LiveXMMRegs.empty()) {
3533 SmallVector<SDValue, 12> SaveXMMOps;
3534 SaveXMMOps.push_back(Chain);
3535 SaveXMMOps.push_back(ALVal);
3536 SaveXMMOps.push_back(
3537 DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3538 SaveXMMOps.push_back(
3539 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3540 llvm::append_range(SaveXMMOps, LiveXMMRegs);
3541 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3542 MVT::Other, SaveXMMOps));
3543 }
3544
3545 if (!MemOps.empty())
3546 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3547 }
3548}
3549
3550void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3551 // Find the largest legal vector type.
3552 MVT VecVT = MVT::Other;
3553 // FIXME: Only some x86_32 calling conventions support AVX512.
3554 if (Subtarget.useAVX512Regs() &&
3555 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3556 CallConv == CallingConv::Intel_OCL_BI)))
3557 VecVT = MVT::v16f32;
3558 else if (Subtarget.hasAVX())
3559 VecVT = MVT::v8f32;
3560 else if (Subtarget.hasSSE2())
3561 VecVT = MVT::v4f32;
3562
3563 // We forward some GPRs and some vector types.
3564 SmallVector<MVT, 2> RegParmTypes;
3565 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3566 RegParmTypes.push_back(IntVT);
3567 if (VecVT != MVT::Other)
3568 RegParmTypes.push_back(VecVT);
3569
3570 // Compute the set of forwarded registers. The rest are scratch.
3571 SmallVectorImpl<ForwardedRegister> &Forwards =
3572 FuncInfo->getForwardedMustTailRegParms();
3573 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3574
3575 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3576 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3577 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3578 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3579 }
3580
3581 // Copy all forwards from physical to virtual registers.
3582 for (ForwardedRegister &FR : Forwards) {
3583 // FIXME: Can we use a less constrained schedule?
3584 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3585 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3586 TargLowering.getRegClassFor(FR.VT));
3587 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3588 }
3589}
3590
3591void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3592 unsigned StackSize) {
3593 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3594 // If necessary, it would be set into the correct value later.
3595 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3596 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3597
3598 if (FrameInfo.hasVAStart())
3599 createVarArgAreaAndStoreRegisters(Chain, StackSize);
3600
3601 if (FrameInfo.hasMustTailInVarArgFunc())
3602 forwardMustTailParameters(Chain);
3603}
3604
3605SDValue X86TargetLowering::LowerFormalArguments(
3606 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3607 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3608 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3609 MachineFunction &MF = DAG.getMachineFunction();
3610 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3611
3612 const Function &F = MF.getFunction();
3613 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3614 F.getName() == "main")
3615 FuncInfo->setForceFramePointer(true);
3616
3617 MachineFrameInfo &MFI = MF.getFrameInfo();
3618 bool Is64Bit = Subtarget.is64Bit();
3619 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3620
3621 assert(((void)0)
3622 !(IsVarArg && canGuaranteeTCO(CallConv)) &&((void)0)
3623 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((void)0);
3624
3625 // Assign locations to all of the incoming arguments.
3626 SmallVector<CCValAssign, 16> ArgLocs;
3627 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3628
3629 // Allocate shadow area for Win64.
3630 if (IsWin64)
3631 CCInfo.AllocateStack(32, Align(8));
3632
3633 CCInfo.AnalyzeArguments(Ins, CC_X86);
3634
3635 // In vectorcall calling convention a second pass is required for the HVA
3636 // types.
3637 if (CallingConv::X86_VectorCall == CallConv) {
3638 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3639 }
3640
3641 // The next loop assumes that the locations are in the same order of the
3642 // input arguments.
3643 assert(isSortedByValueNo(ArgLocs) &&((void)0)
3644 "Argument Location list must be sorted before lowering")((void)0);
3645
3646 SDValue ArgValue;
3647 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3648 ++I, ++InsIndex) {
3649 assert(InsIndex < Ins.size() && "Invalid Ins index")((void)0);
3650 CCValAssign &VA = ArgLocs[I];
3651
3652 if (VA.isRegLoc()) {
3653 EVT RegVT = VA.getLocVT();
3654 if (VA.needsCustom()) {
3655 assert(((void)0)
3656 VA.getValVT() == MVT::v64i1 &&((void)0)
3657 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
3658
3659 // v64i1 values, in regcall calling convention, that are
3660 // compiled to 32 bit arch, are split up into two registers.
3661 ArgValue =
3662 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3663 } else {
3664 const TargetRegisterClass *RC;
3665 if (RegVT == MVT::i8)
3666 RC = &X86::GR8RegClass;
3667 else if (RegVT == MVT::i16)
3668 RC = &X86::GR16RegClass;
3669 else if (RegVT == MVT::i32)
3670 RC = &X86::GR32RegClass;
3671 else if (Is64Bit && RegVT == MVT::i64)
3672 RC = &X86::GR64RegClass;
3673 else if (RegVT == MVT::f32)
3674 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3675 else if (RegVT == MVT::f64)
3676 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3677 else if (RegVT == MVT::f80)
3678 RC = &X86::RFP80RegClass;
3679 else if (RegVT == MVT::f128)
3680 RC = &X86::VR128RegClass;
3681 else if (RegVT.is512BitVector())
3682 RC = &X86::VR512RegClass;
3683 else if (RegVT.is256BitVector())
3684 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3685 else if (RegVT.is128BitVector())
3686 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3687 else if (RegVT == MVT::x86mmx)
3688 RC = &X86::VR64RegClass;
3689 else if (RegVT == MVT::v1i1)
3690 RC = &X86::VK1RegClass;
3691 else if (RegVT == MVT::v8i1)
3692 RC = &X86::VK8RegClass;
3693 else if (RegVT == MVT::v16i1)
3694 RC = &X86::VK16RegClass;
3695 else if (RegVT == MVT::v32i1)
3696 RC = &X86::VK32RegClass;
3697 else if (RegVT == MVT::v64i1)
3698 RC = &X86::VK64RegClass;
3699 else
3700 llvm_unreachable("Unknown argument type!")__builtin_unreachable();
3701
3702 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3703 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3704 }
3705
3706 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3707 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3708 // right size.
3709 if (VA.getLocInfo() == CCValAssign::SExt)
3710 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3711 DAG.getValueType(VA.getValVT()));
3712 else if (VA.getLocInfo() == CCValAssign::ZExt)
3713 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3714 DAG.getValueType(VA.getValVT()));
3715 else if (VA.getLocInfo() == CCValAssign::BCvt)
3716 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3717
3718 if (VA.isExtInLoc()) {
3719 // Handle MMX values passed in XMM regs.
3720 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3721 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3722 else if (VA.getValVT().isVector() &&
3723 VA.getValVT().getScalarType() == MVT::i1 &&
3724 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3725 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3726 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3727 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3728 } else
3729 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3730 }
3731 } else {
3732 assert(VA.isMemLoc())((void)0);
3733 ArgValue =
3734 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3735 }
3736
3737 // If value is passed via pointer - do a load.
3738 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3739 ArgValue =
3740 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3741
3742 InVals.push_back(ArgValue);
3743 }
3744
3745 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3746 if (Ins[I].Flags.isSwiftAsync()) {
3747 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3748 if (Subtarget.is64Bit())
3749 X86FI->setHasSwiftAsyncContext(true);
3750 else {
3751 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3752 X86FI->setSwiftAsyncContextFrameIdx(FI);
3753 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3754 DAG.getFrameIndex(FI, MVT::i32),
3755 MachinePointerInfo::getFixedStack(MF, FI));
3756 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3757 }
3758 }
3759
3760 // Swift calling convention does not require we copy the sret argument
3761 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3762 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3763 continue;
3764
3765 // All x86 ABIs require that for returning structs by value we copy the
3766 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3767 // the argument into a virtual register so that we can access it from the
3768 // return points.
3769 if (Ins[I].Flags.isSRet()) {
3770 Register Reg = FuncInfo->getSRetReturnReg();
3771 if (!Reg) {
3772 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3773 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3774 FuncInfo->setSRetReturnReg(Reg);
3775 }
3776 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3777 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3778 break;
3779 }
3780 }
3781
3782 unsigned StackSize = CCInfo.getNextStackOffset();
3783 // Align stack specially for tail calls.
3784 if (shouldGuaranteeTCO(CallConv,
3785 MF.getTarget().Options.GuaranteedTailCallOpt))
3786 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3787
3788 if (IsVarArg)
3789 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3790 .lowerVarArgsParameters(Chain, StackSize);
3791
3792 // Some CCs need callee pop.
3793 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3794 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3795 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3796 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3797 // X86 interrupts must pop the error code (and the alignment padding) if
3798 // present.
3799 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3800 } else {
3801 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3802 // If this is an sret function, the return should pop the hidden pointer.
3803 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3804 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3805 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3806 FuncInfo->setBytesToPopOnReturn(4);
3807 }
3808
3809 if (!Is64Bit) {
3810 // RegSaveFrameIndex is X86-64 only.
3811 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3812 }
3813
3814 FuncInfo->setArgumentStackSize(StackSize);
3815
3816 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3817 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3818 if (Personality == EHPersonality::CoreCLR) {
3819 assert(Is64Bit)((void)0);
3820 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3821 // that we'd prefer this slot be allocated towards the bottom of the frame
3822 // (i.e. near the stack pointer after allocating the frame). Every
3823 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3824 // offset from the bottom of this and each funclet's frame must be the
3825 // same, so the size of funclets' (mostly empty) frames is dictated by
3826 // how far this slot is from the bottom (since they allocate just enough
3827 // space to accommodate holding this slot at the correct offset).
3828 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
3829 EHInfo->PSPSymFrameIdx = PSPSymFI;
3830 }
3831 }
3832
3833 if (CallConv == CallingConv::X86_RegCall ||
3834 F.hasFnAttribute("no_caller_saved_registers")) {
3835 MachineRegisterInfo &MRI = MF.getRegInfo();
3836 for (std::pair<Register, Register> Pair : MRI.liveins())
3837 MRI.disableCalleeSavedRegister(Pair.first);
3838 }
3839
3840 return Chain;
3841}
3842
3843SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3844 SDValue Arg, const SDLoc &dl,
3845 SelectionDAG &DAG,
3846 const CCValAssign &VA,
3847 ISD::ArgFlagsTy Flags,
3848 bool isByVal) const {
3849 unsigned LocMemOffset = VA.getLocMemOffset();
3850 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3851 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3852 StackPtr, PtrOff);
3853 if (isByVal)
3854 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3855
3856 return DAG.getStore(
3857 Chain, dl, Arg, PtrOff,
3858 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3859}
3860
3861/// Emit a load of return address if tail call
3862/// optimization is performed and it is required.
3863SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3864 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3865 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3866 // Adjust the Return address stack slot.
3867 EVT VT = getPointerTy(DAG.getDataLayout());
3868 OutRetAddr = getReturnAddressFrameIndex(DAG);
3869
3870 // Load the "old" Return address.
3871 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3872 return SDValue(OutRetAddr.getNode(), 1);
3873}
3874
3875/// Emit a store of the return address if tail call
3876/// optimization is performed and it is required (FPDiff!=0).
3877static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3878 SDValue Chain, SDValue RetAddrFrIdx,
3879 EVT PtrVT, unsigned SlotSize,
3880 int FPDiff, const SDLoc &dl) {
3881 // Store the return address to the appropriate stack slot.
3882 if (!FPDiff) return Chain;
3883 // Calculate the new stack slot for the return address.
3884 int NewReturnAddrFI =
3885 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3886 false);
3887 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3888 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3889 MachinePointerInfo::getFixedStack(
3890 DAG.getMachineFunction(), NewReturnAddrFI));
3891 return Chain;
3892}
3893
3894/// Returns a vector_shuffle mask for an movs{s|d}, movd
3895/// operation of specified width.
3896static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3897 SDValue V2) {
3898 unsigned NumElems = VT.getVectorNumElements();
3899 SmallVector<int, 8> Mask;
3900 Mask.push_back(NumElems);
3901 for (unsigned i = 1; i != NumElems; ++i)
3902 Mask.push_back(i);
3903 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3904}
3905
3906SDValue
3907X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3908 SmallVectorImpl<SDValue> &InVals) const {
3909 SelectionDAG &DAG = CLI.DAG;
3910 SDLoc &dl = CLI.DL;
3911 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3912 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3913 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3914 SDValue Chain = CLI.Chain;
3915 SDValue Callee = CLI.Callee;
3916 CallingConv::ID CallConv = CLI.CallConv;
3917 bool &isTailCall = CLI.IsTailCall;
3918 bool isVarArg = CLI.IsVarArg;
3919 const auto *CB = CLI.CB;
3920
3921 MachineFunction &MF = DAG.getMachineFunction();
3922 bool Is64Bit = Subtarget.is64Bit();
3923 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3924 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3925 bool IsSibcall = false;
3926 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3927 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
3928 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3929 bool HasNCSR = (CB && isa<CallInst>(CB) &&
3930 CB->hasFnAttr("no_caller_saved_registers"));
3931 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
3932 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
3933 const Module *M = MF.getMMI().getModule();
3934 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3935
3936 MachineFunction::CallSiteInfo CSInfo;
3937 if (CallConv == CallingConv::X86_INTR)
3938 report_fatal_error("X86 interrupts may not be called directly");
3939
3940 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
3941 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
3942 // If we are using a GOT, disable tail calls to external symbols with
3943 // default visibility. Tail calling such a symbol requires using a GOT
3944 // relocation, which forces early binding of the symbol. This breaks code
3945 // that require lazy function symbol resolution. Using musttail or
3946 // GuaranteedTailCallOpt will override this.
3947 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3948 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3949 G->getGlobal()->hasDefaultVisibility()))
3950 isTailCall = false;
3951 }
3952
3953
3954 if (isTailCall && !IsMustTail) {
3955 // Check if it's really possible to do a tail call.
3956 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3957 isVarArg, SR != NotStructReturn,
3958 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3959 Outs, OutVals, Ins, DAG);
3960
3961 // Sibcalls are automatically detected tailcalls which do not require
3962 // ABI changes.
3963 if (!IsGuaranteeTCO && isTailCall)
3964 IsSibcall = true;
3965
3966 if (isTailCall)
3967 ++NumTailCalls;
3968 }
3969
3970 if (IsMustTail && !isTailCall)
3971 report_fatal_error("failed to perform tail call elimination on a call "
3972 "site marked musttail");
3973
3974 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((void)0)
3975 "Var args not supported with calling convention fastcc, ghc or hipe")((void)0);
3976
3977 // Analyze operands of the call, assigning locations to each operand.
3978 SmallVector<CCValAssign, 16> ArgLocs;
3979 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3980
3981 // Allocate shadow area for Win64.
3982 if (IsWin64)
3983 CCInfo.AllocateStack(32, Align(8));
3984
3985 CCInfo.AnalyzeArguments(Outs, CC_X86);
3986
3987 // In vectorcall calling convention a second pass is required for the HVA
3988 // types.
3989 if (CallingConv::X86_VectorCall == CallConv) {
3990 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3991 }
3992
3993 // Get a count of how many bytes are to be pushed on the stack.
3994 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3995 if (IsSibcall)
3996 // This is a sibcall. The memory operands are available in caller's
3997 // own caller's stack.
3998 NumBytes = 0;
3999 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4000 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4001
4002 int FPDiff = 0;
4003 if (isTailCall &&
4004 shouldGuaranteeTCO(CallConv,
4005 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4006 // Lower arguments at fp - stackoffset + fpdiff.
4007 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4008
4009 FPDiff = NumBytesCallerPushed - NumBytes;
4010
4011 // Set the delta of movement of the returnaddr stackslot.
4012 // But only set if delta is greater than previous delta.
4013 if (FPDiff < X86Info->getTCReturnAddrDelta())
4014 X86Info->setTCReturnAddrDelta(FPDiff);
4015 }
4016
4017 unsigned NumBytesToPush = NumBytes;
4018 unsigned NumBytesToPop = NumBytes;
4019
4020 // If we have an inalloca argument, all stack space has already been allocated
4021 // for us and be right at the top of the stack. We don't support multiple
4022 // arguments passed in memory when using inalloca.
4023 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4024 NumBytesToPush = 0;
4025 if (!ArgLocs.back().isMemLoc())
4026 report_fatal_error("cannot use inalloca attribute on a register "
4027 "parameter");
4028 if (ArgLocs.back().getLocMemOffset() != 0)
4029 report_fatal_error("any parameter with the inalloca attribute must be "
4030 "the only memory argument");
4031 } else if (CLI.IsPreallocated) {
4032 assert(ArgLocs.back().isMemLoc() &&((void)0)
4033 "cannot use preallocated attribute on a register "((void)0)
4034 "parameter")((void)0);
4035 SmallVector<size_t, 4> PreallocatedOffsets;
4036 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4037 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4038 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4039 }
4040 }
4041 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4042 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4043 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4044 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4045 NumBytesToPush = 0;
4046 }
4047
4048 if (!IsSibcall && !IsMustTail)
4049 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4050 NumBytes - NumBytesToPush, dl);
4051
4052 SDValue RetAddrFrIdx;
4053 // Load return address for tail calls.
4054 if (isTailCall && FPDiff)
4055 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4056 Is64Bit, FPDiff, dl);
4057
4058 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4059 SmallVector<SDValue, 8> MemOpChains;
4060 SDValue StackPtr;
4061
4062 // The next loop assumes that the locations are in the same order of the
4063 // input arguments.
4064 assert(isSortedByValueNo(ArgLocs) &&((void)0)
4065 "Argument Location list must be sorted before lowering")((void)0);
4066
4067 // Walk the register/memloc assignments, inserting copies/loads. In the case
4068 // of tail call optimization arguments are handle later.
4069 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4070 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4071 ++I, ++OutIndex) {
4072 assert(OutIndex < Outs.size() && "Invalid Out index")((void)0);
4073 // Skip inalloca/preallocated arguments, they have already been written.
4074 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4075 if (Flags.isInAlloca() || Flags.isPreallocated())
4076 continue;
4077
4078 CCValAssign &VA = ArgLocs[I];
4079 EVT RegVT = VA.getLocVT();
4080 SDValue Arg = OutVals[OutIndex];
4081 bool isByVal = Flags.isByVal();
4082
4083 // Promote the value if needed.
4084 switch (VA.getLocInfo()) {
4085 default: llvm_unreachable("Unknown loc info!")__builtin_unreachable();
4086 case CCValAssign::Full: break;
4087 case CCValAssign::SExt:
4088 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4089 break;
4090 case CCValAssign::ZExt:
4091 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4092 break;
4093 case CCValAssign::AExt:
4094 if (Arg.getValueType().isVector() &&
4095 Arg.getValueType().getVectorElementType() == MVT::i1)
4096 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4097 else if (RegVT.is128BitVector()) {
4098 // Special case: passing MMX values in XMM registers.
4099 Arg = DAG.getBitcast(MVT::i64, Arg);
4100 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4101 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4102 } else
4103 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4104 break;
4105 case CCValAssign::BCvt:
4106 Arg = DAG.getBitcast(RegVT, Arg);
4107 break;
4108 case CCValAssign::Indirect: {
4109 if (isByVal) {
4110 // Memcpy the argument to a temporary stack slot to prevent
4111 // the caller from seeing any modifications the callee may make
4112 // as guaranteed by the `byval` attribute.
4113 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4114 Flags.getByValSize(),
4115 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4116 SDValue StackSlot =
4117 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4118 Chain =
4119 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4120 // From now on treat this as a regular pointer
4121 Arg = StackSlot;
4122 isByVal = false;
4123 } else {
4124 // Store the argument.
4125 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4126 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4127 Chain = DAG.getStore(
4128 Chain, dl, Arg, SpillSlot,
4129 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4130 Arg = SpillSlot;
4131 }
4132 break;
4133 }
4134 }
4135
4136 if (VA.needsCustom()) {
4137 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
4138 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
4139 // Split v64i1 value into two registers
4140 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4141 } else if (VA.isRegLoc()) {
4142 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4143 const TargetOptions &Options = DAG.getTarget().Options;
4144 if (Options.EmitCallSiteInfo)
4145 CSInfo.emplace_back(VA.getLocReg(), I);
4146 if (isVarArg && IsWin64) {
4147 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4148 // shadow reg if callee is a varargs function.
4149 Register ShadowReg;
4150 switch (VA.getLocReg()) {
4151 case X86::XMM0: ShadowReg = X86::RCX; break;
4152 case X86::XMM1: ShadowReg = X86::RDX; break;
4153 case X86::XMM2: ShadowReg = X86::R8; break;
4154 case X86::XMM3: ShadowReg = X86::R9; break;
4155 }
4156 if (ShadowReg)
4157 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4158 }
4159 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4160 assert(VA.isMemLoc())((void)0);
4161 if (!StackPtr.getNode())
4162 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4163 getPointerTy(DAG.getDataLayout()));
4164 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4165 dl, DAG, VA, Flags, isByVal));
4166 }
4167 }
4168
4169 if (!MemOpChains.empty())
4170 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4171
4172 if (Subtarget.isPICStyleGOT()) {
4173 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4174 // GOT pointer (except regcall).
4175 if (!isTailCall) {
4176 // Indirect call with RegCall calling convertion may use up all the
4177 // general registers, so it is not suitable to bind EBX reister for
4178 // GOT address, just let register allocator handle it.
4179 if (CallConv != CallingConv::X86_RegCall)
4180 RegsToPass.push_back(std::make_pair(
4181 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4182 getPointerTy(DAG.getDataLayout()))));
4183 } else {
4184 // If we are tail calling and generating PIC/GOT style code load the
4185 // address of the callee into ECX. The value in ecx is used as target of
4186 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4187 // for tail calls on PIC/GOT architectures. Normally we would just put the
4188 // address of GOT into ebx and then call target@PLT. But for tail calls
4189 // ebx would be restored (since ebx is callee saved) before jumping to the
4190 // target@PLT.
4191
4192 // Note: The actual moving to ECX is done further down.
4193 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4194 if (G && !G->getGlobal()->hasLocalLinkage() &&
4195 G->getGlobal()->hasDefaultVisibility())
4196 Callee = LowerGlobalAddress(Callee, DAG);
4197 else if (isa<ExternalSymbolSDNode>(Callee))
4198 Callee = LowerExternalSymbol(Callee, DAG);
4199 }
4200 }
4201
4202 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4203 // From AMD64 ABI document:
4204 // For calls that may call functions that use varargs or stdargs
4205 // (prototype-less calls or calls to functions containing ellipsis (...) in
4206 // the declaration) %al is used as hidden argument to specify the number
4207 // of SSE registers used. The contents of %al do not need to match exactly
4208 // the number of registers, but must be an ubound on the number of SSE
4209 // registers used and is in the range 0 - 8 inclusive.
4210
4211 // Count the number of XMM registers allocated.
4212 static const MCPhysReg XMMArgRegs[] = {
4213 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4214 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4215 };
4216 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4217 assert((Subtarget.hasSSE1() || !NumXMMRegs)((void)0)
4218 && "SSE registers cannot be used when SSE is disabled")((void)0);
4219 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4220 DAG.getConstant(NumXMMRegs, dl,
4221 MVT::i8)));
4222 }
4223
4224 if (isVarArg && IsMustTail) {
4225 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4226 for (const auto &F : Forwards) {
4227 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4228 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4229 }
4230 }
4231
4232 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4233 // don't need this because the eligibility check rejects calls that require
4234 // shuffling arguments passed in memory.
4235 if (!IsSibcall && isTailCall) {
4236 // Force all the incoming stack arguments to be loaded from the stack
4237 // before any new outgoing arguments are stored to the stack, because the
4238 // outgoing stack slots may alias the incoming argument stack slots, and
4239 // the alias isn't otherwise explicit. This is slightly more conservative
4240 // than necessary, because it means that each store effectively depends
4241 // on every argument instead of just those arguments it would clobber.
4242 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4243
4244 SmallVector<SDValue, 8> MemOpChains2;
4245 SDValue FIN;
4246 int FI = 0;
4247 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4248 ++I, ++OutsIndex) {
4249 CCValAssign &VA = ArgLocs[I];
4250
4251 if (VA.isRegLoc()) {
4252 if (VA.needsCustom()) {
4253 assert((CallConv == CallingConv::X86_RegCall) &&((void)0)
4254 "Expecting custom case only in regcall calling convention")((void)0);
4255 // This means that we are in special case where one argument was
4256 // passed through two register locations - Skip the next location
4257 ++I;
4258 }
4259
4260 continue;
4261 }
4262
4263 assert(VA.isMemLoc())((void)0);
4264 SDValue Arg = OutVals[OutsIndex];
4265 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4266 // Skip inalloca/preallocated arguments. They don't require any work.
4267 if (Flags.isInAlloca() || Flags.isPreallocated())
4268 continue;
4269 // Create frame index.
4270 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4271 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4272 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4273 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4274
4275 if (Flags.isByVal()) {
4276 // Copy relative to framepointer.
4277 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4278 if (!StackPtr.getNode())
4279 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4280 getPointerTy(DAG.getDataLayout()));
4281 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4282 StackPtr, Source);
4283
4284 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4285 ArgChain,
4286 Flags, DAG, dl));
4287 } else {
4288 // Store relative to framepointer.
4289 MemOpChains2.push_back(DAG.getStore(
4290 ArgChain, dl, Arg, FIN,
4291 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4292 }
4293 }
4294
4295 if (!MemOpChains2.empty())
4296 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4297
4298 // Store the return address to the appropriate stack slot.
4299 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4300 getPointerTy(DAG.getDataLayout()),
4301 RegInfo->getSlotSize(), FPDiff, dl);
4302 }
4303
4304 // Build a sequence of copy-to-reg nodes chained together with token chain
4305 // and flag operands which copy the outgoing args into registers.
4306 SDValue InFlag;
4307 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4308 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4309 RegsToPass[i].second, InFlag);
4310 InFlag = Chain.getValue(1);
4311 }
4312
4313 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4314 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((void)0);
4315 // In the 64-bit large code model, we have to make all calls
4316 // through a register, since the call instruction's 32-bit
4317 // pc-relative offset may not be large enough to hold the whole
4318 // address.
4319 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4320 Callee->getOpcode() == ISD::ExternalSymbol) {
4321 // Lower direct calls to global addresses and external symbols. Setting
4322 // ForCall to true here has the effect of removing WrapperRIP when possible
4323 // to allow direct calls to be selected without first materializing the
4324 // address into a register.
4325 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4326 } else if (Subtarget.isTarget64BitILP32() &&
4327 Callee->getValueType(0) == MVT::i32) {
4328 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4329 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4330 }
4331
4332 // Returns a chain & a flag for retval copy to use.
4333 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4334 SmallVector<SDValue, 8> Ops;
4335
4336 if (!IsSibcall && isTailCall && !IsMustTail) {
4337 Chain = DAG.getCALLSEQ_END(Chain,
4338 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4339 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4340 InFlag = Chain.getValue(1);
4341 }
4342
4343 Ops.push_back(Chain);
4344 Ops.push_back(Callee);
4345
4346 if (isTailCall)
4347 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4348
4349 // Add argument registers to the end of the list so that they are known live
4350 // into the call.
4351 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4352 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4353 RegsToPass[i].second.getValueType()));
4354
4355 // Add a register mask operand representing the call-preserved registers.
4356 const uint32_t *Mask = [&]() {
4357 auto AdaptedCC = CallConv;
4358 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4359 // use X86_INTR calling convention because it has the same CSR mask
4360 // (same preserved registers).
4361 if (HasNCSR)
4362 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4363 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4364 // to use the CSR_NoRegs_RegMask.
4365 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4366 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4367 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4368 }();
4369 assert(Mask && "Missing call preserved mask for calling convention")((void)0);
4370
4371 // If this is an invoke in a 32-bit function using a funclet-based
4372 // personality, assume the function clobbers all registers. If an exception
4373 // is thrown, the runtime will not restore CSRs.
4374 // FIXME: Model this more precisely so that we can register allocate across
4375 // the normal edge and spill and fill across the exceptional edge.
4376 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4377 const Function &CallerFn = MF.getFunction();
4378 EHPersonality Pers =
4379 CallerFn.hasPersonalityFn()
4380 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4381 : EHPersonality::Unknown;
4382 if (isFuncletEHPersonality(Pers))
4383 Mask = RegInfo->getNoPreservedMask();
4384 }
4385
4386 // Define a new register mask from the existing mask.
4387 uint32_t *RegMask = nullptr;
4388
4389 // In some calling conventions we need to remove the used physical registers
4390 // from the reg mask.
4391 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4392 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4393
4394 // Allocate a new Reg Mask and copy Mask.
4395 RegMask = MF.allocateRegMask();
4396 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4397 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4398
4399 // Make sure all sub registers of the argument registers are reset
4400 // in the RegMask.
4401 for (auto const &RegPair : RegsToPass)
4402 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4403 SubRegs.isValid(); ++SubRegs)
4404 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4405
4406 // Create the RegMask Operand according to our updated mask.
4407 Ops.push_back(DAG.getRegisterMask(RegMask));
4408 } else {
4409 // Create the RegMask Operand according to the static mask.
4410 Ops.push_back(DAG.getRegisterMask(Mask));
4411 }
4412
4413 if (InFlag.getNode())
4414 Ops.push_back(InFlag);
4415
4416 if (isTailCall) {
4417 // We used to do:
4418 //// If this is the first return lowered for this function, add the regs
4419 //// to the liveout set for the function.
4420 // This isn't right, although it's probably harmless on x86; liveouts
4421 // should be computed from returns not tail calls. Consider a void
4422 // function making a tail call to a function returning int.
4423 MF.getFrameInfo().setHasTailCall();
4424 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4425 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4426 return Ret;
4427 }
4428
4429 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4430 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4431 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4432 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4433 // expanded to the call, directly followed by a special marker sequence and
4434 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4435 assert(!isTailCall &&((void)0)
4436 "tail calls cannot be marked with clang.arc.attachedcall")((void)0);
4437 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")((void)0);
4438
4439 // Add target constant to select ObjC runtime call just before the call
4440 // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4441 // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4442 // epxanding the pseudo.
4443 unsigned RuntimeCallType =
4444 objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4445 Ops.insert(Ops.begin() + 1,
4446 DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4447 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4448 } else {
4449 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4450 }
4451
4452 InFlag = Chain.getValue(1);
4453 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4454 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4455
4456 // Save heapallocsite metadata.
4457 if (CLI.CB)
4458 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4459 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4460
4461 // Create the CALLSEQ_END node.
4462 unsigned NumBytesForCalleeToPop;
4463 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4464 DAG.getTarget().Options.GuaranteedTailCallOpt))
4465 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4466 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4467 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4468 SR == StackStructReturn)
4469 // If this is a call to a struct-return function, the callee
4470 // pops the hidden struct pointer, so we have to push it back.
4471 // This is common for Darwin/X86, Linux & Mingw32 targets.
4472 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4473 NumBytesForCalleeToPop = 4;
4474 else
4475 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4476
4477 // Returns a flag for retval copy to use.
4478 if (!IsSibcall) {
4479 Chain = DAG.getCALLSEQ_END(Chain,
4480 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4481 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4482 true),
4483 InFlag, dl);
4484 InFlag = Chain.getValue(1);
4485 }
4486
4487 // Handle result values, copying them out of physregs into vregs that we
4488 // return.
4489 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4490 InVals, RegMask);
4491}
4492
4493//===----------------------------------------------------------------------===//
4494// Fast Calling Convention (tail call) implementation
4495//===----------------------------------------------------------------------===//
4496
4497// Like std call, callee cleans arguments, convention except that ECX is
4498// reserved for storing the tail called function address. Only 2 registers are
4499// free for argument passing (inreg). Tail call optimization is performed
4500// provided:
4501// * tailcallopt is enabled
4502// * caller/callee are fastcc
4503// On X86_64 architecture with GOT-style position independent code only local
4504// (within module) calls are supported at the moment.
4505// To keep the stack aligned according to platform abi the function
4506// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4507// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4508// If a tail called function callee has more arguments than the caller the
4509// caller needs to make sure that there is room to move the RETADDR to. This is
4510// achieved by reserving an area the size of the argument delta right after the
4511// original RETADDR, but before the saved framepointer or the spilled registers
4512// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4513// stack layout:
4514// arg1
4515// arg2
4516// RETADDR
4517// [ new RETADDR
4518// move area ]
4519// (possible EBP)
4520// ESI
4521// EDI
4522// local1 ..
4523
4524/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4525/// requirement.
4526unsigned
4527X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4528 SelectionDAG &DAG) const {
4529 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4530 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4531 assert(StackSize % SlotSize == 0 &&((void)0)
4532 "StackSize must be a multiple of SlotSize")((void)0);
4533 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4534}
4535
4536/// Return true if the given stack call argument is already available in the
4537/// same position (relatively) of the caller's incoming argument stack.
4538static
4539bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4540 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4541 const X86InstrInfo *TII, const CCValAssign &VA) {
4542 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4543
4544 for (;;) {
4545 // Look through nodes that don't alter the bits of the incoming value.
4546 unsigned Op = Arg.getOpcode();
4547 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4548 Arg = Arg.getOperand(0);
4549 continue;
4550 }
4551 if (Op == ISD::TRUNCATE) {
4552 const SDValue &TruncInput = Arg.getOperand(0);
4553 if (TruncInput.getOpcode() == ISD::AssertZext &&
4554 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4555 Arg.getValueType()) {
4556 Arg = TruncInput.getOperand(0);
4557 continue;
4558 }
4559 }
4560 break;
4561 }
4562
4563 int FI = INT_MAX2147483647;
4564 if (Arg.getOpcode() == ISD::CopyFromReg) {
4565 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4566 if (!VR.isVirtual())
4567 return false;
4568 MachineInstr *Def = MRI->getVRegDef(VR);
4569 if (!Def)
4570 return false;
4571 if (!Flags.isByVal()) {
4572 if (!TII->isLoadFromStackSlot(*Def, FI))
4573 return false;
4574 } else {
4575 unsigned Opcode = Def->getOpcode();
4576 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4577 Opcode == X86::LEA64_32r) &&
4578 Def->getOperand(1).isFI()) {
4579 FI = Def->getOperand(1).getIndex();
4580 Bytes = Flags.getByValSize();
4581 } else
4582 return false;
4583 }
4584 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4585 if (Flags.isByVal())
4586 // ByVal argument is passed in as a pointer but it's now being
4587 // dereferenced. e.g.
4588 // define @foo(%struct.X* %A) {
4589 // tail call @bar(%struct.X* byval %A)
4590 // }
4591 return false;
4592 SDValue Ptr = Ld->getBasePtr();
4593 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4594 if (!FINode)
4595 return false;
4596 FI = FINode->getIndex();
4597 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4598 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4599 FI = FINode->getIndex();
4600 Bytes = Flags.getByValSize();
4601 } else
4602 return false;
4603
4604 assert(FI != INT_MAX)((void)0);
4605 if (!MFI.isFixedObjectIndex(FI))
4606 return false;
4607
4608 if (Offset != MFI.getObjectOffset(FI))
4609 return false;
4610
4611 // If this is not byval, check that the argument stack object is immutable.
4612 // inalloca and argument copy elision can create mutable argument stack
4613 // objects. Byval objects can be mutated, but a byval call intends to pass the
4614 // mutated memory.
4615 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4616 return false;
4617
4618 if (VA.getLocVT().getFixedSizeInBits() >
4619 Arg.getValueSizeInBits().getFixedSize()) {
4620 // If the argument location is wider than the argument type, check that any
4621 // extension flags match.
4622 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4623 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4624 return false;
4625 }
4626 }
4627
4628 return Bytes == MFI.getObjectSize(FI);
4629}
4630
4631/// Check whether the call is eligible for tail call optimization. Targets
4632/// that want to do tail call optimization should implement this function.
4633bool X86TargetLowering::IsEligibleForTailCallOptimization(
4634 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4635 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4636 const SmallVectorImpl<ISD::OutputArg> &Outs,
4637 const SmallVectorImpl<SDValue> &OutVals,
4638 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4639 if (!mayTailCallThisCC(CalleeCC))
4640 return false;
4641
4642 // If -tailcallopt is specified, make fastcc functions tail-callable.
4643 MachineFunction &MF = DAG.getMachineFunction();
4644 const Function &CallerF = MF.getFunction();
4645
4646 // If the function return type is x86_fp80 and the callee return type is not,
4647 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4648 // perform a tailcall optimization here.
4649 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4650 return false;
4651
4652 CallingConv::ID CallerCC = CallerF.getCallingConv();
4653 bool CCMatch = CallerCC == CalleeCC;
4654 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4655 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4656 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4657 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4658
4659 // Win64 functions have extra shadow space for argument homing. Don't do the
4660 // sibcall if the caller and callee have mismatched expectations for this
4661 // space.
4662 if (IsCalleeWin64 != IsCallerWin64)
4663 return false;
4664
4665 if (IsGuaranteeTCO) {
4666 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4667 return true;
4668 return false;
4669 }
4670
4671 // Look for obvious safe cases to perform tail call optimization that do not
4672 // require ABI changes. This is what gcc calls sibcall.
4673
4674 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4675 // emit a special epilogue.
4676 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4677 if (RegInfo->hasStackRealignment(MF))
4678 return false;
4679
4680 // Also avoid sibcall optimization if either caller or callee uses struct
4681 // return semantics.
4682 if (isCalleeStructRet || isCallerStructRet)
4683 return false;
4684
4685 // Do not sibcall optimize vararg calls unless all arguments are passed via
4686 // registers.
4687 LLVMContext &C = *DAG.getContext();
4688 if (isVarArg && !Outs.empty()) {
4689 // Optimizing for varargs on Win64 is unlikely to be safe without
4690 // additional testing.
4691 if (IsCalleeWin64 || IsCallerWin64)
4692 return false;
4693
4694 SmallVector<CCValAssign, 16> ArgLocs;
4695 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4696
4697 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4698 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4699 if (!ArgLocs[i].isRegLoc())
4700 return false;
4701 }
4702
4703 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4704 // stack. Therefore, if it's not used by the call it is not safe to optimize
4705 // this into a sibcall.
4706 bool Unused = false;
4707 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4708 if (!Ins[i].Used) {
4709 Unused = true;
4710 break;
4711 }
4712 }
4713 if (Unused) {
4714 SmallVector<CCValAssign, 16> RVLocs;
4715 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4716 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4717 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4718 CCValAssign &VA = RVLocs[i];
4719 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4720 return false;
4721 }
4722 }
4723
4724 // Check that the call results are passed in the same way.
4725 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4726 RetCC_X86, RetCC_X86))
4727 return false;
4728 // The callee has to preserve all registers the caller needs to preserve.
4729 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4730 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4731 if (!CCMatch) {
4732 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4733 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4734 return false;
4735 }
4736
4737 unsigned StackArgsSize = 0;
4738
4739 // If the callee takes no arguments then go on to check the results of the
4740 // call.
4741 if (!Outs.empty()) {
4742 // Check if stack adjustment is needed. For now, do not do this if any
4743 // argument is passed on the stack.
4744 SmallVector<CCValAssign, 16> ArgLocs;
4745 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4746
4747 // Allocate shadow area for Win64
4748 if (IsCalleeWin64)
4749 CCInfo.AllocateStack(32, Align(8));
4750
4751 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4752 StackArgsSize = CCInfo.getNextStackOffset();
4753
4754 if (CCInfo.getNextStackOffset()) {
4755 // Check if the arguments are already laid out in the right way as
4756 // the caller's fixed stack objects.
4757 MachineFrameInfo &MFI = MF.getFrameInfo();
4758 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4759 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4760 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4761 CCValAssign &VA = ArgLocs[i];
4762 SDValue Arg = OutVals[i];
4763 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4764 if (VA.getLocInfo() == CCValAssign::Indirect)
4765 return false;
4766 if (!VA.isRegLoc()) {
4767 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4768 MFI, MRI, TII, VA))
4769 return false;
4770 }
4771 }
4772 }
4773
4774 bool PositionIndependent = isPositionIndependent();
4775 // If the tailcall address may be in a register, then make sure it's
4776 // possible to register allocate for it. In 32-bit, the call address can
4777 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4778 // callee-saved registers are restored. These happen to be the same
4779 // registers used to pass 'inreg' arguments so watch out for those.
4780 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4781 !isa<ExternalSymbolSDNode>(Callee)) ||
4782 PositionIndependent)) {
4783 unsigned NumInRegs = 0;
4784 // In PIC we need an extra register to formulate the address computation
4785 // for the callee.
4786 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4787
4788 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4789 CCValAssign &VA = ArgLocs[i];
4790 if (!VA.isRegLoc())
4791 continue;
4792 Register Reg = VA.getLocReg();
4793 switch (Reg) {
4794 default: break;
4795 case X86::EAX: case X86::EDX: case X86::ECX:
4796 if (++NumInRegs == MaxInRegs)
4797 return false;
4798 break;
4799 }
4800 }
4801 }
4802
4803 const MachineRegisterInfo &MRI = MF.getRegInfo();
4804 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4805 return false;
4806 }
4807
4808 bool CalleeWillPop =
4809 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4810 MF.getTarget().Options.GuaranteedTailCallOpt);
4811
4812 if (unsigned BytesToPop =
4813 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4814 // If we have bytes to pop, the callee must pop them.
4815 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4816 if (!CalleePopMatches)
4817 return false;
4818 } else if (CalleeWillPop && StackArgsSize > 0) {
4819 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4820 return false;
4821 }
4822
4823 return true;
4824}
4825
4826FastISel *
4827X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4828 const TargetLibraryInfo *libInfo) const {
4829 return X86::createFastISel(funcInfo, libInfo);
4830}
4831
4832//===----------------------------------------------------------------------===//
4833// Other Lowering Hooks
4834//===----------------------------------------------------------------------===//
4835
4836static bool MayFoldLoad(SDValue Op) {
4837 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4838}
4839
4840static bool MayFoldIntoStore(SDValue Op) {
4841 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4842}
4843
4844static bool MayFoldIntoZeroExtend(SDValue Op) {
4845 if (Op.hasOneUse()) {
4846 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4847 return (ISD::ZERO_EXTEND == Opcode);
4848 }
4849 return false;
4850}
4851
4852static bool isTargetShuffle(unsigned Opcode) {
4853 switch(Opcode) {
4854 default: return false;
4855 case X86ISD::BLENDI:
4856 case X86ISD::PSHUFB:
4857 case X86ISD::PSHUFD:
4858 case X86ISD::PSHUFHW:
4859 case X86ISD::PSHUFLW:
4860 case X86ISD::SHUFP:
4861 case X86ISD::INSERTPS:
4862 case X86ISD::EXTRQI:
4863 case X86ISD::INSERTQI:
4864 case X86ISD::VALIGN:
4865 case X86ISD::PALIGNR:
4866 case X86ISD::VSHLDQ:
4867 case X86ISD::VSRLDQ:
4868 case X86ISD::MOVLHPS:
4869 case X86ISD::MOVHLPS:
4870 case X86ISD::MOVSHDUP:
4871 case X86ISD::MOVSLDUP:
4872 case X86ISD::MOVDDUP:
4873 case X86ISD::MOVSS:
4874 case X86ISD::MOVSD:
4875 case X86ISD::UNPCKL:
4876 case X86ISD::UNPCKH:
4877 case X86ISD::VBROADCAST:
4878 case X86ISD::VPERMILPI:
4879 case X86ISD::VPERMILPV:
4880 case X86ISD::VPERM2X128:
4881 case X86ISD::SHUF128:
4882 case X86ISD::VPERMIL2:
4883 case X86ISD::VPERMI:
4884 case X86ISD::VPPERM:
4885 case X86ISD::VPERMV:
4886 case X86ISD::VPERMV3:
4887 case X86ISD::VZEXT_MOVL:
4888 return true;
4889 }
4890}
4891
4892static bool isTargetShuffleVariableMask(unsigned Opcode) {
4893 switch (Opcode) {
4894 default: return false;
4895 // Target Shuffles.
4896 case X86ISD::PSHUFB:
4897 case X86ISD::VPERMILPV:
4898 case X86ISD::VPERMIL2:
4899 case X86ISD::VPPERM:
4900 case X86ISD::VPERMV:
4901 case X86ISD::VPERMV3:
4902 return true;
4903 // 'Faux' Target Shuffles.
4904 case ISD::OR:
4905 case ISD::AND:
4906 case X86ISD::ANDNP:
4907 return true;
4908 }
4909}
4910
4911static bool isTargetShuffleSplat(SDValue Op) {
4912 unsigned Opcode = Op.getOpcode();
4913 if (Opcode == ISD::EXTRACT_SUBVECTOR)
4914 return isTargetShuffleSplat(Op.getOperand(0));
4915 return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
4916}
4917
4918SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4919 MachineFunction &MF = DAG.getMachineFunction();
4920 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4921 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4922 int ReturnAddrIndex = FuncInfo->getRAIndex();
4923
4924 if (ReturnAddrIndex == 0) {
4925 // Set up a frame object for the return address.
4926 unsigned SlotSize = RegInfo->getSlotSize();
4927 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4928 -(int64_t)SlotSize,
4929 false);
4930 FuncInfo->setRAIndex(ReturnAddrIndex);
4931 }
4932
4933 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4934}
4935
4936bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4937 bool hasSymbolicDisplacement) {
4938 // Offset should fit into 32 bit immediate field.
4939 if (!isInt<32>(Offset))
4940 return false;
4941
4942 // If we don't have a symbolic displacement - we don't have any extra
4943 // restrictions.
4944 if (!hasSymbolicDisplacement)
4945 return true;
4946
4947 // FIXME: Some tweaks might be needed for medium code model.
4948 if (M != CodeModel::Small && M != CodeModel::Kernel)
4949 return false;
4950
4951 // For small code model we assume that latest object is 16MB before end of 31
4952 // bits boundary. We may also accept pretty large negative constants knowing
4953 // that all objects are in the positive half of address space.
4954 if (M == CodeModel::Small && Offset < 16*1024*1024)
4955 return true;
4956
4957 // For kernel code model we know that all object resist in the negative half
4958 // of 32bits address space. We may not accept negative offsets, since they may
4959 // be just off and we may accept pretty large positive ones.
4960 if (M == CodeModel::Kernel && Offset >= 0)
4961 return true;
4962
4963 return false;
4964}
4965
4966/// Determines whether the callee is required to pop its own arguments.
4967/// Callee pop is necessary to support tail calls.
4968bool X86::isCalleePop(CallingConv::ID CallingConv,
4969 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4970 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4971 // can guarantee TCO.
4972 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4973 return true;
4974
4975 switch (CallingConv) {
4976 default:
4977 return false;
4978 case CallingConv::X86_StdCall:
4979 case CallingConv::X86_FastCall:
4980 case CallingConv::X86_ThisCall:
4981 case CallingConv::X86_VectorCall:
4982 return !is64Bit;
4983 }
4984}
4985
4986/// Return true if the condition is an signed comparison operation.
4987static bool isX86CCSigned(unsigned X86CC) {
4988 switch (X86CC) {
4989 default:
4990 llvm_unreachable("Invalid integer condition!")__builtin_unreachable();
4991 case X86::COND_E:
4992 case X86::COND_NE:
4993 case X86::COND_B:
4994 case X86::COND_A:
4995 case X86::COND_BE:
4996 case X86::COND_AE:
4997 return false;
4998 case X86::COND_G:
4999 case X86::COND_GE:
5000 case X86::COND_L:
5001 case X86::COND_LE:
5002 return true;
5003 }
5004}
5005
5006static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5007 switch (SetCCOpcode) {
5008 default: llvm_unreachable("Invalid integer condition!")__builtin_unreachable();
5009 case ISD::SETEQ: return X86::COND_E;
5010 case ISD::SETGT: return X86::COND_G;
5011 case ISD::SETGE: return X86::COND_GE;
5012 case ISD::SETLT: return X86::COND_L;
5013 case ISD::SETLE: return X86::COND_LE;
5014 case ISD::SETNE: return X86::COND_NE;
5015 case ISD::SETULT: return X86::COND_B;
5016 case ISD::SETUGT: return X86::COND_A;
5017 case ISD::SETULE: return X86::COND_BE;
5018 case ISD::SETUGE: return X86::COND_AE;
5019 }
5020}
5021
5022/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5023/// condition code, returning the condition code and the LHS/RHS of the
5024/// comparison to make.
5025static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5026 bool isFP, SDValue &LHS, SDValue &RHS,
5027 SelectionDAG &DAG) {
5028 if (!isFP) {
5029 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5030 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5031 // X > -1 -> X == 0, jump !sign.
5032 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5033 return X86::COND_NS;
5034 }
5035 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5036 // X < 0 -> X == 0, jump on sign.
5037 return X86::COND_S;
5038 }
5039 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5040 // X >= 0 -> X == 0, jump on !sign.
5041 return X86::COND_NS;
5042 }
5043 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5044 // X < 1 -> X <= 0
5045 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5046 return X86::COND_LE;
5047 }
5048 }
5049
5050 return TranslateIntegerX86CC(SetCCOpcode);
5051 }
5052
5053 // First determine if it is required or is profitable to flip the operands.
5054
5055 // If LHS is a foldable load, but RHS is not, flip the condition.
5056 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5057 !ISD::isNON_EXTLoad(RHS.getNode())) {
5058 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5059 std::swap(LHS, RHS);
5060 }
5061
5062 switch (SetCCOpcode) {
5063 default: break;
5064 case ISD::SETOLT:
5065 case ISD::SETOLE:
5066 case ISD::SETUGT:
5067 case ISD::SETUGE:
5068 std::swap(LHS, RHS);
5069 break;
5070 }
5071
5072 // On a floating point condition, the flags are set as follows:
5073 // ZF PF CF op
5074 // 0 | 0 | 0 | X > Y
5075 // 0 | 0 | 1 | X < Y
5076 // 1 | 0 | 0 | X == Y
5077 // 1 | 1 | 1 | unordered
5078 switch (SetCCOpcode) {
5079 default: llvm_unreachable("Condcode should be pre-legalized away")__builtin_unreachable();
5080 case ISD::SETUEQ:
5081 case ISD::SETEQ: return X86::COND_E;
5082 case ISD::SETOLT: // flipped
5083 case ISD::SETOGT:
5084 case ISD::SETGT: return X86::COND_A;
5085 case ISD::SETOLE: // flipped
5086 case ISD::SETOGE:
5087 case ISD::SETGE: return X86::COND_AE;
5088 case ISD::SETUGT: // flipped
5089 case ISD::SETULT:
5090 case ISD::SETLT: return X86::COND_B;
5091 case ISD::SETUGE: // flipped
5092 case ISD::SETULE:
5093 case ISD::SETLE: return X86::COND_BE;
5094 case ISD::SETONE:
5095 case ISD::SETNE: return X86::COND_NE;
5096 case ISD::SETUO: return X86::COND_P;
5097 case ISD::SETO: return X86::COND_NP;
5098 case ISD::SETOEQ:
5099 case ISD::SETUNE: return X86::COND_INVALID;
5100 }
5101}
5102
5103/// Is there a floating point cmov for the specific X86 condition code?
5104/// Current x86 isa includes the following FP cmov instructions:
5105/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5106static bool hasFPCMov(unsigned X86CC) {
5107 switch (X86CC) {
5108 default:
5109 return false;
5110 case X86::COND_B:
5111 case X86::COND_BE:
5112 case X86::COND_E:
5113 case X86::COND_P:
5114 case X86::COND_A:
5115 case X86::COND_AE:
5116 case X86::COND_NE:
5117 case X86::COND_NP:
5118 return true;
5119 }
5120}
5121
5122
5123bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5124 const CallInst &I,
5125 MachineFunction &MF,
5126 unsigned Intrinsic) const {
5127 Info.flags = MachineMemOperand::MONone;
5128 Info.offset = 0;
5129
5130 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5131 if (!IntrData) {
5132 switch (Intrinsic) {
5133 case Intrinsic::x86_aesenc128kl:
5134 case Intrinsic::x86_aesdec128kl:
5135 Info.opc = ISD::INTRINSIC_W_CHAIN;
5136 Info.ptrVal = I.getArgOperand(1);
5137 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5138 Info.align = Align(1);
5139 Info.flags |= MachineMemOperand::MOLoad;
5140 return true;
5141 case Intrinsic::x86_aesenc256kl:
5142 case Intrinsic::x86_aesdec256kl:
5143 Info.opc = ISD::INTRINSIC_W_CHAIN;
5144 Info.ptrVal = I.getArgOperand(1);
5145 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5146 Info.align = Align(1);
5147 Info.flags |= MachineMemOperand::MOLoad;
5148 return true;
5149 case Intrinsic::x86_aesencwide128kl:
5150 case Intrinsic::x86_aesdecwide128kl:
5151 Info.opc = ISD::INTRINSIC_W_CHAIN;
5152 Info.ptrVal = I.getArgOperand(0);
5153 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5154 Info.align = Align(1);
5155 Info.flags |= MachineMemOperand::MOLoad;
5156 return true;
5157 case Intrinsic::x86_aesencwide256kl:
5158 case Intrinsic::x86_aesdecwide256kl:
5159 Info.opc = ISD::INTRINSIC_W_CHAIN;
5160 Info.ptrVal = I.getArgOperand(0);
5161 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5162 Info.align = Align(1);
5163 Info.flags |= MachineMemOperand::MOLoad;
5164 return true;
5165 }
5166 return false;
5167 }
5168
5169 switch (IntrData->Type) {
5170 case TRUNCATE_TO_MEM_VI8:
5171 case TRUNCATE_TO_MEM_VI16:
5172 case TRUNCATE_TO_MEM_VI32: {
5173 Info.opc = ISD::INTRINSIC_VOID;
5174 Info.ptrVal = I.getArgOperand(0);
5175 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5176 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5177 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5178 ScalarVT = MVT::i8;
5179 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5180 ScalarVT = MVT::i16;
5181 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5182 ScalarVT = MVT::i32;
5183
5184 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5185 Info.align = Align(1);
5186 Info.flags |= MachineMemOperand::MOStore;
5187 break;
5188 }
5189 case GATHER:
5190 case GATHER_AVX2: {
5191 Info.opc = ISD::INTRINSIC_W_CHAIN;
5192 Info.ptrVal = nullptr;
5193 MVT DataVT = MVT::getVT(I.getType());
5194 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5195 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5196 IndexVT.getVectorNumElements());
5197 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5198 Info.align = Align(1);
5199 Info.flags |= MachineMemOperand::MOLoad;
5200 break;
5201 }
5202 case SCATTER: {
5203 Info.opc = ISD::INTRINSIC_VOID;
5204 Info.ptrVal = nullptr;
5205 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5206 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5207 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5208 IndexVT.getVectorNumElements());
5209 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5210 Info.align = Align(1);
5211 Info.flags |= MachineMemOperand::MOStore;
5212 break;
5213 }
5214 default:
5215 return false;
5216 }
5217
5218 return true;
5219}
5220
5221/// Returns true if the target can instruction select the
5222/// specified FP immediate natively. If false, the legalizer will
5223/// materialize the FP immediate as a load from a constant pool.
5224bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5225 bool ForCodeSize) const {
5226 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5227 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5228 return true;
5229 }
5230 return false;
5231}
5232
5233bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5234 ISD::LoadExtType ExtTy,
5235 EVT NewVT) const {
5236 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")((void)0);
5237
5238 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5239 // relocation target a movq or addq instruction: don't let the load shrink.
5240 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5241 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5242 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5243 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5244
5245 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5246 // those uses are extracted directly into a store, then the extract + store
5247 // can be store-folded. Therefore, it's probably not worth splitting the load.
5248 EVT VT = Load->getValueType(0);
5249 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5250 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5251 // Skip uses of the chain value. Result 0 of the node is the load value.
5252 if (UI.getUse().getResNo() != 0)
5253 continue;
5254
5255 // If this use is not an extract + store, it's probably worth splitting.
5256 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5257 UI->use_begin()->getOpcode() != ISD::STORE)
5258 return true;
5259 }
5260 // All non-chain uses are extract + store.
5261 return false;
5262 }
5263
5264 return true;
5265}
5266
5267/// Returns true if it is beneficial to convert a load of a constant
5268/// to just the constant itself.
5269bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5270 Type *Ty) const {
5271 assert(Ty->isIntegerTy())((void)0);
5272
5273 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5274 if (BitSize == 0 || BitSize > 64)
5275 return false;
5276 return true;
5277}
5278
5279bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5280 // If we are using XMM registers in the ABI and the condition of the select is
5281 // a floating-point compare and we have blendv or conditional move, then it is
5282 // cheaper to select instead of doing a cross-register move and creating a
5283 // load that depends on the compare result.
5284 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5285 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5286}
5287
5288bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5289 // TODO: It might be a win to ease or lift this restriction, but the generic
5290 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5291 if (VT.isVector() && Subtarget.hasAVX512())
5292 return false;
5293
5294 return true;
5295}
5296
5297bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5298 SDValue C) const {
5299 // TODO: We handle scalars using custom code, but generic combining could make
5300 // that unnecessary.
5301 APInt MulC;
5302 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5303 return false;
5304
5305 // Find the type this will be legalized too. Otherwise we might prematurely
5306 // convert this to shl+add/sub and then still have to type legalize those ops.
5307 // Another choice would be to defer the decision for illegal types until
5308 // after type legalization. But constant splat vectors of i64 can't make it
5309 // through type legalization on 32-bit targets so we would need to special
5310 // case vXi64.
5311 while (getTypeAction(Context, VT) != TypeLegal)
5312 VT = getTypeToTransformTo(Context, VT);
5313
5314 // If vector multiply is legal, assume that's faster than shl + add/sub.
5315 // TODO: Multiply is a complex op with higher latency and lower throughput in
5316 // most implementations, so this check could be loosened based on type
5317 // and/or a CPU attribute.
5318 if (isOperationLegal(ISD::MUL, VT))
5319 return false;
5320
5321 // shl+add, shl+sub, shl+add+neg
5322 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5323 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5324}
5325
5326bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5327 unsigned Index) const {
5328 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5329 return false;
5330
5331 // Mask vectors support all subregister combinations and operations that
5332 // extract half of vector.
5333 if (ResVT.getVectorElementType() == MVT::i1)
5334 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5335 (Index == ResVT.getVectorNumElements()));
5336
5337 return (Index % ResVT.getVectorNumElements()) == 0;
5338}
5339
5340bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5341 unsigned Opc = VecOp.getOpcode();
5342
5343 // Assume target opcodes can't be scalarized.
5344 // TODO - do we have any exceptions?
5345 if (Opc >= ISD::BUILTIN_OP_END)
5346 return false;
5347
5348 // If the vector op is not supported, try to convert to scalar.
5349 EVT VecVT = VecOp.getValueType();
5350 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5351 return true;
5352
5353 // If the vector op is supported, but the scalar op is not, the transform may
5354 // not be worthwhile.
5355 EVT ScalarVT = VecVT.getScalarType();
5356 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5357}
5358
5359bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5360 bool) const {
5361 // TODO: Allow vectors?
5362 if (VT.isVector())
5363 return false;
5364 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5365}
5366
5367bool X86TargetLowering::isCheapToSpeculateCttz() const {
5368 // Speculate cttz only if we can directly use TZCNT.
5369 return Subtarget.hasBMI();
5370}
5371
5372bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5373 // Speculate ctlz only if we can directly use LZCNT.
5374 return Subtarget.hasLZCNT();
5375}
5376
5377bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5378 const SelectionDAG &DAG,
5379 const MachineMemOperand &MMO) const {
5380 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5381 BitcastVT.getVectorElementType() == MVT::i1)
5382 return false;
5383
5384 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5385 return false;
5386
5387 // If both types are legal vectors, it's always ok to convert them.
5388 if (LoadVT.isVector() && BitcastVT.isVector() &&
5389 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5390 return true;
5391
5392 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5393}
5394
5395bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5396 const SelectionDAG &DAG) const {
5397 // Do not merge to float value size (128 bytes) if no implicit
5398 // float attribute is set.
5399 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5400 Attribute::NoImplicitFloat);
5401
5402 if (NoFloat) {
5403 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5404 return (MemVT.getSizeInBits() <= MaxIntSize);
5405 }
5406 // Make sure we don't merge greater than our preferred vector
5407 // width.
5408 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5409 return false;
5410
5411 return true;
5412}
5413
5414bool X86TargetLowering::isCtlzFast() const {
5415 return Subtarget.hasFastLZCNT();
5416}
5417
5418bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5419 const Instruction &AndI) const {
5420 return true;
5421}
5422
5423bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5424 EVT VT = Y.getValueType();
5425
5426 if (VT.isVector())
5427 return false;
5428
5429 if (!Subtarget.hasBMI())
5430 return false;
5431
5432 // There are only 32-bit and 64-bit forms for 'andn'.
5433 if (VT != MVT::i32 && VT != MVT::i64)
5434 return false;
5435
5436 return !isa<ConstantSDNode>(Y);
5437}
5438
5439bool X86TargetLowering::hasAndNot(SDValue Y) const {
5440 EVT VT = Y.getValueType();
5441
5442 if (!VT.isVector())
5443 return hasAndNotCompare(Y);
5444
5445 // Vector.
5446
5447 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5448 return false;
5449
5450 if (VT == MVT::v4i32)
5451 return true;
5452
5453 return Subtarget.hasSSE2();
5454}
5455
5456bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5457 return X.getValueType().isScalarInteger(); // 'bt'
5458}
5459
5460bool X86TargetLowering::
5461 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5462 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5463 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5464 SelectionDAG &DAG) const {
5465 // Does baseline recommend not to perform the fold by default?
5466 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5467 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5468 return false;
5469 // For scalars this transform is always beneficial.
5470 if (X.getValueType().isScalarInteger())
5471 return true;
5472 // If all the shift amounts are identical, then transform is beneficial even
5473 // with rudimentary SSE2 shifts.
5474 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5475 return true;
5476 // If we have AVX2 with it's powerful shift operations, then it's also good.
5477 if (Subtarget.hasAVX2())
5478 return true;
5479 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5480 return NewShiftOpcode == ISD::SHL;
5481}
5482
5483bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5484 const SDNode *N, CombineLevel Level) const {
5485 assert(((N->getOpcode() == ISD::SHL &&((void)0)
5486 N->getOperand(0).getOpcode() == ISD::SRL) ||((void)0)
5487 (N->getOpcode() == ISD::SRL &&((void)0)
5488 N->getOperand(0).getOpcode() == ISD::SHL)) &&((void)0)
5489 "Expected shift-shift mask")((void)0);
5490 EVT VT = N->getValueType(0);
5491 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5492 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5493 // Only fold if the shift values are equal - so it folds to AND.
5494 // TODO - we should fold if either is a non-uniform vector but we don't do
5495 // the fold for non-splats yet.
5496 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5497 }
5498 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5499}
5500
5501bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5502 EVT VT = Y.getValueType();
5503
5504 // For vectors, we don't have a preference, but we probably want a mask.
5505 if (VT.isVector())
5506 return false;
5507
5508 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5509 if (VT == MVT::i64 && !Subtarget.is64Bit())
5510 return false;
5511
5512 return true;
5513}
5514
5515bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5516 SDNode *N) const {
5517 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5518 !Subtarget.isOSWindows())
5519 return false;
5520 return true;
5521}
5522
5523bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5524 // Any legal vector type can be splatted more efficiently than
5525 // loading/spilling from memory.
5526 return isTypeLegal(VT);
5527}
5528
5529MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5530 MVT VT = MVT::getIntegerVT(NumBits);
5531 if (isTypeLegal(VT))
5532 return VT;
5533
5534 // PMOVMSKB can handle this.
5535 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5536 return MVT::v16i8;
5537
5538 // VPMOVMSKB can handle this.
5539 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5540 return MVT::v32i8;
5541
5542 // TODO: Allow 64-bit type for 32-bit target.
5543 // TODO: 512-bit types should be allowed, but make sure that those
5544 // cases are handled in combineVectorSizedSetCCEquality().
5545
5546 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5547}
5548
5549/// Val is the undef sentinel value or equal to the specified value.
5550static bool isUndefOrEqual(int Val, int CmpVal) {
5551 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5552}
5553
5554/// Return true if every element in Mask is the undef sentinel value or equal to
5555/// the specified value..
5556static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5557 return llvm::all_of(Mask, [CmpVal](int M) {
5558 return (M == SM_SentinelUndef) || (M == CmpVal);
5559 });
5560}
5561
5562/// Val is either the undef or zero sentinel value.
5563static bool isUndefOrZero(int Val) {
5564 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5565}
5566
5567/// Return true if every element in Mask, beginning from position Pos and ending
5568/// in Pos+Size is the undef sentinel value.
5569static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5570 return llvm::all_of(Mask.slice(Pos, Size),
5571 [](int M) { return M == SM_SentinelUndef; });
5572}
5573
5574/// Return true if the mask creates a vector whose lower half is undefined.
5575static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5576 unsigned NumElts = Mask.size();
5577 return isUndefInRange(Mask, 0, NumElts / 2);
5578}
5579
5580/// Return true if the mask creates a vector whose upper half is undefined.
5581static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5582 unsigned NumElts = Mask.size();
5583 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5584}
5585
5586/// Return true if Val falls within the specified range (L, H].
5587static bool isInRange(int Val, int Low, int Hi) {
5588 return (Val >= Low && Val < Hi);
5589}
5590
5591/// Return true if the value of any element in Mask falls within the specified
5592/// range (L, H].
5593static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5594 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5595}
5596
5597/// Return true if the value of any element in Mask is the zero sentinel value.
5598static bool isAnyZero(ArrayRef<int> Mask) {
5599 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5600}
5601
5602/// Return true if the value of any element in Mask is the zero or undef
5603/// sentinel values.
5604static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5605 return llvm::any_of(Mask, [](int M) {
5606 return M == SM_SentinelZero || M == SM_SentinelUndef;
5607 });
5608}
5609
5610/// Return true if Val is undef or if its value falls within the
5611/// specified range (L, H].
5612static bool isUndefOrInRange(int Val, int Low, int Hi) {
5613 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5614}
5615
5616/// Return true if every element in Mask is undef or if its value
5617/// falls within the specified range (L, H].
5618static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5619 return llvm::all_of(
5620 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5621}
5622
5623/// Return true if Val is undef, zero or if its value falls within the
5624/// specified range (L, H].
5625static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5626 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5627}
5628
5629/// Return true if every element in Mask is undef, zero or if its value
5630/// falls within the specified range (L, H].
5631static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5632 return llvm::all_of(
5633 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5634}
5635
5636/// Return true if every element in Mask, beginning
5637/// from position Pos and ending in Pos + Size, falls within the specified
5638/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5639static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5640 unsigned Size, int Low, int Step = 1) {
5641 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5642 if (!isUndefOrEqual(Mask[i], Low))
5643 return false;
5644 return true;
5645}
5646
5647/// Return true if every element in Mask, beginning
5648/// from position Pos and ending in Pos+Size, falls within the specified
5649/// sequential range (Low, Low+Size], or is undef or is zero.
5650static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5651 unsigned Size, int Low,
5652 int Step = 1) {
5653 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5654 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5655 return false;
5656 return true;
5657}
5658
5659/// Return true if every element in Mask, beginning
5660/// from position Pos and ending in Pos+Size is undef or is zero.
5661static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5662 unsigned Size) {
5663 return llvm::all_of(Mask.slice(Pos, Size),
5664 [](int M) { return isUndefOrZero(M); });
5665}
5666
5667/// Helper function to test whether a shuffle mask could be
5668/// simplified by widening the elements being shuffled.
5669///
5670/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5671/// leaves it in an unspecified state.
5672///
5673/// NOTE: This must handle normal vector shuffle masks and *target* vector
5674/// shuffle masks. The latter have the special property of a '-2' representing
5675/// a zero-ed lane of a vector.
5676static bool canWidenShuffleElements(ArrayRef<int> Mask,
5677 SmallVectorImpl<int> &WidenedMask) {
5678 WidenedMask.assign(Mask.size() / 2, 0);
5679 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5680 int M0 = Mask[i];
5681 int M1 = Mask[i + 1];
5682
5683 // If both elements are undef, its trivial.
5684 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5685 WidenedMask[i / 2] = SM_SentinelUndef;
5686 continue;
5687 }
5688
5689 // Check for an undef mask and a mask value properly aligned to fit with
5690 // a pair of values. If we find such a case, use the non-undef mask's value.
5691 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5692 WidenedMask[i / 2] = M1 / 2;
5693 continue;
5694 }
5695 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5696 WidenedMask[i / 2] = M0 / 2;
5697 continue;
5698 }
5699
5700 // When zeroing, we need to spread the zeroing across both lanes to widen.
5701 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5702 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5703 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5704 WidenedMask[i / 2] = SM_SentinelZero;
5705 continue;
5706 }
5707 return false;
5708 }
5709
5710 // Finally check if the two mask values are adjacent and aligned with
5711 // a pair.
5712 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5713 WidenedMask[i / 2] = M0 / 2;
5714 continue;
5715 }
5716
5717 // Otherwise we can't safely widen the elements used in this shuffle.
5718 return false;
5719 }
5720 assert(WidenedMask.size() == Mask.size() / 2 &&((void)0)
5721 "Incorrect size of mask after widening the elements!")((void)0);
5722
5723 return true;
5724}
5725
5726static bool canWidenShuffleElements(ArrayRef<int> Mask,
5727 const APInt &Zeroable,
5728 bool V2IsZero,
5729 SmallVectorImpl<int> &WidenedMask) {
5730 // Create an alternative mask with info about zeroable elements.
5731 // Here we do not set undef elements as zeroable.
5732 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5733 if (V2IsZero) {
5734 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")((void)0);
5735 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5736 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5737 ZeroableMask[i] = SM_SentinelZero;
5738 }
5739 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5740}
5741
5742static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5743 SmallVector<int, 32> WidenedMask;
5744 return canWidenShuffleElements(Mask, WidenedMask);
5745}
5746
5747// Attempt to narrow/widen shuffle mask until it matches the target number of
5748// elements.
5749static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5750 SmallVectorImpl<int> &ScaledMask) {
5751 unsigned NumSrcElts = Mask.size();
5752 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&((void)0)
5753 "Illegal shuffle scale factor")((void)0);
5754
5755 // Narrowing is guaranteed to work.
5756 if (NumDstElts >= NumSrcElts) {
5757 int Scale = NumDstElts / NumSrcElts;
5758 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5759 return true;
5760 }
5761
5762 // We have to repeat the widening until we reach the target size, but we can
5763 // split out the first widening as it sets up ScaledMask for us.
5764 if (canWidenShuffleElements(Mask, ScaledMask)) {
5765 while (ScaledMask.size() > NumDstElts) {
5766 SmallVector<int, 16> WidenedMask;
5767 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5768 return false;
5769 ScaledMask = std::move(WidenedMask);
5770 }
5771 return true;
5772 }
5773
5774 return false;
5775}
5776
5777/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5778bool X86::isZeroNode(SDValue Elt) {
5779 return isNullConstant(Elt) || isNullFPConstant(Elt);
5780}
5781
5782// Build a vector of constants.
5783// Use an UNDEF node if MaskElt == -1.
5784// Split 64-bit constants in the 32-bit mode.
5785static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5786 const SDLoc &dl, bool IsMask = false) {
5787
5788 SmallVector<SDValue, 32> Ops;
5789 bool Split = false;
5790
5791 MVT ConstVecVT = VT;
5792 unsigned NumElts = VT.getVectorNumElements();
5793 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5794 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5795 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5796 Split = true;
5797 }
5798
5799 MVT EltVT = ConstVecVT.getVectorElementType();
5800 for (unsigned i = 0; i < NumElts; ++i) {
5801 bool IsUndef = Values[i] < 0 && IsMask;
5802 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5803 DAG.getConstant(Values[i], dl, EltVT);
5804 Ops.push_back(OpNode);
5805 if (Split)
5806 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5807 DAG.getConstant(0, dl, EltVT));
5808 }
5809 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5810 if (Split)
5811 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5812 return ConstsNode;
5813}
5814
5815static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5816 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5817 assert(Bits.size() == Undefs.getBitWidth() &&((void)0)
5818 "Unequal constant and undef arrays")((void)0);
5819 SmallVector<SDValue, 32> Ops;
5820 bool Split = false;
5821
5822 MVT ConstVecVT = VT;
5823 unsigned NumElts = VT.getVectorNumElements();
5824 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5825 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5826 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5827 Split = true;
5828 }
5829
5830 MVT EltVT = ConstVecVT.getVectorElementType();
5831 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5832 if (Undefs[i]) {
5833 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5834 continue;
5835 }
5836 const APInt &V = Bits[i];
5837 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((void)0);
5838 if (Split) {
5839 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5840 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5841 } else if (EltVT == MVT::f32) {
5842 APFloat FV(APFloat::IEEEsingle(), V);
5843 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5844 } else if (EltVT == MVT::f64) {
5845 APFloat FV(APFloat::IEEEdouble(), V);
5846 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5847 } else {
5848 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5849 }
5850 }
5851
5852 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5853 return DAG.getBitcast(VT, ConstsNode);
5854}
5855
5856/// Returns a vector of specified type with all zero elements.
5857static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5858 SelectionDAG &DAG, const SDLoc &dl) {
5859 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||((void)0)
5860 VT.getVectorElementType() == MVT::i1) &&((void)0)
5861 "Unexpected vector type")((void)0);
5862
5863 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5864 // type. This ensures they get CSE'd. But if the integer type is not
5865 // available, use a floating-point +0.0 instead.
5866 SDValue Vec;
5867 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5868 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5869 } else if (VT.isFloatingPoint()) {
5870 Vec = DAG.getConstantFP(+0.0, dl, VT);
5871 } else if (VT.getVectorElementType() == MVT::i1) {
5872 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&((void)0)
5873 "Unexpected vector type")((void)0);
5874 Vec = DAG.getConstant(0, dl, VT);
5875 } else {
5876 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5877 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5878 }
5879 return DAG.getBitcast(VT, Vec);
5880}
5881
5882static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5883 const SDLoc &dl, unsigned vectorWidth) {
5884 EVT VT = Vec.getValueType();
5885 EVT ElVT = VT.getVectorElementType();
5886 unsigned Factor = VT.getSizeInBits() / vectorWidth;
5887 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5888 VT.getVectorNumElements() / Factor);
5889
5890 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5891 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5892 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
5893
5894 // This is the index of the first element of the vectorWidth-bit chunk
5895 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5896 IdxVal &= ~(ElemsPerChunk - 1);
5897
5898 // If the input is a buildvector just emit a smaller one.
5899 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5900 return DAG.getBuildVector(ResultVT, dl,
5901 Vec->ops().slice(IdxVal, ElemsPerChunk));
5902
5903 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5904 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5905}
5906
5907/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5908/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5909/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5910/// instructions or a simple subregister reference. Idx is an index in the
5911/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5912/// lowering EXTRACT_VECTOR_ELT operations easier.
5913static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5914 SelectionDAG &DAG, const SDLoc &dl) {
5915 assert((Vec.getValueType().is256BitVector() ||((void)0)
5916 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")((void)0);
5917 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5918}
5919
5920/// Generate a DAG to grab 256-bits from a 512-bit vector.
5921static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5922 SelectionDAG &DAG, const SDLoc &dl) {
5923 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((void)0);
5924 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5925}
5926
5927static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5928 SelectionDAG &DAG, const SDLoc &dl,
5929 unsigned vectorWidth) {
5930 assert((vectorWidth == 128 || vectorWidth == 256) &&((void)0)
5931 "Unsupported vector width")((void)0);
5932 // Inserting UNDEF is Result
5933 if (Vec.isUndef())
5934 return Result;
5935 EVT VT = Vec.getValueType();
5936 EVT ElVT = VT.getVectorElementType();
5937 EVT ResultVT = Result.getValueType();
5938
5939 // Insert the relevant vectorWidth bits.
5940 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5941 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
5942
5943 // This is the index of the first element of the vectorWidth-bit chunk
5944 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5945 IdxVal &= ~(ElemsPerChunk - 1);
5946
5947 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5948 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5949}
5950
5951/// Generate a DAG to put 128-bits into a vector > 128 bits. This
5952/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5953/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5954/// simple superregister reference. Idx is an index in the 128 bits
5955/// we want. It need not be aligned to a 128-bit boundary. That makes
5956/// lowering INSERT_VECTOR_ELT operations easier.
5957static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5958 SelectionDAG &DAG, const SDLoc &dl) {
5959 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((void)0);
5960 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5961}
5962
5963/// Widen a vector to a larger size with the same scalar type, with the new
5964/// elements either zero or undef.
5965static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5966 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5967 const SDLoc &dl) {
5968 assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&((void)0)
5969 Vec.getValueType().getScalarType() == VT.getScalarType() &&((void)0)
5970 "Unsupported vector widening type")((void)0);
5971 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5972 : DAG.getUNDEF(VT);
5973 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5974 DAG.getIntPtrConstant(0, dl));
5975}
5976
5977/// Widen a vector to a larger size with the same scalar type, with the new
5978/// elements either zero or undef.
5979static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5980 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5981 const SDLoc &dl, unsigned WideSizeInBits) {
5982 assert(Vec.getValueSizeInBits() < WideSizeInBits &&((void)0)
5983 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&((void)0)
5984 "Unsupported vector widening type")((void)0);
5985 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5986 MVT SVT = Vec.getSimpleValueType().getScalarType();
5987 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5988 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5989}
5990
5991// Helper function to collect subvector ops that are concatenated together,
5992// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5993// The subvectors in Ops are guaranteed to be the same type.
5994static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5995 assert(Ops.empty() && "Expected an empty ops vector")((void)0);
5996
5997 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
5998 Ops.append(N->op_begin(), N->op_end());
5999 return true;
6000 }
6001
6002 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6003 SDValue Src = N->getOperand(0);
6004 SDValue Sub = N->getOperand(1);
6005 const APInt &Idx = N->getConstantOperandAPInt(2);
6006 EVT VT = Src.getValueType();
6007 EVT SubVT = Sub.getValueType();
6008
6009 // TODO - Handle more general insert_subvector chains.
6010 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6011 Idx == (VT.getVectorNumElements() / 2)) {
6012 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6013 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6014 Src.getOperand(1).getValueType() == SubVT &&
6015 isNullConstant(Src.getOperand(2))) {
6016 Ops.push_back(Src.getOperand(1));
6017 Ops.push_back(Sub);
6018 return true;
6019 }
6020 // insert_subvector(x, extract_subvector(x, lo), hi)
6021 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6022 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6023 Ops.append(2, Sub);
6024 return true;
6025 }
6026 }
6027 }
6028
6029 return false;
6030}
6031
6032static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6033 const SDLoc &dl) {
6034 EVT VT = Op.getValueType();
6035 unsigned NumElems = VT.getVectorNumElements();
6036 unsigned SizeInBits = VT.getSizeInBits();
6037 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&((void)0)
6038 "Can't split odd sized vector")((void)0);
6039
6040 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6041 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6042 return std::make_pair(Lo, Hi);
6043}
6044
6045// Split an unary integer op into 2 half sized ops.
6046static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6047 EVT VT = Op.getValueType();
6048
6049 // Make sure we only try to split 256/512-bit types to avoid creating
6050 // narrow vectors.
6051 assert((Op.getOperand(0).getValueType().is256BitVector() ||((void)0)
6052 Op.getOperand(0).getValueType().is512BitVector()) &&((void)0)
6053 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")((void)0);
6054 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==((void)0)
6055 VT.getVectorNumElements() &&((void)0)
6056 "Unexpected VTs!")((void)0);
6057
6058 SDLoc dl(Op);
6059
6060 // Extract the Lo/Hi vectors
6061 SDValue Lo, Hi;
6062 std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6063
6064 EVT LoVT, HiVT;
6065 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6066 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6067 DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6068 DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6069}
6070
6071/// Break a binary integer operation into 2 half sized ops and then
6072/// concatenate the result back.
6073static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6074 EVT VT = Op.getValueType();
6075
6076 // Sanity check that all the types match.
6077 assert(Op.getOperand(0).getValueType() == VT &&((void)0)
6078 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")((void)0);
6079 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")((void)0);
6080
6081 SDLoc dl(Op);
6082
6083 // Extract the LHS Lo/Hi vectors
6084 SDValue LHS1, LHS2;
6085 std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6086
6087 // Extract the RHS Lo/Hi vectors
6088 SDValue RHS1, RHS2;
6089 std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6090
6091 EVT LoVT, HiVT;
6092 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6093 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6094 DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6095 DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6096}
6097
6098// Helper for splitting operands of an operation to legal target size and
6099// apply a function on each part.
6100// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6101// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6102// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6103// The argument Builder is a function that will be applied on each split part:
6104// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6105template <typename F>
6106SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6107 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6108 F Builder, bool CheckBWI = true) {
6109 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")((void)0);
6110 unsigned NumSubs = 1;
6111 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6112 (!CheckBWI && Subtarget.useAVX512Regs())) {
6113 if (VT.getSizeInBits() > 512) {
6114 NumSubs = VT.getSizeInBits() / 512;
6115 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")((void)0);
6116 }
6117 } else if (Subtarget.hasAVX2()) {
6118 if (VT.getSizeInBits() > 256) {
6119 NumSubs = VT.getSizeInBits() / 256;
6120 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")((void)0);
6121 }
6122 } else {
6123 if (VT.getSizeInBits() > 128) {
6124 NumSubs = VT.getSizeInBits() / 128;
6125 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")((void)0);
6126 }
6127 }
6128
6129 if (NumSubs == 1)
6130 return Builder(DAG, DL, Ops);
6131
6132 SmallVector<SDValue, 4> Subs;
6133 for (unsigned i = 0; i != NumSubs; ++i) {
6134 SmallVector<SDValue, 2> SubOps;
6135 for (SDValue Op : Ops) {
6136 EVT OpVT = Op.getValueType();
6137 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6138 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6139 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6140 }
6141 Subs.push_back(Builder(DAG, DL, SubOps));
6142 }
6143 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6144}
6145
6146/// Insert i1-subvector to i1-vector.
6147static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6148 const X86Subtarget &Subtarget) {
6149
6150 SDLoc dl(Op);
6151 SDValue Vec = Op.getOperand(0);
6152 SDValue SubVec = Op.getOperand(1);
6153 SDValue Idx = Op.getOperand(2);
6154 unsigned IdxVal = Op.getConstantOperandVal(2);
6155
6156 // Inserting undef is a nop. We can just return the original vector.
6157 if (SubVec.isUndef())
6158 return Vec;
6159
6160 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6161 return Op;
6162
6163 MVT OpVT = Op.getSimpleValueType();
6164 unsigned NumElems = OpVT.getVectorNumElements();
6165 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6166
6167 // Extend to natively supported kshift.
6168 MVT WideOpVT = OpVT;
6169 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6170 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6171
6172 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6173 // if necessary.
6174 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6175 // May need to promote to a legal type.
6176 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6177 DAG.getConstant(0, dl, WideOpVT),
6178 SubVec, Idx);
6179 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6180 }
6181
6182 MVT SubVecVT = SubVec.getSimpleValueType();
6183 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6184 assert(IdxVal + SubVecNumElems <= NumElems &&((void)0)
6185 IdxVal % SubVecVT.getSizeInBits() == 0 &&((void)0)
6186 "Unexpected index value in INSERT_SUBVECTOR")((void)0);
6187
6188 SDValue Undef = DAG.getUNDEF(WideOpVT);
6189
6190 if (IdxVal == 0) {
6191 // Zero lower bits of the Vec
6192 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6193 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6194 ZeroIdx);
6195 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6196 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6197 // Merge them together, SubVec should be zero extended.
6198 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6199 DAG.getConstant(0, dl, WideOpVT),
6200 SubVec, ZeroIdx);
6201 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6202 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6203 }
6204
6205 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6206 Undef, SubVec, ZeroIdx);
6207
6208 if (Vec.isUndef()) {
6209 assert(IdxVal != 0 && "Unexpected index")((void)0);
6210 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6211 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6212 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6213 }
6214
6215 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6216 assert(IdxVal != 0 && "Unexpected index")((void)0);
6217 NumElems = WideOpVT.getVectorNumElements();
6218 unsigned ShiftLeft = NumElems - SubVecNumElems;
6219 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6220 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6221 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6222 if (ShiftRight != 0)
6223 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6224 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6225 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6226 }
6227
6228 // Simple case when we put subvector in the upper part
6229 if (IdxVal + SubVecNumElems == NumElems) {
6230 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6231 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6232 if (SubVecNumElems * 2 == NumElems) {
6233 // Special case, use legal zero extending insert_subvector. This allows
6234 // isel to optimize when bits are known zero.
6235 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6236 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6237 DAG.getConstant(0, dl, WideOpVT),
6238 Vec, ZeroIdx);
6239 } else {
6240 // Otherwise use explicit shifts to zero the bits.
6241 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6242 Undef, Vec, ZeroIdx);
6243 NumElems = WideOpVT.getVectorNumElements();
6244 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6245 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6246 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6247 }
6248 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6249 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6250 }
6251
6252 // Inserting into the middle is more complicated.
6253
6254 NumElems = WideOpVT.getVectorNumElements();
6255
6256 // Widen the vector if needed.
6257 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6258
6259 unsigned ShiftLeft = NumElems - SubVecNumElems;
6260 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6261
6262 // Do an optimization for the the most frequently used types.
6263 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6264 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6265 Mask0.flipAllBits();
6266 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6267 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6268 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6269 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6270 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6271 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6272 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6273 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6274
6275 // Reduce to original width if needed.
6276 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6277 }
6278
6279 // Clear the upper bits of the subvector and move it to its insert position.
6280 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6281 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6282 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6283 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6284
6285 // Isolate the bits below the insertion point.
6286 unsigned LowShift = NumElems - IdxVal;
6287 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6288 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6289 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6290 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6291
6292 // Isolate the bits after the last inserted bit.
6293 unsigned HighShift = IdxVal + SubVecNumElems;
6294 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6295 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6296 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6297 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6298
6299 // Now OR all 3 pieces together.
6300 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6301 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6302
6303 // Reduce to original width if needed.
6304 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6305}
6306
6307static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6308 const SDLoc &dl) {
6309 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")((void)0);
6310 EVT SubVT = V1.getValueType();
6311 EVT SubSVT = SubVT.getScalarType();
6312 unsigned SubNumElts = SubVT.getVectorNumElements();
6313 unsigned SubVectorWidth = SubVT.getSizeInBits();
6314 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6315 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6316 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6317}
6318
6319/// Returns a vector of specified type with all bits set.
6320/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6321/// Then bitcast to their original type, ensuring they get CSE'd.
6322static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6323 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
6324 "Expected a 128/256/512-bit vector type")((void)0);
6325
6326 APInt Ones = APInt::getAllOnesValue(32);
6327 unsigned NumElts = VT.getSizeInBits() / 32;
6328 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6329 return DAG.getBitcast(VT, Vec);
6330}
6331
6332// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6333static unsigned getOpcode_EXTEND(unsigned Opcode) {
6334 switch (Opcode) {
6335 case ISD::ANY_EXTEND:
6336 case ISD::ANY_EXTEND_VECTOR_INREG:
6337 return ISD::ANY_EXTEND;
6338 case ISD::ZERO_EXTEND:
6339 case ISD::ZERO_EXTEND_VECTOR_INREG:
6340 return ISD::ZERO_EXTEND;
6341 case ISD::SIGN_EXTEND:
6342 case ISD::SIGN_EXTEND_VECTOR_INREG:
6343 return ISD::SIGN_EXTEND;
6344 }
6345 llvm_unreachable("Unknown opcode")__builtin_unreachable();
6346}
6347
6348// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6349static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6350 switch (Opcode) {
6351 case ISD::ANY_EXTEND:
6352 case ISD::ANY_EXTEND_VECTOR_INREG:
6353 return ISD::ANY_EXTEND_VECTOR_INREG;
6354 case ISD::ZERO_EXTEND:
6355 case ISD::ZERO_EXTEND_VECTOR_INREG:
6356 return ISD::ZERO_EXTEND_VECTOR_INREG;
6357 case ISD::SIGN_EXTEND:
6358 case ISD::SIGN_EXTEND_VECTOR_INREG:
6359 return ISD::SIGN_EXTEND_VECTOR_INREG;
6360 }
6361 llvm_unreachable("Unknown opcode")__builtin_unreachable();
6362}
6363
6364static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6365 SDValue In, SelectionDAG &DAG) {
6366 EVT InVT = In.getValueType();
6367 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")((void)0);
6368 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||((void)0)
6369 ISD::ZERO_EXTEND == Opcode) &&((void)0)
6370 "Unknown extension opcode")((void)0);
6371
6372 // For 256-bit vectors, we only need the lower (128-bit) input half.
6373 // For 512-bit vectors, we only need the lower input half or quarter.
6374 if (InVT.getSizeInBits() > 128) {
6375 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&((void)0)
6376 "Expected VTs to be the same size!")((void)0);
6377 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6378 In = extractSubVector(In, 0, DAG, DL,
6379 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6380 InVT = In.getValueType();
6381 }
6382
6383 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6384 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6385
6386 return DAG.getNode(Opcode, DL, VT, In);
6387}
6388
6389// Match (xor X, -1) -> X.
6390// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6391// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6392static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6393 V = peekThroughBitcasts(V);
6394 if (V.getOpcode() == ISD::XOR &&
6395 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6396 return V.getOperand(0);
6397 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6398 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6399 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6400 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6401 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6402 Not, V.getOperand(1));
6403 }
6404 }
6405 SmallVector<SDValue, 2> CatOps;
6406 if (collectConcatOps(V.getNode(), CatOps)) {
6407 for (SDValue &CatOp : CatOps) {
6408 SDValue NotCat = IsNOT(CatOp, DAG);
6409 if (!NotCat) return SDValue();
6410 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6411 }
6412 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6413 }
6414 return SDValue();
6415}
6416
6417void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6418 bool Lo, bool Unary) {
6419 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&((void)0)
6420 "Illegal vector type to unpack")((void)0);
6421 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6422 int NumElts = VT.getVectorNumElements();
6423 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6424 for (int i = 0; i < NumElts; ++i) {
6425 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6426 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6427 Pos += (Unary ? 0 : NumElts * (i % 2));
6428 Pos += (Lo ? 0 : NumEltsInLane / 2);
6429 Mask.push_back(Pos);
6430 }
6431}
6432
6433/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6434/// imposed by AVX and specific to the unary pattern. Example:
6435/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6436/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6437void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6438 bool Lo) {
6439 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6440 int NumElts = VT.getVectorNumElements();
6441 for (int i = 0; i < NumElts; ++i) {
6442 int Pos = i / 2;
6443 Pos += (Lo ? 0 : NumElts / 2);
6444 Mask.push_back(Pos);
6445 }
6446}
6447
6448/// Returns a vector_shuffle node for an unpackl operation.
6449static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6450 SDValue V1, SDValue V2) {
6451 SmallVector<int, 8> Mask;
6452 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6453 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6454}
6455
6456/// Returns a vector_shuffle node for an unpackh operation.
6457static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6458 SDValue V1, SDValue V2) {
6459 SmallVector<int, 8> Mask;
6460 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6461 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6462}
6463
6464/// Return a vector_shuffle of the specified vector of zero or undef vector.
6465/// This produces a shuffle where the low element of V2 is swizzled into the
6466/// zero/undef vector, landing at element Idx.
6467/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6468static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6469 bool IsZero,
6470 const X86Subtarget &Subtarget,
6471 SelectionDAG &DAG) {
6472 MVT VT = V2.getSimpleValueType();
6473 SDValue V1 = IsZero
6474 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6475 int NumElems = VT.getVectorNumElements();
6476 SmallVector<int, 16> MaskVec(NumElems);
6477 for (int i = 0; i != NumElems; ++i)
6478 // If this is the insertion idx, put the low elt of V2 here.
6479 MaskVec[i] = (i == Idx) ? NumElems : i;
6480 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6481}
6482
6483static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6484 if (Ptr.getOpcode() == X86ISD::Wrapper ||
6485 Ptr.getOpcode() == X86ISD::WrapperRIP)
6486 Ptr = Ptr.getOperand(0);
6487
6488 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6489 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6490 return nullptr;
6491
6492 return CNode->getConstVal();
6493}
6494
6495static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6496 if (!Load || !ISD::isNormalLoad(Load))
6497 return nullptr;
6498 return getTargetConstantFromBasePtr(Load->getBasePtr());
6499}
6500
6501static const Constant *getTargetConstantFromNode(SDValue Op) {
6502 Op = peekThroughBitcasts(Op);
6503 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6504}
6505
6506const Constant *
6507X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6508 assert(LD && "Unexpected null LoadSDNode")((void)0);
6509 return getTargetConstantFromNode(LD);
6510}
6511
6512// Extract raw constant bits from constant pools.
6513static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6514 APInt &UndefElts,
6515 SmallVectorImpl<APInt> &EltBits,
6516 bool AllowWholeUndefs = true,
6517 bool AllowPartialUndefs = true) {
6518 assert(EltBits.empty() && "Expected an empty EltBits vector")((void)0);
6519
6520 Op = peekThroughBitcasts(Op);
6521
6522 EVT VT = Op.getValueType();
6523 unsigned SizeInBits = VT.getSizeInBits();
6524 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")((void)0);
6525 unsigned NumElts = SizeInBits / EltSizeInBits;
6526
6527 // Bitcast a source array of element bits to the target size.
6528 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6529 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6530 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6531 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&((void)0)
6532 "Constant bit sizes don't match")((void)0);
6533
6534 // Don't split if we don't allow undef bits.
6535 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6536 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6537 return false;
6538
6539 // If we're already the right size, don't bother bitcasting.
6540 if (NumSrcElts == NumElts) {
6541 UndefElts = UndefSrcElts;
6542 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6543 return true;
6544 }
6545
6546 // Extract all the undef/constant element data and pack into single bitsets.
6547 APInt UndefBits(SizeInBits, 0);
6548 APInt MaskBits(SizeInBits, 0);
6549
6550 for (unsigned i = 0; i != NumSrcElts; ++i) {
6551 unsigned BitOffset = i * SrcEltSizeInBits;
6552 if (UndefSrcElts[i])
6553 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6554 MaskBits.insertBits(SrcEltBits[i], BitOffset);
6555 }
6556
6557 // Split the undef/constant single bitset data into the target elements.
6558 UndefElts = APInt(NumElts, 0);
6559 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6560
6561 for (unsigned i = 0; i != NumElts; ++i) {
6562 unsigned BitOffset = i * EltSizeInBits;
6563 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6564
6565 // Only treat an element as UNDEF if all bits are UNDEF.
6566 if (UndefEltBits.isAllOnesValue()) {
6567 if (!AllowWholeUndefs)
6568 return false;
6569 UndefElts.setBit(i);
6570 continue;
6571 }
6572
6573 // If only some bits are UNDEF then treat them as zero (or bail if not
6574 // supported).
6575 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6576 return false;
6577
6578 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6579 }
6580 return true;
6581 };
6582
6583 // Collect constant bits and insert into mask/undef bit masks.
6584 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6585 unsigned UndefBitIndex) {
6586 if (!Cst)
6587 return false;
6588 if (isa<UndefValue>(Cst)) {
6589 Undefs.setBit(UndefBitIndex);
6590 return true;
6591 }
6592 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6593 Mask = CInt->getValue();
6594 return true;
6595 }
6596 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6597 Mask = CFP->getValueAPF().bitcastToAPInt();
6598 return true;
6599 }
6600 return false;
6601 };
6602
6603 // Handle UNDEFs.
6604 if (Op.isUndef()) {
6605 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6606 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6607 return CastBitData(UndefSrcElts, SrcEltBits);
6608 }
6609
6610 // Extract scalar constant bits.
6611 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6612 APInt UndefSrcElts = APInt::getNullValue(1);
6613 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6614 return CastBitData(UndefSrcElts, SrcEltBits);
6615 }
6616 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6617 APInt UndefSrcElts = APInt::getNullValue(1);
6618 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6619 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6620 return CastBitData(UndefSrcElts, SrcEltBits);
6621 }
6622
6623 // Extract constant bits from build vector.
6624 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6625 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6626 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6627
6628 APInt UndefSrcElts(NumSrcElts, 0);
6629 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6630 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6631 const SDValue &Src = Op.getOperand(i);
6632 if (Src.isUndef()) {
6633 UndefSrcElts.setBit(i);
6634 continue;
6635 }
6636 auto *Cst = cast<ConstantSDNode>(Src);
6637 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6638 }
6639 return CastBitData(UndefSrcElts, SrcEltBits);
6640 }
6641 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6642 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6643 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6644
6645 APInt UndefSrcElts(NumSrcElts, 0);
6646 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6647 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6648 const SDValue &Src = Op.getOperand(i);
6649 if (Src.isUndef()) {
6650 UndefSrcElts.setBit(i);
6651 continue;
6652 }
6653 auto *Cst = cast<ConstantFPSDNode>(Src);
6654 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6655 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6656 }
6657 return CastBitData(UndefSrcElts, SrcEltBits);
6658 }
6659
6660 // Extract constant bits from constant pool vector.
6661 if (auto *Cst = getTargetConstantFromNode(Op)) {
6662 Type *CstTy = Cst->getType();
6663 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6664 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6665 return false;
6666
6667 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6668 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6669
6670 APInt UndefSrcElts(NumSrcElts, 0);
6671 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6672 for (unsigned i = 0; i != NumSrcElts; ++i)
6673 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6674 UndefSrcElts, i))
6675 return false;
6676
6677 return CastBitData(UndefSrcElts, SrcEltBits);
6678 }
6679
6680 // Extract constant bits from a broadcasted constant pool scalar.
6681 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6682 EltSizeInBits <= VT.getScalarSizeInBits()) {
6683 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6684 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6685 return false;
6686
6687 SDValue Ptr = MemIntr->getBasePtr();
6688 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6689 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6690 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6691
6692 APInt UndefSrcElts(NumSrcElts, 0);
6693 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6694 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6695 if (UndefSrcElts[0])
6696 UndefSrcElts.setBits(0, NumSrcElts);
6697 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6698 return CastBitData(UndefSrcElts, SrcEltBits);
6699 }
6700 }
6701 }
6702
6703 // Extract constant bits from a subvector broadcast.
6704 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6705 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6706 SDValue Ptr = MemIntr->getBasePtr();
6707 // The source constant may be larger than the subvector broadcast,
6708 // ensure we extract the correct subvector constants.
6709 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6710 Type *CstTy = Cst->getType();
6711 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6712 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
6713 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
6714 (SizeInBits % SubVecSizeInBits) != 0)
6715 return false;
6716 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
6717 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
6718 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
6719 APInt UndefSubElts(NumSubElts, 0);
6720 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6721 APInt(CstEltSizeInBits, 0));
6722 for (unsigned i = 0; i != NumSubElts; ++i) {
6723 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6724 UndefSubElts, i))
6725 return false;
6726 for (unsigned j = 1; j != NumSubVecs; ++j)
6727 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6728 }
6729 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6730 UndefSubElts);
6731 return CastBitData(UndefSubElts, SubEltBits);
6732 }
6733 }
6734
6735 // Extract a rematerialized scalar constant insertion.
6736 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6737 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6738 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6739 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6740 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6741
6742 APInt UndefSrcElts(NumSrcElts, 0);
6743 SmallVector<APInt, 64> SrcEltBits;
6744 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6745 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6746 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6747 return CastBitData(UndefSrcElts, SrcEltBits);
6748 }
6749
6750 // Insert constant bits from a base and sub vector sources.
6751 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6752 // If bitcasts to larger elements we might lose track of undefs - don't
6753 // allow any to be safe.
6754 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6755 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6756
6757 APInt UndefSrcElts, UndefSubElts;
6758 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6759 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6760 UndefSubElts, EltSubBits,
6761 AllowWholeUndefs && AllowUndefs,
6762 AllowPartialUndefs && AllowUndefs) &&
6763 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6764 UndefSrcElts, EltSrcBits,
6765 AllowWholeUndefs && AllowUndefs,
6766 AllowPartialUndefs && AllowUndefs)) {
6767 unsigned BaseIdx = Op.getConstantOperandVal(2);
6768 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6769 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6770 EltSrcBits[BaseIdx + i] = EltSubBits[i];
6771 return CastBitData(UndefSrcElts, EltSrcBits);
6772 }
6773 }
6774
6775 // Extract constant bits from a subvector's source.
6776 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6777 // TODO - support extract_subvector through bitcasts.
6778 if (EltSizeInBits != VT.getScalarSizeInBits())
6779 return false;
6780
6781 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6782 UndefElts, EltBits, AllowWholeUndefs,
6783 AllowPartialUndefs)) {
6784 EVT SrcVT = Op.getOperand(0).getValueType();
6785 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6786 unsigned NumSubElts = VT.getVectorNumElements();
6787 unsigned BaseIdx = Op.getConstantOperandVal(1);
6788 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6789 if ((BaseIdx + NumSubElts) != NumSrcElts)
6790 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6791 if (BaseIdx != 0)
6792 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6793 return true;
6794 }
6795 }
6796
6797 // Extract constant bits from shuffle node sources.
6798 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6799 // TODO - support shuffle through bitcasts.
6800 if (EltSizeInBits != VT.getScalarSizeInBits())
6801 return false;
6802
6803 ArrayRef<int> Mask = SVN->getMask();
6804 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6805 llvm::any_of(Mask, [](int M) { return M < 0; }))
6806 return false;
6807
6808 APInt UndefElts0, UndefElts1;
6809 SmallVector<APInt, 32> EltBits0, EltBits1;
6810 if (isAnyInRange(Mask, 0, NumElts) &&
6811 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6812 UndefElts0, EltBits0, AllowWholeUndefs,
6813 AllowPartialUndefs))
6814 return false;
6815 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6816 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6817 UndefElts1, EltBits1, AllowWholeUndefs,
6818 AllowPartialUndefs))
6819 return false;
6820
6821 UndefElts = APInt::getNullValue(NumElts);
6822 for (int i = 0; i != (int)NumElts; ++i) {
6823 int M = Mask[i];
6824 if (M < 0) {
6825 UndefElts.setBit(i);
6826 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6827 } else if (M < (int)NumElts) {
6828 if (UndefElts0[M])
6829 UndefElts.setBit(i);
6830 EltBits.push_back(EltBits0[M]);
6831 } else {
6832 if (UndefElts1[M - NumElts])
6833 UndefElts.setBit(i);
6834 EltBits.push_back(EltBits1[M - NumElts]);
6835 }
6836 }
6837 return true;
6838 }
6839
6840 return false;
6841}
6842
6843namespace llvm {
6844namespace X86 {
6845bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
6846 APInt UndefElts;
6847 SmallVector<APInt, 16> EltBits;
6848 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6849 UndefElts, EltBits, true,
6850 AllowPartialUndefs)) {
6851 int SplatIndex = -1;
6852 for (int i = 0, e = EltBits.size(); i != e; ++i) {
6853 if (UndefElts[i])
6854 continue;
6855 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6856 SplatIndex = -1;
6857 break;
6858 }
6859 SplatIndex = i;
6860 }
6861 if (0 <= SplatIndex) {
6862 SplatVal = EltBits[SplatIndex];
6863 return true;
6864 }
6865 }
6866
6867 return false;
6868}
6869} // namespace X86
6870} // namespace llvm
6871
6872static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6873 unsigned MaskEltSizeInBits,
6874 SmallVectorImpl<uint64_t> &RawMask,
6875 APInt &UndefElts) {
6876 // Extract the raw target constant bits.
6877 SmallVector<APInt, 64> EltBits;
6878 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6879 EltBits, /* AllowWholeUndefs */ true,
6880 /* AllowPartialUndefs */ false))
6881 return false;
6882
6883 // Insert the extracted elements into the mask.
6884 for (const APInt &Elt : EltBits)
6885 RawMask.push_back(Elt.getZExtValue());
6886
6887 return true;
6888}
6889
6890/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6891/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
6892/// Note: This ignores saturation, so inputs must be checked first.
6893static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6894 bool Unary, unsigned NumStages = 1) {
6895 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6896 unsigned NumElts = VT.getVectorNumElements();
6897 unsigned NumLanes = VT.getSizeInBits() / 128;
6898 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6899 unsigned Offset = Unary ? 0 : NumElts;
6900 unsigned Repetitions = 1u << (NumStages - 1);
6901 unsigned Increment = 1u << NumStages;
6902 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")((void)0);
6903
6904 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6905 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
6906 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6907 Mask.push_back(Elt + (Lane * NumEltsPerLane));
6908 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6909 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6910 }
6911 }
6912}
6913
6914// Split the demanded elts of a PACKSS/PACKUS node between its operands.
6915static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6916 APInt &DemandedLHS, APInt &DemandedRHS) {
6917 int NumLanes = VT.getSizeInBits() / 128;
6918 int NumElts = DemandedElts.getBitWidth();
6919 int NumInnerElts = NumElts / 2;
6920 int NumEltsPerLane = NumElts / NumLanes;
6921 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6922
6923 DemandedLHS = APInt::getNullValue(NumInnerElts);
6924 DemandedRHS = APInt::getNullValue(NumInnerElts);
6925
6926 // Map DemandedElts to the packed operands.
6927 for (int Lane = 0; Lane != NumLanes; ++Lane) {
6928 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6929 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6930 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6931 if (DemandedElts[OuterIdx])
6932 DemandedLHS.setBit(InnerIdx);
6933 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6934 DemandedRHS.setBit(InnerIdx);
6935 }
6936 }
6937}
6938
6939// Split the demanded elts of a HADD/HSUB node between its operands.
6940static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6941 APInt &DemandedLHS, APInt &DemandedRHS) {
6942 int NumLanes = VT.getSizeInBits() / 128;
6943 int NumElts = DemandedElts.getBitWidth();
6944 int NumEltsPerLane = NumElts / NumLanes;
6945 int HalfEltsPerLane = NumEltsPerLane / 2;
6946
6947 DemandedLHS = APInt::getNullValue(NumElts);
6948 DemandedRHS = APInt::getNullValue(NumElts);
6949
6950 // Map DemandedElts to the horizontal operands.
6951 for (int Idx = 0; Idx != NumElts; ++Idx) {
6952 if (!DemandedElts[Idx])
6953 continue;
6954 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6955 int LocalIdx = Idx % NumEltsPerLane;
6956 if (LocalIdx < HalfEltsPerLane) {
6957 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6958 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6959 } else {
6960 LocalIdx -= HalfEltsPerLane;
6961 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6962 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6963 }
6964 }
6965}
6966
6967/// Calculates the shuffle mask corresponding to the target-specific opcode.
6968/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6969/// operands in \p Ops, and returns true.
6970/// Sets \p IsUnary to true if only one source is used. Note that this will set
6971/// IsUnary for shuffles which use a single input multiple times, and in those
6972/// cases it will adjust the mask to only have indices within that single input.
6973/// It is an error to call this with non-empty Mask/Ops vectors.
6974static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6975 SmallVectorImpl<SDValue> &Ops,
6976 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6977 unsigned NumElems = VT.getVectorNumElements();
6978 unsigned MaskEltSize = VT.getScalarSizeInBits();
6979 SmallVector<uint64_t, 32> RawMask;
6980 APInt RawUndefs;
6981 uint64_t ImmN;
6982
6983 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((void)0);
6984 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((void)0);
6985
6986 IsUnary = false;
6987 bool IsFakeUnary = false;
6988 switch (N->getOpcode()) {
6989 case X86ISD::BLENDI:
6990 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
6991 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
6992 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6993 DecodeBLENDMask(NumElems, ImmN, Mask);
6994 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6995 break;
6996 case X86ISD::SHUFP:
6997 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
6998 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
6999 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7000 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7001 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7002 break;
7003 case X86ISD::INSERTPS:
7004 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7005 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7006 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7007 DecodeINSERTPSMask(ImmN, Mask);
7008 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7009 break;
7010 case X86ISD::EXTRQI:
7011 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7012 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7013 isa<ConstantSDNode>(N->getOperand(2))) {
7014 int BitLen = N->getConstantOperandVal(1);
7015 int BitIdx = N->getConstantOperandVal(2);
7016 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7017 IsUnary = true;
7018 }
7019 break;
7020 case X86ISD::INSERTQI:
7021 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7022 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7023 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7024 isa<ConstantSDNode>(N->getOperand(3))) {
7025 int BitLen = N->getConstantOperandVal(2);
7026 int BitIdx = N->getConstantOperandVal(3);
7027 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7028 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7029 }
7030 break;
7031 case X86ISD::UNPCKH:
7032 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7033 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7034 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7035 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7036 break;
7037 case X86ISD::UNPCKL:
7038 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7039 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7040 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7041 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7042 break;
7043 case X86ISD::MOVHLPS:
7044 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7045 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7046 DecodeMOVHLPSMask(NumElems, Mask);
7047 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7048 break;
7049 case X86ISD::MOVLHPS:
7050 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7051 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7052 DecodeMOVLHPSMask(NumElems, Mask);
7053 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7054 break;
7055 case X86ISD::VALIGN:
7056 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&((void)0)
7057 "Only 32-bit and 64-bit elements are supported!")((void)0);
7058 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7059 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7060 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7061 DecodeVALIGNMask(NumElems, ImmN, Mask);
7062 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7063 Ops.push_back(N->getOperand(1));
7064 Ops.push_back(N->getOperand(0));
7065 break;
7066 case X86ISD::PALIGNR:
7067 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7068 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7069 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7070 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7071 DecodePALIGNRMask(NumElems, ImmN, Mask);
7072 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7073 Ops.push_back(N->getOperand(1));
7074 Ops.push_back(N->getOperand(0));
7075 break;
7076 case X86ISD::VSHLDQ:
7077 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7078 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7079 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7080 DecodePSLLDQMask(NumElems, ImmN, Mask);
7081 IsUnary = true;
7082 break;
7083 case X86ISD::VSRLDQ:
7084 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7085 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7086 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7087 DecodePSRLDQMask(NumElems, ImmN, Mask);
7088 IsUnary = true;
7089 break;
7090 case X86ISD::PSHUFD:
7091 case X86ISD::VPERMILPI:
7092 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7093 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7094 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7095 IsUnary = true;
7096 break;
7097 case X86ISD::PSHUFHW:
7098 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7099 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7100 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7101 IsUnary = true;
7102 break;
7103 case X86ISD::PSHUFLW:
7104 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7105 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7106 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7107 IsUnary = true;
7108 break;
7109 case X86ISD::VZEXT_MOVL:
7110 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7111 DecodeZeroMoveLowMask(NumElems, Mask);
7112 IsUnary = true;
7113 break;
7114 case X86ISD::VBROADCAST:
7115 // We only decode broadcasts of same-sized vectors, peeking through to
7116 // extracted subvectors is likely to cause hasOneUse issues with
7117 // SimplifyDemandedBits etc.
7118 if (N->getOperand(0).getValueType() == VT) {
7119 DecodeVectorBroadcast(NumElems, Mask);
7120 IsUnary = true;
7121 break;
7122 }
7123 return false;
7124 case X86ISD::VPERMILPV: {
7125 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7126 IsUnary = true;
7127 SDValue MaskNode = N->getOperand(1);
7128 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7129 RawUndefs)) {
7130 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7131 break;
7132 }
7133 return false;
7134 }
7135 case X86ISD::PSHUFB: {
7136 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7137 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7138 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7139 IsUnary = true;
7140 SDValue MaskNode = N->getOperand(1);
7141 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7142 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7143 break;
7144 }
7145 return false;
7146 }
7147 case X86ISD::VPERMI:
7148 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7149 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7150 DecodeVPERMMask(NumElems, ImmN, Mask);
7151 IsUnary = true;
7152 break;
7153 case X86ISD::MOVSS:
7154 case X86ISD::MOVSD:
7155 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7156 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7157 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7158 break;
7159 case X86ISD::VPERM2X128:
7160 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7161 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7162 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7163 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7164 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7165 break;
7166 case X86ISD::SHUF128:
7167 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7168 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7169 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7170 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7171 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7172 break;
7173 case X86ISD::MOVSLDUP:
7174 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7175 DecodeMOVSLDUPMask(NumElems, Mask);
7176 IsUnary = true;
7177 break;
7178 case X86ISD::MOVSHDUP:
7179 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7180 DecodeMOVSHDUPMask(NumElems, Mask);
7181 IsUnary = true;
7182 break;
7183 case X86ISD::MOVDDUP:
7184 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7185 DecodeMOVDDUPMask(NumElems, Mask);
7186 IsUnary = true;
7187 break;
7188 case X86ISD::VPERMIL2: {
7189 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7190 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7191 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7192 SDValue MaskNode = N->getOperand(2);
7193 SDValue CtrlNode = N->getOperand(3);
7194 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7195 unsigned CtrlImm = CtrlOp->getZExtValue();
7196 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7197 RawUndefs)) {
7198 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7199 Mask);
7200 break;
7201 }
7202 }
7203 return false;
7204 }
7205 case X86ISD::VPPERM: {
7206 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7207 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7208 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7209 SDValue MaskNode = N->getOperand(2);
7210 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7211 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7212 break;
7213 }
7214 return false;
7215 }
7216 case X86ISD::VPERMV: {
7217 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7218 IsUnary = true;
7219 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7220 Ops.push_back(N->getOperand(1));
7221 SDValue MaskNode = N->getOperand(0);
7222 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7223 RawUndefs)) {
7224 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7225 break;
7226 }
7227 return false;
7228 }
7229 case X86ISD::VPERMV3: {
7230 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7231 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")((void)0);
7232 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7233 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7234 Ops.push_back(N->getOperand(0));
7235 Ops.push_back(N->getOperand(2));
7236 SDValue MaskNode = N->getOperand(1);
7237 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7238 RawUndefs)) {
7239 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7240 break;
7241 }
7242 return false;
7243 }
7244 default: llvm_unreachable("unknown target shuffle node")__builtin_unreachable();
7245 }
7246
7247 // Empty mask indicates the decode failed.
7248 if (Mask.empty())
7249 return false;
7250
7251 // Check if we're getting a shuffle mask with zero'd elements.
7252 if (!AllowSentinelZero && isAnyZero(Mask))
7253 return false;
7254
7255 // If we have a fake unary shuffle, the shuffle mask is spread across two
7256 // inputs that are actually the same node. Re-map the mask to always point
7257 // into the first input.
7258 if (IsFakeUnary)
7259 for (int &M : Mask)
7260 if (M >= (int)Mask.size())
7261 M -= Mask.size();
7262
7263 // If we didn't already add operands in the opcode-specific code, default to
7264 // adding 1 or 2 operands starting at 0.
7265 if (Ops.empty()) {
7266 Ops.push_back(N->getOperand(0));
7267 if (!IsUnary || IsFakeUnary)
7268 Ops.push_back(N->getOperand(1));
7269 }
7270
7271 return true;
7272}
7273
7274// Wrapper for getTargetShuffleMask with InUnary;
7275static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7276 SmallVectorImpl<SDValue> &Ops,
7277 SmallVectorImpl<int> &Mask) {
7278 bool IsUnary;
7279 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7280}
7281
7282/// Compute whether each element of a shuffle is zeroable.
7283///
7284/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7285/// Either it is an undef element in the shuffle mask, the element of the input
7286/// referenced is undef, or the element of the input referenced is known to be
7287/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7288/// as many lanes with this technique as possible to simplify the remaining
7289/// shuffle.
7290static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7291 SDValue V1, SDValue V2,
7292 APInt &KnownUndef, APInt &KnownZero) {
7293 int Size = Mask.size();
7294 KnownUndef = KnownZero = APInt::getNullValue(Size);
7295
7296 V1 = peekThroughBitcasts(V1);
7297 V2 = peekThroughBitcasts(V2);
7298
7299 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7300 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7301
7302 int VectorSizeInBits = V1.getValueSizeInBits();
7303 int ScalarSizeInBits = VectorSizeInBits / Size;
7304 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((void)0);
7305
7306 for (int i = 0; i < Size; ++i) {
7307 int M = Mask[i];
7308 // Handle the easy cases.
7309 if (M < 0) {
7310 KnownUndef.setBit(i);
7311 continue;
7312 }
7313 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7314 KnownZero.setBit(i);
7315 continue;
7316 }
7317
7318 // Determine shuffle input and normalize the mask.
7319 SDValue V = M < Size ? V1 : V2;
7320 M %= Size;
7321
7322 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7323 if (V.getOpcode() != ISD::BUILD_VECTOR)
7324 continue;
7325
7326 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7327 // the (larger) source element must be UNDEF/ZERO.
7328 if ((Size % V.getNumOperands()) == 0) {
7329 int Scale = Size / V->getNumOperands();
7330 SDValue Op = V.getOperand(M / Scale);
7331 if (Op.isUndef())
7332 KnownUndef.setBit(i);
7333 if (X86::isZeroNode(Op))
7334 KnownZero.setBit(i);
7335 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7336 APInt Val = Cst->getAPIntValue();
7337 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7338 if (Val == 0)
7339 KnownZero.setBit(i);
7340 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7341 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7342 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7343 if (Val == 0)
7344 KnownZero.setBit(i);
7345 }
7346 continue;
7347 }
7348
7349 // If the BUILD_VECTOR has more elements then all the (smaller) source
7350 // elements must be UNDEF or ZERO.
7351 if ((V.getNumOperands() % Size) == 0) {
7352 int Scale = V->getNumOperands() / Size;
7353 bool AllUndef = true;
7354 bool AllZero = true;
7355 for (int j = 0; j < Scale; ++j) {
7356 SDValue Op = V.getOperand((M * Scale) + j);
7357 AllUndef &= Op.isUndef();
7358 AllZero &= X86::isZeroNode(Op);
7359 }
7360 if (AllUndef)
7361 KnownUndef.setBit(i);
7362 if (AllZero)
7363 KnownZero.setBit(i);
7364 continue;
7365 }
7366 }
7367}
7368
7369/// Decode a target shuffle mask and inputs and see if any values are
7370/// known to be undef or zero from their inputs.
7371/// Returns true if the target shuffle mask was decoded.
7372/// FIXME: Merge this with computeZeroableShuffleElements?
7373static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7374 SmallVectorImpl<SDValue> &Ops,
7375 APInt &KnownUndef, APInt &KnownZero) {
7376 bool IsUnary;
7377 if (!isTargetShuffle(N.getOpcode()))
7378 return false;
7379
7380 MVT VT = N.getSimpleValueType();
7381 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7382 return false;
7383
7384 int Size = Mask.size();
7385 SDValue V1 = Ops[0];
7386 SDValue V2 = IsUnary ? V1 : Ops[1];
7387 KnownUndef = KnownZero = APInt::getNullValue(Size);
7388
7389 V1 = peekThroughBitcasts(V1);
7390 V2 = peekThroughBitcasts(V2);
7391
7392 assert((VT.getSizeInBits() % Size) == 0 &&((void)0)
7393 "Illegal split of shuffle value type")((void)0);
7394 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7395
7396 // Extract known constant input data.
7397 APInt UndefSrcElts[2];
7398 SmallVector<APInt, 32> SrcEltBits[2];
7399 bool IsSrcConstant[2] = {
7400 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7401 SrcEltBits[0], true, false),
7402 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7403 SrcEltBits[1], true, false)};
7404
7405 for (int i = 0; i < Size; ++i) {
7406 int M = Mask[i];
7407
7408 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7409 if (M < 0) {
7410 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")((void)0);
7411 if (SM_SentinelUndef == M)
7412 KnownUndef.setBit(i);
7413 if (SM_SentinelZero == M)
7414 KnownZero.setBit(i);
7415 continue;
7416 }
7417
7418 // Determine shuffle input and normalize the mask.
7419 unsigned SrcIdx = M / Size;
7420 SDValue V = M < Size ? V1 : V2;
7421 M %= Size;
7422
7423 // We are referencing an UNDEF input.
7424 if (V.isUndef()) {
7425 KnownUndef.setBit(i);
7426 continue;
7427 }
7428
7429 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7430 // TODO: We currently only set UNDEF for integer types - floats use the same
7431 // registers as vectors and many of the scalar folded loads rely on the
7432 // SCALAR_TO_VECTOR pattern.
7433 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7434 (Size % V.getValueType().getVectorNumElements()) == 0) {
7435 int Scale = Size / V.getValueType().getVectorNumElements();
7436 int Idx = M / Scale;
7437 if (Idx != 0 && !VT.isFloatingPoint())
7438 KnownUndef.setBit(i);
7439 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7440 KnownZero.setBit(i);
7441 continue;
7442 }
7443
7444 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7445 // base vectors.
7446 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7447 SDValue Vec = V.getOperand(0);
7448 int NumVecElts = Vec.getValueType().getVectorNumElements();
7449 if (Vec.isUndef() && Size == NumVecElts) {
7450 int Idx = V.getConstantOperandVal(2);
7451 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7452 if (M < Idx || (Idx + NumSubElts) <= M)
7453 KnownUndef.setBit(i);
7454 }
7455 continue;
7456 }
7457
7458 // Attempt to extract from the source's constant bits.
7459 if (IsSrcConstant[SrcIdx]) {
7460 if (UndefSrcElts[SrcIdx][M])
7461 KnownUndef.setBit(i);
7462 else if (SrcEltBits[SrcIdx][M] == 0)
7463 KnownZero.setBit(i);
7464 }
7465 }
7466
7467 assert(VT.getVectorNumElements() == (unsigned)Size &&((void)0)
7468 "Different mask size from vector size!")((void)0);
7469 return true;
7470}
7471
7472// Replace target shuffle mask elements with known undef/zero sentinels.
7473static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7474 const APInt &KnownUndef,
7475 const APInt &KnownZero,
7476 bool ResolveKnownZeros= true) {
7477 unsigned NumElts = Mask.size();
7478 assert(KnownUndef.getBitWidth() == NumElts &&((void)0)
7479 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")((void)0);
7480
7481 for (unsigned i = 0; i != NumElts; ++i) {
7482 if (KnownUndef[i])
7483 Mask[i] = SM_SentinelUndef;
7484 else if (ResolveKnownZeros && KnownZero[i])
7485 Mask[i] = SM_SentinelZero;
7486 }
7487}
7488
7489// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7490static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7491 APInt &KnownUndef,
7492 APInt &KnownZero) {
7493 unsigned NumElts = Mask.size();
7494 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7495
7496 for (unsigned i = 0; i != NumElts; ++i) {
7497 int M = Mask[i];
7498 if (SM_SentinelUndef == M)
7499 KnownUndef.setBit(i);
7500 if (SM_SentinelZero == M)
7501 KnownZero.setBit(i);
7502 }
7503}
7504
7505// Forward declaration (for getFauxShuffleMask recursive check).
7506// TODO: Use DemandedElts variant.
7507static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7508 SmallVectorImpl<int> &Mask,
7509 const SelectionDAG &DAG, unsigned Depth,
7510 bool ResolveKnownElts);
7511
7512// Attempt to decode ops that could be represented as a shuffle mask.
7513// The decoded shuffle mask may contain a different number of elements to the
7514// destination value type.
7515static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7516 SmallVectorImpl<int> &Mask,
7517 SmallVectorImpl<SDValue> &Ops,
7518 const SelectionDAG &DAG, unsigned Depth,
7519 bool ResolveKnownElts) {
7520 Mask.clear();
7521 Ops.clear();
7522
7523 MVT VT = N.getSimpleValueType();
7524 unsigned NumElts = VT.getVectorNumElements();
7525 unsigned NumSizeInBits = VT.getSizeInBits();
7526 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7527 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7528 return false;
7529 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")((void)0);
7530 unsigned NumSizeInBytes = NumSizeInBits / 8;
7531 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7532
7533 unsigned Opcode = N.getOpcode();
7534 switch (Opcode) {
7535 case ISD::VECTOR_SHUFFLE: {
7536 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7537 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7538 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7539 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7540 Ops.push_back(N.getOperand(0));
7541 Ops.push_back(N.getOperand(1));
7542 return true;
7543 }
7544 return false;
7545 }
7546 case ISD::AND:
7547 case X86ISD::ANDNP: {
7548 // Attempt to decode as a per-byte mask.
7549 APInt UndefElts;
7550 SmallVector<APInt, 32> EltBits;
7551 SDValue N0 = N.getOperand(0);
7552 SDValue N1 = N.getOperand(1);
7553 bool IsAndN = (X86ISD::ANDNP == Opcode);
7554 uint64_t ZeroMask = IsAndN ? 255 : 0;
7555 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7556 return false;
7557 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7558 if (UndefElts[i]) {
7559 Mask.push_back(SM_SentinelUndef);
7560 continue;
7561 }
7562 const APInt &ByteBits = EltBits[i];
7563 if (ByteBits != 0 && ByteBits != 255)
7564 return false;
7565 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7566 }
7567 Ops.push_back(IsAndN ? N1 : N0);
7568 return true;
7569 }
7570 case ISD::OR: {
7571 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7572 // is a valid shuffle index.
7573 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7574 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7575 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7576 return false;
7577 SmallVector<int, 64> SrcMask0, SrcMask1;
7578 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7579 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7580 true) ||
7581 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7582 true))
7583 return false;
7584
7585 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7586 SmallVector<int, 64> Mask0, Mask1;
7587 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7588 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7589 for (int i = 0; i != (int)MaskSize; ++i) {
7590 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7591 // loops converting between OR and BLEND shuffles due to
7592 // canWidenShuffleElements merging away undef elements, meaning we
7593 // fail to recognise the OR as the undef element isn't known zero.
7594 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7595 Mask.push_back(SM_SentinelZero);
7596 else if (Mask1[i] == SM_SentinelZero)
7597 Mask.push_back(i);
7598 else if (Mask0[i] == SM_SentinelZero)
7599 Mask.push_back(i + MaskSize);
7600 else
7601 return false;
7602 }
7603 Ops.push_back(N0);
7604 Ops.push_back(N1);
7605 return true;
7606 }
7607 case ISD::INSERT_SUBVECTOR: {
7608 SDValue Src = N.getOperand(0);
7609 SDValue Sub = N.getOperand(1);
7610 EVT SubVT = Sub.getValueType();
7611 unsigned NumSubElts = SubVT.getVectorNumElements();
7612 if (!N->isOnlyUserOf(Sub.getNode()))
7613 return false;
7614 uint64_t InsertIdx = N.getConstantOperandVal(2);
7615 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7616 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7617 Sub.getOperand(0).getValueType() == VT) {
7618 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7619 for (int i = 0; i != (int)NumElts; ++i)
7620 Mask.push_back(i);
7621 for (int i = 0; i != (int)NumSubElts; ++i)
7622 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7623 Ops.push_back(Src);
7624 Ops.push_back(Sub.getOperand(0));
7625 return true;
7626 }
7627 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7628 SmallVector<int, 64> SubMask;
7629 SmallVector<SDValue, 2> SubInputs;
7630 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7631 SubMask, DAG, Depth + 1, ResolveKnownElts))
7632 return false;
7633
7634 // Subvector shuffle inputs must not be larger than the subvector.
7635 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7636 return SubVT.getFixedSizeInBits() <
7637 SubInput.getValueSizeInBits().getFixedSize();
7638 }))
7639 return false;
7640
7641 if (SubMask.size() != NumSubElts) {
7642 assert(((SubMask.size() % NumSubElts) == 0 ||((void)0)
7643 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")((void)0);
7644 if ((NumSubElts % SubMask.size()) == 0) {
7645 int Scale = NumSubElts / SubMask.size();
7646 SmallVector<int,64> ScaledSubMask;
7647 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7648 SubMask = ScaledSubMask;
7649 } else {
7650 int Scale = SubMask.size() / NumSubElts;
7651 NumSubElts = SubMask.size();
7652 NumElts *= Scale;
7653 InsertIdx *= Scale;
7654 }
7655 }
7656 Ops.push_back(Src);
7657 Ops.append(SubInputs.begin(), SubInputs.end());
7658 if (ISD::isBuildVectorAllZeros(Src.getNode()))
7659 Mask.append(NumElts, SM_SentinelZero);
7660 else
7661 for (int i = 0; i != (int)NumElts; ++i)
7662 Mask.push_back(i);
7663 for (int i = 0; i != (int)NumSubElts; ++i) {
7664 int M = SubMask[i];
7665 if (0 <= M) {
7666 int InputIdx = M / NumSubElts;
7667 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7668 }
7669 Mask[i + InsertIdx] = M;
7670 }
7671 return true;
7672 }
7673 case X86ISD::PINSRB:
7674 case X86ISD::PINSRW:
7675 case ISD::SCALAR_TO_VECTOR:
7676 case ISD::INSERT_VECTOR_ELT: {
7677 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7678 // vector, for matching src/dst vector types.
7679 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7680
7681 unsigned DstIdx = 0;
7682 if (Opcode != ISD::SCALAR_TO_VECTOR) {
7683 // Check we have an in-range constant insertion index.
7684 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7685 N.getConstantOperandAPInt(2).uge(NumElts))
7686 return false;
7687 DstIdx = N.getConstantOperandVal(2);
7688
7689 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7690 if (X86::isZeroNode(Scl)) {
7691 Ops.push_back(N.getOperand(0));
7692 for (unsigned i = 0; i != NumElts; ++i)
7693 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7694 return true;
7695 }
7696 }
7697
7698 // Peek through trunc/aext/zext.
7699 // TODO: aext shouldn't require SM_SentinelZero padding.
7700 // TODO: handle shift of scalars.
7701 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7702 while (Scl.getOpcode() == ISD::TRUNCATE ||
7703 Scl.getOpcode() == ISD::ANY_EXTEND ||
7704 Scl.getOpcode() == ISD::ZERO_EXTEND) {
7705 Scl = Scl.getOperand(0);
7706 MinBitsPerElt =
7707 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7708 }
7709 if ((MinBitsPerElt % 8) != 0)
7710 return false;
7711
7712 // Attempt to find the source vector the scalar was extracted from.
7713 SDValue SrcExtract;
7714 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7715 Scl.getOpcode() == X86ISD::PEXTRW ||
7716 Scl.getOpcode() == X86ISD::PEXTRB) &&
7717 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7718 SrcExtract = Scl;
7719 }
7720 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7721 return false;
7722
7723 SDValue SrcVec = SrcExtract.getOperand(0);
7724 EVT SrcVT = SrcVec.getValueType();
7725 if (!SrcVT.getScalarType().isByteSized())
7726 return false;
7727 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7728 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7729 unsigned DstByte = DstIdx * NumBytesPerElt;
7730 MinBitsPerElt =
7731 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7732
7733 // Create 'identity' byte level shuffle mask and then add inserted bytes.
7734 if (Opcode == ISD::SCALAR_TO_VECTOR) {
7735 Ops.push_back(SrcVec);
7736 Mask.append(NumSizeInBytes, SM_SentinelUndef);
7737 } else {
7738 Ops.push_back(SrcVec);
7739 Ops.push_back(N.getOperand(0));
7740 for (int i = 0; i != (int)NumSizeInBytes; ++i)
7741 Mask.push_back(NumSizeInBytes + i);
7742 }
7743
7744 unsigned MinBytesPerElts = MinBitsPerElt / 8;
7745 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7746 for (unsigned i = 0; i != MinBytesPerElts; ++i)
7747 Mask[DstByte + i] = SrcByte + i;
7748 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7749 Mask[DstByte + i] = SM_SentinelZero;
7750 return true;
7751 }
7752 case X86ISD::PACKSS:
7753 case X86ISD::PACKUS: {
7754 SDValue N0 = N.getOperand(0);
7755 SDValue N1 = N.getOperand(1);
7756 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&((void)0)
7757 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&((void)0)
7758 "Unexpected input value type")((void)0);
7759
7760 APInt EltsLHS, EltsRHS;
7761 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7762
7763 // If we know input saturation won't happen (or we don't care for particular
7764 // lanes), we can treat this as a truncation shuffle.
7765 bool Offset0 = false, Offset1 = false;
7766 if (Opcode == X86ISD::PACKSS) {
7767 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7768 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7769 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7770 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7771 return false;
7772 // We can't easily fold ASHR into a shuffle, but if it was feeding a
7773 // PACKSS then it was likely being used for sign-extension for a
7774 // truncation, so just peek through and adjust the mask accordingly.
7775 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7776 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7777 Offset0 = true;
7778 N0 = N0.getOperand(0);
7779 }
7780 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7781 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
7782 Offset1 = true;
7783 N1 = N1.getOperand(0);
7784 }
7785 } else {
7786 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7787 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7788 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7789 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7790 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7791 return false;
7792 }
7793
7794 bool IsUnary = (N0 == N1);
7795
7796 Ops.push_back(N0);
7797 if (!IsUnary)
7798 Ops.push_back(N1);
7799
7800 createPackShuffleMask(VT, Mask, IsUnary);
7801
7802 if (Offset0 || Offset1) {
7803 for (int &M : Mask)
7804 if ((Offset0 && isInRange(M, 0, NumElts)) ||
7805 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
7806 ++M;
7807 }
7808 return true;
7809 }
7810 case X86ISD::VTRUNC: {
7811 SDValue Src = N.getOperand(0);
7812 EVT SrcVT = Src.getValueType();
7813 // Truncated source must be a simple vector.
7814 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7815 (SrcVT.getScalarSizeInBits() % 8) != 0)
7816 return false;
7817 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7818 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7819 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7820 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")((void)0);
7821 for (unsigned i = 0; i != NumSrcElts; ++i)
7822 Mask.push_back(i * Scale);
7823 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7824 Ops.push_back(Src);
7825 return true;
7826 }
7827 case X86ISD::VSHLI:
7828 case X86ISD::VSRLI: {
7829 uint64_t ShiftVal = N.getConstantOperandVal(1);
7830 // Out of range bit shifts are guaranteed to be zero.
7831 if (NumBitsPerElt <= ShiftVal) {
7832 Mask.append(NumElts, SM_SentinelZero);
7833 return true;
7834 }
7835
7836 // We can only decode 'whole byte' bit shifts as shuffles.
7837 if ((ShiftVal % 8) != 0)
7838 break;
7839
7840 uint64_t ByteShift = ShiftVal / 8;
7841 Ops.push_back(N.getOperand(0));
7842
7843 // Clear mask to all zeros and insert the shifted byte indices.
7844 Mask.append(NumSizeInBytes, SM_SentinelZero);
7845
7846 if (X86ISD::VSHLI == Opcode) {
7847 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7848 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7849 Mask[i + j] = i + j - ByteShift;
7850 } else {
7851 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7852 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7853 Mask[i + j - ByteShift] = i + j;
7854 }
7855 return true;
7856 }
7857 case X86ISD::VROTLI:
7858 case X86ISD::VROTRI: {
7859 // We can only decode 'whole byte' bit rotates as shuffles.
7860 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7861 if ((RotateVal % 8) != 0)
7862 return false;
7863 Ops.push_back(N.getOperand(0));
7864 int Offset = RotateVal / 8;
7865 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7866 for (int i = 0; i != (int)NumElts; ++i) {
7867 int BaseIdx = i * NumBytesPerElt;
7868 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7869 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7870 }
7871 }
7872 return true;
7873 }
7874 case X86ISD::VBROADCAST: {
7875 SDValue Src = N.getOperand(0);
7876 if (!Src.getSimpleValueType().isVector()) {
7877 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7878 !isNullConstant(Src.getOperand(1)) ||
7879 Src.getOperand(0).getValueType().getScalarType() !=
7880 VT.getScalarType())
7881 return false;
7882 Src = Src.getOperand(0);
7883 }
7884 Ops.push_back(Src);
7885 Mask.append(NumElts, 0);
7886 return true;
7887 }
7888 case ISD::ZERO_EXTEND:
7889 case ISD::ANY_EXTEND:
7890 case ISD::ZERO_EXTEND_VECTOR_INREG:
7891 case ISD::ANY_EXTEND_VECTOR_INREG: {
7892 SDValue Src = N.getOperand(0);
7893 EVT SrcVT = Src.getValueType();
7894
7895 // Extended source must be a simple vector.
7896 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7897 (SrcVT.getScalarSizeInBits() % 8) != 0)
7898 return false;
7899
7900 bool IsAnyExtend =
7901 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7902 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7903 IsAnyExtend, Mask);
7904 Ops.push_back(Src);
7905 return true;
7906 }
7907 }
7908
7909 return false;
7910}
7911
7912/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7913static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7914 SmallVectorImpl<int> &Mask) {
7915 int MaskWidth = Mask.size();
7916 SmallVector<SDValue, 16> UsedInputs;
7917 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7918 int lo = UsedInputs.size() * MaskWidth;
7919 int hi = lo + MaskWidth;
7920
7921 // Strip UNDEF input usage.
7922 if (Inputs[i].isUndef())
7923 for (int &M : Mask)
7924 if ((lo <= M) && (M < hi))
7925 M = SM_SentinelUndef;
7926
7927 // Check for unused inputs.
7928 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7929 for (int &M : Mask)
7930 if (lo <= M)
7931 M -= MaskWidth;
7932 continue;
7933 }
7934
7935 // Check for repeated inputs.
7936 bool IsRepeat = false;
7937 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7938 if (UsedInputs[j] != Inputs[i])
7939 continue;
7940 for (int &M : Mask)
7941 if (lo <= M)
7942 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7943 IsRepeat = true;
7944 break;
7945 }
7946 if (IsRepeat)
7947 continue;
7948
7949 UsedInputs.push_back(Inputs[i]);
7950 }
7951 Inputs = UsedInputs;
7952}
7953
7954/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7955/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7956/// Returns true if the target shuffle mask was decoded.
7957static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7958 SmallVectorImpl<SDValue> &Inputs,
7959 SmallVectorImpl<int> &Mask,
7960 APInt &KnownUndef, APInt &KnownZero,
7961 const SelectionDAG &DAG, unsigned Depth,
7962 bool ResolveKnownElts) {
7963 EVT VT = Op.getValueType();
7964 if (!VT.isSimple() || !VT.isVector())
7965 return false;
7966
7967 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7968 if (ResolveKnownElts)
7969 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7970 return true;
7971 }
7972 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7973 ResolveKnownElts)) {
7974 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7975 return true;
7976 }
7977 return false;
7978}
7979
7980static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7981 SmallVectorImpl<int> &Mask,
7982 const SelectionDAG &DAG, unsigned Depth = 0,
7983 bool ResolveKnownElts = true) {
7984 EVT VT = Op.getValueType();
7985 if (!VT.isSimple() || !VT.isVector())
7986 return false;
7987
7988 APInt KnownUndef, KnownZero;
7989 unsigned NumElts = Op.getValueType().getVectorNumElements();
7990 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7991 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7992 KnownZero, DAG, Depth, ResolveKnownElts);
7993}
7994
7995// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
7996static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
7997 EVT MemVT, MemSDNode *Mem, unsigned Offset,
7998 SelectionDAG &DAG) {
7999 assert((Opcode == X86ISD::VBROADCAST_LOAD ||((void)0)
8000 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&((void)0)
8001 "Unknown broadcast load type")((void)0);
8002
8003 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8004 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8005 return SDValue();
8006
8007 SDValue Ptr =
8008 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8009 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8010 SDValue Ops[] = {Mem->getChain(), Ptr};
8011 SDValue BcstLd = DAG.getMemIntrinsicNode(
8012 Opcode, DL, Tys, Ops, MemVT,
8013 DAG.getMachineFunction().getMachineMemOperand(
8014 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8015 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8016 return BcstLd;
8017}
8018
8019/// Returns the scalar element that will make up the i'th
8020/// element of the result of the vector shuffle.
8021static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8022 SelectionDAG &DAG, unsigned Depth) {
8023 if (Depth >= SelectionDAG::MaxRecursionDepth)
8024 return SDValue(); // Limit search depth.
8025
8026 EVT VT = Op.getValueType();
8027 unsigned Opcode = Op.getOpcode();
8028 unsigned NumElems = VT.getVectorNumElements();
8029
8030 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8031 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8032 int Elt = SV->getMaskElt(Index);
8033
8034 if (Elt < 0)
8035 return DAG.getUNDEF(VT.getVectorElementType());
8036
8037 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8038 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8039 }
8040
8041 // Recurse into target specific vector shuffles to find scalars.
8042 if (isTargetShuffle(Opcode)) {
8043 MVT ShufVT = VT.getSimpleVT();
8044 MVT ShufSVT = ShufVT.getVectorElementType();
8045 int NumElems = (int)ShufVT.getVectorNumElements();
8046 SmallVector<int, 16> ShuffleMask;
8047 SmallVector<SDValue, 16> ShuffleOps;
8048 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8049 ShuffleMask))
8050 return SDValue();
8051
8052 int Elt = ShuffleMask[Index];
8053 if (Elt == SM_SentinelZero)
8054 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8055 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8056 if (Elt == SM_SentinelUndef)
8057 return DAG.getUNDEF(ShufSVT);
8058
8059 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")((void)0);
8060 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8061 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8062 }
8063
8064 // Recurse into insert_subvector base/sub vector to find scalars.
8065 if (Opcode == ISD::INSERT_SUBVECTOR) {
8066 SDValue Vec = Op.getOperand(0);
8067 SDValue Sub = Op.getOperand(1);
8068 uint64_t SubIdx = Op.getConstantOperandVal(2);
8069 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8070
8071 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8072 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8073 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8074 }
8075
8076 // Recurse into concat_vectors sub vector to find scalars.
8077 if (Opcode == ISD::CONCAT_VECTORS) {
8078 EVT SubVT = Op.getOperand(0).getValueType();
8079 unsigned NumSubElts = SubVT.getVectorNumElements();
8080 uint64_t SubIdx = Index / NumSubElts;
8081 uint64_t SubElt = Index % NumSubElts;
8082 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8083 }
8084
8085 // Recurse into extract_subvector src vector to find scalars.
8086 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8087 SDValue Src = Op.getOperand(0);
8088 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8089 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8090 }
8091
8092 // We only peek through bitcasts of the same vector width.
8093 if (Opcode == ISD::BITCAST) {
8094 SDValue Src = Op.getOperand(0);
8095 EVT SrcVT = Src.getValueType();
8096 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8097 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8098 return SDValue();
8099 }
8100
8101 // Actual nodes that may contain scalar elements
8102
8103 // For insert_vector_elt - either return the index matching scalar or recurse
8104 // into the base vector.
8105 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8106 isa<ConstantSDNode>(Op.getOperand(2))) {
8107 if (Op.getConstantOperandAPInt(2) == Index)
8108 return Op.getOperand(1);
8109 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8110 }
8111
8112 if (Opcode == ISD::SCALAR_TO_VECTOR)
8113 return (Index == 0) ? Op.getOperand(0)
8114 : DAG.getUNDEF(VT.getVectorElementType());
8115
8116 if (Opcode == ISD::BUILD_VECTOR)
8117 return Op.getOperand(Index);
8118
8119 return SDValue();
8120}
8121
8122// Use PINSRB/PINSRW/PINSRD to create a build vector.
8123static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8124 unsigned NumNonZero, unsigned NumZero,
8125 SelectionDAG &DAG,
8126 const X86Subtarget &Subtarget) {
8127 MVT VT = Op.getSimpleValueType();
8128 unsigned NumElts = VT.getVectorNumElements();
8129 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||((void)0)
8130 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&((void)0)
8131 "Illegal vector insertion")((void)0);
8132
8133 SDLoc dl(Op);
8134 SDValue V;
8135 bool First = true;
8136
8137 for (unsigned i = 0; i < NumElts; ++i) {
8138 bool IsNonZero = NonZeroMask[i];
8139 if (!IsNonZero)
8140 continue;
8141
8142 // If the build vector contains zeros or our first insertion is not the
8143 // first index then insert into zero vector to break any register
8144 // dependency else use SCALAR_TO_VECTOR.
8145 if (First) {
8146 First = false;
8147 if (NumZero || 0 != i)
8148 V = getZeroVector(VT, Subtarget, DAG, dl);
8149 else {
8150 assert(0 == i && "Expected insertion into zero-index")((void)0);
8151 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8152 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8153 V = DAG.getBitcast(VT, V);
8154 continue;
8155 }
8156 }
8157 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8158 DAG.getIntPtrConstant(i, dl));
8159 }
8160
8161 return V;
8162}
8163
8164/// Custom lower build_vector of v16i8.
8165static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8166 unsigned NumNonZero, unsigned NumZero,
8167 SelectionDAG &DAG,
8168 const X86Subtarget &Subtarget) {
8169 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8170 return SDValue();
8171
8172 // SSE4.1 - use PINSRB to insert each byte directly.
8173 if (Subtarget.hasSSE41())
8174 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8175 Subtarget);
8176
8177 SDLoc dl(Op);
8178 SDValue V;
8179
8180 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8181 for (unsigned i = 0; i < 16; i += 2) {
8182 bool ThisIsNonZero = NonZeroMask[i];
8183 bool NextIsNonZero = NonZeroMask[i + 1];
8184 if (!ThisIsNonZero && !NextIsNonZero)
8185 continue;
8186
8187 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8188 SDValue Elt;
8189 if (ThisIsNonZero) {
8190 if (NumZero || NextIsNonZero)
8191 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8192 else
8193 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8194 }
8195
8196 if (NextIsNonZero) {
8197 SDValue NextElt = Op.getOperand(i + 1);
8198 if (i == 0 && NumZero)
8199 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8200 else
8201 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8202 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8203 DAG.getConstant(8, dl, MVT::i8));
8204 if (ThisIsNonZero)
8205 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8206 else
8207 Elt = NextElt;
8208 }
8209
8210 // If our first insertion is not the first index or zeros are needed, then
8211 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8212 // elements undefined).
8213 if (!V) {
8214 if (i != 0 || NumZero)
8215 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8216 else {
8217 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8218 V = DAG.getBitcast(MVT::v8i16, V);
8219 continue;
8220 }
8221 }
8222 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8223 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8224 DAG.getIntPtrConstant(i / 2, dl));
8225 }
8226
8227 return DAG.getBitcast(MVT::v16i8, V);
8228}
8229
8230/// Custom lower build_vector of v8i16.
8231static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8232 unsigned NumNonZero, unsigned NumZero,
8233 SelectionDAG &DAG,
8234 const X86Subtarget &Subtarget) {
8235 if (NumNonZero > 4 && !Subtarget.hasSSE41())
8236 return SDValue();
8237
8238 // Use PINSRW to insert each byte directly.
8239 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8240 Subtarget);
8241}
8242
8243/// Custom lower build_vector of v4i32 or v4f32.
8244static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8245 const X86Subtarget &Subtarget) {
8246 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8247 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8248 // Because we're creating a less complicated build vector here, we may enable
8249 // further folding of the MOVDDUP via shuffle transforms.
8250 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8251 Op.getOperand(0) == Op.getOperand(2) &&
8252 Op.getOperand(1) == Op.getOperand(3) &&
8253 Op.getOperand(0) != Op.getOperand(1)) {
8254 SDLoc DL(Op);
8255 MVT VT = Op.getSimpleValueType();
8256 MVT EltVT = VT.getVectorElementType();
8257 // Create a new build vector with the first 2 elements followed by undef
8258 // padding, bitcast to v2f64, duplicate, and bitcast back.
8259 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8260 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8261 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8262 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8263 return DAG.getBitcast(VT, Dup);
8264 }
8265
8266 // Find all zeroable elements.
8267 std::bitset<4> Zeroable, Undefs;
8268 for (int i = 0; i < 4; ++i) {
8269 SDValue Elt = Op.getOperand(i);
8270 Undefs[i] = Elt.isUndef();
8271 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8272 }
8273 assert(Zeroable.size() - Zeroable.count() > 1 &&((void)0)
8274 "We expect at least two non-zero elements!")((void)0);
8275
8276 // We only know how to deal with build_vector nodes where elements are either
8277 // zeroable or extract_vector_elt with constant index.
8278 SDValue FirstNonZero;
8279 unsigned FirstNonZeroIdx;
8280 for (unsigned i = 0; i < 4; ++i) {
8281 if (Zeroable[i])
8282 continue;
8283 SDValue Elt = Op.getOperand(i);
8284 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8285 !isa<ConstantSDNode>(Elt.getOperand(1)))
8286 return SDValue();
8287 // Make sure that this node is extracting from a 128-bit vector.
8288 MVT VT = Elt.getOperand(0).getSimpleValueType();
8289 if (!VT.is128BitVector())
8290 return SDValue();
8291 if (!FirstNonZero.getNode()) {
8292 FirstNonZero = Elt;
8293 FirstNonZeroIdx = i;
8294 }
8295 }
8296
8297 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((void)0);
8298 SDValue V1 = FirstNonZero.getOperand(0);
8299 MVT VT = V1.getSimpleValueType();
8300
8301 // See if this build_vector can be lowered as a blend with zero.
8302 SDValue Elt;
8303 unsigned EltMaskIdx, EltIdx;
8304 int Mask[4];
8305 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8306 if (Zeroable[EltIdx]) {
8307 // The zero vector will be on the right hand side.
8308 Mask[EltIdx] = EltIdx+4;
8309 continue;
8310 }
8311
8312 Elt = Op->getOperand(EltIdx);
8313 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8314 EltMaskIdx = Elt.getConstantOperandVal(1);
8315 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8316 break;
8317 Mask[EltIdx] = EltIdx;
8318 }
8319
8320 if (EltIdx == 4) {
8321 // Let the shuffle legalizer deal with blend operations.
8322 SDValue VZeroOrUndef = (Zeroable == Undefs)
8323 ? DAG.getUNDEF(VT)
8324 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8325 if (V1.getSimpleValueType() != VT)
8326 V1 = DAG.getBitcast(VT, V1);
8327 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8328 }
8329
8330 // See if we can lower this build_vector to a INSERTPS.
8331 if (!Subtarget.hasSSE41())
8332 return SDValue();
8333
8334 SDValue V2 = Elt.getOperand(0);
8335 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8336 V1 = SDValue();
8337
8338 bool CanFold = true;
8339 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8340 if (Zeroable[i])
8341 continue;
8342
8343 SDValue Current = Op->getOperand(i);
8344 SDValue SrcVector = Current->getOperand(0);
8345 if (!V1.getNode())
8346 V1 = SrcVector;
8347 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8348 }
8349
8350 if (!CanFold)
8351 return SDValue();
8352
8353 assert(V1.getNode() && "Expected at least two non-zero elements!")((void)0);
8354 if (V1.getSimpleValueType() != MVT::v4f32)
8355 V1 = DAG.getBitcast(MVT::v4f32, V1);
8356 if (V2.getSimpleValueType() != MVT::v4f32)
8357 V2 = DAG.getBitcast(MVT::v4f32, V2);
8358
8359 // Ok, we can emit an INSERTPS instruction.
8360 unsigned ZMask = Zeroable.to_ulong();
8361
8362 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8363 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")((void)0);
8364 SDLoc DL(Op);
8365 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8366 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8367 return DAG.getBitcast(VT, Result);
8368}
8369
8370/// Return a vector logical shift node.
8371static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8372 SelectionDAG &DAG, const TargetLowering &TLI,
8373 const SDLoc &dl) {
8374 assert(VT.is128BitVector() && "Unknown type for VShift")((void)0);
8375 MVT ShVT = MVT::v16i8;
8376 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8377 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8378 assert(NumBits % 8 == 0 && "Only support byte sized shifts")((void)0);
8379 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8380 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8381}
8382
8383static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8384 SelectionDAG &DAG) {
8385
8386 // Check if the scalar load can be widened into a vector load. And if
8387 // the address is "base + cst" see if the cst can be "absorbed" into
8388 // the shuffle mask.
8389 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8390 SDValue Ptr = LD->getBasePtr();
8391 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8392 return SDValue();
8393 EVT PVT = LD->getValueType(0);
8394 if (PVT != MVT::i32 && PVT != MVT::f32)
8395 return SDValue();
8396
8397 int FI = -1;
8398 int64_t Offset = 0;
8399 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8400 FI = FINode->getIndex();
8401 Offset = 0;
8402 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8403 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8404 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8405 Offset = Ptr.getConstantOperandVal(1);
8406 Ptr = Ptr.getOperand(0);
8407 } else {
8408 return SDValue();
8409 }
8410
8411 // FIXME: 256-bit vector instructions don't require a strict alignment,
8412 // improve this code to support it better.
8413 Align RequiredAlign(VT.getSizeInBits() / 8);
8414 SDValue Chain = LD->getChain();
8415 // Make sure the stack object alignment is at least 16 or 32.
8416 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8417 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8418 if (!InferredAlign || *InferredAlign < RequiredAlign) {
8419 if (MFI.isFixedObjectIndex(FI)) {
8420 // Can't change the alignment. FIXME: It's possible to compute
8421 // the exact stack offset and reference FI + adjust offset instead.
8422 // If someone *really* cares about this. That's the way to implement it.
8423 return SDValue();
8424 } else {
8425 MFI.setObjectAlignment(FI, RequiredAlign);
8426 }
8427 }
8428
8429 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8430 // Ptr + (Offset & ~15).
8431 if (Offset < 0)
8432 return SDValue();
8433 if ((Offset % RequiredAlign.value()) & 3)
8434 return SDValue();
8435 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8436 if (StartOffset) {
8437 SDLoc DL(Ptr);
8438 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8439 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8440 }
8441
8442 int EltNo = (Offset - StartOffset) >> 2;
8443 unsigned NumElems = VT.getVectorNumElements();
8444
8445 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8446 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8447 LD->getPointerInfo().getWithOffset(StartOffset));
8448
8449 SmallVector<int, 8> Mask(NumElems, EltNo);
8450
8451 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8452 }
8453
8454 return SDValue();
8455}
8456
8457// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8458static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8459 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8460 auto *BaseLd = cast<LoadSDNode>(Elt);
8461 if (!BaseLd->isSimple())
8462 return false;
8463 Ld = BaseLd;
8464 ByteOffset = 0;
8465 return true;
8466 }
8467
8468 switch (Elt.getOpcode()) {
8469 case ISD::BITCAST:
8470 case ISD::TRUNCATE:
8471 case ISD::SCALAR_TO_VECTOR:
8472 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8473 case ISD::SRL:
8474 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8475 uint64_t Idx = IdxC->getZExtValue();
8476 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8477 ByteOffset += Idx / 8;
8478 return true;
8479 }
8480 }
8481 break;
8482 case ISD::EXTRACT_VECTOR_ELT:
8483 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8484 SDValue Src = Elt.getOperand(0);
8485 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8486 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8487 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8488 findEltLoadSrc(Src, Ld, ByteOffset)) {
8489 uint64_t Idx = IdxC->getZExtValue();
8490 ByteOffset += Idx * (SrcSizeInBits / 8);
8491 return true;
8492 }
8493 }
8494 break;
8495 }
8496
8497 return false;
8498}
8499
8500/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8501/// elements can be replaced by a single large load which has the same value as
8502/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8503///
8504/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8505static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8506 const SDLoc &DL, SelectionDAG &DAG,
8507 const X86Subtarget &Subtarget,
8508 bool IsAfterLegalize) {
8509 if ((VT.getScalarSizeInBits() % 8) != 0)
8510 return SDValue();
8511
8512 unsigned NumElems = Elts.size();
8513
8514 int LastLoadedElt = -1;
8515 APInt LoadMask = APInt::getNullValue(NumElems);
8516 APInt ZeroMask = APInt::getNullValue(NumElems);
8517 APInt UndefMask = APInt::getNullValue(NumElems);
8518
8519 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8520 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8521
8522 // For each element in the initializer, see if we've found a load, zero or an
8523 // undef.
8524 for (unsigned i = 0; i < NumElems; ++i) {
8525 SDValue Elt = peekThroughBitcasts(Elts[i]);
8526 if (!Elt.getNode())
8527 return SDValue();
8528 if (Elt.isUndef()) {
8529 UndefMask.setBit(i);
8530 continue;
8531 }
8532 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8533 ZeroMask.setBit(i);
8534 continue;
8535 }
8536
8537 // Each loaded element must be the correct fractional portion of the
8538 // requested vector load.
8539 unsigned EltSizeInBits = Elt.getValueSizeInBits();
8540 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8541 return SDValue();
8542
8543 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8544 return SDValue();
8545 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8546 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8547 return SDValue();
8548
8549 LoadMask.setBit(i);
8550 LastLoadedElt = i;
8551 }
8552 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +((void)0)
8553 LoadMask.countPopulation()) == NumElems &&((void)0)
8554 "Incomplete element masks")((void)0);
8555
8556 // Handle Special Cases - all undef or undef/zero.
8557 if (UndefMask.countPopulation() == NumElems)
8558 return DAG.getUNDEF(VT);
8559 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8560 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8561 : DAG.getConstantFP(0.0, DL, VT);
8562
8563 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8564 int FirstLoadedElt = LoadMask.countTrailingZeros();
8565 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8566 EVT EltBaseVT = EltBase.getValueType();
8567 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&((void)0)
8568 "Register/Memory size mismatch")((void)0);
8569 LoadSDNode *LDBase = Loads[FirstLoadedElt];
8570 assert(LDBase && "Did not find base load for merging consecutive loads")((void)0);
8571 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8572 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8573 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8574 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8575 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")((void)0);
8576
8577 // TODO: Support offsetting the base load.
8578 if (ByteOffsets[FirstLoadedElt] != 0)
8579 return SDValue();
8580
8581 // Check to see if the element's load is consecutive to the base load
8582 // or offset from a previous (already checked) load.
8583 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8584 LoadSDNode *Ld = Loads[EltIdx];
8585 int64_t ByteOffset = ByteOffsets[EltIdx];
8586 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8587 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8588 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8589 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8590 }
8591 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8592 EltIdx - FirstLoadedElt);
8593 };
8594
8595 // Consecutive loads can contain UNDEFS but not ZERO elements.
8596 // Consecutive loads with UNDEFs and ZEROs elements require a
8597 // an additional shuffle stage to clear the ZERO elements.
8598 bool IsConsecutiveLoad = true;
8599 bool IsConsecutiveLoadWithZeros = true;
8600 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8601 if (LoadMask[i]) {
8602 if (!CheckConsecutiveLoad(LDBase, i)) {
8603 IsConsecutiveLoad = false;
8604 IsConsecutiveLoadWithZeros = false;
8605 break;
8606 }
8607 } else if (ZeroMask[i]) {
8608 IsConsecutiveLoad = false;
8609 }
8610 }
8611
8612 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8613 auto MMOFlags = LDBase->getMemOperand()->getFlags();
8614 assert(LDBase->isSimple() &&((void)0)
8615 "Cannot merge volatile or atomic loads.")((void)0);
8616 SDValue NewLd =
8617 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8618 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8619 MMOFlags);
8620 for (auto *LD : Loads)
8621 if (LD)
8622 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8623 return NewLd;
8624 };
8625
8626 // Check if the base load is entirely dereferenceable.
8627 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8628 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8629
8630 // LOAD - all consecutive load/undefs (must start/end with a load or be
8631 // entirely dereferenceable). If we have found an entire vector of loads and
8632 // undefs, then return a large load of the entire vector width starting at the
8633 // base pointer. If the vector contains zeros, then attempt to shuffle those
8634 // elements.
8635 if (FirstLoadedElt == 0 &&
8636 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8637 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8638 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8639 return SDValue();
8640
8641 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8642 // will lower to regular temporal loads and use the cache.
8643 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8644 VT.is256BitVector() && !Subtarget.hasInt256())
8645 return SDValue();
8646
8647 if (NumElems == 1)
8648 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8649
8650 if (!ZeroMask)
8651 return CreateLoad(VT, LDBase);
8652
8653 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8654 // vector and a zero vector to clear out the zero elements.
8655 if (!IsAfterLegalize && VT.isVector()) {
8656 unsigned NumMaskElts = VT.getVectorNumElements();
8657 if ((NumMaskElts % NumElems) == 0) {
8658 unsigned Scale = NumMaskElts / NumElems;
8659 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8660 for (unsigned i = 0; i < NumElems; ++i) {
8661 if (UndefMask[i])
8662 continue;
8663 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8664 for (unsigned j = 0; j != Scale; ++j)
8665 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8666 }
8667 SDValue V = CreateLoad(VT, LDBase);
8668 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8669 : DAG.getConstantFP(0.0, DL, VT);
8670 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8671 }
8672 }
8673 }
8674
8675 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8676 if (VT.is256BitVector() || VT.is512BitVector()) {
8677 unsigned HalfNumElems = NumElems / 2;
8678 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8679 EVT HalfVT =
8680 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8681 SDValue HalfLD =
8682 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8683 DAG, Subtarget, IsAfterLegalize);
8684 if (HalfLD)
8685 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8686 HalfLD, DAG.getIntPtrConstant(0, DL));
8687 }
8688 }
8689
8690 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8691 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8692 (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8693 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8694 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8695 : MVT::getIntegerVT(LoadSizeInBits);
8696 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8697 // Allow v4f32 on SSE1 only targets.
8698 // FIXME: Add more isel patterns so we can just use VT directly.
8699 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8700 VecVT = MVT::v4f32;
8701 if (TLI.isTypeLegal(VecVT)) {
8702 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8703 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8704 SDValue ResNode = DAG.getMemIntrinsicNode(
8705 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8706 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8707 for (auto *LD : Loads)
8708 if (LD)
8709 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8710 return DAG.getBitcast(VT, ResNode);
8711 }
8712 }
8713
8714 // BROADCAST - match the smallest possible repetition pattern, load that
8715 // scalar/subvector element and then broadcast to the entire vector.
8716 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8717 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8718 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8719 unsigned RepeatSize = SubElems * BaseSizeInBits;
8720 unsigned ScalarSize = std::min(RepeatSize, 64u);
8721 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8722 continue;
8723
8724 // Don't attempt a 1:N subvector broadcast - it should be caught by
8725 // combineConcatVectorOps, else will cause infinite loops.
8726 if (RepeatSize > ScalarSize && SubElems == 1)
8727 continue;
8728
8729 bool Match = true;
8730 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8731 for (unsigned i = 0; i != NumElems && Match; ++i) {
8732 if (!LoadMask[i])
8733 continue;
8734 SDValue Elt = peekThroughBitcasts(Elts[i]);
8735 if (RepeatedLoads[i % SubElems].isUndef())
8736 RepeatedLoads[i % SubElems] = Elt;
8737 else
8738 Match &= (RepeatedLoads[i % SubElems] == Elt);
8739 }
8740
8741 // We must have loads at both ends of the repetition.
8742 Match &= !RepeatedLoads.front().isUndef();
8743 Match &= !RepeatedLoads.back().isUndef();
8744 if (!Match)
8745 continue;
8746
8747 EVT RepeatVT =
8748 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8749 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8750 : EVT::getFloatingPointVT(ScalarSize);
8751 if (RepeatSize > ScalarSize)
8752 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8753 RepeatSize / ScalarSize);
8754 EVT BroadcastVT =
8755 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8756 VT.getSizeInBits() / ScalarSize);
8757 if (TLI.isTypeLegal(BroadcastVT)) {
8758 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8759 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
8760 SDValue Broadcast = RepeatLoad;
8761 if (RepeatSize > ScalarSize) {
8762 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8763 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8764 } else {
8765 Broadcast =
8766 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8767 }
8768 return DAG.getBitcast(VT, Broadcast);
8769 }
8770 }
8771 }
8772 }
8773
8774 return SDValue();
8775}
8776
8777// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8778// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8779// are consecutive, non-overlapping, and in the right order.
8780static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
8781 SelectionDAG &DAG,
8782 const X86Subtarget &Subtarget,
8783 bool IsAfterLegalize) {
8784 SmallVector<SDValue, 64> Elts;
8785 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8786 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8787 Elts.push_back(Elt);
8788 continue;
8789 }
8790 return SDValue();
8791 }
8792 assert(Elts.size() == VT.getVectorNumElements())((void)0);
8793 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8794 IsAfterLegalize);
8795}
8796
8797static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8798 unsigned SplatBitSize, LLVMContext &C) {
8799 unsigned ScalarSize = VT.getScalarSizeInBits();
8800 unsigned NumElm = SplatBitSize / ScalarSize;
8801
8802 SmallVector<Constant *, 32> ConstantVec;
8803 for (unsigned i = 0; i < NumElm; i++) {
8804 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8805 Constant *Const;
8806 if (VT.isFloatingPoint()) {
8807 if (ScalarSize == 32) {
8808 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8809 } else {
8810 assert(ScalarSize == 64 && "Unsupported floating point scalar size")((void)0);
8811 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8812 }
8813 } else
8814 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8815 ConstantVec.push_back(Const);
8816 }
8817 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8818}
8819
8820static bool isFoldableUseOfShuffle(SDNode *N) {
8821 for (auto *U : N->uses()) {
8822 unsigned Opc = U->getOpcode();
8823 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8824 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8825 return false;
8826 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8827 return false;
8828 if (isTargetShuffle(Opc))
8829 return true;
8830 if (Opc == ISD::BITCAST) // Ignore bitcasts
8831 return isFoldableUseOfShuffle(U);
8832 if (N->hasOneUse())
8833 return true;
8834 }
8835 return false;
8836}
8837
8838/// Attempt to use the vbroadcast instruction to generate a splat value
8839/// from a splat BUILD_VECTOR which uses:
8840/// a. A single scalar load, or a constant.
8841/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8842///
8843/// The VBROADCAST node is returned when a pattern is found,
8844/// or SDValue() otherwise.
8845static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8846 const X86Subtarget &Subtarget,
8847 SelectionDAG &DAG) {
8848 // VBROADCAST requires AVX.
8849 // TODO: Splats could be generated for non-AVX CPUs using SSE
8850 // instructions, but there's less potential gain for only 128-bit vectors.
8851 if (!Subtarget.hasAVX())
8852 return SDValue();
8853
8854 MVT VT = BVOp->getSimpleValueType(0);
8855 unsigned NumElts = VT.getVectorNumElements();
8856 SDLoc dl(BVOp);
8857
8858 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
8859 "Unsupported vector type for broadcast.")((void)0);
8860
8861 // See if the build vector is a repeating sequence of scalars (inc. splat).
8862 SDValue Ld;
8863 BitVector UndefElements;
8864 SmallVector<SDValue, 16> Sequence;
8865 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8866 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")((void)0);
8867 if (Sequence.size() == 1)
8868 Ld = Sequence[0];
8869 }
8870
8871 // Attempt to use VBROADCASTM
8872 // From this pattern:
8873 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8874 // b. t1 = (build_vector t0 t0)
8875 //
8876 // Create (VBROADCASTM v2i1 X)
8877 if (!Sequence.empty() && Subtarget.hasCDI()) {
8878 // If not a splat, are the upper sequence values zeroable?
8879 unsigned SeqLen = Sequence.size();
8880 bool UpperZeroOrUndef =
8881 SeqLen == 1 ||
8882 llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
8883 return !V || V.isUndef() || isNullConstant(V);
8884 });
8885 SDValue Op0 = Sequence[0];
8886 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8887 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8888 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8889 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8890 ? Op0.getOperand(0)
8891 : Op0.getOperand(0).getOperand(0);
8892 MVT MaskVT = BOperand.getSimpleValueType();
8893 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8894 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8895 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8896 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8897 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8898 unsigned Scale = 512 / VT.getSizeInBits();
8899 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8900 }
8901 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8902 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8903 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8904 return DAG.getBitcast(VT, Bcst);
8905 }
8906 }
8907 }
8908
8909 unsigned NumUndefElts = UndefElements.count();
8910 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8911 APInt SplatValue, Undef;
8912 unsigned SplatBitSize;
8913 bool HasUndef;
8914 // Check if this is a repeated constant pattern suitable for broadcasting.
8915 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8916 SplatBitSize > VT.getScalarSizeInBits() &&
8917 SplatBitSize < VT.getSizeInBits()) {
8918 // Avoid replacing with broadcast when it's a use of a shuffle
8919 // instruction to preserve the present custom lowering of shuffles.
8920 if (isFoldableUseOfShuffle(BVOp))
8921 return SDValue();
8922 // replace BUILD_VECTOR with broadcast of the repeated constants.
8923 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8924 LLVMContext *Ctx = DAG.getContext();
8925 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8926 if (Subtarget.hasAVX()) {
8927 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8928 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8929 // Splatted value can fit in one INTEGER constant in constant pool.
8930 // Load the constant and broadcast it.
8931 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8932 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8933 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8934 SDValue CP = DAG.getConstantPool(C, PVT);
8935 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8936
8937 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8938 SDVTList Tys =
8939 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8940 SDValue Ops[] = {DAG.getEntryNode(), CP};
8941 MachinePointerInfo MPI =
8942 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8943 SDValue Brdcst = DAG.getMemIntrinsicNode(
8944 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8945 MachineMemOperand::MOLoad);
8946 return DAG.getBitcast(VT, Brdcst);
8947 }
8948 if (SplatBitSize > 64) {
8949 // Load the vector of constants and broadcast it.
8950 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8951 *Ctx);
8952 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8953 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8954 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8955 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8956 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8957 SDValue Ops[] = {DAG.getEntryNode(), VCP};
8958 MachinePointerInfo MPI =
8959 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8960 return DAG.getMemIntrinsicNode(
8961 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
8962 MachineMemOperand::MOLoad);
8963 }
8964 }
8965 }
8966
8967 // If we are moving a scalar into a vector (Ld must be set and all elements
8968 // but 1 are undef) and that operation is not obviously supported by
8969 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8970 // That's better than general shuffling and may eliminate a load to GPR and
8971 // move from scalar to vector register.
8972 if (!Ld || NumElts - NumUndefElts != 1)
8973 return SDValue();
8974 unsigned ScalarSize = Ld.getValueSizeInBits();
8975 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8976 return SDValue();
8977 }
8978
8979 bool ConstSplatVal =
8980 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8981 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8982
8983 // TODO: Handle broadcasts of non-constant sequences.
8984
8985 // Make sure that all of the users of a non-constant load are from the
8986 // BUILD_VECTOR node.
8987 // FIXME: Is the use count needed for non-constant, non-load case?
8988 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8989 return SDValue();
8990
8991 unsigned ScalarSize = Ld.getValueSizeInBits();
8992 bool IsGE256 = (VT.getSizeInBits() >= 256);
8993
8994 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8995 // instruction to save 8 or more bytes of constant pool data.
8996 // TODO: If multiple splats are generated to load the same constant,
8997 // it may be detrimental to overall size. There needs to be a way to detect
8998 // that condition to know if this is truly a size win.
8999 bool OptForSize = DAG.shouldOptForSize();
9000
9001 // Handle broadcasting a single constant scalar from the constant pool
9002 // into a vector.
9003 // On Sandybridge (no AVX2), it is still better to load a constant vector
9004 // from the constant pool and not to broadcast it from a scalar.
9005 // But override that restriction when optimizing for size.
9006 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9007 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9008 EVT CVT = Ld.getValueType();
9009 assert(!CVT.isVector() && "Must not broadcast a vector type")((void)0);
9010
9011 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9012 // For size optimization, also splat v2f64 and v2i64, and for size opt
9013 // with AVX2, also splat i8 and i16.
9014 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9015 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9016 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9017 const Constant *C = nullptr;
9018 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9019 C = CI->getConstantIntValue();
9020 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9021 C = CF->getConstantFPValue();
9022
9023 assert(C && "Invalid constant type")((void)0);
9024
9025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9026 SDValue CP =
9027 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9028 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9029
9030 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9031 SDValue Ops[] = {DAG.getEntryNode(), CP};
9032 MachinePointerInfo MPI =
9033 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9034 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9035 MPI, Alignment, MachineMemOperand::MOLoad);
9036 }
9037 }
9038
9039 // Handle AVX2 in-register broadcasts.
9040 if (!IsLoad && Subtarget.hasInt256() &&
9041 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9042 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9043
9044 // The scalar source must be a normal load.
9045 if (!IsLoad)
9046 return SDValue();
9047
9048 // Make sure the non-chain result is only used by this build vector.
9049 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9050 return SDValue();
9051
9052 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9053 (Subtarget.hasVLX() && ScalarSize == 64)) {
9054 auto *LN = cast<LoadSDNode>(Ld);
9055 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9056 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9057 SDValue BCast =
9058 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9059 LN->getMemoryVT(), LN->getMemOperand());
9060 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9061 return BCast;
9062 }
9063
9064 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9065 // double since there is no vbroadcastsd xmm
9066 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9067 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9068 auto *LN = cast<LoadSDNode>(Ld);
9069 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9070 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9071 SDValue BCast =
9072 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9073 LN->getMemoryVT(), LN->getMemOperand());
9074 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9075 return BCast;
9076 }
9077
9078 // Unsupported broadcast.
9079 return SDValue();
9080}
9081
9082/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9083/// underlying vector and index.
9084///
9085/// Modifies \p ExtractedFromVec to the real vector and returns the real
9086/// index.
9087static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9088 SDValue ExtIdx) {
9089 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9090 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9091 return Idx;
9092
9093 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9094 // lowered this:
9095 // (extract_vector_elt (v8f32 %1), Constant<6>)
9096 // to:
9097 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9098 // (extract_subvector (v8f32 %0), Constant<4>),
9099 // undef)
9100 // Constant<0>)
9101 // In this case the vector is the extract_subvector expression and the index
9102 // is 2, as specified by the shuffle.
9103 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9104 SDValue ShuffleVec = SVOp->getOperand(0);
9105 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9106 assert(ShuffleVecVT.getVectorElementType() ==((void)0)
9107 ExtractedFromVec.getSimpleValueType().getVectorElementType())((void)0);
9108
9109 int ShuffleIdx = SVOp->getMaskElt(Idx);
9110 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9111 ExtractedFromVec = ShuffleVec;
9112 return ShuffleIdx;
9113 }
9114 return Idx;
9115}
9116
9117static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9118 MVT VT = Op.getSimpleValueType();
9119
9120 // Skip if insert_vec_elt is not supported.
9121 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9122 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9123 return SDValue();
9124
9125 SDLoc DL(Op);
9126 unsigned NumElems = Op.getNumOperands();
9127
9128 SDValue VecIn1;
9129 SDValue VecIn2;
9130 SmallVector<unsigned, 4> InsertIndices;
9131 SmallVector<int, 8> Mask(NumElems, -1);
9132
9133 for (unsigned i = 0; i != NumElems; ++i) {
9134 unsigned Opc = Op.getOperand(i).getOpcode();
9135
9136 if (Opc == ISD::UNDEF)
9137 continue;
9138
9139 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9140 // Quit if more than 1 elements need inserting.
9141 if (InsertIndices.size() > 1)
9142 return SDValue();
9143
9144 InsertIndices.push_back(i);
9145 continue;
9146 }
9147
9148 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9149 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9150
9151 // Quit if non-constant index.
9152 if (!isa<ConstantSDNode>(ExtIdx))
9153 return SDValue();
9154 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9155
9156 // Quit if extracted from vector of different type.
9157 if (ExtractedFromVec.getValueType() != VT)
9158 return SDValue();
9159
9160 if (!VecIn1.getNode())
9161 VecIn1 = ExtractedFromVec;
9162 else if (VecIn1 != ExtractedFromVec) {
9163 if (!VecIn2.getNode())
9164 VecIn2 = ExtractedFromVec;
9165 else if (VecIn2 != ExtractedFromVec)
9166 // Quit if more than 2 vectors to shuffle
9167 return SDValue();
9168 }
9169
9170 if (ExtractedFromVec == VecIn1)
9171 Mask[i] = Idx;
9172 else if (ExtractedFromVec == VecIn2)
9173 Mask[i] = Idx + NumElems;
9174 }
9175
9176 if (!VecIn1.getNode())
9177 return SDValue();
9178
9179 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9180 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9181
9182 for (unsigned Idx : InsertIndices)
9183 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9184 DAG.getIntPtrConstant(Idx, DL));
9185
9186 return NV;
9187}
9188
9189// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9190static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9191 const X86Subtarget &Subtarget) {
9192
9193 MVT VT = Op.getSimpleValueType();
9194 assert((VT.getVectorElementType() == MVT::i1) &&((void)0)
9195 "Unexpected type in LowerBUILD_VECTORvXi1!")((void)0);
9196
9197 SDLoc dl(Op);
9198 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9199 ISD::isBuildVectorAllOnes(Op.getNode()))
9200 return Op;
9201
9202 uint64_t Immediate = 0;
9203 SmallVector<unsigned, 16> NonConstIdx;
9204 bool IsSplat = true;
9205 bool HasConstElts = false;
9206 int SplatIdx = -1;
9207 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9208 SDValue In = Op.getOperand(idx);
9209 if (In.isUndef())
9210 continue;
9211 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9212 Immediate |= (InC->getZExtValue() & 0x1) << idx;
9213 HasConstElts = true;
9214 } else {
9215 NonConstIdx.push_back(idx);
9216 }
9217 if (SplatIdx < 0)
9218 SplatIdx = idx;
9219 else if (In != Op.getOperand(SplatIdx))
9220 IsSplat = false;
9221 }
9222
9223 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9224 if (IsSplat) {
9225 // The build_vector allows the scalar element to be larger than the vector
9226 // element type. We need to mask it to use as a condition unless we know
9227 // the upper bits are zero.
9228 // FIXME: Use computeKnownBits instead of checking specific opcode?
9229 SDValue Cond = Op.getOperand(SplatIdx);
9230 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")((void)0);
9231 if (Cond.getOpcode() != ISD::SETCC)
9232 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9233 DAG.getConstant(1, dl, MVT::i8));
9234
9235 // Perform the select in the scalar domain so we can use cmov.
9236 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9237 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9238 DAG.getAllOnesConstant(dl, MVT::i32),
9239 DAG.getConstant(0, dl, MVT::i32));
9240 Select = DAG.getBitcast(MVT::v32i1, Select);
9241 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9242 } else {
9243 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9244 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9245 DAG.getAllOnesConstant(dl, ImmVT),
9246 DAG.getConstant(0, dl, ImmVT));
9247 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9248 Select = DAG.getBitcast(VecVT, Select);
9249 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9250 DAG.getIntPtrConstant(0, dl));
9251 }
9252 }
9253
9254 // insert elements one by one
9255 SDValue DstVec;
9256 if (HasConstElts) {
9257 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9258 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9259 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9260 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9261 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9262 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9263 } else {
9264 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9265 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9266 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9267 DstVec = DAG.getBitcast(VecVT, Imm);
9268 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9269 DAG.getIntPtrConstant(0, dl));
9270 }
9271 } else
9272 DstVec = DAG.getUNDEF(VT);
9273
9274 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9275 unsigned InsertIdx = NonConstIdx[i];
9276 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9277 Op.getOperand(InsertIdx),
9278 DAG.getIntPtrConstant(InsertIdx, dl));
9279 }
9280 return DstVec;
9281}
9282
9283LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
9284 switch (Opcode) {
9285 case X86ISD::PACKSS:
9286 case X86ISD::PACKUS:
9287 case X86ISD::FHADD:
9288 case X86ISD::FHSUB:
9289 case X86ISD::HADD:
9290 case X86ISD::HSUB:
9291 return true;
9292 }
9293 return false;
9294}
9295
9296/// This is a helper function of LowerToHorizontalOp().
9297/// This function checks that the build_vector \p N in input implements a
9298/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9299/// may not match the layout of an x86 256-bit horizontal instruction.
9300/// In other words, if this returns true, then some extraction/insertion will
9301/// be required to produce a valid horizontal instruction.
9302///
9303/// Parameter \p Opcode defines the kind of horizontal operation to match.
9304/// For example, if \p Opcode is equal to ISD::ADD, then this function
9305/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9306/// is equal to ISD::SUB, then this function checks if this is a horizontal
9307/// arithmetic sub.
9308///
9309/// This function only analyzes elements of \p N whose indices are
9310/// in range [BaseIdx, LastIdx).
9311///
9312/// TODO: This function was originally used to match both real and fake partial
9313/// horizontal operations, but the index-matching logic is incorrect for that.
9314/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9315/// code because it is only used for partial h-op matching now?
9316static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9317 SelectionDAG &DAG,
9318 unsigned BaseIdx, unsigned LastIdx,
9319 SDValue &V0, SDValue &V1) {
9320 EVT VT = N->getValueType(0);
9321 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")((void)0);
9322 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((void)0);
9323 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((void)0)
9324 "Invalid Vector in input!")((void)0);
9325
9326 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9327 bool CanFold = true;
9328 unsigned ExpectedVExtractIdx = BaseIdx;
9329 unsigned NumElts = LastIdx - BaseIdx;
9330 V0 = DAG.getUNDEF(VT);
9331 V1 = DAG.getUNDEF(VT);
9332
9333 // Check if N implements a horizontal binop.
9334 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9335 SDValue Op = N->getOperand(i + BaseIdx);
9336
9337 // Skip UNDEFs.
9338 if (Op->isUndef()) {
9339 // Update the expected vector extract index.
9340 if (i * 2 == NumElts)
9341 ExpectedVExtractIdx = BaseIdx;
9342 ExpectedVExtractIdx += 2;
9343 continue;
9344 }
9345
9346 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9347
9348 if (!CanFold)
9349 break;
9350
9351 SDValue Op0 = Op.getOperand(0);
9352 SDValue Op1 = Op.getOperand(1);
9353
9354 // Try to match the following pattern:
9355 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9356 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9357 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9358 Op0.getOperand(0) == Op1.getOperand(0) &&
9359 isa<ConstantSDNode>(Op0.getOperand(1)) &&
9360 isa<ConstantSDNode>(Op1.getOperand(1)));
9361 if (!CanFold)
9362 break;
9363
9364 unsigned I0 = Op0.getConstantOperandVal(1);
9365 unsigned I1 = Op1.getConstantOperandVal(1);
9366
9367 if (i * 2 < NumElts) {
9368 if (V0.isUndef()) {
9369 V0 = Op0.getOperand(0);
9370 if (V0.getValueType() != VT)
9371 return false;
9372 }
9373 } else {
9374 if (V1.isUndef()) {
9375 V1 = Op0.getOperand(0);
9376 if (V1.getValueType() != VT)
9377 return false;
9378 }
9379 if (i * 2 == NumElts)
9380 ExpectedVExtractIdx = BaseIdx;
9381 }
9382
9383 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9384 if (I0 == ExpectedVExtractIdx)
9385 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9386 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9387 // Try to match the following dag sequence:
9388 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9389 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9390 } else
9391 CanFold = false;
9392
9393 ExpectedVExtractIdx += 2;
9394 }
9395
9396 return CanFold;
9397}
9398
9399/// Emit a sequence of two 128-bit horizontal add/sub followed by
9400/// a concat_vector.
9401///
9402/// This is a helper function of LowerToHorizontalOp().
9403/// This function expects two 256-bit vectors called V0 and V1.
9404/// At first, each vector is split into two separate 128-bit vectors.
9405/// Then, the resulting 128-bit vectors are used to implement two
9406/// horizontal binary operations.
9407///
9408/// The kind of horizontal binary operation is defined by \p X86Opcode.
9409///
9410/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9411/// the two new horizontal binop.
9412/// When Mode is set, the first horizontal binop dag node would take as input
9413/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9414/// horizontal binop dag node would take as input the lower 128-bit of V1
9415/// and the upper 128-bit of V1.
9416/// Example:
9417/// HADD V0_LO, V0_HI
9418/// HADD V1_LO, V1_HI
9419///
9420/// Otherwise, the first horizontal binop dag node takes as input the lower
9421/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9422/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9423/// Example:
9424/// HADD V0_LO, V1_LO
9425/// HADD V0_HI, V1_HI
9426///
9427/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9428/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9429/// the upper 128-bits of the result.
9430static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9431 const SDLoc &DL, SelectionDAG &DAG,
9432 unsigned X86Opcode, bool Mode,
9433 bool isUndefLO, bool isUndefHI) {
9434 MVT VT = V0.getSimpleValueType();
9435 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((void)0)
9436 "Invalid nodes in input!")((void)0);
9437
9438 unsigned NumElts = VT.getVectorNumElements();
9439 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9440 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9441 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9442 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9443 MVT NewVT = V0_LO.getSimpleValueType();
9444
9445 SDValue LO = DAG.getUNDEF(NewVT);
9446 SDValue HI = DAG.getUNDEF(NewVT);
9447
9448 if (Mode) {
9449 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9450 if (!isUndefLO && !V0->isUndef())
9451 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9452 if (!isUndefHI && !V1->isUndef())
9453 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9454 } else {
9455 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9456 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9457 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9458
9459 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9460 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9461 }
9462
9463 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9464}
9465
9466/// Returns true iff \p BV builds a vector with the result equivalent to
9467/// the result of ADDSUB/SUBADD operation.
9468/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9469/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9470/// \p Opnd0 and \p Opnd1.
9471static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9472 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9473 SDValue &Opnd0, SDValue &Opnd1,
9474 unsigned &NumExtracts,
9475 bool &IsSubAdd) {
9476
9477 MVT VT = BV->getSimpleValueType(0);
9478 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9479 return false;
9480
9481 unsigned NumElts = VT.getVectorNumElements();
9482 SDValue InVec0 = DAG.getUNDEF(VT);
9483 SDValue InVec1 = DAG.getUNDEF(VT);
9484
9485 NumExtracts = 0;
9486
9487 // Odd-numbered elements in the input build vector are obtained from
9488 // adding/subtracting two integer/float elements.
9489 // Even-numbered elements in the input build vector are obtained from
9490 // subtracting/adding two integer/float elements.
9491 unsigned Opc[2] = {0, 0};
9492 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9493 SDValue Op = BV->getOperand(i);
9494
9495 // Skip 'undef' values.
9496 unsigned Opcode = Op.getOpcode();
9497 if (Opcode == ISD::UNDEF)
9498 continue;
9499
9500 // Early exit if we found an unexpected opcode.
9501 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9502 return false;
9503
9504 SDValue Op0 = Op.getOperand(0);
9505 SDValue Op1 = Op.getOperand(1);
9506
9507 // Try to match the following pattern:
9508 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9509 // Early exit if we cannot match that sequence.
9510 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9511 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9512 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9513 Op0.getOperand(1) != Op1.getOperand(1))
9514 return false;
9515
9516 unsigned I0 = Op0.getConstantOperandVal(1);
9517 if (I0 != i)
9518 return false;
9519
9520 // We found a valid add/sub node, make sure its the same opcode as previous
9521 // elements for this parity.
9522 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9523 return false;
9524 Opc[i % 2] = Opcode;
9525
9526 // Update InVec0 and InVec1.
9527 if (InVec0.isUndef()) {
9528 InVec0 = Op0.getOperand(0);
9529 if (InVec0.getSimpleValueType() != VT)
9530 return false;
9531 }
9532 if (InVec1.isUndef()) {
9533 InVec1 = Op1.getOperand(0);
9534 if (InVec1.getSimpleValueType() != VT)
9535 return false;
9536 }
9537
9538 // Make sure that operands in input to each add/sub node always
9539 // come from a same pair of vectors.
9540 if (InVec0 != Op0.getOperand(0)) {
9541 if (Opcode == ISD::FSUB)
9542 return false;
9543
9544 // FADD is commutable. Try to commute the operands
9545 // and then test again.
9546 std::swap(Op0, Op1);
9547 if (InVec0 != Op0.getOperand(0))
9548 return false;
9549 }
9550
9551 if (InVec1 != Op1.getOperand(0))
9552 return false;
9553
9554 // Increment the number of extractions done.
9555 ++NumExtracts;
9556 }
9557
9558 // Ensure we have found an opcode for both parities and that they are
9559 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9560 // inputs are undef.
9561 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9562 InVec0.isUndef() || InVec1.isUndef())
9563 return false;
9564
9565 IsSubAdd = Opc[0] == ISD::FADD;
9566
9567 Opnd0 = InVec0;
9568 Opnd1 = InVec1;
9569 return true;
9570}
9571
9572/// Returns true if is possible to fold MUL and an idiom that has already been
9573/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9574/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9575/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9576///
9577/// Prior to calling this function it should be known that there is some
9578/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9579/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9580/// before replacement of such SDNode with ADDSUB operation. Thus the number
9581/// of \p Opnd0 uses is expected to be equal to 2.
9582/// For example, this function may be called for the following IR:
9583/// %AB = fmul fast <2 x double> %A, %B
9584/// %Sub = fsub fast <2 x double> %AB, %C
9585/// %Add = fadd fast <2 x double> %AB, %C
9586/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9587/// <2 x i32> <i32 0, i32 3>
9588/// There is a def for %Addsub here, which potentially can be replaced by
9589/// X86ISD::ADDSUB operation:
9590/// %Addsub = X86ISD::ADDSUB %AB, %C
9591/// and such ADDSUB can further be replaced with FMADDSUB:
9592/// %Addsub = FMADDSUB %A, %B, %C.
9593///
9594/// The main reason why this method is called before the replacement of the
9595/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9596/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9597/// FMADDSUB is.
9598static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9599 SelectionDAG &DAG,
9600 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9601 unsigned ExpectedUses) {
9602 if (Opnd0.getOpcode() != ISD::FMUL ||
9603 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9604 return false;
9605
9606 // FIXME: These checks must match the similar ones in
9607 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9608 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9609 // or MUL + ADDSUB to FMADDSUB.
9610 const TargetOptions &Options = DAG.getTarget().Options;
9611 bool AllowFusion =
9612 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9613 if (!AllowFusion)
9614 return false;
9615
9616 Opnd2 = Opnd1;
9617 Opnd1 = Opnd0.getOperand(1);
9618 Opnd0 = Opnd0.getOperand(0);
9619
9620 return true;
9621}
9622
9623/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9624/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9625/// X86ISD::FMSUBADD node.
9626static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9627 const X86Subtarget &Subtarget,
9628 SelectionDAG &DAG) {
9629 SDValue Opnd0, Opnd1;
9630 unsigned NumExtracts;
9631 bool IsSubAdd;
9632 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9633 IsSubAdd))
9634 return SDValue();
9635
9636 MVT VT = BV->getSimpleValueType(0);
9637 SDLoc DL(BV);
9638
9639 // Try to generate X86ISD::FMADDSUB node here.
9640 SDValue Opnd2;
9641 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9642 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9643 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9644 }
9645
9646 // We only support ADDSUB.
9647 if (IsSubAdd)
9648 return SDValue();
9649
9650 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9651 // the ADDSUB idiom has been successfully recognized. There are no known
9652 // X86 targets with 512-bit ADDSUB instructions!
9653 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9654 // recognition.
9655 if (VT.is512BitVector())
9656 return SDValue();
9657
9658 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9659}
9660
9661static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9662 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9663 // Initialize outputs to known values.
9664 MVT VT = BV->getSimpleValueType(0);
9665 HOpcode = ISD::DELETED_NODE;
9666 V0 = DAG.getUNDEF(VT);
9667 V1 = DAG.getUNDEF(VT);
9668
9669 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9670 // half of the result is calculated independently from the 128-bit halves of
9671 // the inputs, so that makes the index-checking logic below more complicated.
9672 unsigned NumElts = VT.getVectorNumElements();
9673 unsigned GenericOpcode = ISD::DELETED_NODE;
9674 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9675 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9676 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9677 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9678 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9679 // Ignore undef elements.
9680 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9681 if (Op.isUndef())
9682 continue;
9683
9684 // If there's an opcode mismatch, we're done.
9685 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9686 return false;
9687
9688 // Initialize horizontal opcode.
9689 if (HOpcode == ISD::DELETED_NODE) {
9690 GenericOpcode = Op.getOpcode();
9691 switch (GenericOpcode) {
9692 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9693 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9694 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9695 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9696 default: return false;
9697 }
9698 }
9699
9700 SDValue Op0 = Op.getOperand(0);
9701 SDValue Op1 = Op.getOperand(1);
9702 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9703 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9704 Op0.getOperand(0) != Op1.getOperand(0) ||
9705 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9706 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9707 return false;
9708
9709 // The source vector is chosen based on which 64-bit half of the
9710 // destination vector is being calculated.
9711 if (j < NumEltsIn64Bits) {
9712 if (V0.isUndef())
9713 V0 = Op0.getOperand(0);
9714 } else {
9715 if (V1.isUndef())
9716 V1 = Op0.getOperand(0);
9717 }
9718
9719 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9720 if (SourceVec != Op0.getOperand(0))
9721 return false;
9722
9723 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9724 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9725 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9726 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9727 (j % NumEltsIn64Bits) * 2;
9728 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9729 continue;
9730
9731 // If this is not a commutative op, this does not match.
9732 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9733 return false;
9734
9735 // Addition is commutative, so try swapping the extract indexes.
9736 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9737 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9738 continue;
9739
9740 // Extract indexes do not match horizontal requirement.
9741 return false;
9742 }
9743 }
9744 // We matched. Opcode and operands are returned by reference as arguments.
9745 return true;
9746}
9747
9748static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9749 SelectionDAG &DAG, unsigned HOpcode,
9750 SDValue V0, SDValue V1) {
9751 // If either input vector is not the same size as the build vector,
9752 // extract/insert the low bits to the correct size.
9753 // This is free (examples: zmm --> xmm, xmm --> ymm).
9754 MVT VT = BV->getSimpleValueType(0);
9755 unsigned Width = VT.getSizeInBits();
9756 if (V0.getValueSizeInBits() > Width)
9757 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9758 else if (V0.getValueSizeInBits() < Width)
9759 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9760
9761 if (V1.getValueSizeInBits() > Width)
9762 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9763 else if (V1.getValueSizeInBits() < Width)
9764 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9765
9766 unsigned NumElts = VT.getVectorNumElements();
9767 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9768 for (unsigned i = 0; i != NumElts; ++i)
9769 if (BV->getOperand(i).isUndef())
9770 DemandedElts.clearBit(i);
9771
9772 // If we don't need the upper xmm, then perform as a xmm hop.
9773 unsigned HalfNumElts = NumElts / 2;
9774 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9775 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9776 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9777 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9778 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9779 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9780 }
9781
9782 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9783}
9784
9785/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9786static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9787 const X86Subtarget &Subtarget,
9788 SelectionDAG &DAG) {
9789 // We need at least 2 non-undef elements to make this worthwhile by default.
9790 unsigned NumNonUndefs =
9791 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9792 if (NumNonUndefs < 2)
9793 return SDValue();
9794
9795 // There are 4 sets of horizontal math operations distinguished by type:
9796 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9797 // subtarget feature. Try to match those "native" patterns first.
9798 MVT VT = BV->getSimpleValueType(0);
9799 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9800 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9801 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9802 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9803 unsigned HOpcode;
9804 SDValue V0, V1;
9805 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9806 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9807 }
9808
9809 // Try harder to match 256-bit ops by using extract/concat.
9810 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9811 return SDValue();
9812
9813 // Count the number of UNDEF operands in the build_vector in input.
9814 unsigned NumElts = VT.getVectorNumElements();
9815 unsigned Half = NumElts / 2;
9816 unsigned NumUndefsLO = 0;
9817 unsigned NumUndefsHI = 0;
9818 for (unsigned i = 0, e = Half; i != e; ++i)
9819 if (BV->getOperand(i)->isUndef())
9820 NumUndefsLO++;
9821
9822 for (unsigned i = Half, e = NumElts; i != e; ++i)
9823 if (BV->getOperand(i)->isUndef())
9824 NumUndefsHI++;
9825
9826 SDLoc DL(BV);
9827 SDValue InVec0, InVec1;
9828 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9829 SDValue InVec2, InVec3;
9830 unsigned X86Opcode;
9831 bool CanFold = true;
9832
9833 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9834 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9835 InVec3) &&
9836 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9837 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9838 X86Opcode = X86ISD::HADD;
9839 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9840 InVec1) &&
9841 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9842 InVec3) &&
9843 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9844 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9845 X86Opcode = X86ISD::HSUB;
9846 else
9847 CanFold = false;
9848
9849 if (CanFold) {
9850 // Do not try to expand this build_vector into a pair of horizontal
9851 // add/sub if we can emit a pair of scalar add/sub.
9852 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9853 return SDValue();
9854
9855 // Convert this build_vector into a pair of horizontal binops followed by
9856 // a concat vector. We must adjust the outputs from the partial horizontal
9857 // matching calls above to account for undefined vector halves.
9858 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9859 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9860 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")((void)0);
9861 bool isUndefLO = NumUndefsLO == Half;
9862 bool isUndefHI = NumUndefsHI == Half;
9863 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9864 isUndefHI);
9865 }
9866 }
9867
9868 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9869 VT == MVT::v16i16) {
9870 unsigned X86Opcode;
9871 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9872 X86Opcode = X86ISD::HADD;
9873 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9874 InVec1))
9875 X86Opcode = X86ISD::HSUB;
9876 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9877 InVec1))
9878 X86Opcode = X86ISD::FHADD;
9879 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9880 InVec1))
9881 X86Opcode = X86ISD::FHSUB;
9882 else
9883 return SDValue();
9884
9885 // Don't try to expand this build_vector into a pair of horizontal add/sub
9886 // if we can simply emit a pair of scalar add/sub.
9887 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9888 return SDValue();
9889
9890 // Convert this build_vector into two horizontal add/sub followed by
9891 // a concat vector.
9892 bool isUndefLO = NumUndefsLO == Half;
9893 bool isUndefHI = NumUndefsHI == Half;
9894 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9895 isUndefLO, isUndefHI);
9896 }
9897
9898 return SDValue();
9899}
9900
9901static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9902 SelectionDAG &DAG);
9903
9904/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9905/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9906/// just apply the bit to the vectors.
9907/// NOTE: Its not in our interest to start make a general purpose vectorizer
9908/// from this, but enough scalar bit operations are created from the later
9909/// legalization + scalarization stages to need basic support.
9910static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9911 const X86Subtarget &Subtarget,
9912 SelectionDAG &DAG) {
9913 SDLoc DL(Op);
9914 MVT VT = Op->getSimpleValueType(0);
9915 unsigned NumElems = VT.getVectorNumElements();
9916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9917
9918 // Check that all elements have the same opcode.
9919 // TODO: Should we allow UNDEFS and if so how many?
9920 unsigned Opcode = Op->getOperand(0).getOpcode();
9921 for (unsigned i = 1; i < NumElems; ++i)
9922 if (Opcode != Op->getOperand(i).getOpcode())
9923 return SDValue();
9924
9925 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9926 bool IsShift = false;
9927 switch (Opcode) {
9928 default:
9929 return SDValue();
9930 case ISD::SHL:
9931 case ISD::SRL:
9932 case ISD::SRA:
9933 IsShift = true;
9934 break;
9935 case ISD::AND:
9936 case ISD::XOR:
9937 case ISD::OR:
9938 // Don't do this if the buildvector is a splat - we'd replace one
9939 // constant with an entire vector.
9940 if (Op->getSplatValue())
9941 return SDValue();
9942 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9943 return SDValue();
9944 break;
9945 }
9946
9947 SmallVector<SDValue, 4> LHSElts, RHSElts;
9948 for (SDValue Elt : Op->ops()) {
9949 SDValue LHS = Elt.getOperand(0);
9950 SDValue RHS = Elt.getOperand(1);
9951
9952 // We expect the canonicalized RHS operand to be the constant.
9953 if (!isa<ConstantSDNode>(RHS))
9954 return SDValue();
9955
9956 // Extend shift amounts.
9957 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9958 if (!IsShift)
9959 return SDValue();
9960 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9961 }
9962
9963 LHSElts.push_back(LHS);
9964 RHSElts.push_back(RHS);
9965 }
9966
9967 // Limit to shifts by uniform immediates.
9968 // TODO: Only accept vXi8/vXi64 special cases?
9969 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9970 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9971 return SDValue();
9972
9973 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9974 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9975 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9976
9977 if (!IsShift)
9978 return Res;
9979
9980 // Immediately lower the shift to ensure the constant build vector doesn't
9981 // get converted to a constant pool before the shift is lowered.
9982 return LowerShift(Res, Subtarget, DAG);
9983}
9984
9985/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9986/// functionality to do this, so it's all zeros, all ones, or some derivation
9987/// that is cheap to calculate.
9988static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9989 const X86Subtarget &Subtarget) {
9990 SDLoc DL(Op);
9991 MVT VT = Op.getSimpleValueType();
9992
9993 // Vectors containing all zeros can be matched by pxor and xorps.
9994 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9995 return Op;
9996
9997 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9998 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9999 // vpcmpeqd on 256-bit vectors.
10000 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10001 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10002 return Op;
10003
10004 return getOnesVector(VT, DAG, DL);
10005 }
10006
10007 return SDValue();
10008}
10009
10010/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10011/// from a vector of source values and a vector of extraction indices.
10012/// The vectors might be manipulated to match the type of the permute op.
10013static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10014 SDLoc &DL, SelectionDAG &DAG,
10015 const X86Subtarget &Subtarget) {
10016 MVT ShuffleVT = VT;
10017 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10018 unsigned NumElts = VT.getVectorNumElements();
10019 unsigned SizeInBits = VT.getSizeInBits();
10020
10021 // Adjust IndicesVec to match VT size.
10022 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&((void)0)
10023 "Illegal variable permute mask size")((void)0);
10024 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10025 // Narrow/widen the indices vector to the correct size.
10026 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10027 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10028 NumElts * VT.getScalarSizeInBits());
10029 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10030 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10031 SDLoc(IndicesVec), SizeInBits);
10032 // Zero-extend the index elements within the vector.
10033 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10034 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10035 IndicesVT, IndicesVec);
10036 }
10037 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10038
10039 // Handle SrcVec that don't match VT type.
10040 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10041 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10042 // Handle larger SrcVec by treating it as a larger permute.
10043 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10044 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10045 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10046 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10047 Subtarget, DAG, SDLoc(IndicesVec));
10048 SDValue NewSrcVec =
10049 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10050 if (NewSrcVec)
10051 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10052 return SDValue();
10053 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10054 // Widen smaller SrcVec to match VT.
10055 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10056 } else
10057 return SDValue();
10058 }
10059
10060 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10061 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")((void)0);
10062 EVT SrcVT = Idx.getValueType();
10063 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10064 uint64_t IndexScale = 0;
10065 uint64_t IndexOffset = 0;
10066
10067 // If we're scaling a smaller permute op, then we need to repeat the
10068 // indices, scaling and offsetting them as well.
10069 // e.g. v4i32 -> v16i8 (Scale = 4)
10070 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10071 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10072 for (uint64_t i = 0; i != Scale; ++i) {
10073 IndexScale |= Scale << (i * NumDstBits);
10074 IndexOffset |= i << (i * NumDstBits);
10075 }
10076
10077 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10078 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10079 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10080 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10081 return Idx;
10082 };
10083
10084 unsigned Opcode = 0;
10085 switch (VT.SimpleTy) {
10086 default:
10087 break;
10088 case MVT::v16i8:
10089 if (Subtarget.hasSSSE3())
10090 Opcode = X86ISD::PSHUFB;
10091 break;
10092 case MVT::v8i16:
10093 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10094 Opcode = X86ISD::VPERMV;
10095 else if (Subtarget.hasSSSE3()) {
10096 Opcode = X86ISD::PSHUFB;
10097 ShuffleVT = MVT::v16i8;
10098 }
10099 break;
10100 case MVT::v4f32:
10101 case MVT::v4i32:
10102 if (Subtarget.hasAVX()) {
10103 Opcode = X86ISD::VPERMILPV;
10104 ShuffleVT = MVT::v4f32;
10105 } else if (Subtarget.hasSSSE3()) {
10106 Opcode = X86ISD::PSHUFB;
10107 ShuffleVT = MVT::v16i8;
10108 }
10109 break;
10110 case MVT::v2f64:
10111 case MVT::v2i64:
10112 if (Subtarget.hasAVX()) {
10113 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10114 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10115 Opcode = X86ISD::VPERMILPV;
10116 ShuffleVT = MVT::v2f64;
10117 } else if (Subtarget.hasSSE41()) {
10118 // SSE41 can compare v2i64 - select between indices 0 and 1.
10119 return DAG.getSelectCC(
10120 DL, IndicesVec,
10121 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10122 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10123 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10124 ISD::CondCode::SETEQ);
10125 }
10126 break;
10127 case MVT::v32i8:
10128 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10129 Opcode = X86ISD::VPERMV;
10130 else if (Subtarget.hasXOP()) {
10131 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10132 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10133 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10134 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10135 return DAG.getNode(
10136 ISD::CONCAT_VECTORS, DL, VT,
10137 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10138 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10139 } else if (Subtarget.hasAVX()) {
10140 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10141 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10142 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10143 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10144 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10145 ArrayRef<SDValue> Ops) {
10146 // Permute Lo and Hi and then select based on index range.
10147 // This works as SHUFB uses bits[3:0] to permute elements and we don't
10148 // care about the bit[7] as its just an index vector.
10149 SDValue Idx = Ops[2];
10150 EVT VT = Idx.getValueType();
10151 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10152 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10153 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10154 ISD::CondCode::SETGT);
10155 };
10156 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10157 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10158 PSHUFBBuilder);
10159 }
10160 break;
10161 case MVT::v16i16:
10162 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10163 Opcode = X86ISD::VPERMV;
10164 else if (Subtarget.hasAVX()) {
10165 // Scale to v32i8 and perform as v32i8.
10166 IndicesVec = ScaleIndices(IndicesVec, 2);
10167 return DAG.getBitcast(
10168 VT, createVariablePermute(
10169 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10170 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10171 }
10172 break;
10173 case MVT::v8f32:
10174 case MVT::v8i32:
10175 if (Subtarget.hasAVX2())
10176 Opcode = X86ISD::VPERMV;
10177 else if (Subtarget.hasAVX()) {
10178 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10179 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10180 {0, 1, 2, 3, 0, 1, 2, 3});
10181 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10182 {4, 5, 6, 7, 4, 5, 6, 7});
10183 if (Subtarget.hasXOP())
10184 return DAG.getBitcast(
10185 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10186 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10187 // Permute Lo and Hi and then select based on index range.
10188 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10189 SDValue Res = DAG.getSelectCC(
10190 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10191 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10192 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10193 ISD::CondCode::SETGT);
10194 return DAG.getBitcast(VT, Res);
10195 }
10196 break;
10197 case MVT::v4i64:
10198 case MVT::v4f64:
10199 if (Subtarget.hasAVX512()) {
10200 if (!Subtarget.hasVLX()) {
10201 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10202 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10203 SDLoc(SrcVec));
10204 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10205 DAG, SDLoc(IndicesVec));
10206 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10207 DAG, Subtarget);
10208 return extract256BitVector(Res, 0, DAG, DL);
10209 }
10210 Opcode = X86ISD::VPERMV;
10211 } else if (Subtarget.hasAVX()) {
10212 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10213 SDValue LoLo =
10214 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10215 SDValue HiHi =
10216 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10217 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10218 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10219 if (Subtarget.hasXOP())
10220 return DAG.getBitcast(
10221 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10222 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10223 // Permute Lo and Hi and then select based on index range.
10224 // This works as VPERMILPD only uses index bit[1] to permute elements.
10225 SDValue Res = DAG.getSelectCC(
10226 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10227 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10228 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10229 ISD::CondCode::SETGT);
10230 return DAG.getBitcast(VT, Res);
10231 }
10232 break;
10233 case MVT::v64i8:
10234 if (Subtarget.hasVBMI())
10235 Opcode = X86ISD::VPERMV;
10236 break;
10237 case MVT::v32i16:
10238 if (Subtarget.hasBWI())
10239 Opcode = X86ISD::VPERMV;
10240 break;
10241 case MVT::v16f32:
10242 case MVT::v16i32:
10243 case MVT::v8f64:
10244 case MVT::v8i64:
10245 if (Subtarget.hasAVX512())
10246 Opcode = X86ISD::VPERMV;
10247 break;
10248 }
10249 if (!Opcode)
10250 return SDValue();
10251
10252 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&((void)0)
10253 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&((void)0)
10254 "Illegal variable permute shuffle type")((void)0);
10255
10256 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10257 if (Scale > 1)
10258 IndicesVec = ScaleIndices(IndicesVec, Scale);
10259
10260 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10261 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10262
10263 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10264 SDValue Res = Opcode == X86ISD::VPERMV
10265 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10266 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10267 return DAG.getBitcast(VT, Res);
10268}
10269
10270// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10271// reasoned to be a permutation of a vector by indices in a non-constant vector.
10272// (build_vector (extract_elt V, (extract_elt I, 0)),
10273// (extract_elt V, (extract_elt I, 1)),
10274// ...
10275// ->
10276// (vpermv I, V)
10277//
10278// TODO: Handle undefs
10279// TODO: Utilize pshufb and zero mask blending to support more efficient
10280// construction of vectors with constant-0 elements.
10281static SDValue
10282LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10283 const X86Subtarget &Subtarget) {
10284 SDValue SrcVec, IndicesVec;
10285 // Check for a match of the permute source vector and permute index elements.
10286 // This is done by checking that the i-th build_vector operand is of the form:
10287 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10288 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10289 SDValue Op = V.getOperand(Idx);
10290 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10291 return SDValue();
10292
10293 // If this is the first extract encountered in V, set the source vector,
10294 // otherwise verify the extract is from the previously defined source
10295 // vector.
10296 if (!SrcVec)
10297 SrcVec = Op.getOperand(0);
10298 else if (SrcVec != Op.getOperand(0))
10299 return SDValue();
10300 SDValue ExtractedIndex = Op->getOperand(1);
10301 // Peek through extends.
10302 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10303 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10304 ExtractedIndex = ExtractedIndex.getOperand(0);
10305 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10306 return SDValue();
10307
10308 // If this is the first extract from the index vector candidate, set the
10309 // indices vector, otherwise verify the extract is from the previously
10310 // defined indices vector.
10311 if (!IndicesVec)
10312 IndicesVec = ExtractedIndex.getOperand(0);
10313 else if (IndicesVec != ExtractedIndex.getOperand(0))
10314 return SDValue();
10315
10316 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10317 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10318 return SDValue();
10319 }
10320
10321 SDLoc DL(V);
10322 MVT VT = V.getSimpleValueType();
10323 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10324}
10325
10326SDValue
10327X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10328 SDLoc dl(Op);
10329
10330 MVT VT = Op.getSimpleValueType();
10331 MVT EltVT = VT.getVectorElementType();
10332 unsigned NumElems = Op.getNumOperands();
10333
10334 // Generate vectors for predicate vectors.
10335 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10336 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10337
10338 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10339 return VectorConstant;
10340
10341 unsigned EVTBits = EltVT.getSizeInBits();
10342 APInt UndefMask = APInt::getNullValue(NumElems);
10343 APInt ZeroMask = APInt::getNullValue(NumElems);
10344 APInt NonZeroMask = APInt::getNullValue(NumElems);
10345 bool IsAllConstants = true;
10346 SmallSet<SDValue, 8> Values;
10347 unsigned NumConstants = NumElems;
10348 for (unsigned i = 0; i < NumElems; ++i) {
10349 SDValue Elt = Op.getOperand(i);
10350 if (Elt.isUndef()) {
10351 UndefMask.setBit(i);
10352 continue;
10353 }
10354 Values.insert(Elt);
10355 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10356 IsAllConstants = false;
10357 NumConstants--;
10358 }
10359 if (X86::isZeroNode(Elt)) {
10360 ZeroMask.setBit(i);
10361 } else {
10362 NonZeroMask.setBit(i);
10363 }
10364 }
10365
10366 // All undef vector. Return an UNDEF. All zero vectors were handled above.
10367 if (NonZeroMask == 0) {
10368 assert(UndefMask.isAllOnesValue() && "Fully undef mask expected")((void)0);
10369 return DAG.getUNDEF(VT);
10370 }
10371
10372 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10373
10374 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10375 // lowering to a smaller build vector and padding with undef/zero.
10376 if ((VT.is256BitVector() || VT.is512BitVector()) &&
10377 !isFoldableUseOfShuffle(BV)) {
10378 unsigned UpperElems = NumElems / 2;
10379 APInt UndefOrZeroMask = UndefMask | ZeroMask;
10380 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10381 if (NumUpperUndefsOrZeros >= UpperElems) {
10382 if (VT.is512BitVector() &&
10383 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10384 UpperElems = NumElems - (NumElems / 4);
10385 bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10386 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10387 SDValue NewBV =
10388 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10389 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10390 }
10391 }
10392
10393 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10394 return AddSub;
10395 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10396 return HorizontalOp;
10397 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10398 return Broadcast;
10399 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10400 return BitOp;
10401
10402 unsigned NumZero = ZeroMask.countPopulation();
10403 unsigned NumNonZero = NonZeroMask.countPopulation();
10404
10405 // If we are inserting one variable into a vector of non-zero constants, try
10406 // to avoid loading each constant element as a scalar. Load the constants as a
10407 // vector and then insert the variable scalar element. If insertion is not
10408 // supported, fall back to a shuffle to get the scalar blended with the
10409 // constants. Insertion into a zero vector is handled as a special-case
10410 // somewhere below here.
10411 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10412 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10413 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10414 // Create an all-constant vector. The variable element in the old
10415 // build vector is replaced by undef in the constant vector. Save the
10416 // variable scalar element and its index for use in the insertelement.
10417 LLVMContext &Context = *DAG.getContext();
10418 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10419 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10420 SDValue VarElt;
10421 SDValue InsIndex;
10422 for (unsigned i = 0; i != NumElems; ++i) {
10423 SDValue Elt = Op.getOperand(i);
10424 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10425 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10426 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10427 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10428 else if (!Elt.isUndef()) {
10429 assert(!VarElt.getNode() && !InsIndex.getNode() &&((void)0)
10430 "Expected one variable element in this vector")((void)0);
10431 VarElt = Elt;
10432 InsIndex = DAG.getVectorIdxConstant(i, dl);
10433 }
10434 }
10435 Constant *CV = ConstantVector::get(ConstVecOps);
10436 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10437
10438 // The constants we just created may not be legal (eg, floating point). We
10439 // must lower the vector right here because we can not guarantee that we'll
10440 // legalize it before loading it. This is also why we could not just create
10441 // a new build vector here. If the build vector contains illegal constants,
10442 // it could get split back up into a series of insert elements.
10443 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10444 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10445 MachineFunction &MF = DAG.getMachineFunction();
10446 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10447 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10448 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10449 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10450 if (InsertC < NumEltsInLow128Bits)
10451 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10452
10453 // There's no good way to insert into the high elements of a >128-bit
10454 // vector, so use shuffles to avoid an extract/insert sequence.
10455 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")((void)0);
10456 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")((void)0);
10457 SmallVector<int, 8> ShuffleMask;
10458 unsigned NumElts = VT.getVectorNumElements();
10459 for (unsigned i = 0; i != NumElts; ++i)
10460 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10461 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10462 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10463 }
10464
10465 // Special case for single non-zero, non-undef, element.
10466 if (NumNonZero == 1) {
10467 unsigned Idx = NonZeroMask.countTrailingZeros();
10468 SDValue Item = Op.getOperand(Idx);
10469
10470 // If we have a constant or non-constant insertion into the low element of
10471 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10472 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10473 // depending on what the source datatype is.
10474 if (Idx == 0) {
10475 if (NumZero == 0)
10476 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10477
10478 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10479 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10480 assert((VT.is128BitVector() || VT.is256BitVector() ||((void)0)
10481 VT.is512BitVector()) &&((void)0)
10482 "Expected an SSE value type!")((void)0);
10483 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10484 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10485 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10486 }
10487
10488 // We can't directly insert an i8 or i16 into a vector, so zero extend
10489 // it to i32 first.
10490 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10491 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10492 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10493 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10494 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10495 return DAG.getBitcast(VT, Item);
10496 }
10497 }
10498
10499 // Is it a vector logical left shift?
10500 if (NumElems == 2 && Idx == 1 &&
10501 X86::isZeroNode(Op.getOperand(0)) &&
10502 !X86::isZeroNode(Op.getOperand(1))) {
10503 unsigned NumBits = VT.getSizeInBits();
10504 return getVShift(true, VT,
10505 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10506 VT, Op.getOperand(1)),
10507 NumBits/2, DAG, *this, dl);
10508 }
10509
10510 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10511 return SDValue();
10512
10513 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10514 // is a non-constant being inserted into an element other than the low one,
10515 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10516 // movd/movss) to move this into the low element, then shuffle it into
10517 // place.
10518 if (EVTBits == 32) {
10519 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10520 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10521 }
10522 }
10523
10524 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10525 if (Values.size() == 1) {
10526 if (EVTBits == 32) {
10527 // Instead of a shuffle like this:
10528 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10529 // Check if it's possible to issue this instead.
10530 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10531 unsigned Idx = NonZeroMask.countTrailingZeros();
10532 SDValue Item = Op.getOperand(Idx);
10533 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10534 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10535 }
10536 return SDValue();
10537 }
10538
10539 // A vector full of immediates; various special cases are already
10540 // handled, so this is best done with a single constant-pool load.
10541 if (IsAllConstants)
10542 return SDValue();
10543
10544 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10545 return V;
10546
10547 // See if we can use a vector load to get all of the elements.
10548 {
10549 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10550 if (SDValue LD =
10551 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10552 return LD;
10553 }
10554
10555 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10556 // build_vector and broadcast it.
10557 // TODO: We could probably generalize this more.
10558 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10559 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10560 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10561 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10562 // Make sure all the even/odd operands match.
10563 for (unsigned i = 2; i != NumElems; ++i)
10564 if (Ops[i % 2] != Op.getOperand(i))
10565 return false;
10566 return true;
10567 };
10568 if (CanSplat(Op, NumElems, Ops)) {
10569 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10570 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10571 // Create a new build vector and cast to v2i64/v2f64.
10572 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10573 DAG.getBuildVector(NarrowVT, dl, Ops));
10574 // Broadcast from v2i64/v2f64 and cast to final VT.
10575 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10576 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10577 NewBV));
10578 }
10579 }
10580
10581 // For AVX-length vectors, build the individual 128-bit pieces and use
10582 // shuffles to put them in place.
10583 if (VT.getSizeInBits() > 128) {
10584 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10585
10586 // Build both the lower and upper subvector.
10587 SDValue Lower =
10588 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10589 SDValue Upper = DAG.getBuildVector(
10590 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10591
10592 // Recreate the wider vector with the lower and upper part.
10593 return concatSubVectors(Lower, Upper, DAG, dl);
10594 }
10595
10596 // Let legalizer expand 2-wide build_vectors.
10597 if (EVTBits == 64) {
10598 if (NumNonZero == 1) {
10599 // One half is zero or undef.
10600 unsigned Idx = NonZeroMask.countTrailingZeros();
10601 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10602 Op.getOperand(Idx));
10603 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10604 }
10605 return SDValue();
10606 }
10607
10608 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10609 if (EVTBits == 8 && NumElems == 16)
10610 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10611 DAG, Subtarget))
10612 return V;
10613
10614 if (EVTBits == 16 && NumElems == 8)
10615 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10616 DAG, Subtarget))
10617 return V;
10618
10619 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10620 if (EVTBits == 32 && NumElems == 4)
10621 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10622 return V;
10623
10624 // If element VT is == 32 bits, turn it into a number of shuffles.
10625 if (NumElems == 4 && NumZero > 0) {
10626 SmallVector<SDValue, 8> Ops(NumElems);
10627 for (unsigned i = 0; i < 4; ++i) {
10628 bool isZero = !NonZeroMask[i];
10629 if (isZero)
10630 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10631 else
10632 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10633 }
10634
10635 for (unsigned i = 0; i < 2; ++i) {
10636 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10637 default: llvm_unreachable("Unexpected NonZero count")__builtin_unreachable();
10638 case 0:
10639 Ops[i] = Ops[i*2]; // Must be a zero vector.
10640 break;
10641 case 1:
10642 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10643 break;
10644 case 2:
10645 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10646 break;
10647 case 3:
10648 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10649 break;
10650 }
10651 }
10652
10653 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10654 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10655 int MaskVec[] = {
10656 Reverse1 ? 1 : 0,
10657 Reverse1 ? 0 : 1,
10658 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10659 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10660 };
10661 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10662 }
10663
10664 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")((void)0);
10665
10666 // Check for a build vector from mostly shuffle plus few inserting.
10667 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10668 return Sh;
10669
10670 // For SSE 4.1, use insertps to put the high elements into the low element.
10671 if (Subtarget.hasSSE41()) {
10672 SDValue Result;
10673 if (!Op.getOperand(0).isUndef())
10674 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10675 else
10676 Result = DAG.getUNDEF(VT);
10677
10678 for (unsigned i = 1; i < NumElems; ++i) {
10679 if (Op.getOperand(i).isUndef()) continue;
10680 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10681 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10682 }
10683 return Result;
10684 }
10685
10686 // Otherwise, expand into a number of unpckl*, start by extending each of
10687 // our (non-undef) elements to the full vector width with the element in the
10688 // bottom slot of the vector (which generates no code for SSE).
10689 SmallVector<SDValue, 8> Ops(NumElems);
10690 for (unsigned i = 0; i < NumElems; ++i) {
10691 if (!Op.getOperand(i).isUndef())
10692 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10693 else
10694 Ops[i] = DAG.getUNDEF(VT);
10695 }
10696
10697 // Next, we iteratively mix elements, e.g. for v4f32:
10698 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10699 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10700 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10701 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10702 // Generate scaled UNPCKL shuffle mask.
10703 SmallVector<int, 16> Mask;
10704 for(unsigned i = 0; i != Scale; ++i)
10705 Mask.push_back(i);
10706 for (unsigned i = 0; i != Scale; ++i)
10707 Mask.push_back(NumElems+i);
10708 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10709
10710 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10711 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10712 }
10713 return Ops[0];
10714}
10715
10716// 256-bit AVX can use the vinsertf128 instruction
10717// to create 256-bit vectors from two other 128-bit ones.
10718// TODO: Detect subvector broadcast here instead of DAG combine?
10719static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10720 const X86Subtarget &Subtarget) {
10721 SDLoc dl(Op);
10722 MVT ResVT = Op.getSimpleValueType();
10723
10724 assert((ResVT.is256BitVector() ||((void)0)
10725 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")((void)0);
10726
10727 unsigned NumOperands = Op.getNumOperands();
10728 unsigned NumZero = 0;
10729 unsigned NumNonZero = 0;
10730 unsigned NonZeros = 0;
10731 for (unsigned i = 0; i != NumOperands; ++i) {
10732 SDValue SubVec = Op.getOperand(i);
10733 if (SubVec.isUndef())
10734 continue;
10735 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10736 ++NumZero;
10737 else {
10738 assert(i < sizeof(NonZeros) * CHAR_BIT)((void)0); // Ensure the shift is in range.
10739 NonZeros |= 1 << i;
10740 ++NumNonZero;
10741 }
10742 }
10743
10744 // If we have more than 2 non-zeros, build each half separately.
10745 if (NumNonZero > 2) {
10746 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10747 ArrayRef<SDUse> Ops = Op->ops();
10748 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10749 Ops.slice(0, NumOperands/2));
10750 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10751 Ops.slice(NumOperands/2));
10752 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10753 }
10754
10755 // Otherwise, build it up through insert_subvectors.
10756 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10757 : DAG.getUNDEF(ResVT);
10758
10759 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10760 unsigned NumSubElems = SubVT.getVectorNumElements();
10761 for (unsigned i = 0; i != NumOperands; ++i) {
10762 if ((NonZeros & (1 << i)) == 0)
10763 continue;
10764
10765 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10766 Op.getOperand(i),
10767 DAG.getIntPtrConstant(i * NumSubElems, dl));
10768 }
10769
10770 return Vec;
10771}
10772
10773// Returns true if the given node is a type promotion (by concatenating i1
10774// zeros) of the result of a node that already zeros all upper bits of
10775// k-register.
10776// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10777static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10778 const X86Subtarget &Subtarget,
10779 SelectionDAG & DAG) {
10780 SDLoc dl(Op);
10781 MVT ResVT = Op.getSimpleValueType();
10782 unsigned NumOperands = Op.getNumOperands();
10783
10784 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&((void)0)
10785 "Unexpected number of operands in CONCAT_VECTORS")((void)0);
10786
10787 uint64_t Zeros = 0;
10788 uint64_t NonZeros = 0;
10789 for (unsigned i = 0; i != NumOperands; ++i) {
10790 SDValue SubVec = Op.getOperand(i);
10791 if (SubVec.isUndef())
10792 continue;
10793 assert(i < sizeof(NonZeros) * CHAR_BIT)((void)0); // Ensure the shift is in range.
10794 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10795 Zeros |= (uint64_t)1 << i;
10796 else
10797 NonZeros |= (uint64_t)1 << i;
10798 }
10799
10800 unsigned NumElems = ResVT.getVectorNumElements();
10801
10802 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10803 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10804 // insert_subvector will give us two kshifts.
10805 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10806 Log2_64(NonZeros) != NumOperands - 1) {
10807 MVT ShiftVT = ResVT;
10808 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10809 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10810 unsigned Idx = Log2_64(NonZeros);
10811 SDValue SubVec = Op.getOperand(Idx);
10812 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10813 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10814 DAG.getUNDEF(ShiftVT), SubVec,
10815 DAG.getIntPtrConstant(0, dl));
10816 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10817 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10818 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10819 DAG.getIntPtrConstant(0, dl));
10820 }
10821
10822 // If there are zero or one non-zeros we can handle this very simply.
10823 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10824 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10825 if (!NonZeros)
10826 return Vec;
10827 unsigned Idx = Log2_64(NonZeros);
10828 SDValue SubVec = Op.getOperand(Idx);
10829 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10830 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10831 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10832 }
10833
10834 if (NumOperands > 2) {
10835 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10836 ArrayRef<SDUse> Ops = Op->ops();
10837 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10838 Ops.slice(0, NumOperands/2));
10839 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10840 Ops.slice(NumOperands/2));
10841 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10842 }
10843
10844 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")((void)0);
10845
10846 if (ResVT.getVectorNumElements() >= 16)
10847 return Op; // The operation is legal with KUNPCK
10848
10849 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10850 DAG.getUNDEF(ResVT), Op.getOperand(0),
10851 DAG.getIntPtrConstant(0, dl));
10852 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10853 DAG.getIntPtrConstant(NumElems/2, dl));
10854}
10855
10856static SDValue LowerCONCAT_VECTORS(SDValue Op,
10857 const X86Subtarget &Subtarget,
10858 SelectionDAG &DAG) {
10859 MVT VT = Op.getSimpleValueType();
10860 if (VT.getVectorElementType() == MVT::i1)
10861 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10862
10863 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||((void)0)
10864 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||((void)0)
10865 Op.getNumOperands() == 4)))((void)0);
10866
10867 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10868 // from two other 128-bit ones.
10869
10870 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10871 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10872}
10873
10874//===----------------------------------------------------------------------===//
10875// Vector shuffle lowering
10876//
10877// This is an experimental code path for lowering vector shuffles on x86. It is
10878// designed to handle arbitrary vector shuffles and blends, gracefully
10879// degrading performance as necessary. It works hard to recognize idiomatic
10880// shuffles and lower them to optimal instruction patterns without leaving
10881// a framework that allows reasonably efficient handling of all vector shuffle
10882// patterns.
10883//===----------------------------------------------------------------------===//
10884
10885/// Tiny helper function to identify a no-op mask.
10886///
10887/// This is a somewhat boring predicate function. It checks whether the mask
10888/// array input, which is assumed to be a single-input shuffle mask of the kind
10889/// used by the X86 shuffle instructions (not a fully general
10890/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10891/// in-place shuffle are 'no-op's.
10892static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10893 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10894 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
10895 if (Mask[i] >= 0 && Mask[i] != i)
10896 return false;
10897 }
10898 return true;
10899}
10900
10901/// Test whether there are elements crossing LaneSizeInBits lanes in this
10902/// shuffle mask.
10903///
10904/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10905/// and we routinely test for these.
10906static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10907 unsigned ScalarSizeInBits,
10908 ArrayRef<int> Mask) {
10909 assert(LaneSizeInBits && ScalarSizeInBits &&((void)0)
10910 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((void)0)
10911 "Illegal shuffle lane size")((void)0);
10912 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10913 int Size = Mask.size();
10914 for (int i = 0; i < Size; ++i)
10915 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10916 return true;
10917 return false;
10918}
10919
10920/// Test whether there are elements crossing 128-bit lanes in this
10921/// shuffle mask.
10922static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10923 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10924}
10925
10926/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10927/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10928/// better support 'repeated mask + lane permute' style shuffles.
10929static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10930 unsigned ScalarSizeInBits,
10931 ArrayRef<int> Mask) {
10932 assert(LaneSizeInBits && ScalarSizeInBits &&((void)0)
10933 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((void)0)
10934 "Illegal shuffle lane size")((void)0);
10935 int NumElts = Mask.size();
10936 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10937 int NumLanes = NumElts / NumEltsPerLane;
10938 if (NumLanes > 1) {
10939 for (int i = 0; i != NumLanes; ++i) {
10940 int SrcLane = -1;
10941 for (int j = 0; j != NumEltsPerLane; ++j) {
10942 int M = Mask[(i * NumEltsPerLane) + j];
10943 if (M < 0)
10944 continue;
10945 int Lane = (M % NumElts) / NumEltsPerLane;
10946 if (SrcLane >= 0 && SrcLane != Lane)
10947 return true;
10948 SrcLane = Lane;
10949 }
10950 }
10951 }
10952 return false;
10953}
10954
10955/// Test whether a shuffle mask is equivalent within each sub-lane.
10956///
10957/// This checks a shuffle mask to see if it is performing the same
10958/// lane-relative shuffle in each sub-lane. This trivially implies
10959/// that it is also not lane-crossing. It may however involve a blend from the
10960/// same lane of a second vector.
10961///
10962/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10963/// non-trivial to compute in the face of undef lanes. The representation is
10964/// suitable for use with existing 128-bit shuffles as entries from the second
10965/// vector have been remapped to [LaneSize, 2*LaneSize).
10966static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10967 ArrayRef<int> Mask,
10968 SmallVectorImpl<int> &RepeatedMask) {
10969 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10970 RepeatedMask.assign(LaneSize, -1);
10971 int Size = Mask.size();
10972 for (int i = 0; i < Size; ++i) {
10973 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)((void)0);
10974 if (Mask[i] < 0)
10975 continue;
10976 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10977 // This entry crosses lanes, so there is no way to model this shuffle.
10978 return false;
10979
10980 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10981 // Adjust second vector indices to start at LaneSize instead of Size.
10982 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10983 : Mask[i] % LaneSize + LaneSize;
10984 if (RepeatedMask[i % LaneSize] < 0)
10985 // This is the first non-undef entry in this slot of a 128-bit lane.
10986 RepeatedMask[i % LaneSize] = LocalM;
10987 else if (RepeatedMask[i % LaneSize] != LocalM)
10988 // Found a mismatch with the repeated mask.
10989 return false;
10990 }
10991 return true;
10992}
10993
10994/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10995static bool
10996is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10997 SmallVectorImpl<int> &RepeatedMask) {
10998 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10999}
11000
11001static bool
11002is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11003 SmallVector<int, 32> RepeatedMask;
11004 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11005}
11006
11007/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11008static bool
11009is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11010 SmallVectorImpl<int> &RepeatedMask) {
11011 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11012}
11013
11014/// Test whether a target shuffle mask is equivalent within each sub-lane.
11015/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11016static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11017 unsigned EltSizeInBits,
11018 ArrayRef<int> Mask,
11019 SmallVectorImpl<int> &RepeatedMask) {
11020 int LaneSize = LaneSizeInBits / EltSizeInBits;
11021 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11022 int Size = Mask.size();
11023 for (int i = 0; i < Size; ++i) {
11024 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))((void)0);
11025 if (Mask[i] == SM_SentinelUndef)
11026 continue;
11027 if (Mask[i] == SM_SentinelZero) {
11028 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11029 return false;
11030 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11031 continue;
11032 }
11033 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11034 // This entry crosses lanes, so there is no way to model this shuffle.
11035 return false;
11036
11037 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11038 // later vector indices to start at multiples of LaneSize instead of Size.
11039 int LaneM = Mask[i] / Size;
11040 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11041 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11042 // This is the first non-undef entry in this slot of a 128-bit lane.
11043 RepeatedMask[i % LaneSize] = LocalM;
11044 else if (RepeatedMask[i % LaneSize] != LocalM)
11045 // Found a mismatch with the repeated mask.
11046 return false;
11047 }
11048 return true;
11049}
11050
11051/// Test whether a target shuffle mask is equivalent within each sub-lane.
11052/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11053static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11054 ArrayRef<int> Mask,
11055 SmallVectorImpl<int> &RepeatedMask) {
11056 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11057 Mask, RepeatedMask);
11058}
11059
11060/// Checks whether the vector elements referenced by two shuffle masks are
11061/// equivalent.
11062static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11063 int Idx, int ExpectedIdx) {
11064 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&((void)0)
11065 ExpectedIdx < MaskSize && "Out of range element index")((void)0);
11066 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11067 return false;
11068
11069 switch (Op.getOpcode()) {
11070 case ISD::BUILD_VECTOR:
11071 // If the values are build vectors, we can look through them to find
11072 // equivalent inputs that make the shuffles equivalent.
11073 // TODO: Handle MaskSize != Op.getNumOperands()?
11074 if (MaskSize == (int)Op.getNumOperands() &&
11075 MaskSize == (int)ExpectedOp.getNumOperands())
11076 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11077 break;
11078 case X86ISD::VBROADCAST:
11079 case X86ISD::VBROADCAST_LOAD:
11080 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11081 return (Op == ExpectedOp &&
11082 (int)Op.getValueType().getVectorNumElements() == MaskSize);
11083 case X86ISD::HADD:
11084 case X86ISD::HSUB:
11085 case X86ISD::FHADD:
11086 case X86ISD::FHSUB:
11087 case X86ISD::PACKSS:
11088 case X86ISD::PACKUS:
11089 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11090 // TODO: Handle MaskSize != NumElts?
11091 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11092 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11093 MVT VT = Op.getSimpleValueType();
11094 int NumElts = VT.getVectorNumElements();
11095 if (MaskSize == NumElts) {
11096 int NumLanes = VT.getSizeInBits() / 128;
11097 int NumEltsPerLane = NumElts / NumLanes;
11098 int NumHalfEltsPerLane = NumEltsPerLane / 2;
11099 bool SameLane =
11100 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11101 bool SameElt =
11102 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11103 return SameLane && SameElt;
11104 }
11105 }
11106 break;
11107 }
11108
11109 return false;
11110}
11111
11112/// Checks whether a shuffle mask is equivalent to an explicit list of
11113/// arguments.
11114///
11115/// This is a fast way to test a shuffle mask against a fixed pattern:
11116///
11117/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11118///
11119/// It returns true if the mask is exactly as wide as the argument list, and
11120/// each element of the mask is either -1 (signifying undef) or the value given
11121/// in the argument.
11122static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11123 SDValue V1 = SDValue(),
11124 SDValue V2 = SDValue()) {
11125 int Size = Mask.size();
11126 if (Size != (int)ExpectedMask.size())
11127 return false;
11128
11129 for (int i = 0; i < Size; ++i) {
11130 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
11131 int MaskIdx = Mask[i];
11132 int ExpectedIdx = ExpectedMask[i];
11133 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11134 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11135 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11136 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11137 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11138 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11139 return false;
11140 }
11141 }
11142 return true;
11143}
11144
11145/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11146///
11147/// The masks must be exactly the same width.
11148///
11149/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11150/// value in ExpectedMask is always accepted. Otherwise the indices must match.
11151///
11152/// SM_SentinelZero is accepted as a valid negative index but must match in
11153/// both.
11154static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11155 ArrayRef<int> ExpectedMask,
11156 SDValue V1 = SDValue(),
11157 SDValue V2 = SDValue()) {
11158 int Size = Mask.size();
11159 if (Size != (int)ExpectedMask.size())
11160 return false;
11161 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&((void)0)
11162 "Illegal target shuffle mask")((void)0);
11163
11164 // Check for out-of-range target shuffle mask indices.
11165 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11166 return false;
11167
11168 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11169 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11170 V1 = SDValue();
11171 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11172 V2 = SDValue();
11173
11174 for (int i = 0; i < Size; ++i) {
11175 int MaskIdx = Mask[i];
11176 int ExpectedIdx = ExpectedMask[i];
11177 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11178 continue;
11179 if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11180 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11181 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11182 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11183 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11184 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11185 continue;
11186 }
11187 // TODO - handle SM_Sentinel equivalences.
11188 return false;
11189 }
11190 return true;
11191}
11192
11193// Attempt to create a shuffle mask from a VSELECT condition mask.
11194static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11195 SDValue Cond) {
11196 EVT CondVT = Cond.getValueType();
11197 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11198 unsigned NumElts = CondVT.getVectorNumElements();
11199
11200 APInt UndefElts;
11201 SmallVector<APInt, 32> EltBits;
11202 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11203 true, false))
11204 return false;
11205
11206 Mask.resize(NumElts, SM_SentinelUndef);
11207
11208 for (int i = 0; i != (int)NumElts; ++i) {
11209 Mask[i] = i;
11210 // Arbitrarily choose from the 2nd operand if the select condition element
11211 // is undef.
11212 // TODO: Can we do better by matching patterns such as even/odd?
11213 if (UndefElts[i] || EltBits[i].isNullValue())
11214 Mask[i] += NumElts;
11215 }
11216
11217 return true;
11218}
11219
11220// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11221// instructions.
11222static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11223 if (VT != MVT::v8i32 && VT != MVT::v8f32)
11224 return false;
11225
11226 SmallVector<int, 8> Unpcklwd;
11227 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11228 /* Unary = */ false);
11229 SmallVector<int, 8> Unpckhwd;
11230 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11231 /* Unary = */ false);
11232 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11233 isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11234 return IsUnpackwdMask;
11235}
11236
11237static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11238 // Create 128-bit vector type based on mask size.
11239 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11240 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11241
11242 // We can't assume a canonical shuffle mask, so try the commuted version too.
11243 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11244 ShuffleVectorSDNode::commuteMask(CommutedMask);
11245
11246 // Match any of unary/binary or low/high.
11247 for (unsigned i = 0; i != 4; ++i) {
11248 SmallVector<int, 16> UnpackMask;
11249 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11250 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11251 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11252 return true;
11253 }
11254 return false;
11255}
11256
11257/// Return true if a shuffle mask chooses elements identically in its top and
11258/// bottom halves. For example, any splat mask has the same top and bottom
11259/// halves. If an element is undefined in only one half of the mask, the halves
11260/// are not considered identical.
11261static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11262 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")((void)0);
11263 unsigned HalfSize = Mask.size() / 2;
11264 for (unsigned i = 0; i != HalfSize; ++i) {
11265 if (Mask[i] != Mask[i + HalfSize])
11266 return false;
11267 }
11268 return true;
11269}
11270
11271/// Get a 4-lane 8-bit shuffle immediate for a mask.
11272///
11273/// This helper function produces an 8-bit shuffle immediate corresponding to
11274/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11275/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11276/// example.
11277///
11278/// NB: We rely heavily on "undef" masks preserving the input lane.
11279static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11280 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((void)0);
11281 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((void)0);
11282 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((void)0);
11283 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((void)0);
11284 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((void)0);
11285
11286 // If the mask only uses one non-undef element, then fully 'splat' it to
11287 // improve later broadcast matching.
11288 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11289 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")((void)0);
11290
11291 int FirstElt = Mask[FirstIndex];
11292 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11293 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11294
11295 unsigned Imm = 0;
11296 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11297 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11298 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11299 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11300 return Imm;
11301}
11302
11303static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11304 SelectionDAG &DAG) {
11305 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11306}
11307
11308// The Shuffle result is as follow:
11309// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11310// Each Zeroable's element correspond to a particular Mask's element.
11311// As described in computeZeroableShuffleElements function.
11312//
11313// The function looks for a sub-mask that the nonzero elements are in
11314// increasing order. If such sub-mask exist. The function returns true.
11315static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11316 ArrayRef<int> Mask, const EVT &VectorType,
11317 bool &IsZeroSideLeft) {
11318 int NextElement = -1;
11319 // Check if the Mask's nonzero elements are in increasing order.
11320 for (int i = 0, e = Mask.size(); i < e; i++) {
11321 // Checks if the mask's zeros elements are built from only zeros.
11322 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
11323 if (Mask[i] < 0)
11324 return false;
11325 if (Zeroable[i])
11326 continue;
11327 // Find the lowest non zero element
11328 if (NextElement < 0) {
11329 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11330 IsZeroSideLeft = NextElement != 0;
11331 }
11332 // Exit if the mask's non zero elements are not in increasing order.
11333 if (NextElement != Mask[i])
11334 return false;
11335 NextElement++;
11336 }
11337 return true;
11338}
11339
11340/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11341static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11342 ArrayRef<int> Mask, SDValue V1,
11343 SDValue V2, const APInt &Zeroable,
11344 const X86Subtarget &Subtarget,
11345 SelectionDAG &DAG) {
11346 int Size = Mask.size();
11347 int LaneSize = 128 / VT.getScalarSizeInBits();
11348 const int NumBytes = VT.getSizeInBits() / 8;
11349 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11350
11351 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||((void)0)
11352 (Subtarget.hasAVX2() && VT.is256BitVector()) ||((void)0)
11353 (Subtarget.hasBWI() && VT.is512BitVector()))((void)0);
11354
11355 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11356 // Sign bit set in i8 mask means zero element.
11357 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11358
11359 SDValue V;
11360 for (int i = 0; i < NumBytes; ++i) {
11361 int M = Mask[i / NumEltBytes];
11362 if (M < 0) {
11363 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11364 continue;
11365 }
11366 if (Zeroable[i / NumEltBytes]) {
11367 PSHUFBMask[i] = ZeroMask;
11368 continue;
11369 }
11370
11371 // We can only use a single input of V1 or V2.
11372 SDValue SrcV = (M >= Size ? V2 : V1);
11373 if (V && V != SrcV)
11374 return SDValue();
11375 V = SrcV;
11376 M %= Size;
11377
11378 // PSHUFB can't cross lanes, ensure this doesn't happen.
11379 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11380 return SDValue();
11381
11382 M = M % LaneSize;
11383 M = M * NumEltBytes + (i % NumEltBytes);
11384 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11385 }
11386 assert(V && "Failed to find a source input")((void)0);
11387
11388 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11389 return DAG.getBitcast(
11390 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11391 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11392}
11393
11394static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11395 const X86Subtarget &Subtarget, SelectionDAG &DAG,
11396 const SDLoc &dl);
11397
11398// X86 has dedicated shuffle that can be lowered to VEXPAND
11399static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11400 const APInt &Zeroable,
11401 ArrayRef<int> Mask, SDValue &V1,
11402 SDValue &V2, SelectionDAG &DAG,
11403 const X86Subtarget &Subtarget) {
11404 bool IsLeftZeroSide = true;
11405 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11406 IsLeftZeroSide))
11407 return SDValue();
11408 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11409 MVT IntegerType =
11410 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11411 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11412 unsigned NumElts = VT.getVectorNumElements();
11413 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&((void)0)
11414 "Unexpected number of vector elements")((void)0);
11415 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11416 Subtarget, DAG, DL);
11417 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11418 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11419 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11420}
11421
11422static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11423 unsigned &UnpackOpcode, bool IsUnary,
11424 ArrayRef<int> TargetMask, const SDLoc &DL,
11425 SelectionDAG &DAG,
11426 const X86Subtarget &Subtarget) {
11427 int NumElts = VT.getVectorNumElements();
11428
11429 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11430 for (int i = 0; i != NumElts; i += 2) {
11431 int M1 = TargetMask[i + 0];
11432 int M2 = TargetMask[i + 1];
11433 Undef1 &= (SM_SentinelUndef == M1);
11434 Undef2 &= (SM_SentinelUndef == M2);
11435 Zero1 &= isUndefOrZero(M1);
11436 Zero2 &= isUndefOrZero(M2);
11437 }
11438 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&((void)0)
11439 "Zeroable shuffle detected")((void)0);
11440
11441 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11442 SmallVector<int, 64> Unpckl, Unpckh;
11443 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11444 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11445 (IsUnary ? V1 : V2))) {
11446 UnpackOpcode = X86ISD::UNPCKL;
11447 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11448 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11449 return true;
11450 }
11451
11452 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11453 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11454 (IsUnary ? V1 : V2))) {
11455 UnpackOpcode = X86ISD::UNPCKH;
11456 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11457 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11458 return true;
11459 }
11460
11461 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11462 if (IsUnary && (Zero1 || Zero2)) {
11463 // Don't bother if we can blend instead.
11464 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11465 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11466 return false;
11467
11468 bool MatchLo = true, MatchHi = true;
11469 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11470 int M = TargetMask[i];
11471
11472 // Ignore if the input is known to be zero or the index is undef.
11473 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11474 (M == SM_SentinelUndef))
11475 continue;
11476
11477 MatchLo &= (M == Unpckl[i]);
11478 MatchHi &= (M == Unpckh[i]);
11479 }
11480
11481 if (MatchLo || MatchHi) {
11482 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11483 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11484 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11485 return true;
11486 }
11487 }
11488
11489 // If a binary shuffle, commute and try again.
11490 if (!IsUnary) {
11491 ShuffleVectorSDNode::commuteMask(Unpckl);
11492 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11493 UnpackOpcode = X86ISD::UNPCKL;
11494 std::swap(V1, V2);
11495 return true;
11496 }
11497
11498 ShuffleVectorSDNode::commuteMask(Unpckh);
11499 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11500 UnpackOpcode = X86ISD::UNPCKH;
11501 std::swap(V1, V2);
11502 return true;
11503 }
11504 }
11505
11506 return false;
11507}
11508
11509// X86 has dedicated unpack instructions that can handle specific blend
11510// operations: UNPCKH and UNPCKL.
11511static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11512 ArrayRef<int> Mask, SDValue V1, SDValue V2,
11513 SelectionDAG &DAG) {
11514 SmallVector<int, 8> Unpckl;
11515 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11516 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11517 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11518
11519 SmallVector<int, 8> Unpckh;
11520 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11521 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11522 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11523
11524 // Commute and try again.
11525 ShuffleVectorSDNode::commuteMask(Unpckl);
11526 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11527 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11528
11529 ShuffleVectorSDNode::commuteMask(Unpckh);
11530 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11531 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11532
11533 return SDValue();
11534}
11535
11536/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11537/// followed by unpack 256-bit.
11538static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11539 ArrayRef<int> Mask, SDValue V1,
11540 SDValue V2, SelectionDAG &DAG) {
11541 SmallVector<int, 32> Unpckl, Unpckh;
11542 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11543 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11544
11545 unsigned UnpackOpcode;
11546 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11547 UnpackOpcode = X86ISD::UNPCKL;
11548 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11549 UnpackOpcode = X86ISD::UNPCKH;
11550 else
11551 return SDValue();
11552
11553 // This is a "natural" unpack operation (rather than the 128-bit sectored
11554 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11555 // input in order to use the x86 instruction.
11556 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11557 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11558 V1 = DAG.getBitcast(VT, V1);
11559 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11560}
11561
11562// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11563// source into the lower elements and zeroing the upper elements.
11564static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11565 ArrayRef<int> Mask, const APInt &Zeroable,
11566 const X86Subtarget &Subtarget) {
11567 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11568 return false;
11569
11570 unsigned NumElts = Mask.size();
11571 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11572 unsigned MaxScale = 64 / EltSizeInBits;
11573
11574 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11575 unsigned SrcEltBits = EltSizeInBits * Scale;
11576 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11577 continue;
11578 unsigned NumSrcElts = NumElts / Scale;
11579 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11580 continue;
11581 unsigned UpperElts = NumElts - NumSrcElts;
11582 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11583 continue;
11584 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11585 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11586 DstVT = MVT::getIntegerVT(EltSizeInBits);
11587 if ((NumSrcElts * EltSizeInBits) >= 128) {
11588 // ISD::TRUNCATE
11589 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11590 } else {
11591 // X86ISD::VTRUNC
11592 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11593 }
11594 return true;
11595 }
11596
11597 return false;
11598}
11599
11600// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11601// element padding to the final DstVT.
11602static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11603 const X86Subtarget &Subtarget,
11604 SelectionDAG &DAG, bool ZeroUppers) {
11605 MVT SrcVT = Src.getSimpleValueType();
11606 MVT DstSVT = DstVT.getScalarType();
11607 unsigned NumDstElts = DstVT.getVectorNumElements();
11608 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11609 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11610
11611 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11612 return SDValue();
11613
11614 // Perform a direct ISD::TRUNCATE if possible.
11615 if (NumSrcElts == NumDstElts)
11616 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11617
11618 if (NumSrcElts > NumDstElts) {
11619 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11620 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11621 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11622 }
11623
11624 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11625 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11626 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11627 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11628 DstVT.getSizeInBits());
11629 }
11630
11631 // Non-VLX targets must truncate from a 512-bit type, so we need to
11632 // widen, truncate and then possibly extract the original subvector.
11633 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11634 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11635 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11636 }
11637
11638 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11639 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11640 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11641 if (DstVT != TruncVT)
11642 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11643 DstVT.getSizeInBits());
11644 return Trunc;
11645}
11646
11647// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11648//
11649// An example is the following:
11650//
11651// t0: ch = EntryToken
11652// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11653// t25: v4i32 = truncate t2
11654// t41: v8i16 = bitcast t25
11655// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11656// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11657// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11658// t18: v2i64 = bitcast t51
11659//
11660// One can just use a single vpmovdw instruction, without avx512vl we need to
11661// use the zmm variant and extract the lower subvector, padding with zeroes.
11662// TODO: Merge with lowerShuffleAsVTRUNC.
11663static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11664 SDValue V2, ArrayRef<int> Mask,
11665 const APInt &Zeroable,
11666 const X86Subtarget &Subtarget,
11667 SelectionDAG &DAG) {
11668 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")((void)0);
11669 if (!Subtarget.hasAVX512())
11670 return SDValue();
11671
11672 unsigned NumElts = VT.getVectorNumElements();
11673 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11674 unsigned MaxScale = 64 / EltSizeInBits;
11675 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11676 unsigned NumSrcElts = NumElts / Scale;
11677 unsigned UpperElts = NumElts - NumSrcElts;
11678 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11679 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11680 continue;
11681
11682 SDValue Src = V1;
11683 if (!Src.hasOneUse())
11684 return SDValue();
11685
11686 Src = peekThroughOneUseBitcasts(Src);
11687 if (Src.getOpcode() != ISD::TRUNCATE ||
11688 Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11689 return SDValue();
11690 Src = Src.getOperand(0);
11691
11692 // VPMOVWB is only available with avx512bw.
11693 MVT SrcVT = Src.getSimpleValueType();
11694 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11695 !Subtarget.hasBWI())
11696 return SDValue();
11697
11698 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11699 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11700 }
11701
11702 return SDValue();
11703}
11704
11705// Attempt to match binary shuffle patterns as a truncate.
11706static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11707 SDValue V2, ArrayRef<int> Mask,
11708 const APInt &Zeroable,
11709 const X86Subtarget &Subtarget,
11710 SelectionDAG &DAG) {
11711 assert((VT.is128BitVector() || VT.is256BitVector()) &&((void)0)
11712 "Unexpected VTRUNC type")((void)0);
11713 if (!Subtarget.hasAVX512())
11714 return SDValue();
11715
11716 unsigned NumElts = VT.getVectorNumElements();
11717 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11718 unsigned MaxScale = 64 / EltSizeInBits;
11719 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11720 // TODO: Support non-BWI VPMOVWB truncations?
11721 unsigned SrcEltBits = EltSizeInBits * Scale;
11722 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11723 continue;
11724
11725 // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11726 // Bail if the V2 elements are undef.
11727 unsigned NumHalfSrcElts = NumElts / Scale;
11728 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11729 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11730 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11731 continue;
11732
11733 // The elements beyond the truncation must be undef/zero.
11734 unsigned UpperElts = NumElts - NumSrcElts;
11735 if (UpperElts > 0 &&
11736 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11737 continue;
11738 bool UndefUppers =
11739 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11740
11741 // As we're using both sources then we need to concat them together
11742 // and truncate from the double-sized src.
11743 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11744 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11745
11746 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11747 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11748 Src = DAG.getBitcast(SrcVT, Src);
11749 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11750 }
11751
11752 return SDValue();
11753}
11754
11755/// Check whether a compaction lowering can be done by dropping even
11756/// elements and compute how many times even elements must be dropped.
11757///
11758/// This handles shuffles which take every Nth element where N is a power of
11759/// two. Example shuffle masks:
11760///
11761/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11762/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11763/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11764/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11765/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11766/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11767///
11768/// Any of these lanes can of course be undef.
11769///
11770/// This routine only supports N <= 3.
11771/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11772/// for larger N.
11773///
11774/// \returns N above, or the number of times even elements must be dropped if
11775/// there is such a number. Otherwise returns zero.
11776static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11777 bool IsSingleInput) {
11778 // The modulus for the shuffle vector entries is based on whether this is
11779 // a single input or not.
11780 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11781 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((void)0)
11782 "We should only be called with masks with a power-of-2 size!")((void)0);
11783
11784 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11785
11786 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11787 // and 2^3 simultaneously. This is because we may have ambiguity with
11788 // partially undef inputs.
11789 bool ViableForN[3] = {true, true, true};
11790
11791 for (int i = 0, e = Mask.size(); i < e; ++i) {
11792 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11793 // want.
11794 if (Mask[i] < 0)
11795 continue;
11796
11797 bool IsAnyViable = false;
11798 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11799 if (ViableForN[j]) {
11800 uint64_t N = j + 1;
11801
11802 // The shuffle mask must be equal to (i * 2^N) % M.
11803 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11804 IsAnyViable = true;
11805 else
11806 ViableForN[j] = false;
11807 }
11808 // Early exit if we exhaust the possible powers of two.
11809 if (!IsAnyViable)
11810 break;
11811 }
11812
11813 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11814 if (ViableForN[j])
11815 return j + 1;
11816
11817 // Return 0 as there is no viable power of two.
11818 return 0;
11819}
11820
11821// X86 has dedicated pack instructions that can handle specific truncation
11822// operations: PACKSS and PACKUS.
11823// Checks for compaction shuffle masks if MaxStages > 1.
11824// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11825static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11826 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11827 const SelectionDAG &DAG,
11828 const X86Subtarget &Subtarget,
11829 unsigned MaxStages = 1) {
11830 unsigned NumElts = VT.getVectorNumElements();
11831 unsigned BitSize = VT.getScalarSizeInBits();
11832 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&((void)0)
11833 "Illegal maximum compaction")((void)0);
11834
11835 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11836 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11837 unsigned NumPackedBits = NumSrcBits - BitSize;
11838 N1 = peekThroughBitcasts(N1);
11839 N2 = peekThroughBitcasts(N2);
11840 unsigned NumBits1 = N1.getScalarValueSizeInBits();
11841 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11842 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11843 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11844 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11845 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11846 return false;
11847 if (Subtarget.hasSSE41() || BitSize == 8) {
11848 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11849 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11850 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11851 V1 = N1;
11852 V2 = N2;
11853 SrcVT = PackVT;
11854 PackOpcode = X86ISD::PACKUS;
11855 return true;
11856 }
11857 }
11858 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11859 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11860 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11861 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11862 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11863 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11864 V1 = N1;
11865 V2 = N2;
11866 SrcVT = PackVT;
11867 PackOpcode = X86ISD::PACKSS;
11868 return true;
11869 }
11870 return false;
11871 };
11872
11873 // Attempt to match against wider and wider compaction patterns.
11874 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11875 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11876 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11877
11878 // Try binary shuffle.
11879 SmallVector<int, 32> BinaryMask;
11880 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11881 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
11882 if (MatchPACK(V1, V2, PackVT))
11883 return true;
11884
11885 // Try unary shuffle.
11886 SmallVector<int, 32> UnaryMask;
11887 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11888 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
11889 if (MatchPACK(V1, V1, PackVT))
11890 return true;
11891 }
11892
11893 return false;
11894}
11895
11896static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11897 SDValue V1, SDValue V2, SelectionDAG &DAG,
11898 const X86Subtarget &Subtarget) {
11899 MVT PackVT;
11900 unsigned PackOpcode;
11901 unsigned SizeBits = VT.getSizeInBits();
11902 unsigned EltBits = VT.getScalarSizeInBits();
11903 unsigned MaxStages = Log2_32(64 / EltBits);
11904 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11905 Subtarget, MaxStages))
11906 return SDValue();
11907
11908 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11909 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11910
11911 // Don't lower multi-stage packs on AVX512, truncation is better.
11912 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11913 return SDValue();
11914
11915 // Pack to the largest type possible:
11916 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11917 unsigned MaxPackBits = 16;
11918 if (CurrentEltBits > 16 &&
11919 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11920 MaxPackBits = 32;
11921
11922 // Repeatedly pack down to the target size.
11923 SDValue Res;
11924 for (unsigned i = 0; i != NumStages; ++i) {
11925 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11926 unsigned NumSrcElts = SizeBits / SrcEltBits;
11927 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11928 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11929 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11930 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11931 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11932 DAG.getBitcast(SrcVT, V2));
11933 V1 = V2 = Res;
11934 CurrentEltBits /= 2;
11935 }
11936 assert(Res && Res.getValueType() == VT &&((void)0)
11937 "Failed to lower compaction shuffle")((void)0);
11938 return Res;
11939}
11940
11941/// Try to emit a bitmask instruction for a shuffle.
11942///
11943/// This handles cases where we can model a blend exactly as a bitmask due to
11944/// one of the inputs being zeroable.
11945static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11946 SDValue V2, ArrayRef<int> Mask,
11947 const APInt &Zeroable,
11948 const X86Subtarget &Subtarget,
11949 SelectionDAG &DAG) {
11950 MVT MaskVT = VT;
11951 MVT EltVT = VT.getVectorElementType();
11952 SDValue Zero, AllOnes;
11953 // Use f64 if i64 isn't legal.
11954 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11955 EltVT = MVT::f64;
11956 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11957 }
11958
11959 MVT LogicVT = VT;
11960 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11961 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11962 APFloat AllOnesValue = APFloat::getAllOnesValue(
11963 SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
11964 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11965 LogicVT =
11966 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11967 } else {
11968 Zero = DAG.getConstant(0, DL, EltVT);
11969 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11970 }
11971
11972 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11973 SDValue V;
11974 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11975 if (Zeroable[i])
11976 continue;
11977 if (Mask[i] % Size != i)
11978 return SDValue(); // Not a blend.
11979 if (!V)
11980 V = Mask[i] < Size ? V1 : V2;
11981 else if (V != (Mask[i] < Size ? V1 : V2))
11982 return SDValue(); // Can only let one input through the mask.
11983
11984 VMaskOps[i] = AllOnes;
11985 }
11986 if (!V)
11987 return SDValue(); // No non-zeroable elements!
11988
11989 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11990 VMask = DAG.getBitcast(LogicVT, VMask);
11991 V = DAG.getBitcast(LogicVT, V);
11992 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11993 return DAG.getBitcast(VT, And);
11994}
11995
11996/// Try to emit a blend instruction for a shuffle using bit math.
11997///
11998/// This is used as a fallback approach when first class blend instructions are
11999/// unavailable. Currently it is only suitable for integer vectors, but could
12000/// be generalized for floating point vectors if desirable.
12001static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12002 SDValue V2, ArrayRef<int> Mask,
12003 SelectionDAG &DAG) {
12004 assert(VT.isInteger() && "Only supports integer vector types!")((void)0);
12005 MVT EltVT = VT.getVectorElementType();
12006 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12007 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12008 SmallVector<SDValue, 16> MaskOps;
12009 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12010 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12011 return SDValue(); // Shuffled input!
12012 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12013 }
12014
12015 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12016 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12017 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12018 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12019}
12020
12021static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12022 SDValue PreservedSrc,
12023 const X86Subtarget &Subtarget,
12024 SelectionDAG &DAG);
12025
12026static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12027 MutableArrayRef<int> Mask,
12028 const APInt &Zeroable, bool &ForceV1Zero,
12029 bool &ForceV2Zero, uint64_t &BlendMask) {
12030 bool V1IsZeroOrUndef =
12031 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12032 bool V2IsZeroOrUndef =
12033 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12034
12035 BlendMask = 0;
12036 ForceV1Zero = false, ForceV2Zero = false;
12037 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")((void)0);
12038
12039 // Attempt to generate the binary blend mask. If an input is zero then
12040 // we can use any lane.
12041 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12042 int M = Mask[i];
12043 if (M == SM_SentinelUndef)
12044 continue;
12045 if (M == i)
12046 continue;
12047 if (M == i + Size) {
12048 BlendMask |= 1ull << i;
12049 continue;
12050 }
12051 if (Zeroable[i]) {
12052 if (V1IsZeroOrUndef) {
12053 ForceV1Zero = true;
12054 Mask[i] = i;
12055 continue;
12056 }
12057 if (V2IsZeroOrUndef) {
12058 ForceV2Zero = true;
12059 BlendMask |= 1ull << i;
12060 Mask[i] = i + Size;
12061 continue;
12062 }
12063 }
12064 return false;
12065 }
12066 return true;
12067}
12068
12069static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12070 int Scale) {
12071 uint64_t ScaledMask = 0;
12072 for (int i = 0; i != Size; ++i)
12073 if (BlendMask & (1ull << i))
12074 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12075 return ScaledMask;
12076}
12077
12078/// Try to emit a blend instruction for a shuffle.
12079///
12080/// This doesn't do any checks for the availability of instructions for blending
12081/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12082/// be matched in the backend with the type given. What it does check for is
12083/// that the shuffle mask is a blend, or convertible into a blend with zero.
12084static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12085 SDValue V2, ArrayRef<int> Original,
12086 const APInt &Zeroable,
12087 const X86Subtarget &Subtarget,
12088 SelectionDAG &DAG) {
12089 uint64_t BlendMask = 0;
12090 bool ForceV1Zero = false, ForceV2Zero = false;
12091 SmallVector<int, 64> Mask(Original.begin(), Original.end());
12092 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12093 BlendMask))
12094 return SDValue();
12095
12096 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12097 if (ForceV1Zero)
12098 V1 = getZeroVector(VT, Subtarget, DAG, DL);
12099 if (ForceV2Zero)
12100 V2 = getZeroVector(VT, Subtarget, DAG, DL);
12101
12102 switch (VT.SimpleTy) {
12103 case MVT::v4i64:
12104 case MVT::v8i32:
12105 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((void)0);
12106 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12107 case MVT::v4f64:
12108 case MVT::v8f32:
12109 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")((void)0);
12110 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12111 case MVT::v2f64:
12112 case MVT::v2i64:
12113 case MVT::v4f32:
12114 case MVT::v4i32:
12115 case MVT::v8i16:
12116 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")((void)0);
12117 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12118 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12119 case MVT::v16i16: {
12120 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")((void)0);
12121 SmallVector<int, 8> RepeatedMask;
12122 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12123 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12124 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((void)0);
12125 BlendMask = 0;
12126 for (int i = 0; i < 8; ++i)
12127 if (RepeatedMask[i] >= 8)
12128 BlendMask |= 1ull << i;
12129 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12130 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12131 }
12132 // Use PBLENDW for lower/upper lanes and then blend lanes.
12133 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12134 // merge to VSELECT where useful.
12135 uint64_t LoMask = BlendMask & 0xFF;
12136 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12137 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12138 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12139 DAG.getTargetConstant(LoMask, DL, MVT::i8));
12140 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12141 DAG.getTargetConstant(HiMask, DL, MVT::i8));
12142 return DAG.getVectorShuffle(
12143 MVT::v16i16, DL, Lo, Hi,
12144 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12145 }
12146 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12147 }
12148 case MVT::v32i8:
12149 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")((void)0);
12150 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12151 case MVT::v16i8: {
12152 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")((void)0);
12153
12154 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12155 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12156 Subtarget, DAG))
12157 return Masked;
12158
12159 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12160 MVT IntegerType =
12161 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12162 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12163 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12164 }
12165
12166 // If we have VPTERNLOG, we can use that as a bit blend.
12167 if (Subtarget.hasVLX())
12168 if (SDValue BitBlend =
12169 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12170 return BitBlend;
12171
12172 // Scale the blend by the number of bytes per element.
12173 int Scale = VT.getScalarSizeInBits() / 8;
12174
12175 // This form of blend is always done on bytes. Compute the byte vector
12176 // type.
12177 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12178
12179 // x86 allows load folding with blendvb from the 2nd source operand. But
12180 // we are still using LLVM select here (see comment below), so that's V1.
12181 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12182 // allow that load-folding possibility.
12183 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12184 ShuffleVectorSDNode::commuteMask(Mask);
12185 std::swap(V1, V2);
12186 }
12187
12188 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12189 // mix of LLVM's code generator and the x86 backend. We tell the code
12190 // generator that boolean values in the elements of an x86 vector register
12191 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12192 // mapping a select to operand #1, and 'false' mapping to operand #2. The
12193 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12194 // of the element (the remaining are ignored) and 0 in that high bit would
12195 // mean operand #1 while 1 in the high bit would mean operand #2. So while
12196 // the LLVM model for boolean values in vector elements gets the relevant
12197 // bit set, it is set backwards and over constrained relative to x86's
12198 // actual model.
12199 SmallVector<SDValue, 32> VSELECTMask;
12200 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12201 for (int j = 0; j < Scale; ++j)
12202 VSELECTMask.push_back(
12203 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12204 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12205 MVT::i8));
12206
12207 V1 = DAG.getBitcast(BlendVT, V1);
12208 V2 = DAG.getBitcast(BlendVT, V2);
12209 return DAG.getBitcast(
12210 VT,
12211 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12212 V1, V2));
12213 }
12214 case MVT::v16f32:
12215 case MVT::v8f64:
12216 case MVT::v8i64:
12217 case MVT::v16i32:
12218 case MVT::v32i16:
12219 case MVT::v64i8: {
12220 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12221 bool OptForSize = DAG.shouldOptForSize();
12222 if (!OptForSize) {
12223 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12224 Subtarget, DAG))
12225 return Masked;
12226 }
12227
12228 // Otherwise load an immediate into a GPR, cast to k-register, and use a
12229 // masked move.
12230 MVT IntegerType =
12231 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12232 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12233 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12234 }
12235 default:
12236 llvm_unreachable("Not a supported integer vector type!")__builtin_unreachable();
12237 }
12238}
12239
12240/// Try to lower as a blend of elements from two inputs followed by
12241/// a single-input permutation.
12242///
12243/// This matches the pattern where we can blend elements from two inputs and
12244/// then reduce the shuffle to a single-input permutation.
12245static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12246 SDValue V1, SDValue V2,
12247 ArrayRef<int> Mask,
12248 SelectionDAG &DAG,
12249 bool ImmBlends = false) {
12250 // We build up the blend mask while checking whether a blend is a viable way
12251 // to reduce the shuffle.
12252 SmallVector<int, 32> BlendMask(Mask.size(), -1);
12253 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12254
12255 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12256 if (Mask[i] < 0)
12257 continue;
12258
12259 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((void)0);
12260
12261 if (BlendMask[Mask[i] % Size] < 0)
12262 BlendMask[Mask[i] % Size] = Mask[i];
12263 else if (BlendMask[Mask[i] % Size] != Mask[i])
12264 return SDValue(); // Can't blend in the needed input!
12265
12266 PermuteMask[i] = Mask[i] % Size;
12267 }
12268
12269 // If only immediate blends, then bail if the blend mask can't be widened to
12270 // i16.
12271 unsigned EltSize = VT.getScalarSizeInBits();
12272 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12273 return SDValue();
12274
12275 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12276 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12277}
12278
12279/// Try to lower as an unpack of elements from two inputs followed by
12280/// a single-input permutation.
12281///
12282/// This matches the pattern where we can unpack elements from two inputs and
12283/// then reduce the shuffle to a single-input (wider) permutation.
12284static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12285 SDValue V1, SDValue V2,
12286 ArrayRef<int> Mask,
12287 SelectionDAG &DAG) {
12288 int NumElts = Mask.size();
12289 int NumLanes = VT.getSizeInBits() / 128;
12290 int NumLaneElts = NumElts / NumLanes;
12291 int NumHalfLaneElts = NumLaneElts / 2;
12292
12293 bool MatchLo = true, MatchHi = true;
12294 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12295
12296 // Determine UNPCKL/UNPCKH type and operand order.
12297 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12298 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12299 int M = Mask[Lane + Elt];
12300 if (M < 0)
12301 continue;
12302
12303 SDValue &Op = Ops[Elt & 1];
12304 if (M < NumElts && (Op.isUndef() || Op == V1))
12305 Op = V1;
12306 else if (NumElts <= M && (Op.isUndef() || Op == V2))
12307 Op = V2;
12308 else
12309 return SDValue();
12310
12311 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12312 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12313 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12314 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12315 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12316 if (!MatchLo && !MatchHi)
12317 return SDValue();
12318 }
12319 }
12320 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")((void)0);
12321
12322 // Now check that each pair of elts come from the same unpack pair
12323 // and set the permute mask based on each pair.
12324 // TODO - Investigate cases where we permute individual elements.
12325 SmallVector<int, 32> PermuteMask(NumElts, -1);
12326 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12327 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12328 int M0 = Mask[Lane + Elt + 0];
12329 int M1 = Mask[Lane + Elt + 1];
12330 if (0 <= M0 && 0 <= M1 &&
12331 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12332 return SDValue();
12333 if (0 <= M0)
12334 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12335 if (0 <= M1)
12336 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12337 }
12338 }
12339
12340 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12341 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12342 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12343}
12344
12345/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12346/// permuting the elements of the result in place.
12347static SDValue lowerShuffleAsByteRotateAndPermute(
12348 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12349 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12350 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12351 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12352 (VT.is512BitVector() && !Subtarget.hasBWI()))
12353 return SDValue();
12354
12355 // We don't currently support lane crossing permutes.
12356 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12357 return SDValue();
12358
12359 int Scale = VT.getScalarSizeInBits() / 8;
12360 int NumLanes = VT.getSizeInBits() / 128;
12361 int NumElts = VT.getVectorNumElements();
12362 int NumEltsPerLane = NumElts / NumLanes;
12363
12364 // Determine range of mask elts.
12365 bool Blend1 = true;
12366 bool Blend2 = true;
12367 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12368 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12369 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12370 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12371 int M = Mask[Lane + Elt];
12372 if (M < 0)
12373 continue;
12374 if (M < NumElts) {
12375 Blend1 &= (M == (Lane + Elt));
12376 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((void)0);
12377 M = M % NumEltsPerLane;
12378 Range1.first = std::min(Range1.first, M);
12379 Range1.second = std::max(Range1.second, M);
12380 } else {
12381 M -= NumElts;
12382 Blend2 &= (M == (Lane + Elt));
12383 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((void)0);
12384 M = M % NumEltsPerLane;
12385 Range2.first = std::min(Range2.first, M);
12386 Range2.second = std::max(Range2.second, M);
12387 }
12388 }
12389 }
12390
12391 // Bail if we don't need both elements.
12392 // TODO - it might be worth doing this for unary shuffles if the permute
12393 // can be widened.
12394 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12395 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12396 return SDValue();
12397
12398 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12399 return SDValue();
12400
12401 // Rotate the 2 ops so we can access both ranges, then permute the result.
12402 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12403 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12404 SDValue Rotate = DAG.getBitcast(
12405 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12406 DAG.getBitcast(ByteVT, Lo),
12407 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12408 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12409 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12410 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12411 int M = Mask[Lane + Elt];
12412 if (M < 0)
12413 continue;
12414 if (M < NumElts)
12415 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12416 else
12417 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12418 }
12419 }
12420 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12421 };
12422
12423 // Check if the ranges are small enough to rotate from either direction.
12424 if (Range2.second < Range1.first)
12425 return RotateAndPermute(V1, V2, Range1.first, 0);
12426 if (Range1.second < Range2.first)
12427 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12428 return SDValue();
12429}
12430
12431/// Generic routine to decompose a shuffle and blend into independent
12432/// blends and permutes.
12433///
12434/// This matches the extremely common pattern for handling combined
12435/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12436/// operations. It will try to pick the best arrangement of shuffles and
12437/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12438static SDValue lowerShuffleAsDecomposedShuffleMerge(
12439 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12440 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12441 int NumElts = Mask.size();
12442 int NumLanes = VT.getSizeInBits() / 128;
12443 int NumEltsPerLane = NumElts / NumLanes;
12444
12445 // Shuffle the input elements into the desired positions in V1 and V2 and
12446 // unpack/blend them together.
12447 bool IsAlternating = true;
12448 SmallVector<int, 32> V1Mask(NumElts, -1);
12449 SmallVector<int, 32> V2Mask(NumElts, -1);
12450 SmallVector<int, 32> FinalMask(NumElts, -1);
12451 for (int i = 0; i < NumElts; ++i) {
12452 int M = Mask[i];
12453 if (M >= 0 && M < NumElts) {
12454 V1Mask[i] = M;
12455 FinalMask[i] = i;
12456 IsAlternating &= (i & 1) == 0;
12457 } else if (M >= NumElts) {
12458 V2Mask[i] = M - NumElts;
12459 FinalMask[i] = i + NumElts;
12460 IsAlternating &= (i & 1) == 1;
12461 }
12462 }
12463
12464 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12465 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12466 // the shuffle may be able to fold with a load or other benefit. However, when
12467 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12468 // pre-shuffle first is a better strategy.
12469 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12470 // Only prefer immediate blends to unpack/rotate.
12471 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12472 DAG, true))
12473 return BlendPerm;
12474 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12475 DAG))
12476 return UnpackPerm;
12477 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12478 DL, VT, V1, V2, Mask, Subtarget, DAG))
12479 return RotatePerm;
12480 // Unpack/rotate failed - try again with variable blends.
12481 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12482 DAG))
12483 return BlendPerm;
12484 }
12485
12486 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12487 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12488 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12489 // than half the elements coming from each source.
12490 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12491 V1Mask.assign(NumElts, -1);
12492 V2Mask.assign(NumElts, -1);
12493 FinalMask.assign(NumElts, -1);
12494 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12495 for (int j = 0; j != NumEltsPerLane; ++j) {
12496 int M = Mask[i + j];
12497 if (M >= 0 && M < NumElts) {
12498 V1Mask[i + (j / 2)] = M;
12499 FinalMask[i + j] = i + (j / 2);
12500 } else if (M >= NumElts) {
12501 V2Mask[i + (j / 2)] = M - NumElts;
12502 FinalMask[i + j] = i + (j / 2) + NumElts;
12503 }
12504 }
12505 }
12506
12507 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12508 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12509 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12510}
12511
12512/// Try to lower a vector shuffle as a bit rotation.
12513///
12514/// Look for a repeated rotation pattern in each sub group.
12515/// Returns a ISD::ROTL element rotation amount or -1 if failed.
12516static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12517 int NumElts = Mask.size();
12518 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")((void)0);
12519
12520 int RotateAmt = -1;
12521 for (int i = 0; i != NumElts; i += NumSubElts) {
12522 for (int j = 0; j != NumSubElts; ++j) {
12523 int M = Mask[i + j];
12524 if (M < 0)
12525 continue;
12526 if (!isInRange(M, i, i + NumSubElts))
12527 return -1;
12528 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12529 if (0 <= RotateAmt && Offset != RotateAmt)
12530 return -1;
12531 RotateAmt = Offset;
12532 }
12533 }
12534 return RotateAmt;
12535}
12536
12537static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12538 const X86Subtarget &Subtarget,
12539 ArrayRef<int> Mask) {
12540 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12541 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")((void)0);
12542
12543 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12544 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12545 int MaxSubElts = 64 / EltSizeInBits;
12546 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12547 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12548 if (RotateAmt < 0)
12549 continue;
12550
12551 int NumElts = Mask.size();
12552 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12553 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12554 return RotateAmt * EltSizeInBits;
12555 }
12556
12557 return -1;
12558}
12559
12560/// Lower shuffle using X86ISD::VROTLI rotations.
12561static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12562 ArrayRef<int> Mask,
12563 const X86Subtarget &Subtarget,
12564 SelectionDAG &DAG) {
12565 // Only XOP + AVX512 targets have bit rotation instructions.
12566 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12567 bool IsLegal =
12568 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12569 if (!IsLegal && Subtarget.hasSSE3())
12570 return SDValue();
12571
12572 MVT RotateVT;
12573 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12574 Subtarget, Mask);
12575 if (RotateAmt < 0)
12576 return SDValue();
12577
12578 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12579 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12580 // widen to vXi16 or more then existing lowering should will be better.
12581 if (!IsLegal) {
12582 if ((RotateAmt % 16) == 0)
12583 return SDValue();
12584 // TODO: Use getTargetVShiftByConstNode.
12585 unsigned ShlAmt = RotateAmt;
12586 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12587 V1 = DAG.getBitcast(RotateVT, V1);
12588 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12589 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12590 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12591 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12592 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12593 return DAG.getBitcast(VT, Rot);
12594 }
12595
12596 SDValue Rot =
12597 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12598 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12599 return DAG.getBitcast(VT, Rot);
12600}
12601
12602/// Try to match a vector shuffle as an element rotation.
12603///
12604/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12605static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12606 ArrayRef<int> Mask) {
12607 int NumElts = Mask.size();
12608
12609 // We need to detect various ways of spelling a rotation:
12610 // [11, 12, 13, 14, 15, 0, 1, 2]
12611 // [-1, 12, 13, 14, -1, -1, 1, -1]
12612 // [-1, -1, -1, -1, -1, -1, 1, 2]
12613 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12614 // [-1, 4, 5, 6, -1, -1, 9, -1]
12615 // [-1, 4, 5, 6, -1, -1, -1, -1]
12616 int Rotation = 0;
12617 SDValue Lo, Hi;
12618 for (int i = 0; i < NumElts; ++i) {
12619 int M = Mask[i];
12620 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&((void)0)
12621 "Unexpected mask index.")((void)0);
12622 if (M < 0)
12623 continue;
12624
12625 // Determine where a rotated vector would have started.
12626 int StartIdx = i - (M % NumElts);
12627 if (StartIdx == 0)
12628 // The identity rotation isn't interesting, stop.
12629 return -1;
12630
12631 // If we found the tail of a vector the rotation must be the missing
12632 // front. If we found the head of a vector, it must be how much of the
12633 // head.
12634 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12635
12636 if (Rotation == 0)
12637 Rotation = CandidateRotation;
12638 else if (Rotation != CandidateRotation)
12639 // The rotations don't match, so we can't match this mask.
12640 return -1;
12641
12642 // Compute which value this mask is pointing at.
12643 SDValue MaskV = M < NumElts ? V1 : V2;
12644
12645 // Compute which of the two target values this index should be assigned
12646 // to. This reflects whether the high elements are remaining or the low
12647 // elements are remaining.
12648 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12649
12650 // Either set up this value if we've not encountered it before, or check
12651 // that it remains consistent.
12652 if (!TargetV)
12653 TargetV = MaskV;
12654 else if (TargetV != MaskV)
12655 // This may be a rotation, but it pulls from the inputs in some
12656 // unsupported interleaving.
12657 return -1;
12658 }
12659
12660 // Check that we successfully analyzed the mask, and normalize the results.
12661 assert(Rotation != 0 && "Failed to locate a viable rotation!")((void)0);
12662 assert((Lo || Hi) && "Failed to find a rotated input vector!")((void)0);
12663 if (!Lo)
12664 Lo = Hi;
12665 else if (!Hi)
12666 Hi = Lo;
12667
12668 V1 = Lo;
12669 V2 = Hi;
12670
12671 return Rotation;
12672}
12673
12674/// Try to lower a vector shuffle as a byte rotation.
12675///
12676/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12677/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12678/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12679/// try to generically lower a vector shuffle through such an pattern. It
12680/// does not check for the profitability of lowering either as PALIGNR or
12681/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12682/// This matches shuffle vectors that look like:
12683///
12684/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12685///
12686/// Essentially it concatenates V1 and V2, shifts right by some number of
12687/// elements, and takes the low elements as the result. Note that while this is
12688/// specified as a *right shift* because x86 is little-endian, it is a *left
12689/// rotate* of the vector lanes.
12690static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12691 ArrayRef<int> Mask) {
12692 // Don't accept any shuffles with zero elements.
12693 if (isAnyZero(Mask))
12694 return -1;
12695
12696 // PALIGNR works on 128-bit lanes.
12697 SmallVector<int, 16> RepeatedMask;
12698 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12699 return -1;
12700
12701 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12702 if (Rotation <= 0)
12703 return -1;
12704
12705 // PALIGNR rotates bytes, so we need to scale the
12706 // rotation based on how many bytes are in the vector lane.
12707 int NumElts = RepeatedMask.size();
12708 int Scale = 16 / NumElts;
12709 return Rotation * Scale;
12710}
12711
12712static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12713 SDValue V2, ArrayRef<int> Mask,
12714 const X86Subtarget &Subtarget,
12715 SelectionDAG &DAG) {
12716 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12717
12718 SDValue Lo = V1, Hi = V2;
12719 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12720 if (ByteRotation <= 0)
12721 return SDValue();
12722
12723 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12724 // PSLLDQ/PSRLDQ.
12725 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12726 Lo = DAG.getBitcast(ByteVT, Lo);
12727 Hi = DAG.getBitcast(ByteVT, Hi);
12728
12729 // SSSE3 targets can use the palignr instruction.
12730 if (Subtarget.hasSSSE3()) {
12731 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&((void)0)
12732 "512-bit PALIGNR requires BWI instructions")((void)0);
12733 return DAG.getBitcast(
12734 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12735 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12736 }
12737
12738 assert(VT.is128BitVector() &&((void)0)
12739 "Rotate-based lowering only supports 128-bit lowering!")((void)0);
12740 assert(Mask.size() <= 16 &&((void)0)
12741 "Can shuffle at most 16 bytes in a 128-bit vector!")((void)0);
12742 assert(ByteVT == MVT::v16i8 &&((void)0)
12743 "SSE2 rotate lowering only needed for v16i8!")((void)0);
12744
12745 // Default SSE2 implementation
12746 int LoByteShift = 16 - ByteRotation;
12747 int HiByteShift = ByteRotation;
12748
12749 SDValue LoShift =
12750 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12751 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12752 SDValue HiShift =
12753 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12754 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12755 return DAG.getBitcast(VT,
12756 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12757}
12758
12759/// Try to lower a vector shuffle as a dword/qword rotation.
12760///
12761/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12762/// rotation of the concatenation of two vectors; This routine will
12763/// try to generically lower a vector shuffle through such an pattern.
12764///
12765/// Essentially it concatenates V1 and V2, shifts right by some number of
12766/// elements, and takes the low elements as the result. Note that while this is
12767/// specified as a *right shift* because x86 is little-endian, it is a *left
12768/// rotate* of the vector lanes.
12769static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
12770 SDValue V2, ArrayRef<int> Mask,
12771 const X86Subtarget &Subtarget,
12772 SelectionDAG &DAG) {
12773 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&((void)0)
12774 "Only 32-bit and 64-bit elements are supported!")((void)0);
12775
12776 // 128/256-bit vectors are only supported with VLX.
12777 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))((void)0)
12778 && "VLX required for 128/256-bit vectors")((void)0);
12779
12780 SDValue Lo = V1, Hi = V2;
12781 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12782 if (Rotation <= 0)
12783 return SDValue();
12784
12785 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12786 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12787}
12788
12789/// Try to lower a vector shuffle as a byte shift sequence.
12790static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12791 SDValue V2, ArrayRef<int> Mask,
12792 const APInt &Zeroable,
12793 const X86Subtarget &Subtarget,
12794 SelectionDAG &DAG) {
12795 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12796 assert(VT.is128BitVector() && "Only 128-bit vectors supported")((void)0);
12797
12798 // We need a shuffle that has zeros at one/both ends and a sequential
12799 // shuffle from one source within.
12800 unsigned ZeroLo = Zeroable.countTrailingOnes();
12801 unsigned ZeroHi = Zeroable.countLeadingOnes();
12802 if (!ZeroLo && !ZeroHi)
12803 return SDValue();
12804
12805 unsigned NumElts = Mask.size();
12806 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12807 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12808 return SDValue();
12809
12810 unsigned Scale = VT.getScalarSizeInBits() / 8;
12811 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12812 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12813 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12814 return SDValue();
12815
12816 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12817 Res = DAG.getBitcast(MVT::v16i8, Res);
12818
12819 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12820 // inner sequential set of elements, possibly offset:
12821 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12822 // 01234567 --> 4567zzzz --> zzzzz456
12823 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12824 if (ZeroLo == 0) {
12825 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12826 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12827 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12828 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12829 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12830 } else if (ZeroHi == 0) {
12831 unsigned Shift = Mask[ZeroLo] % NumElts;
12832 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12833 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12834 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12835 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12836 } else if (!Subtarget.hasSSSE3()) {
12837 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12838 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12839 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12840 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12841 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12842 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12843 Shift += Mask[ZeroLo] % NumElts;
12844 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12845 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12846 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12847 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12848 } else
12849 return SDValue();
12850
12851 return DAG.getBitcast(VT, Res);
12852}
12853
12854/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12855///
12856/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12857/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12858/// matches elements from one of the input vectors shuffled to the left or
12859/// right with zeroable elements 'shifted in'. It handles both the strictly
12860/// bit-wise element shifts and the byte shift across an entire 128-bit double
12861/// quad word lane.
12862///
12863/// PSHL : (little-endian) left bit shift.
12864/// [ zz, 0, zz, 2 ]
12865/// [ -1, 4, zz, -1 ]
12866/// PSRL : (little-endian) right bit shift.
12867/// [ 1, zz, 3, zz]
12868/// [ -1, -1, 7, zz]
12869/// PSLLDQ : (little-endian) left byte shift
12870/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12871/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12872/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12873/// PSRLDQ : (little-endian) right byte shift
12874/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12875/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12876/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12877static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12878 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12879 int MaskOffset, const APInt &Zeroable,
12880 const X86Subtarget &Subtarget) {
12881 int Size = Mask.size();
12882 unsigned SizeInBits = Size * ScalarSizeInBits;
12883
12884 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12885 for (int i = 0; i < Size; i += Scale)
12886 for (int j = 0; j < Shift; ++j)
12887 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12888 return false;
12889
12890 return true;
12891 };
12892
12893 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12894 for (int i = 0; i != Size; i += Scale) {
12895 unsigned Pos = Left ? i + Shift : i;
12896 unsigned Low = Left ? i : i + Shift;
12897 unsigned Len = Scale - Shift;
12898 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12899 return -1;
12900 }
12901
12902 int ShiftEltBits = ScalarSizeInBits * Scale;
12903 bool ByteShift = ShiftEltBits > 64;
12904 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12905 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12906 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12907
12908 // Normalize the scale for byte shifts to still produce an i64 element
12909 // type.
12910 Scale = ByteShift ? Scale / 2 : Scale;
12911
12912 // We need to round trip through the appropriate type for the shift.
12913 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12914 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12915 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12916 return (int)ShiftAmt;
12917 };
12918
12919 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12920 // keep doubling the size of the integer elements up to that. We can
12921 // then shift the elements of the integer vector by whole multiples of
12922 // their width within the elements of the larger integer vector. Test each
12923 // multiple to see if we can find a match with the moved element indices
12924 // and that the shifted in elements are all zeroable.
12925 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12926 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12927 for (int Shift = 1; Shift != Scale; ++Shift)
12928 for (bool Left : {true, false})
12929 if (CheckZeros(Shift, Scale, Left)) {
12930 int ShiftAmt = MatchShift(Shift, Scale, Left);
12931 if (0 < ShiftAmt)
12932 return ShiftAmt;
12933 }
12934
12935 // no match
12936 return -1;
12937}
12938
12939static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12940 SDValue V2, ArrayRef<int> Mask,
12941 const APInt &Zeroable,
12942 const X86Subtarget &Subtarget,
12943 SelectionDAG &DAG) {
12944 int Size = Mask.size();
12945 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
12946
12947 MVT ShiftVT;
12948 SDValue V = V1;
12949 unsigned Opcode;
12950
12951 // Try to match shuffle against V1 shift.
12952 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12953 Mask, 0, Zeroable, Subtarget);
12954
12955 // If V1 failed, try to match shuffle against V2 shift.
12956 if (ShiftAmt < 0) {
12957 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12958 Mask, Size, Zeroable, Subtarget);
12959 V = V2;
12960 }
12961
12962 if (ShiftAmt < 0)
12963 return SDValue();
12964
12965 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((void)0)
12966 "Illegal integer vector type")((void)0);
12967 V = DAG.getBitcast(ShiftVT, V);
12968 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12969 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12970 return DAG.getBitcast(VT, V);
12971}
12972
12973// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12974// Remainder of lower half result is zero and upper half is all undef.
12975static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12976 ArrayRef<int> Mask, uint64_t &BitLen,
12977 uint64_t &BitIdx, const APInt &Zeroable) {
12978 int Size = Mask.size();
12979 int HalfSize = Size / 2;
12980 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
12981 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((void)0);
12982
12983 // Upper half must be undefined.
12984 if (!isUndefUpperHalf(Mask))
12985 return false;
12986
12987 // Determine the extraction length from the part of the
12988 // lower half that isn't zeroable.
12989 int Len = HalfSize;
12990 for (; Len > 0; --Len)
12991 if (!Zeroable[Len - 1])
12992 break;
12993 assert(Len > 0 && "Zeroable shuffle mask")((void)0);
12994
12995 // Attempt to match first Len sequential elements from the lower half.
12996 SDValue Src;
12997 int Idx = -1;
12998 for (int i = 0; i != Len; ++i) {
12999 int M = Mask[i];
13000 if (M == SM_SentinelUndef)
13001 continue;
13002 SDValue &V = (M < Size ? V1 : V2);
13003 M = M % Size;
13004
13005 // The extracted elements must start at a valid index and all mask
13006 // elements must be in the lower half.
13007 if (i > M || M >= HalfSize)
13008 return false;
13009
13010 if (Idx < 0 || (Src == V && Idx == (M - i))) {
13011 Src = V;
13012 Idx = M - i;
13013 continue;
13014 }
13015 return false;
13016 }
13017
13018 if (!Src || Idx < 0)
13019 return false;
13020
13021 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")((void)0);
13022 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13023 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13024 V1 = Src;
13025 return true;
13026}
13027
13028// INSERTQ: Extract lowest Len elements from lower half of second source and
13029// insert over first source, starting at Idx.
13030// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13031static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13032 ArrayRef<int> Mask, uint64_t &BitLen,
13033 uint64_t &BitIdx) {
13034 int Size = Mask.size();
13035 int HalfSize = Size / 2;
13036 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
13037
13038 // Upper half must be undefined.
13039 if (!isUndefUpperHalf(Mask))
13040 return false;
13041
13042 for (int Idx = 0; Idx != HalfSize; ++Idx) {
13043 SDValue Base;
13044
13045 // Attempt to match first source from mask before insertion point.
13046 if (isUndefInRange(Mask, 0, Idx)) {
13047 /* EMPTY */
13048 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13049 Base = V1;
13050 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13051 Base = V2;
13052 } else {
13053 continue;
13054 }
13055
13056 // Extend the extraction length looking to match both the insertion of
13057 // the second source and the remaining elements of the first.
13058 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13059 SDValue Insert;
13060 int Len = Hi - Idx;
13061
13062 // Match insertion.
13063 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13064 Insert = V1;
13065 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13066 Insert = V2;
13067 } else {
13068 continue;
13069 }
13070
13071 // Match the remaining elements of the lower half.
13072 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13073 /* EMPTY */
13074 } else if ((!Base || (Base == V1)) &&
13075 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13076 Base = V1;
13077 } else if ((!Base || (Base == V2)) &&
13078 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13079 Size + Hi)) {
13080 Base = V2;
13081 } else {
13082 continue;
13083 }
13084
13085 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13086 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13087 V1 = Base;
13088 V2 = Insert;
13089 return true;
13090 }
13091 }
13092
13093 return false;
13094}
13095
13096/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13097static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13098 SDValue V2, ArrayRef<int> Mask,
13099 const APInt &Zeroable, SelectionDAG &DAG) {
13100 uint64_t BitLen, BitIdx;
13101 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13102 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13103 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13104 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13105
13106 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13107 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13108 V2 ? V2 : DAG.getUNDEF(VT),
13109 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13110 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13111
13112 return SDValue();
13113}
13114
13115/// Lower a vector shuffle as a zero or any extension.
13116///
13117/// Given a specific number of elements, element bit width, and extension
13118/// stride, produce either a zero or any extension based on the available
13119/// features of the subtarget. The extended elements are consecutive and
13120/// begin and can start from an offsetted element index in the input; to
13121/// avoid excess shuffling the offset must either being in the bottom lane
13122/// or at the start of a higher lane. All extended elements must be from
13123/// the same lane.
13124static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13125 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13126 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13127 assert(Scale > 1 && "Need a scale to extend.")((void)0);
13128 int EltBits = VT.getScalarSizeInBits();
13129 int NumElements = VT.getVectorNumElements();
13130 int NumEltsPerLane = 128 / EltBits;
13131 int OffsetLane = Offset / NumEltsPerLane;
13132 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&((void)0)
13133 "Only 8, 16, and 32 bit elements can be extended.")((void)0);
13134 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((void)0);
13135 assert(0 <= Offset && "Extension offset must be positive.")((void)0);
13136 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&((void)0)
13137 "Extension offset must be in the first lane or start an upper lane.")((void)0);
13138
13139 // Check that an index is in same lane as the base offset.
13140 auto SafeOffset = [&](int Idx) {
13141 return OffsetLane == (Idx / NumEltsPerLane);
13142 };
13143
13144 // Shift along an input so that the offset base moves to the first element.
13145 auto ShuffleOffset = [&](SDValue V) {
13146 if (!Offset)
13147 return V;
13148
13149 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13150 for (int i = 0; i * Scale < NumElements; ++i) {
13151 int SrcIdx = i + Offset;
13152 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13153 }
13154 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13155 };
13156
13157 // Found a valid a/zext mask! Try various lowering strategies based on the
13158 // input type and available ISA extensions.
13159 if (Subtarget.hasSSE41()) {
13160 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13161 // PUNPCK will catch this in a later shuffle match.
13162 if (Offset && Scale == 2 && VT.is128BitVector())
13163 return SDValue();
13164 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13165 NumElements / Scale);
13166 InputV = ShuffleOffset(InputV);
13167 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13168 DL, ExtVT, InputV, DAG);
13169 return DAG.getBitcast(VT, InputV);
13170 }
13171
13172 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((void)0);
13173
13174 // For any extends we can cheat for larger element sizes and use shuffle
13175 // instructions that can fold with a load and/or copy.
13176 if (AnyExt && EltBits == 32) {
13177 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13178 -1};
13179 return DAG.getBitcast(
13180 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13181 DAG.getBitcast(MVT::v4i32, InputV),
13182 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13183 }
13184 if (AnyExt && EltBits == 16 && Scale > 2) {
13185 int PSHUFDMask[4] = {Offset / 2, -1,
13186 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13187 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13188 DAG.getBitcast(MVT::v4i32, InputV),
13189 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13190 int PSHUFWMask[4] = {1, -1, -1, -1};
13191 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13192 return DAG.getBitcast(
13193 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13194 DAG.getBitcast(MVT::v8i16, InputV),
13195 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13196 }
13197
13198 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13199 // to 64-bits.
13200 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13201 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((void)0);
13202 assert(VT.is128BitVector() && "Unexpected vector width!")((void)0);
13203
13204 int LoIdx = Offset * EltBits;
13205 SDValue Lo = DAG.getBitcast(
13206 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13207 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13208 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13209
13210 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13211 return DAG.getBitcast(VT, Lo);
13212
13213 int HiIdx = (Offset + 1) * EltBits;
13214 SDValue Hi = DAG.getBitcast(
13215 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13216 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13217 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13218 return DAG.getBitcast(VT,
13219 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13220 }
13221
13222 // If this would require more than 2 unpack instructions to expand, use
13223 // pshufb when available. We can only use more than 2 unpack instructions
13224 // when zero extending i8 elements which also makes it easier to use pshufb.
13225 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13226 assert(NumElements == 16 && "Unexpected byte vector width!")((void)0);
13227 SDValue PSHUFBMask[16];
13228 for (int i = 0; i < 16; ++i) {
13229 int Idx = Offset + (i / Scale);
13230 if ((i % Scale == 0 && SafeOffset(Idx))) {
13231 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13232 continue;
13233 }
13234 PSHUFBMask[i] =
13235 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13236 }
13237 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13238 return DAG.getBitcast(
13239 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13240 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13241 }
13242
13243 // If we are extending from an offset, ensure we start on a boundary that
13244 // we can unpack from.
13245 int AlignToUnpack = Offset % (NumElements / Scale);
13246 if (AlignToUnpack) {
13247 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13248 for (int i = AlignToUnpack; i < NumElements; ++i)
13249 ShMask[i - AlignToUnpack] = i;
13250 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13251 Offset -= AlignToUnpack;
13252 }
13253
13254 // Otherwise emit a sequence of unpacks.
13255 do {
13256 unsigned UnpackLoHi = X86ISD::UNPCKL;
13257 if (Offset >= (NumElements / 2)) {
13258 UnpackLoHi = X86ISD::UNPCKH;
13259 Offset -= (NumElements / 2);
13260 }
13261
13262 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13263 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13264 : getZeroVector(InputVT, Subtarget, DAG, DL);
13265 InputV = DAG.getBitcast(InputVT, InputV);
13266 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13267 Scale /= 2;
13268 EltBits *= 2;
13269 NumElements /= 2;
13270 } while (Scale > 1);
13271 return DAG.getBitcast(VT, InputV);
13272}
13273
13274/// Try to lower a vector shuffle as a zero extension on any microarch.
13275///
13276/// This routine will try to do everything in its power to cleverly lower
13277/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13278/// check for the profitability of this lowering, it tries to aggressively
13279/// match this pattern. It will use all of the micro-architectural details it
13280/// can to emit an efficient lowering. It handles both blends with all-zero
13281/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13282/// masking out later).
13283///
13284/// The reason we have dedicated lowering for zext-style shuffles is that they
13285/// are both incredibly common and often quite performance sensitive.
13286static SDValue lowerShuffleAsZeroOrAnyExtend(
13287 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13288 const APInt &Zeroable, const X86Subtarget &Subtarget,
13289 SelectionDAG &DAG) {
13290 int Bits = VT.getSizeInBits();
13291 int NumLanes = Bits / 128;
13292 int NumElements = VT.getVectorNumElements();
13293 int NumEltsPerLane = NumElements / NumLanes;
13294 assert(VT.getScalarSizeInBits() <= 32 &&((void)0)
13295 "Exceeds 32-bit integer zero extension limit")((void)0);
13296 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")((void)0);
13297
13298 // Define a helper function to check a particular ext-scale and lower to it if
13299 // valid.
13300 auto Lower = [&](int Scale) -> SDValue {
13301 SDValue InputV;
13302 bool AnyExt = true;
13303 int Offset = 0;
13304 int Matches = 0;
13305 for (int i = 0; i < NumElements; ++i) {
13306 int M = Mask[i];
13307 if (M < 0)
13308 continue; // Valid anywhere but doesn't tell us anything.
13309 if (i % Scale != 0) {
13310 // Each of the extended elements need to be zeroable.
13311 if (!Zeroable[i])
13312 return SDValue();
13313
13314 // We no longer are in the anyext case.
13315 AnyExt = false;
13316 continue;
13317 }
13318
13319 // Each of the base elements needs to be consecutive indices into the
13320 // same input vector.
13321 SDValue V = M < NumElements ? V1 : V2;
13322 M = M % NumElements;
13323 if (!InputV) {
13324 InputV = V;
13325 Offset = M - (i / Scale);
13326 } else if (InputV != V)
13327 return SDValue(); // Flip-flopping inputs.
13328
13329 // Offset must start in the lowest 128-bit lane or at the start of an
13330 // upper lane.
13331 // FIXME: Is it ever worth allowing a negative base offset?
13332 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13333 (Offset % NumEltsPerLane) == 0))
13334 return SDValue();
13335
13336 // If we are offsetting, all referenced entries must come from the same
13337 // lane.
13338 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13339 return SDValue();
13340
13341 if ((M % NumElements) != (Offset + (i / Scale)))
13342 return SDValue(); // Non-consecutive strided elements.
13343 Matches++;
13344 }
13345
13346 // If we fail to find an input, we have a zero-shuffle which should always
13347 // have already been handled.
13348 // FIXME: Maybe handle this here in case during blending we end up with one?
13349 if (!InputV)
13350 return SDValue();
13351
13352 // If we are offsetting, don't extend if we only match a single input, we
13353 // can always do better by using a basic PSHUF or PUNPCK.
13354 if (Offset != 0 && Matches < 2)
13355 return SDValue();
13356
13357 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13358 InputV, Mask, Subtarget, DAG);
13359 };
13360
13361 // The widest scale possible for extending is to a 64-bit integer.
13362 assert(Bits % 64 == 0 &&((void)0)
13363 "The number of bits in a vector must be divisible by 64 on x86!")((void)0);
13364 int NumExtElements = Bits / 64;
13365
13366 // Each iteration, try extending the elements half as much, but into twice as
13367 // many elements.
13368 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13369 assert(NumElements % NumExtElements == 0 &&((void)0)
13370 "The input vector size must be divisible by the extended size.")((void)0);
13371 if (SDValue V = Lower(NumElements / NumExtElements))
13372 return V;
13373 }
13374
13375 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13376 if (Bits != 128)
13377 return SDValue();
13378
13379 // Returns one of the source operands if the shuffle can be reduced to a
13380 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13381 auto CanZExtLowHalf = [&]() {
13382 for (int i = NumElements / 2; i != NumElements; ++i)
13383 if (!Zeroable[i])
13384 return SDValue();
13385 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13386 return V1;
13387 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13388 return V2;
13389 return SDValue();
13390 };
13391
13392 if (SDValue V = CanZExtLowHalf()) {
13393 V = DAG.getBitcast(MVT::v2i64, V);
13394 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13395 return DAG.getBitcast(VT, V);
13396 }
13397
13398 // No viable ext lowering found.
13399 return SDValue();
13400}
13401
13402/// Try to get a scalar value for a specific element of a vector.
13403///
13404/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13405static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13406 SelectionDAG &DAG) {
13407 MVT VT = V.getSimpleValueType();
13408 MVT EltVT = VT.getVectorElementType();
13409 V = peekThroughBitcasts(V);
13410
13411 // If the bitcasts shift the element size, we can't extract an equivalent
13412 // element from it.
13413 MVT NewVT = V.getSimpleValueType();
13414 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13415 return SDValue();
13416
13417 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13418 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13419 // Ensure the scalar operand is the same size as the destination.
13420 // FIXME: Add support for scalar truncation where possible.
13421 SDValue S = V.getOperand(Idx);
13422 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13423 return DAG.getBitcast(EltVT, S);
13424 }
13425
13426 return SDValue();
13427}
13428
13429/// Helper to test for a load that can be folded with x86 shuffles.
13430///
13431/// This is particularly important because the set of instructions varies
13432/// significantly based on whether the operand is a load or not.
13433static bool isShuffleFoldableLoad(SDValue V) {
13434 V = peekThroughBitcasts(V);
13435 return ISD::isNON_EXTLoad(V.getNode());
13436}
13437
13438/// Try to lower insertion of a single element into a zero vector.
13439///
13440/// This is a common pattern that we have especially efficient patterns to lower
13441/// across all subtarget feature sets.
13442static SDValue lowerShuffleAsElementInsertion(
13443 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13444 const APInt &Zeroable, const X86Subtarget &Subtarget,
13445 SelectionDAG &DAG) {
13446 MVT ExtVT = VT;
13447 MVT EltVT = VT.getVectorElementType();
13448
13449 int V2Index =
13450 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13451 Mask.begin();
13452 bool IsV1Zeroable = true;
13453 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13454 if (i != V2Index && !Zeroable[i]) {
13455 IsV1Zeroable = false;
13456 break;
13457 }
13458
13459 // Check for a single input from a SCALAR_TO_VECTOR node.
13460 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13461 // all the smarts here sunk into that routine. However, the current
13462 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13463 // vector shuffle lowering is dead.
13464 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13465 DAG);
13466 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13467 // We need to zext the scalar if it is smaller than an i32.
13468 V2S = DAG.getBitcast(EltVT, V2S);
13469 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
13470 // Using zext to expand a narrow element won't work for non-zero
13471 // insertions.
13472 if (!IsV1Zeroable)
13473 return SDValue();
13474
13475 // Zero-extend directly to i32.
13476 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13477 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13478 }
13479 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13480 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13481 EltVT == MVT::i16) {
13482 // Either not inserting from the low element of the input or the input
13483 // element size is too small to use VZEXT_MOVL to clear the high bits.
13484 return SDValue();
13485 }
13486
13487 if (!IsV1Zeroable) {
13488 // If V1 can't be treated as a zero vector we have fewer options to lower
13489 // this. We can't support integer vectors or non-zero targets cheaply, and
13490 // the V1 elements can't be permuted in any way.
13491 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((void)0);
13492 if (!VT.isFloatingPoint() || V2Index != 0)
13493 return SDValue();
13494 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13495 V1Mask[V2Index] = -1;
13496 if (!isNoopShuffleMask(V1Mask))
13497 return SDValue();
13498 if (!VT.is128BitVector())
13499 return SDValue();
13500
13501 // Otherwise, use MOVSD or MOVSS.
13502 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&((void)0)
13503 "Only two types of floating point element types to handle!")((void)0);
13504 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
13505 ExtVT, V1, V2);
13506 }
13507
13508 // This lowering only works for the low element with floating point vectors.
13509 if (VT.isFloatingPoint() && V2Index != 0)
13510 return SDValue();
13511
13512 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13513 if (ExtVT != VT)
13514 V2 = DAG.getBitcast(VT, V2);
13515
13516 if (V2Index != 0) {
13517 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13518 // the desired position. Otherwise it is more efficient to do a vector
13519 // shift left. We know that we can do a vector shift left because all
13520 // the inputs are zero.
13521 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13522 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13523 V2Shuffle[V2Index] = 0;
13524 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13525 } else {
13526 V2 = DAG.getBitcast(MVT::v16i8, V2);
13527 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13528 DAG.getTargetConstant(
13529 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13530 V2 = DAG.getBitcast(VT, V2);
13531 }
13532 }
13533 return V2;
13534}
13535
13536/// Try to lower broadcast of a single - truncated - integer element,
13537/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13538///
13539/// This assumes we have AVX2.
13540static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13541 int BroadcastIdx,
13542 const X86Subtarget &Subtarget,
13543 SelectionDAG &DAG) {
13544 assert(Subtarget.hasAVX2() &&((void)0)
13545 "We can only lower integer broadcasts with AVX2!")((void)0);
13546
13547 MVT EltVT = VT.getVectorElementType();
13548 MVT V0VT = V0.getSimpleValueType();
13549
13550 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((void)0);
13551 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((void)0);
13552
13553 MVT V0EltVT = V0VT.getVectorElementType();
13554 if (!V0EltVT.isInteger())
13555 return SDValue();
13556
13557 const unsigned EltSize = EltVT.getSizeInBits();
13558 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13559
13560 // This is only a truncation if the original element type is larger.
13561 if (V0EltSize <= EltSize)
13562 return SDValue();
13563
13564 assert(((V0EltSize % EltSize) == 0) &&((void)0)
13565 "Scalar type sizes must all be powers of 2 on x86!")((void)0);
13566
13567 const unsigned V0Opc = V0.getOpcode();
13568 const unsigned Scale = V0EltSize / EltSize;
13569 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13570
13571 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13572 V0Opc != ISD::BUILD_VECTOR)
13573 return SDValue();
13574
13575 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13576
13577 // If we're extracting non-least-significant bits, shift so we can truncate.
13578 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13579 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13580 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13581 if (const int OffsetIdx = BroadcastIdx % Scale)
13582 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13583 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13584
13585 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13586 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13587}
13588
13589/// Test whether this can be lowered with a single SHUFPS instruction.
13590///
13591/// This is used to disable more specialized lowerings when the shufps lowering
13592/// will happen to be efficient.
13593static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13594 // This routine only handles 128-bit shufps.
13595 assert(Mask.size() == 4 && "Unsupported mask size!")((void)0);
13596 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((void)0);
13597 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((void)0);
13598 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((void)0);
13599 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((void)0);
13600
13601 // To lower with a single SHUFPS we need to have the low half and high half
13602 // each requiring a single input.
13603 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13604 return false;
13605 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13606 return false;
13607
13608 return true;
13609}
13610
13611/// If we are extracting two 128-bit halves of a vector and shuffling the
13612/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13613/// multi-shuffle lowering.
13614static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13615 SDValue N1, ArrayRef<int> Mask,
13616 SelectionDAG &DAG) {
13617 MVT VT = N0.getSimpleValueType();
13618 assert((VT.is128BitVector() &&((void)0)
13619 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&((void)0)
13620 "VPERM* family of shuffles requires 32-bit or 64-bit elements")((void)0);
13621
13622 // Check that both sources are extracts of the same source vector.
13623 if (!N0.hasOneUse() || !N1.hasOneUse() ||
13624 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13625 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13626 N0.getOperand(0) != N1.getOperand(0))
13627 return SDValue();
13628
13629 SDValue WideVec = N0.getOperand(0);
13630 MVT WideVT = WideVec.getSimpleValueType();
13631 if (!WideVT.is256BitVector())
13632 return SDValue();
13633
13634 // Match extracts of each half of the wide source vector. Commute the shuffle
13635 // if the extract of the low half is N1.
13636 unsigned NumElts = VT.getVectorNumElements();
13637 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13638 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13639 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13640 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13641 ShuffleVectorSDNode::commuteMask(NewMask);
13642 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13643 return SDValue();
13644
13645 // Final bailout: if the mask is simple, we are better off using an extract
13646 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13647 // because that avoids a constant load from memory.
13648 if (NumElts == 4 &&
13649 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13650 return SDValue();
13651
13652 // Extend the shuffle mask with undef elements.
13653 NewMask.append(NumElts, -1);
13654
13655 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13656 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13657 NewMask);
13658 // This is free: ymm -> xmm.
13659 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13660 DAG.getIntPtrConstant(0, DL));
13661}
13662
13663/// Try to lower broadcast of a single element.
13664///
13665/// For convenience, this code also bundles all of the subtarget feature set
13666/// filtering. While a little annoying to re-dispatch on type here, there isn't
13667/// a convenient way to factor it out.
13668static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13669 SDValue V2, ArrayRef<int> Mask,
13670 const X86Subtarget &Subtarget,
13671 SelectionDAG &DAG) {
13672 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13673 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13674 (Subtarget.hasAVX2() && VT.isInteger())))
13675 return SDValue();
13676
13677 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13678 // we can only broadcast from a register with AVX2.
13679 unsigned NumEltBits = VT.getScalarSizeInBits();
13680 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13681 ? X86ISD::MOVDDUP
13682 : X86ISD::VBROADCAST;
13683 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13684
13685 // Check that the mask is a broadcast.
13686 int BroadcastIdx = getSplatIndex(Mask);
13687 if (BroadcastIdx < 0)
13688 return SDValue();
13689 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((void)0)
13690 "a sorted mask where the broadcast "((void)0)
13691 "comes from V1.")((void)0);
13692
13693 // Go up the chain of (vector) values to find a scalar load that we can
13694 // combine with the broadcast.
13695 // TODO: Combine this logic with findEltLoadSrc() used by
13696 // EltsFromConsecutiveLoads().
13697 int BitOffset = BroadcastIdx * NumEltBits;
13698 SDValue V = V1;
13699 for (;;) {
13700 switch (V.getOpcode()) {
13701 case ISD::BITCAST: {
13702 V = V.getOperand(0);
13703 continue;
13704 }
13705 case ISD::CONCAT_VECTORS: {
13706 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13707 int OpIdx = BitOffset / OpBitWidth;
13708 V = V.getOperand(OpIdx);
13709 BitOffset %= OpBitWidth;
13710 continue;
13711 }
13712 case ISD::EXTRACT_SUBVECTOR: {
13713 // The extraction index adds to the existing offset.
13714 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13715 unsigned Idx = V.getConstantOperandVal(1);
13716 unsigned BeginOffset = Idx * EltBitWidth;
13717 BitOffset += BeginOffset;
13718 V = V.getOperand(0);
13719 continue;
13720 }
13721 case ISD::INSERT_SUBVECTOR: {
13722 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13723 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13724 int Idx = (int)V.getConstantOperandVal(2);
13725 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13726 int BeginOffset = Idx * EltBitWidth;
13727 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13728 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13729 BitOffset -= BeginOffset;
13730 V = VInner;
13731 } else {
13732 V = VOuter;
13733 }
13734 continue;
13735 }
13736 }
13737 break;
13738 }
13739 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")((void)0);
13740 BroadcastIdx = BitOffset / NumEltBits;
13741
13742 // Do we need to bitcast the source to retrieve the original broadcast index?
13743 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13744
13745 // Check if this is a broadcast of a scalar. We special case lowering
13746 // for scalars so that we can more effectively fold with loads.
13747 // If the original value has a larger element type than the shuffle, the
13748 // broadcast element is in essence truncated. Make that explicit to ease
13749 // folding.
13750 if (BitCastSrc && VT.isInteger())
13751 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13752 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13753 return TruncBroadcast;
13754
13755 // Also check the simpler case, where we can directly reuse the scalar.
13756 if (!BitCastSrc &&
13757 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13758 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13759 V = V.getOperand(BroadcastIdx);
13760
13761 // If we can't broadcast from a register, check that the input is a load.
13762 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13763 return SDValue();
13764 } else if (ISD::isNormalLoad(V.getNode()) &&
13765 cast<LoadSDNode>(V)->isSimple()) {
13766 // We do not check for one-use of the vector load because a broadcast load
13767 // is expected to be a win for code size, register pressure, and possibly
13768 // uops even if the original vector load is not eliminated.
13769
13770 // Reduce the vector load and shuffle to a broadcasted scalar load.
13771 LoadSDNode *Ld = cast<LoadSDNode>(V);
13772 SDValue BaseAddr = Ld->getOperand(1);
13773 MVT SVT = VT.getScalarType();
13774 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13775 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")((void)0);
13776 SDValue NewAddr =
13777 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
13778
13779 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13780 // than MOVDDUP.
13781 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13782 if (Opcode == X86ISD::VBROADCAST) {
13783 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13784 SDValue Ops[] = {Ld->getChain(), NewAddr};
13785 V = DAG.getMemIntrinsicNode(
13786 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13787 DAG.getMachineFunction().getMachineMemOperand(
13788 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13789 DAG.makeEquivalentMemoryOrdering(Ld, V);
13790 return DAG.getBitcast(VT, V);
13791 }
13792 assert(SVT == MVT::f64 && "Unexpected VT!")((void)0);
13793 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13794 DAG.getMachineFunction().getMachineMemOperand(
13795 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13796 DAG.makeEquivalentMemoryOrdering(Ld, V);
13797 } else if (!BroadcastFromReg) {
13798 // We can't broadcast from a vector register.
13799 return SDValue();
13800 } else if (BitOffset != 0) {
13801 // We can only broadcast from the zero-element of a vector register,
13802 // but it can be advantageous to broadcast from the zero-element of a
13803 // subvector.
13804 if (!VT.is256BitVector() && !VT.is512BitVector())
13805 return SDValue();
13806
13807 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13808 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13809 return SDValue();
13810
13811 // Only broadcast the zero-element of a 128-bit subvector.
13812 if ((BitOffset % 128) != 0)
13813 return SDValue();
13814
13815 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&((void)0)
13816 "Unexpected bit-offset")((void)0);
13817 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&((void)0)
13818 "Unexpected vector size")((void)0);
13819 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13820 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13821 }
13822
13823 // On AVX we can use VBROADCAST directly for scalar sources.
13824 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13825 V = DAG.getBitcast(MVT::f64, V);
13826 if (Subtarget.hasAVX()) {
13827 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13828 return DAG.getBitcast(VT, V);
13829 }
13830 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13831 }
13832
13833 // If this is a scalar, do the broadcast on this type and bitcast.
13834 if (!V.getValueType().isVector()) {
13835 assert(V.getScalarValueSizeInBits() == NumEltBits &&((void)0)
13836 "Unexpected scalar size")((void)0);
13837 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13838 VT.getVectorNumElements());
13839 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13840 }
13841
13842 // We only support broadcasting from 128-bit vectors to minimize the
13843 // number of patterns we need to deal with in isel. So extract down to
13844 // 128-bits, removing as many bitcasts as possible.
13845 if (V.getValueSizeInBits() > 128)
13846 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13847
13848 // Otherwise cast V to a vector with the same element type as VT, but
13849 // possibly narrower than VT. Then perform the broadcast.
13850 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13851 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13852 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13853}
13854
13855// Check for whether we can use INSERTPS to perform the shuffle. We only use
13856// INSERTPS when the V1 elements are already in the correct locations
13857// because otherwise we can just always use two SHUFPS instructions which
13858// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13859// perform INSERTPS if a single V1 element is out of place and all V2
13860// elements are zeroable.
13861static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13862 unsigned &InsertPSMask,
13863 const APInt &Zeroable,
13864 ArrayRef<int> Mask, SelectionDAG &DAG) {
13865 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((void)0);
13866 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((void)0);
13867 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
13868
13869 // Attempt to match INSERTPS with one element from VA or VB being
13870 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13871 // are updated.
13872 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13873 ArrayRef<int> CandidateMask) {
13874 unsigned ZMask = 0;
13875 int VADstIndex = -1;
13876 int VBDstIndex = -1;
13877 bool VAUsedInPlace = false;
13878
13879 for (int i = 0; i < 4; ++i) {
13880 // Synthesize a zero mask from the zeroable elements (includes undefs).
13881 if (Zeroable[i]) {
13882 ZMask |= 1 << i;
13883 continue;
13884 }
13885
13886 // Flag if we use any VA inputs in place.
13887 if (i == CandidateMask[i]) {
13888 VAUsedInPlace = true;
13889 continue;
13890 }
13891
13892 // We can only insert a single non-zeroable element.
13893 if (VADstIndex >= 0 || VBDstIndex >= 0)
13894 return false;
13895
13896 if (CandidateMask[i] < 4) {
13897 // VA input out of place for insertion.
13898 VADstIndex = i;
13899 } else {
13900 // VB input for insertion.
13901 VBDstIndex = i;
13902 }
13903 }
13904
13905 // Don't bother if we have no (non-zeroable) element for insertion.
13906 if (VADstIndex < 0 && VBDstIndex < 0)
13907 return false;
13908
13909 // Determine element insertion src/dst indices. The src index is from the
13910 // start of the inserted vector, not the start of the concatenated vector.
13911 unsigned VBSrcIndex = 0;
13912 if (VADstIndex >= 0) {
13913 // If we have a VA input out of place, we use VA as the V2 element
13914 // insertion and don't use the original V2 at all.
13915 VBSrcIndex = CandidateMask[VADstIndex];
13916 VBDstIndex = VADstIndex;
13917 VB = VA;
13918 } else {
13919 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13920 }
13921
13922 // If no V1 inputs are used in place, then the result is created only from
13923 // the zero mask and the V2 insertion - so remove V1 dependency.
13924 if (!VAUsedInPlace)
13925 VA = DAG.getUNDEF(MVT::v4f32);
13926
13927 // Update V1, V2 and InsertPSMask accordingly.
13928 V1 = VA;
13929 V2 = VB;
13930
13931 // Insert the V2 element into the desired position.
13932 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13933 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")((void)0);
13934 return true;
13935 };
13936
13937 if (matchAsInsertPS(V1, V2, Mask))
13938 return true;
13939
13940 // Commute and try again.
13941 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13942 ShuffleVectorSDNode::commuteMask(CommutedMask);
13943 if (matchAsInsertPS(V2, V1, CommutedMask))
13944 return true;
13945
13946 return false;
13947}
13948
13949static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13950 ArrayRef<int> Mask, const APInt &Zeroable,
13951 SelectionDAG &DAG) {
13952 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
13953 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
13954
13955 // Attempt to match the insertps pattern.
13956 unsigned InsertPSMask = 0;
13957 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13958 return SDValue();
13959
13960 // Insert the V2 element into the desired position.
13961 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13962 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13963}
13964
13965/// Try to lower a shuffle as a permute of the inputs followed by an
13966/// UNPCK instruction.
13967///
13968/// This specifically targets cases where we end up with alternating between
13969/// the two inputs, and so can permute them into something that feeds a single
13970/// UNPCK instruction. Note that this routine only targets integer vectors
13971/// because for floating point vectors we have a generalized SHUFPS lowering
13972/// strategy that handles everything that doesn't *exactly* match an unpack,
13973/// making this clever lowering unnecessary.
13974static SDValue lowerShuffleAsPermuteAndUnpack(
13975 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13976 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13977 assert(!VT.isFloatingPoint() &&((void)0)
13978 "This routine only supports integer vectors.")((void)0);
13979 assert(VT.is128BitVector() &&((void)0)
13980 "This routine only works on 128-bit vectors.")((void)0);
13981 assert(!V2.isUndef() &&((void)0)
13982 "This routine should only be used when blending two inputs.")((void)0);
13983 assert(Mask.size() >= 2 && "Single element masks are invalid.")((void)0);
13984
13985 int Size = Mask.size();
13986
13987 int NumLoInputs =
13988 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13989 int NumHiInputs =
13990 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13991
13992 bool UnpackLo = NumLoInputs >= NumHiInputs;
13993
13994 auto TryUnpack = [&](int ScalarSize, int Scale) {
13995 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13996 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13997
13998 for (int i = 0; i < Size; ++i) {
13999 if (Mask[i] < 0)
14000 continue;
14001
14002 // Each element of the unpack contains Scale elements from this mask.
14003 int UnpackIdx = i / Scale;
14004
14005 // We only handle the case where V1 feeds the first slots of the unpack.
14006 // We rely on canonicalization to ensure this is the case.
14007 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14008 return SDValue();
14009
14010 // Setup the mask for this input. The indexing is tricky as we have to
14011 // handle the unpack stride.
14012 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14013 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14014 Mask[i] % Size;
14015 }
14016
14017 // If we will have to shuffle both inputs to use the unpack, check whether
14018 // we can just unpack first and shuffle the result. If so, skip this unpack.
14019 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14020 !isNoopShuffleMask(V2Mask))
14021 return SDValue();
14022
14023 // Shuffle the inputs into place.
14024 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14025 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14026
14027 // Cast the inputs to the type we will use to unpack them.
14028 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14029 V1 = DAG.getBitcast(UnpackVT, V1);
14030 V2 = DAG.getBitcast(UnpackVT, V2);
14031
14032 // Unpack the inputs and cast the result back to the desired type.
14033 return DAG.getBitcast(
14034 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14035 UnpackVT, V1, V2));
14036 };
14037
14038 // We try each unpack from the largest to the smallest to try and find one
14039 // that fits this mask.
14040 int OrigScalarSize = VT.getScalarSizeInBits();
14041 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14042 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14043 return Unpack;
14044
14045 // If we're shuffling with a zero vector then we're better off not doing
14046 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14047 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14048 ISD::isBuildVectorAllZeros(V2.getNode()))
14049 return SDValue();
14050
14051 // If none of the unpack-rooted lowerings worked (or were profitable) try an
14052 // initial unpack.
14053 if (NumLoInputs == 0 || NumHiInputs == 0) {
14054 assert((NumLoInputs > 0 || NumHiInputs > 0) &&((void)0)
14055 "We have to have *some* inputs!")((void)0);
14056 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14057
14058 // FIXME: We could consider the total complexity of the permute of each
14059 // possible unpacking. Or at the least we should consider how many
14060 // half-crossings are created.
14061 // FIXME: We could consider commuting the unpacks.
14062
14063 SmallVector<int, 32> PermMask((unsigned)Size, -1);
14064 for (int i = 0; i < Size; ++i) {
14065 if (Mask[i] < 0)
14066 continue;
14067
14068 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((void)0);
14069
14070 PermMask[i] =
14071 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14072 }
14073 return DAG.getVectorShuffle(
14074 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14075 DL, VT, V1, V2),
14076 DAG.getUNDEF(VT), PermMask);
14077 }
14078
14079 return SDValue();
14080}
14081
14082/// Handle lowering of 2-lane 64-bit floating point shuffles.
14083///
14084/// This is the basis function for the 2-lane 64-bit shuffles as we have full
14085/// support for floating point shuffles but not integer shuffles. These
14086/// instructions will incur a domain crossing penalty on some chips though so
14087/// it is better to avoid lowering through this for integer vectors where
14088/// possible.
14089static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14090 const APInt &Zeroable, SDValue V1, SDValue V2,
14091 const X86Subtarget &Subtarget,
14092 SelectionDAG &DAG) {
14093 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((void)0);
14094 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((void)0);
14095 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((void)0);
14096
14097 if (V2.isUndef()) {
14098 // Check for being able to broadcast a single element.
14099 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14100 Mask, Subtarget, DAG))
14101 return Broadcast;
14102
14103 // Straight shuffle of a single input vector. Simulate this by using the
14104 // single input as both of the "inputs" to this instruction..
14105 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14106
14107 if (Subtarget.hasAVX()) {
14108 // If we have AVX, we can use VPERMILPS which will allow folding a load
14109 // into the shuffle.
14110 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14111 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14112 }
14113
14114 return DAG.getNode(
14115 X86ISD::SHUFP, DL, MVT::v2f64,
14116 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14117 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14118 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14119 }
14120 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14121 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14122 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((void)0);
14123 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((void)0);
14124
14125 if (Subtarget.hasAVX2())
14126 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14127 return Extract;
14128
14129 // When loading a scalar and then shuffling it into a vector we can often do
14130 // the insertion cheaply.
14131 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14132 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14133 return Insertion;
14134 // Try inverting the insertion since for v2 masks it is easy to do and we
14135 // can't reliably sort the mask one way or the other.
14136 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14137 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14138 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14139 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14140 return Insertion;
14141
14142 // Try to use one of the special instruction patterns to handle two common
14143 // blend patterns if a zero-blend above didn't work.
14144 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14145 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14146 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14147 // We can either use a special instruction to load over the low double or
14148 // to move just the low double.
14149 return DAG.getNode(
14150 X86ISD::MOVSD, DL, MVT::v2f64, V2,
14151 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14152
14153 if (Subtarget.hasSSE41())
14154 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14155 Zeroable, Subtarget, DAG))
14156 return Blend;
14157
14158 // Use dedicated unpack instructions for masks that match their pattern.
14159 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14160 return V;
14161
14162 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14163 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14164 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14165}
14166
14167/// Handle lowering of 2-lane 64-bit integer shuffles.
14168///
14169/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14170/// the integer unit to minimize domain crossing penalties. However, for blends
14171/// it falls back to the floating point shuffle operation with appropriate bit
14172/// casting.
14173static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14174 const APInt &Zeroable, SDValue V1, SDValue V2,
14175 const X86Subtarget &Subtarget,
14176 SelectionDAG &DAG) {
14177 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((void)0);
14178 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((void)0);
14179 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((void)0);
14180
14181 if (V2.isUndef()) {
14182 // Check for being able to broadcast a single element.
14183 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14184 Mask, Subtarget, DAG))
14185 return Broadcast;
14186
14187 // Straight shuffle of a single input vector. For everything from SSE2
14188 // onward this has a single fast instruction with no scary immediates.
14189 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14190 V1 = DAG.getBitcast(MVT::v4i32, V1);
14191 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14192 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14193 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14194 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14195 return DAG.getBitcast(
14196 MVT::v2i64,
14197 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14198 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14199 }
14200 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14201 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14202 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((void)0);
14203 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((void)0);
14204
14205 if (Subtarget.hasAVX2())
14206 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14207 return Extract;
14208
14209 // Try to use shift instructions.
14210 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14211 Zeroable, Subtarget, DAG))
14212 return Shift;
14213
14214 // When loading a scalar and then shuffling it into a vector we can often do
14215 // the insertion cheaply.
14216 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14217 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14218 return Insertion;
14219 // Try inverting the insertion since for v2 masks it is easy to do and we
14220 // can't reliably sort the mask one way or the other.
14221 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14222 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14223 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14224 return Insertion;
14225
14226 // We have different paths for blend lowering, but they all must use the
14227 // *exact* same predicate.
14228 bool IsBlendSupported = Subtarget.hasSSE41();
14229 if (IsBlendSupported)
14230 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14231 Zeroable, Subtarget, DAG))
14232 return Blend;
14233
14234 // Use dedicated unpack instructions for masks that match their pattern.
14235 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14236 return V;
14237
14238 // Try to use byte rotation instructions.
14239 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14240 if (Subtarget.hasSSSE3()) {
14241 if (Subtarget.hasVLX())
14242 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14243 Subtarget, DAG))
14244 return Rotate;
14245
14246 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14247 Subtarget, DAG))
14248 return Rotate;
14249 }
14250
14251 // If we have direct support for blends, we should lower by decomposing into
14252 // a permute. That will be faster than the domain cross.
14253 if (IsBlendSupported)
14254 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14255 Subtarget, DAG);
14256
14257 // We implement this with SHUFPD which is pretty lame because it will likely
14258 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14259 // However, all the alternatives are still more cycles and newer chips don't
14260 // have this problem. It would be really nice if x86 had better shuffles here.
14261 V1 = DAG.getBitcast(MVT::v2f64, V1);
14262 V2 = DAG.getBitcast(MVT::v2f64, V2);
14263 return DAG.getBitcast(MVT::v2i64,
14264 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14265}
14266
14267/// Lower a vector shuffle using the SHUFPS instruction.
14268///
14269/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14270/// It makes no assumptions about whether this is the *best* lowering, it simply
14271/// uses it.
14272static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14273 ArrayRef<int> Mask, SDValue V1,
14274 SDValue V2, SelectionDAG &DAG) {
14275 SDValue LowV = V1, HighV = V2;
14276 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14277 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14278
14279 if (NumV2Elements == 1) {
14280 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14281
14282 // Compute the index adjacent to V2Index and in the same half by toggling
14283 // the low bit.
14284 int V2AdjIndex = V2Index ^ 1;
14285
14286 if (Mask[V2AdjIndex] < 0) {
14287 // Handles all the cases where we have a single V2 element and an undef.
14288 // This will only ever happen in the high lanes because we commute the
14289 // vector otherwise.
14290 if (V2Index < 2)
14291 std::swap(LowV, HighV);
14292 NewMask[V2Index] -= 4;
14293 } else {
14294 // Handle the case where the V2 element ends up adjacent to a V1 element.
14295 // To make this work, blend them together as the first step.
14296 int V1Index = V2AdjIndex;
14297 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14298 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14299 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14300
14301 // Now proceed to reconstruct the final blend as we have the necessary
14302 // high or low half formed.
14303 if (V2Index < 2) {
14304 LowV = V2;
14305 HighV = V1;
14306 } else {
14307 HighV = V2;
14308 }
14309 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14310 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14311 }
14312 } else if (NumV2Elements == 2) {
14313 if (Mask[0] < 4 && Mask[1] < 4) {
14314 // Handle the easy case where we have V1 in the low lanes and V2 in the
14315 // high lanes.
14316 NewMask[2] -= 4;
14317 NewMask[3] -= 4;
14318 } else if (Mask[2] < 4 && Mask[3] < 4) {
14319 // We also handle the reversed case because this utility may get called
14320 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14321 // arrange things in the right direction.
14322 NewMask[0] -= 4;
14323 NewMask[1] -= 4;
14324 HighV = V1;
14325 LowV = V2;
14326 } else {
14327 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14328 // trying to place elements directly, just blend them and set up the final
14329 // shuffle to place them.
14330
14331 // The first two blend mask elements are for V1, the second two are for
14332 // V2.
14333 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14334 Mask[2] < 4 ? Mask[2] : Mask[3],
14335 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14336 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14337 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14338 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14339
14340 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14341 // a blend.
14342 LowV = HighV = V1;
14343 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14344 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14345 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14346 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14347 }
14348 } else if (NumV2Elements == 3) {
14349 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14350 // we can get here due to other paths (e.g repeated mask matching) that we
14351 // don't want to do another round of lowerVECTOR_SHUFFLE.
14352 ShuffleVectorSDNode::commuteMask(NewMask);
14353 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14354 }
14355 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14356 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14357}
14358
14359/// Lower 4-lane 32-bit floating point shuffles.
14360///
14361/// Uses instructions exclusively from the floating point unit to minimize
14362/// domain crossing penalties, as these are sufficient to implement all v4f32
14363/// shuffles.
14364static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14365 const APInt &Zeroable, SDValue V1, SDValue V2,
14366 const X86Subtarget &Subtarget,
14367 SelectionDAG &DAG) {
14368 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
14369 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
14370 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
14371
14372 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14373
14374 if (NumV2Elements == 0) {
14375 // Check for being able to broadcast a single element.
14376 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14377 Mask, Subtarget, DAG))
14378 return Broadcast;
14379
14380 // Use even/odd duplicate instructions for masks that match their pattern.
14381 if (Subtarget.hasSSE3()) {
14382 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14383 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14384 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14385 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14386 }
14387
14388 if (Subtarget.hasAVX()) {
14389 // If we have AVX, we can use VPERMILPS which will allow folding a load
14390 // into the shuffle.
14391 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14392 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14393 }
14394
14395 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14396 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14397 if (!Subtarget.hasSSE2()) {
14398 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14399 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14400 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14401 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14402 }
14403
14404 // Otherwise, use a straight shuffle of a single input vector. We pass the
14405 // input vector to both operands to simulate this with a SHUFPS.
14406 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14407 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14408 }
14409
14410 if (Subtarget.hasAVX2())
14411 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14412 return Extract;
14413
14414 // There are special ways we can lower some single-element blends. However, we
14415 // have custom ways we can lower more complex single-element blends below that
14416 // we defer to if both this and BLENDPS fail to match, so restrict this to
14417 // when the V2 input is targeting element 0 of the mask -- that is the fast
14418 // case here.
14419 if (NumV2Elements == 1 && Mask[0] >= 4)
14420 if (SDValue V = lowerShuffleAsElementInsertion(
14421 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14422 return V;
14423
14424 if (Subtarget.hasSSE41()) {
14425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14426 Zeroable, Subtarget, DAG))
14427 return Blend;
14428
14429 // Use INSERTPS if we can complete the shuffle efficiently.
14430 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14431 return V;
14432
14433 if (!isSingleSHUFPSMask(Mask))
14434 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14435 V2, Mask, DAG))
14436 return BlendPerm;
14437 }
14438
14439 // Use low/high mov instructions. These are only valid in SSE1 because
14440 // otherwise they are widened to v2f64 and never get here.
14441 if (!Subtarget.hasSSE2()) {
14442 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14443 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14444 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14445 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14446 }
14447
14448 // Use dedicated unpack instructions for masks that match their pattern.
14449 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14450 return V;
14451
14452 // Otherwise fall back to a SHUFPS lowering strategy.
14453 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14454}
14455
14456/// Lower 4-lane i32 vector shuffles.
14457///
14458/// We try to handle these with integer-domain shuffles where we can, but for
14459/// blends we use the floating point domain blend instructions.
14460static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14461 const APInt &Zeroable, SDValue V1, SDValue V2,
14462 const X86Subtarget &Subtarget,
14463 SelectionDAG &DAG) {
14464 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((void)0);
14465 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((void)0);
14466 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
14467
14468 // Whenever we can lower this as a zext, that instruction is strictly faster
14469 // than any alternative. It also allows us to fold memory operands into the
14470 // shuffle in many cases.
14471 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14472 Zeroable, Subtarget, DAG))
14473 return ZExt;
14474
14475 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14476
14477 if (NumV2Elements == 0) {
14478 // Try to use broadcast unless the mask only has one non-undef element.
14479 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14480 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14481 Mask, Subtarget, DAG))
14482 return Broadcast;
14483 }
14484
14485 // Straight shuffle of a single input vector. For everything from SSE2
14486 // onward this has a single fast instruction with no scary immediates.
14487 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14488 // but we aren't actually going to use the UNPCK instruction because doing
14489 // so prevents folding a load into this instruction or making a copy.
14490 const int UnpackLoMask[] = {0, 0, 1, 1};
14491 const int UnpackHiMask[] = {2, 2, 3, 3};
14492 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14493 Mask = UnpackLoMask;
14494 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14495 Mask = UnpackHiMask;
14496
14497 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14498 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14499 }
14500
14501 if (Subtarget.hasAVX2())
14502 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14503 return Extract;
14504
14505 // Try to use shift instructions.
14506 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14507 Zeroable, Subtarget, DAG))
14508 return Shift;
14509
14510 // There are special ways we can lower some single-element blends.
14511 if (NumV2Elements == 1)
14512 if (SDValue V = lowerShuffleAsElementInsertion(
14513 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14514 return V;
14515
14516 // We have different paths for blend lowering, but they all must use the
14517 // *exact* same predicate.
14518 bool IsBlendSupported = Subtarget.hasSSE41();
14519 if (IsBlendSupported)
14520 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14521 Zeroable, Subtarget, DAG))
14522 return Blend;
14523
14524 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14525 Zeroable, Subtarget, DAG))
14526 return Masked;
14527
14528 // Use dedicated unpack instructions for masks that match their pattern.
14529 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14530 return V;
14531
14532 // Try to use byte rotation instructions.
14533 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14534 if (Subtarget.hasSSSE3()) {
14535 if (Subtarget.hasVLX())
14536 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14537 Subtarget, DAG))
14538 return Rotate;
14539
14540 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14541 Subtarget, DAG))
14542 return Rotate;
14543 }
14544
14545 // Assume that a single SHUFPS is faster than an alternative sequence of
14546 // multiple instructions (even if the CPU has a domain penalty).
14547 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14548 if (!isSingleSHUFPSMask(Mask)) {
14549 // If we have direct support for blends, we should lower by decomposing into
14550 // a permute. That will be faster than the domain cross.
14551 if (IsBlendSupported)
14552 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14553 Subtarget, DAG);
14554
14555 // Try to lower by permuting the inputs into an unpack instruction.
14556 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14557 Mask, Subtarget, DAG))
14558 return Unpack;
14559 }
14560
14561 // We implement this with SHUFPS because it can blend from two vectors.
14562 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14563 // up the inputs, bypassing domain shift penalties that we would incur if we
14564 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14565 // relevant.
14566 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14567 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14568 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14569 return DAG.getBitcast(MVT::v4i32, ShufPS);
14570}
14571
14572/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14573/// shuffle lowering, and the most complex part.
14574///
14575/// The lowering strategy is to try to form pairs of input lanes which are
14576/// targeted at the same half of the final vector, and then use a dword shuffle
14577/// to place them onto the right half, and finally unpack the paired lanes into
14578/// their final position.
14579///
14580/// The exact breakdown of how to form these dword pairs and align them on the
14581/// correct sides is really tricky. See the comments within the function for
14582/// more of the details.
14583///
14584/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14585/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14586/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14587/// vector, form the analogous 128-bit 8-element Mask.
14588static SDValue lowerV8I16GeneralSingleInputShuffle(
14589 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14590 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14591 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((void)0);
14592 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14593
14594 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((void)0);
14595 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14596 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14597
14598 // Attempt to directly match PSHUFLW or PSHUFHW.
14599 if (isUndefOrInRange(LoMask, 0, 4) &&
14600 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14601 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14602 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14603 }
14604 if (isUndefOrInRange(HiMask, 4, 8) &&
14605 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14606 for (int i = 0; i != 4; ++i)
14607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14608 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14609 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14610 }
14611
14612 SmallVector<int, 4> LoInputs;
14613 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14614 array_pod_sort(LoInputs.begin(), LoInputs.end());
14615 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14616 SmallVector<int, 4> HiInputs;
14617 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14618 array_pod_sort(HiInputs.begin(), HiInputs.end());
14619 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14620 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14621 int NumHToL = LoInputs.size() - NumLToL;
14622 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14623 int NumHToH = HiInputs.size() - NumLToH;
14624 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14625 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14626 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14627 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14628
14629 // If we are shuffling values from one half - check how many different DWORD
14630 // pairs we need to create. If only 1 or 2 then we can perform this as a
14631 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14632 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14633 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14634 V = DAG.getNode(ShufWOp, DL, VT, V,
14635 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14636 V = DAG.getBitcast(PSHUFDVT, V);
14637 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14638 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14639 return DAG.getBitcast(VT, V);
14640 };
14641
14642 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14643 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14644 SmallVector<std::pair<int, int>, 4> DWordPairs;
14645 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14646
14647 // Collect the different DWORD pairs.
14648 for (int DWord = 0; DWord != 4; ++DWord) {
14649 int M0 = Mask[2 * DWord + 0];
14650 int M1 = Mask[2 * DWord + 1];
14651 M0 = (M0 >= 0 ? M0 % 4 : M0);
14652 M1 = (M1 >= 0 ? M1 % 4 : M1);
14653 if (M0 < 0 && M1 < 0)
14654 continue;
14655
14656 bool Match = false;
14657 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14658 auto &DWordPair = DWordPairs[j];
14659 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14660 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14661 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14662 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14663 PSHUFDMask[DWord] = DOffset + j;
14664 Match = true;
14665 break;
14666 }
14667 }
14668 if (!Match) {
14669 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14670 DWordPairs.push_back(std::make_pair(M0, M1));
14671 }
14672 }
14673
14674 if (DWordPairs.size() <= 2) {
14675 DWordPairs.resize(2, std::make_pair(-1, -1));
14676 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14677 DWordPairs[1].first, DWordPairs[1].second};
14678 if ((NumHToL + NumHToH) == 0)
14679 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14680 if ((NumLToL + NumLToH) == 0)
14681 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14682 }
14683 }
14684
14685 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14686 // such inputs we can swap two of the dwords across the half mark and end up
14687 // with <=2 inputs to each half in each half. Once there, we can fall through
14688 // to the generic code below. For example:
14689 //
14690 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14691 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14692 //
14693 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14694 // and an existing 2-into-2 on the other half. In this case we may have to
14695 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14696 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14697 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14698 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14699 // half than the one we target for fixing) will be fixed when we re-enter this
14700 // path. We will also combine away any sequence of PSHUFD instructions that
14701 // result into a single instruction. Here is an example of the tricky case:
14702 //
14703 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14704 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14705 //
14706 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14707 //
14708 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14709 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14710 //
14711 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14712 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14713 //
14714 // The result is fine to be handled by the generic logic.
14715 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14716 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14717 int AOffset, int BOffset) {
14718 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&((void)0)
14719 "Must call this with A having 3 or 1 inputs from the A half.")((void)0);
14720 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&((void)0)
14721 "Must call this with B having 1 or 3 inputs from the B half.")((void)0);
14722 assert(AToAInputs.size() + BToAInputs.size() == 4 &&((void)0)
14723 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")((void)0);
14724
14725 bool ThreeAInputs = AToAInputs.size() == 3;
14726
14727 // Compute the index of dword with only one word among the three inputs in
14728 // a half by taking the sum of the half with three inputs and subtracting
14729 // the sum of the actual three inputs. The difference is the remaining
14730 // slot.
14731 int ADWord = 0, BDWord = 0;
14732 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14733 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14734 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14735 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14736 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14737 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14738 int TripleNonInputIdx =
14739 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14740 TripleDWord = TripleNonInputIdx / 2;
14741
14742 // We use xor with one to compute the adjacent DWord to whichever one the
14743 // OneInput is in.
14744 OneInputDWord = (OneInput / 2) ^ 1;
14745
14746 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14747 // and BToA inputs. If there is also such a problem with the BToB and AToB
14748 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14749 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14750 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14751 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14752 // Compute how many inputs will be flipped by swapping these DWords. We
14753 // need
14754 // to balance this to ensure we don't form a 3-1 shuffle in the other
14755 // half.
14756 int NumFlippedAToBInputs =
14757 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
14758 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
14759 int NumFlippedBToBInputs =
14760 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
14761 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
14762 if ((NumFlippedAToBInputs == 1 &&
14763 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14764 (NumFlippedBToBInputs == 1 &&
14765 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14766 // We choose whether to fix the A half or B half based on whether that
14767 // half has zero flipped inputs. At zero, we may not be able to fix it
14768 // with that half. We also bias towards fixing the B half because that
14769 // will more commonly be the high half, and we have to bias one way.
14770 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14771 ArrayRef<int> Inputs) {
14772 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14773 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14774 // Determine whether the free index is in the flipped dword or the
14775 // unflipped dword based on where the pinned index is. We use this bit
14776 // in an xor to conditionally select the adjacent dword.
14777 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14778 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14779 if (IsFixIdxInput == IsFixFreeIdxInput)
14780 FixFreeIdx += 1;
14781 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14782 assert(IsFixIdxInput != IsFixFreeIdxInput &&((void)0)
14783 "We need to be changing the number of flipped inputs!")((void)0);
14784 int PSHUFHalfMask[] = {0, 1, 2, 3};
14785 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14786 V = DAG.getNode(
14787 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14788 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14789 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14790
14791 for (int &M : Mask)
14792 if (M >= 0 && M == FixIdx)
14793 M = FixFreeIdx;
14794 else if (M >= 0 && M == FixFreeIdx)
14795 M = FixIdx;
14796 };
14797 if (NumFlippedBToBInputs != 0) {
14798 int BPinnedIdx =
14799 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14800 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14801 } else {
14802 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((void)0);
14803 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14804 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14805 }
14806 }
14807 }
14808
14809 int PSHUFDMask[] = {0, 1, 2, 3};
14810 PSHUFDMask[ADWord] = BDWord;
14811 PSHUFDMask[BDWord] = ADWord;
14812 V = DAG.getBitcast(
14813 VT,
14814 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14815 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14816
14817 // Adjust the mask to match the new locations of A and B.
14818 for (int &M : Mask)
14819 if (M >= 0 && M/2 == ADWord)
14820 M = 2 * BDWord + M % 2;
14821 else if (M >= 0 && M/2 == BDWord)
14822 M = 2 * ADWord + M % 2;
14823
14824 // Recurse back into this routine to re-compute state now that this isn't
14825 // a 3 and 1 problem.
14826 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14827 };
14828 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14829 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14830 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14831 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14832
14833 // At this point there are at most two inputs to the low and high halves from
14834 // each half. That means the inputs can always be grouped into dwords and
14835 // those dwords can then be moved to the correct half with a dword shuffle.
14836 // We use at most one low and one high word shuffle to collect these paired
14837 // inputs into dwords, and finally a dword shuffle to place them.
14838 int PSHUFLMask[4] = {-1, -1, -1, -1};
14839 int PSHUFHMask[4] = {-1, -1, -1, -1};
14840 int PSHUFDMask[4] = {-1, -1, -1, -1};
14841
14842 // First fix the masks for all the inputs that are staying in their
14843 // original halves. This will then dictate the targets of the cross-half
14844 // shuffles.
14845 auto fixInPlaceInputs =
14846 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14847 MutableArrayRef<int> SourceHalfMask,
14848 MutableArrayRef<int> HalfMask, int HalfOffset) {
14849 if (InPlaceInputs.empty())
14850 return;
14851 if (InPlaceInputs.size() == 1) {
14852 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14853 InPlaceInputs[0] - HalfOffset;
14854 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14855 return;
14856 }
14857 if (IncomingInputs.empty()) {
14858 // Just fix all of the in place inputs.
14859 for (int Input : InPlaceInputs) {
14860 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14861 PSHUFDMask[Input / 2] = Input / 2;
14862 }
14863 return;
14864 }
14865
14866 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((void)0);
14867 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14868 InPlaceInputs[0] - HalfOffset;
14869 // Put the second input next to the first so that they are packed into
14870 // a dword. We find the adjacent index by toggling the low bit.
14871 int AdjIndex = InPlaceInputs[0] ^ 1;
14872 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14873 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14874 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14875 };
14876 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14877 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14878
14879 // Now gather the cross-half inputs and place them into a free dword of
14880 // their target half.
14881 // FIXME: This operation could almost certainly be simplified dramatically to
14882 // look more like the 3-1 fixing operation.
14883 auto moveInputsToRightHalf = [&PSHUFDMask](
14884 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14885 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14886 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14887 int DestOffset) {
14888 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14889 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14890 };
14891 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14892 int Word) {
14893 int LowWord = Word & ~1;
14894 int HighWord = Word | 1;
14895 return isWordClobbered(SourceHalfMask, LowWord) ||
14896 isWordClobbered(SourceHalfMask, HighWord);
14897 };
14898
14899 if (IncomingInputs.empty())
14900 return;
14901
14902 if (ExistingInputs.empty()) {
14903 // Map any dwords with inputs from them into the right half.
14904 for (int Input : IncomingInputs) {
14905 // If the source half mask maps over the inputs, turn those into
14906 // swaps and use the swapped lane.
14907 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14908 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14909 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14910 Input - SourceOffset;
14911 // We have to swap the uses in our half mask in one sweep.
14912 for (int &M : HalfMask)
14913 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14914 M = Input;
14915 else if (M == Input)
14916 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14917 } else {
14918 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((void)0)
14919 Input - SourceOffset &&((void)0)
14920 "Previous placement doesn't match!")((void)0);
14921 }
14922 // Note that this correctly re-maps both when we do a swap and when
14923 // we observe the other side of the swap above. We rely on that to
14924 // avoid swapping the members of the input list directly.
14925 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14926 }
14927
14928 // Map the input's dword into the correct half.
14929 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14930 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14931 else
14932 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((void)0)
14933 Input / 2 &&((void)0)
14934 "Previous placement doesn't match!")((void)0);
14935 }
14936
14937 // And just directly shift any other-half mask elements to be same-half
14938 // as we will have mirrored the dword containing the element into the
14939 // same position within that half.
14940 for (int &M : HalfMask)
14941 if (M >= SourceOffset && M < SourceOffset + 4) {
14942 M = M - SourceOffset + DestOffset;
14943 assert(M >= 0 && "This should never wrap below zero!")((void)0);
14944 }
14945 return;
14946 }
14947
14948 // Ensure we have the input in a viable dword of its current half. This
14949 // is particularly tricky because the original position may be clobbered
14950 // by inputs being moved and *staying* in that half.
14951 if (IncomingInputs.size() == 1) {
14952 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14953 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14954 SourceOffset;
14955 SourceHalfMask[InputFixed - SourceOffset] =
14956 IncomingInputs[0] - SourceOffset;
14957 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14958 InputFixed);
14959 IncomingInputs[0] = InputFixed;
14960 }
14961 } else if (IncomingInputs.size() == 2) {
14962 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14963 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14964 // We have two non-adjacent or clobbered inputs we need to extract from
14965 // the source half. To do this, we need to map them into some adjacent
14966 // dword slot in the source mask.
14967 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14968 IncomingInputs[1] - SourceOffset};
14969
14970 // If there is a free slot in the source half mask adjacent to one of
14971 // the inputs, place the other input in it. We use (Index XOR 1) to
14972 // compute an adjacent index.
14973 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14974 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14975 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14976 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14977 InputsFixed[1] = InputsFixed[0] ^ 1;
14978 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14979 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14980 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14981 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14982 InputsFixed[0] = InputsFixed[1] ^ 1;
14983 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14984 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14985 // The two inputs are in the same DWord but it is clobbered and the
14986 // adjacent DWord isn't used at all. Move both inputs to the free
14987 // slot.
14988 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14989 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14990 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14991 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14992 } else {
14993 // The only way we hit this point is if there is no clobbering
14994 // (because there are no off-half inputs to this half) and there is no
14995 // free slot adjacent to one of the inputs. In this case, we have to
14996 // swap an input with a non-input.
14997 for (int i = 0; i < 4; ++i)
14998 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&((void)0)
14999 "We can't handle any clobbers here!")((void)0);
15000 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((void)0)
15001 "Cannot have adjacent inputs here!")((void)0);
15002
15003 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15004 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15005
15006 // We also have to update the final source mask in this case because
15007 // it may need to undo the above swap.
15008 for (int &M : FinalSourceHalfMask)
15009 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15010 M = InputsFixed[1] + SourceOffset;
15011 else if (M == InputsFixed[1] + SourceOffset)
15012 M = (InputsFixed[0] ^ 1) + SourceOffset;
15013
15014 InputsFixed[1] = InputsFixed[0] ^ 1;
15015 }
15016
15017 // Point everything at the fixed inputs.
15018 for (int &M : HalfMask)
15019 if (M == IncomingInputs[0])
15020 M = InputsFixed[0] + SourceOffset;
15021 else if (M == IncomingInputs[1])
15022 M = InputsFixed[1] + SourceOffset;
15023
15024 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15025 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15026 }
15027 } else {
15028 llvm_unreachable("Unhandled input size!")__builtin_unreachable();
15029 }
15030
15031 // Now hoist the DWord down to the right half.
15032 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15033 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((void)0);
15034 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15035 for (int &M : HalfMask)
15036 for (int Input : IncomingInputs)
15037 if (M == Input)
15038 M = FreeDWord * 2 + Input % 2;
15039 };
15040 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15041 /*SourceOffset*/ 4, /*DestOffset*/ 0);
15042 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15043 /*SourceOffset*/ 0, /*DestOffset*/ 4);
15044
15045 // Now enact all the shuffles we've computed to move the inputs into their
15046 // target half.
15047 if (!isNoopShuffleMask(PSHUFLMask))
15048 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15049 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15050 if (!isNoopShuffleMask(PSHUFHMask))
15051 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15052 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15053 if (!isNoopShuffleMask(PSHUFDMask))
15054 V = DAG.getBitcast(
15055 VT,
15056 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15057 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15058
15059 // At this point, each half should contain all its inputs, and we can then
15060 // just shuffle them into their final position.
15061 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((void)0)
15062 "Failed to lift all the high half inputs to the low mask!")((void)0);
15063 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((void)0)
15064 "Failed to lift all the low half inputs to the high mask!")((void)0);
15065
15066 // Do a half shuffle for the low mask.
15067 if (!isNoopShuffleMask(LoMask))
15068 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15069 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15070
15071 // Do a half shuffle with the high mask after shifting its values down.
15072 for (int &M : HiMask)
15073 if (M >= 0)
15074 M -= 4;
15075 if (!isNoopShuffleMask(HiMask))
15076 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15077 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15078
15079 return V;
15080}
15081
15082/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15083/// blend if only one input is used.
15084static SDValue lowerShuffleAsBlendOfPSHUFBs(
15085 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15086 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15087 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&((void)0)
15088 "Lane crossing shuffle masks not supported")((void)0);
15089
15090 int NumBytes = VT.getSizeInBits() / 8;
15091 int Size = Mask.size();
15092 int Scale = NumBytes / Size;
15093
15094 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15095 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15096 V1InUse = false;
15097 V2InUse = false;
15098
15099 for (int i = 0; i < NumBytes; ++i) {
15100 int M = Mask[i / Scale];
15101 if (M < 0)
15102 continue;
15103
15104 const int ZeroMask = 0x80;
15105 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15106 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15107 if (Zeroable[i / Scale])
15108 V1Idx = V2Idx = ZeroMask;
15109
15110 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15111 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15112 V1InUse |= (ZeroMask != V1Idx);
15113 V2InUse |= (ZeroMask != V2Idx);
15114 }
15115
15116 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15117 if (V1InUse)
15118 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15119 DAG.getBuildVector(ShufVT, DL, V1Mask));
15120 if (V2InUse)
15121 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15122 DAG.getBuildVector(ShufVT, DL, V2Mask));
15123
15124 // If we need shuffled inputs from both, blend the two.
15125 SDValue V;
15126 if (V1InUse && V2InUse)
15127 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15128 else
15129 V = V1InUse ? V1 : V2;
15130
15131 // Cast the result back to the correct type.
15132 return DAG.getBitcast(VT, V);
15133}
15134
15135/// Generic lowering of 8-lane i16 shuffles.
15136///
15137/// This handles both single-input shuffles and combined shuffle/blends with
15138/// two inputs. The single input shuffles are immediately delegated to
15139/// a dedicated lowering routine.
15140///
15141/// The blends are lowered in one of three fundamental ways. If there are few
15142/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15143/// of the input is significantly cheaper when lowered as an interleaving of
15144/// the two inputs, try to interleave them. Otherwise, blend the low and high
15145/// halves of the inputs separately (making them have relatively few inputs)
15146/// and then concatenate them.
15147static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15148 const APInt &Zeroable, SDValue V1, SDValue V2,
15149 const X86Subtarget &Subtarget,
15150 SelectionDAG &DAG) {
15151 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((void)0);
15152 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((void)0);
15153 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
15154
15155 // Whenever we can lower this as a zext, that instruction is strictly faster
15156 // than any alternative.
15157 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15158 Zeroable, Subtarget, DAG))
15159 return ZExt;
15160
15161 // Try to use lower using a truncation.
15162 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15163 Subtarget, DAG))
15164 return V;
15165
15166 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15167
15168 if (NumV2Inputs == 0) {
15169 // Try to use shift instructions.
15170 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15171 Zeroable, Subtarget, DAG))
15172 return Shift;
15173
15174 // Check for being able to broadcast a single element.
15175 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15176 Mask, Subtarget, DAG))
15177 return Broadcast;
15178
15179 // Try to use bit rotation instructions.
15180 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15181 Subtarget, DAG))
15182 return Rotate;
15183
15184 // Use dedicated unpack instructions for masks that match their pattern.
15185 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15186 return V;
15187
15188 // Use dedicated pack instructions for masks that match their pattern.
15189 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15190 Subtarget))
15191 return V;
15192
15193 // Try to use byte rotation instructions.
15194 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15195 Subtarget, DAG))
15196 return Rotate;
15197
15198 // Make a copy of the mask so it can be modified.
15199 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15200 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15201 Subtarget, DAG);
15202 }
15203
15204 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((void)0)
15205 "All single-input shuffles should be canonicalized to be V1-input "((void)0)
15206 "shuffles.")((void)0);
15207
15208 // Try to use shift instructions.
15209 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15210 Zeroable, Subtarget, DAG))
15211 return Shift;
15212
15213 // See if we can use SSE4A Extraction / Insertion.
15214 if (Subtarget.hasSSE4A())
15215 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15216 Zeroable, DAG))
15217 return V;
15218
15219 // There are special ways we can lower some single-element blends.
15220 if (NumV2Inputs == 1)
15221 if (SDValue V = lowerShuffleAsElementInsertion(
15222 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15223 return V;
15224
15225 // We have different paths for blend lowering, but they all must use the
15226 // *exact* same predicate.
15227 bool IsBlendSupported = Subtarget.hasSSE41();
15228 if (IsBlendSupported)
15229 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15230 Zeroable, Subtarget, DAG))
15231 return Blend;
15232
15233 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15234 Zeroable, Subtarget, DAG))
15235 return Masked;
15236
15237 // Use dedicated unpack instructions for masks that match their pattern.
15238 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15239 return V;
15240
15241 // Use dedicated pack instructions for masks that match their pattern.
15242 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15243 Subtarget))
15244 return V;
15245
15246 // Try to use lower using a truncation.
15247 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15248 Subtarget, DAG))
15249 return V;
15250
15251 // Try to use byte rotation instructions.
15252 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15253 Subtarget, DAG))
15254 return Rotate;
15255
15256 if (SDValue BitBlend =
15257 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15258 return BitBlend;
15259
15260 // Try to use byte shift instructions to mask.
15261 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15262 Zeroable, Subtarget, DAG))
15263 return V;
15264
15265 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15266 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15267 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15268 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15269 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15270 !Subtarget.hasVLX()) {
15271 SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15272 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15273 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15274 SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15275 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15276 DWordClearMask);
15277 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15278 DWordClearMask);
15279 // Now pack things back together.
15280 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15281 if (NumEvenDrops == 2) {
15282 Result = DAG.getBitcast(MVT::v4i32, Result);
15283 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15284 }
15285 return Result;
15286 }
15287
15288 // Try to lower by permuting the inputs into an unpack instruction.
15289 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15290 Mask, Subtarget, DAG))
15291 return Unpack;
15292
15293 // If we can't directly blend but can use PSHUFB, that will be better as it
15294 // can both shuffle and set up the inefficient blend.
15295 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15296 bool V1InUse, V2InUse;
15297 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15298 Zeroable, DAG, V1InUse, V2InUse);
15299 }
15300
15301 // We can always bit-blend if we have to so the fallback strategy is to
15302 // decompose into single-input permutes and blends/unpacks.
15303 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15304 Mask, Subtarget, DAG);
15305}
15306
15307// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15308// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15309// the active subvector is extracted.
15310static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15311 ArrayRef<int> Mask, SDValue V1, SDValue V2,
15312 const X86Subtarget &Subtarget,
15313 SelectionDAG &DAG) {
15314 MVT MaskVT = VT.changeTypeToInteger();
15315 SDValue MaskNode;
15316 MVT ShuffleVT = VT;
15317 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15318 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15319 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15320 ShuffleVT = V1.getSimpleValueType();
15321
15322 // Adjust mask to correct indices for the second input.
15323 int NumElts = VT.getVectorNumElements();
15324 unsigned Scale = 512 / VT.getSizeInBits();
15325 SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15326 for (int &M : AdjustedMask)
15327 if (NumElts <= M)
15328 M += (Scale - 1) * NumElts;
15329 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15330 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15331 } else {
15332 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15333 }
15334
15335 SDValue Result;
15336 if (V2.isUndef())
15337 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15338 else
15339 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15340
15341 if (VT != ShuffleVT)
15342 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15343
15344 return Result;
15345}
15346
15347/// Generic lowering of v16i8 shuffles.
15348///
15349/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15350/// detect any complexity reducing interleaving. If that doesn't help, it uses
15351/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15352/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15353/// back together.
15354static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15355 const APInt &Zeroable, SDValue V1, SDValue V2,
15356 const X86Subtarget &Subtarget,
15357 SelectionDAG &DAG) {
15358 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((void)0);
15359 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((void)0);
15360 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
15361
15362 // Try to use shift instructions.
15363 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15364 Zeroable, Subtarget, DAG))
15365 return Shift;
15366
15367 // Try to use byte rotation instructions.
15368 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15369 Subtarget, DAG))
15370 return Rotate;
15371
15372 // Use dedicated pack instructions for masks that match their pattern.
15373 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15374 Subtarget))
15375 return V;
15376
15377 // Try to use a zext lowering.
15378 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15379 Zeroable, Subtarget, DAG))
15380 return ZExt;
15381
15382 // Try to use lower using a truncation.
15383 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15384 Subtarget, DAG))
15385 return V;
15386
15387 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15388 Subtarget, DAG))
15389 return V;
15390
15391 // See if we can use SSE4A Extraction / Insertion.
15392 if (Subtarget.hasSSE4A())
15393 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15394 Zeroable, DAG))
15395 return V;
15396
15397 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15398
15399 // For single-input shuffles, there are some nicer lowering tricks we can use.
15400 if (NumV2Elements == 0) {
15401 // Check for being able to broadcast a single element.
15402 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15403 Mask, Subtarget, DAG))
15404 return Broadcast;
15405
15406 // Try to use bit rotation instructions.
15407 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15408 Subtarget, DAG))
15409 return Rotate;
15410
15411 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15412 return V;
15413
15414 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15415 // Notably, this handles splat and partial-splat shuffles more efficiently.
15416 // However, it only makes sense if the pre-duplication shuffle simplifies
15417 // things significantly. Currently, this means we need to be able to
15418 // express the pre-duplication shuffle as an i16 shuffle.
15419 //
15420 // FIXME: We should check for other patterns which can be widened into an
15421 // i16 shuffle as well.
15422 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15423 for (int i = 0; i < 16; i += 2)
15424 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15425 return false;
15426
15427 return true;
15428 };
15429 auto tryToWidenViaDuplication = [&]() -> SDValue {
15430 if (!canWidenViaDuplication(Mask))
15431 return SDValue();
15432 SmallVector<int, 4> LoInputs;
15433 copy_if(Mask, std::back_inserter(LoInputs),
15434 [](int M) { return M >= 0 && M < 8; });
15435 array_pod_sort(LoInputs.begin(), LoInputs.end());
15436 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15437 LoInputs.end());
15438 SmallVector<int, 4> HiInputs;
15439 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15440 array_pod_sort(HiInputs.begin(), HiInputs.end());
15441 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15442 HiInputs.end());
15443
15444 bool TargetLo = LoInputs.size() >= HiInputs.size();
15445 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15446 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15447
15448 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15449 SmallDenseMap<int, int, 8> LaneMap;
15450 for (int I : InPlaceInputs) {
15451 PreDupI16Shuffle[I/2] = I/2;
15452 LaneMap[I] = I;
15453 }
15454 int j = TargetLo ? 0 : 4, je = j + 4;
15455 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15456 // Check if j is already a shuffle of this input. This happens when
15457 // there are two adjacent bytes after we move the low one.
15458 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15459 // If we haven't yet mapped the input, search for a slot into which
15460 // we can map it.
15461 while (j < je && PreDupI16Shuffle[j] >= 0)
15462 ++j;
15463
15464 if (j == je)
15465 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15466 return SDValue();
15467
15468 // Map this input with the i16 shuffle.
15469 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15470 }
15471
15472 // Update the lane map based on the mapping we ended up with.
15473 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15474 }
15475 V1 = DAG.getBitcast(
15476 MVT::v16i8,
15477 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15478 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15479
15480 // Unpack the bytes to form the i16s that will be shuffled into place.
15481 bool EvenInUse = false, OddInUse = false;
15482 for (int i = 0; i < 16; i += 2) {
15483 EvenInUse |= (Mask[i + 0] >= 0);
15484 OddInUse |= (Mask[i + 1] >= 0);
15485 if (EvenInUse && OddInUse)
15486 break;
15487 }
15488 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15489 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15490 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15491
15492 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15493 for (int i = 0; i < 16; ++i)
15494 if (Mask[i] >= 0) {
15495 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15496 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((void)0);
15497 if (PostDupI16Shuffle[i / 2] < 0)
15498 PostDupI16Shuffle[i / 2] = MappedMask;
15499 else
15500 assert(PostDupI16Shuffle[i / 2] == MappedMask &&((void)0)
15501 "Conflicting entries in the original shuffle!")((void)0);
15502 }
15503 return DAG.getBitcast(
15504 MVT::v16i8,
15505 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15506 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15507 };
15508 if (SDValue V = tryToWidenViaDuplication())
15509 return V;
15510 }
15511
15512 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15513 Zeroable, Subtarget, DAG))
15514 return Masked;
15515
15516 // Use dedicated unpack instructions for masks that match their pattern.
15517 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15518 return V;
15519
15520 // Try to use byte shift instructions to mask.
15521 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15522 Zeroable, Subtarget, DAG))
15523 return V;
15524
15525 // Check for compaction patterns.
15526 bool IsSingleInput = V2.isUndef();
15527 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15528
15529 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15530 // with PSHUFB. It is important to do this before we attempt to generate any
15531 // blends but after all of the single-input lowerings. If the single input
15532 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15533 // want to preserve that and we can DAG combine any longer sequences into
15534 // a PSHUFB in the end. But once we start blending from multiple inputs,
15535 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15536 // and there are *very* few patterns that would actually be faster than the
15537 // PSHUFB approach because of its ability to zero lanes.
15538 //
15539 // If the mask is a binary compaction, we can more efficiently perform this
15540 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15541 //
15542 // FIXME: The only exceptions to the above are blends which are exact
15543 // interleavings with direct instructions supporting them. We currently don't
15544 // handle those well here.
15545 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15546 bool V1InUse = false;
15547 bool V2InUse = false;
15548
15549 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15550 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15551
15552 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15553 // do so. This avoids using them to handle blends-with-zero which is
15554 // important as a single pshufb is significantly faster for that.
15555 if (V1InUse && V2InUse) {
15556 if (Subtarget.hasSSE41())
15557 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15558 Zeroable, Subtarget, DAG))
15559 return Blend;
15560
15561 // We can use an unpack to do the blending rather than an or in some
15562 // cases. Even though the or may be (very minorly) more efficient, we
15563 // preference this lowering because there are common cases where part of
15564 // the complexity of the shuffles goes away when we do the final blend as
15565 // an unpack.
15566 // FIXME: It might be worth trying to detect if the unpack-feeding
15567 // shuffles will both be pshufb, in which case we shouldn't bother with
15568 // this.
15569 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15570 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15571 return Unpack;
15572
15573 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15574 if (Subtarget.hasVBMI())
15575 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15576 DAG);
15577
15578 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15579 if (Subtarget.hasXOP()) {
15580 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15581 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15582 }
15583
15584 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15585 // PALIGNR will be cheaper than the second PSHUFB+OR.
15586 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15587 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15588 return V;
15589 }
15590
15591 return PSHUFB;
15592 }
15593
15594 // There are special ways we can lower some single-element blends.
15595 if (NumV2Elements == 1)
15596 if (SDValue V = lowerShuffleAsElementInsertion(
15597 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15598 return V;
15599
15600 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15601 return Blend;
15602
15603 // Check whether a compaction lowering can be done. This handles shuffles
15604 // which take every Nth element for some even N. See the helper function for
15605 // details.
15606 //
15607 // We special case these as they can be particularly efficiently handled with
15608 // the PACKUSB instruction on x86 and they show up in common patterns of
15609 // rearranging bytes to truncate wide elements.
15610 if (NumEvenDrops) {
15611 // NumEvenDrops is the power of two stride of the elements. Another way of
15612 // thinking about it is that we need to drop the even elements this many
15613 // times to get the original input.
15614
15615 // First we need to zero all the dropped bytes.
15616 assert(NumEvenDrops <= 3 &&((void)0)
15617 "No support for dropping even elements more than 3 times.")((void)0);
15618 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15619 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15620 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15621 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15622 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15623 WordClearMask);
15624 if (!IsSingleInput)
15625 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15626 WordClearMask);
15627
15628 // Now pack things back together.
15629 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15630 IsSingleInput ? V1 : V2);
15631 for (int i = 1; i < NumEvenDrops; ++i) {
15632 Result = DAG.getBitcast(MVT::v8i16, Result);
15633 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15634 }
15635 return Result;
15636 }
15637
15638 // Handle multi-input cases by blending/unpacking single-input shuffles.
15639 if (NumV2Elements > 0)
15640 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15641 Subtarget, DAG);
15642
15643 // The fallback path for single-input shuffles widens this into two v8i16
15644 // vectors with unpacks, shuffles those, and then pulls them back together
15645 // with a pack.
15646 SDValue V = V1;
15647
15648 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15649 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15650 for (int i = 0; i < 16; ++i)
15651 if (Mask[i] >= 0)
15652 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15653
15654 SDValue VLoHalf, VHiHalf;
15655 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15656 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15657 // i16s.
15658 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15659 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15660 // Use a mask to drop the high bytes.
15661 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15662 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15663 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15664
15665 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15666 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15667
15668 // Squash the masks to point directly into VLoHalf.
15669 for (int &M : LoBlendMask)
15670 if (M >= 0)
15671 M /= 2;
15672 for (int &M : HiBlendMask)
15673 if (M >= 0)
15674 M /= 2;
15675 } else {
15676 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15677 // VHiHalf so that we can blend them as i16s.
15678 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15679
15680 VLoHalf = DAG.getBitcast(
15681 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15682 VHiHalf = DAG.getBitcast(
15683 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15684 }
15685
15686 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15687 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15688
15689 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15690}
15691
15692/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15693///
15694/// This routine breaks down the specific type of 128-bit shuffle and
15695/// dispatches to the lowering routines accordingly.
15696static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15697 MVT VT, SDValue V1, SDValue V2,
15698 const APInt &Zeroable,
15699 const X86Subtarget &Subtarget,
15700 SelectionDAG &DAG) {
15701 switch (VT.SimpleTy) {
15702 case MVT::v2i64:
15703 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15704 case MVT::v2f64:
15705 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15706 case MVT::v4i32:
15707 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15708 case MVT::v4f32:
15709 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15710 case MVT::v8i16:
15711 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15712 case MVT::v16i8:
15713 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15714
15715 default:
15716 llvm_unreachable("Unimplemented!")__builtin_unreachable();
15717 }
15718}
15719
15720/// Generic routine to split vector shuffle into half-sized shuffles.
15721///
15722/// This routine just extracts two subvectors, shuffles them independently, and
15723/// then concatenates them back together. This should work effectively with all
15724/// AVX vector shuffle types.
15725static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15726 SDValue V2, ArrayRef<int> Mask,
15727 SelectionDAG &DAG) {
15728 assert(VT.getSizeInBits() >= 256 &&((void)0)
15729 "Only for 256-bit or wider vector shuffles!")((void)0);
15730 assert(V1.getSimpleValueType() == VT && "Bad operand type!")((void)0);
15731 assert(V2.getSimpleValueType() == VT && "Bad operand type!")((void)0);
15732
15733 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15734 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15735
15736 int NumElements = VT.getVectorNumElements();
15737 int SplitNumElements = NumElements / 2;
15738 MVT ScalarVT = VT.getVectorElementType();
15739 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15740
15741 // Use splitVector/extractSubVector so that split build-vectors just build two
15742 // narrower build vectors. This helps shuffling with splats and zeros.
15743 auto SplitVector = [&](SDValue V) {
15744 SDValue LoV, HiV;
15745 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15746 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15747 DAG.getBitcast(SplitVT, HiV));
15748 };
15749
15750 SDValue LoV1, HiV1, LoV2, HiV2;
15751 std::tie(LoV1, HiV1) = SplitVector(V1);
15752 std::tie(LoV2, HiV2) = SplitVector(V2);
15753
15754 // Now create two 4-way blends of these half-width vectors.
15755 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15756 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
15757 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15758 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15759 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15760 for (int i = 0; i < SplitNumElements; ++i) {
15761 int M = HalfMask[i];
15762 if (M >= NumElements) {
15763 if (M >= NumElements + SplitNumElements)
15764 UseHiV2 = true;
15765 else
15766 UseLoV2 = true;
15767 V2BlendMask[i] = M - NumElements;
15768 BlendMask[i] = SplitNumElements + i;
15769 } else if (M >= 0) {
15770 if (M >= SplitNumElements)
15771 UseHiV1 = true;
15772 else
15773 UseLoV1 = true;
15774 V1BlendMask[i] = M;
15775 BlendMask[i] = i;
15776 }
15777 }
15778
15779 // Because the lowering happens after all combining takes place, we need to
15780 // manually combine these blend masks as much as possible so that we create
15781 // a minimal number of high-level vector shuffle nodes.
15782
15783 // First try just blending the halves of V1 or V2.
15784 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15785 return DAG.getUNDEF(SplitVT);
15786 if (!UseLoV2 && !UseHiV2)
15787 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15788 if (!UseLoV1 && !UseHiV1)
15789 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15790
15791 SDValue V1Blend, V2Blend;
15792 if (UseLoV1 && UseHiV1) {
15793 V1Blend =
15794 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15795 } else {
15796 // We only use half of V1 so map the usage down into the final blend mask.
15797 V1Blend = UseLoV1 ? LoV1 : HiV1;
15798 for (int i = 0; i < SplitNumElements; ++i)
15799 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15800 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15801 }
15802 if (UseLoV2 && UseHiV2) {
15803 V2Blend =
15804 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15805 } else {
15806 // We only use half of V2 so map the usage down into the final blend mask.
15807 V2Blend = UseLoV2 ? LoV2 : HiV2;
15808 for (int i = 0; i < SplitNumElements; ++i)
15809 if (BlendMask[i] >= SplitNumElements)
15810 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15811 }
15812 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15813 };
15814 SDValue Lo = HalfBlend(LoMask);
15815 SDValue Hi = HalfBlend(HiMask);
15816 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15817}
15818
15819/// Either split a vector in halves or decompose the shuffles and the
15820/// blend/unpack.
15821///
15822/// This is provided as a good fallback for many lowerings of non-single-input
15823/// shuffles with more than one 128-bit lane. In those cases, we want to select
15824/// between splitting the shuffle into 128-bit components and stitching those
15825/// back together vs. extracting the single-input shuffles and blending those
15826/// results.
15827static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15828 SDValue V2, ArrayRef<int> Mask,
15829 const X86Subtarget &Subtarget,
15830 SelectionDAG &DAG) {
15831 assert(!V2.isUndef() && "This routine must not be used to lower single-input "((void)0)
15832 "shuffles as it could then recurse on itself.")((void)0);
15833 int Size = Mask.size();
15834
15835 // If this can be modeled as a broadcast of two elements followed by a blend,
15836 // prefer that lowering. This is especially important because broadcasts can
15837 // often fold with memory operands.
15838 auto DoBothBroadcast = [&] {
15839 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15840 for (int M : Mask)
15841 if (M >= Size) {
15842 if (V2BroadcastIdx < 0)
15843 V2BroadcastIdx = M - Size;
15844 else if (M - Size != V2BroadcastIdx)
15845 return false;
15846 } else if (M >= 0) {
15847 if (V1BroadcastIdx < 0)
15848 V1BroadcastIdx = M;
15849 else if (M != V1BroadcastIdx)
15850 return false;
15851 }
15852 return true;
15853 };
15854 if (DoBothBroadcast())
15855 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15856 DAG);
15857
15858 // If the inputs all stem from a single 128-bit lane of each input, then we
15859 // split them rather than blending because the split will decompose to
15860 // unusually few instructions.
15861 int LaneCount = VT.getSizeInBits() / 128;
15862 int LaneSize = Size / LaneCount;
15863 SmallBitVector LaneInputs[2];
15864 LaneInputs[0].resize(LaneCount, false);
15865 LaneInputs[1].resize(LaneCount, false);
15866 for (int i = 0; i < Size; ++i)
15867 if (Mask[i] >= 0)
15868 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15869 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15870 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15871
15872 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15873 // requires that the decomposed single-input shuffles don't end up here.
15874 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15875 DAG);
15876}
15877
15878// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15879// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15880static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15881 SDValue V1, SDValue V2,
15882 ArrayRef<int> Mask,
15883 SelectionDAG &DAG) {
15884 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")((void)0);
15885
15886 int LHSMask[4] = {-1, -1, -1, -1};
15887 int RHSMask[4] = {-1, -1, -1, -1};
15888 unsigned SHUFPMask = 0;
15889
15890 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15891 // perform the shuffle once the lanes have been shuffled in place.
15892 for (int i = 0; i != 4; ++i) {
15893 int M = Mask[i];
15894 if (M < 0)
15895 continue;
15896 int LaneBase = i & ~1;
15897 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15898 LaneMask[LaneBase + (M & 1)] = M;
15899 SHUFPMask |= (M & 1) << i;
15900 }
15901
15902 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15903 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15904 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15905 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15906}
15907
15908/// Lower a vector shuffle crossing multiple 128-bit lanes as
15909/// a lane permutation followed by a per-lane permutation.
15910///
15911/// This is mainly for cases where we can have non-repeating permutes
15912/// in each lane.
15913///
15914/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15915/// we should investigate merging them.
15916static SDValue lowerShuffleAsLanePermuteAndPermute(
15917 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15918 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15919 int NumElts = VT.getVectorNumElements();
15920 int NumLanes = VT.getSizeInBits() / 128;
15921 int NumEltsPerLane = NumElts / NumLanes;
15922 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15923
15924 /// Attempts to find a sublane permute with the given size
15925 /// that gets all elements into their target lanes.
15926 ///
15927 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15928 /// If unsuccessful, returns false and may overwrite InLaneMask.
15929 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15930 int NumSublanesPerLane = NumSublanes / NumLanes;
15931 int NumEltsPerSublane = NumElts / NumSublanes;
15932
15933 SmallVector<int, 16> CrossLaneMask;
15934 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15935 // CrossLaneMask but one entry == one sublane.
15936 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15937
15938 for (int i = 0; i != NumElts; ++i) {
15939 int M = Mask[i];
15940 if (M < 0)
15941 continue;
15942
15943 int SrcSublane = M / NumEltsPerSublane;
15944 int DstLane = i / NumEltsPerLane;
15945
15946 // We only need to get the elements into the right lane, not sublane.
15947 // So search all sublanes that make up the destination lane.
15948 bool Found = false;
15949 int DstSubStart = DstLane * NumSublanesPerLane;
15950 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15951 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15952 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15953 continue;
15954
15955 Found = true;
15956 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15957 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15958 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15959 break;
15960 }
15961 if (!Found)
15962 return SDValue();
15963 }
15964
15965 // Fill CrossLaneMask using CrossLaneMaskLarge.
15966 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15967
15968 if (!CanUseSublanes) {
15969 // If we're only shuffling a single lowest lane and the rest are identity
15970 // then don't bother.
15971 // TODO - isShuffleMaskInputInPlace could be extended to something like
15972 // this.
15973 int NumIdentityLanes = 0;
15974 bool OnlyShuffleLowestLane = true;
15975 for (int i = 0; i != NumLanes; ++i) {
15976 int LaneOffset = i * NumEltsPerLane;
15977 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15978 i * NumEltsPerLane))
15979 NumIdentityLanes++;
15980 else if (CrossLaneMask[LaneOffset] != 0)
15981 OnlyShuffleLowestLane = false;
15982 }
15983 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15984 return SDValue();
15985 }
15986
15987 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15988 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15989 InLaneMask);
15990 };
15991
15992 // First attempt a solution with full lanes.
15993 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15994 return V;
15995
15996 // The rest of the solutions use sublanes.
15997 if (!CanUseSublanes)
15998 return SDValue();
15999
16000 // Then attempt a solution with 64-bit sublanes (vpermq).
16001 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16002 return V;
16003
16004 // If that doesn't work and we have fast variable cross-lane shuffle,
16005 // attempt 32-bit sublanes (vpermd).
16006 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16007 return SDValue();
16008
16009 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16010}
16011
16012/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16013/// source with a lane permutation.
16014///
16015/// This lowering strategy results in four instructions in the worst case for a
16016/// single-input cross lane shuffle which is lower than any other fully general
16017/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16018/// shuffle pattern should be handled prior to trying this lowering.
16019static SDValue lowerShuffleAsLanePermuteAndShuffle(
16020 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16021 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16022 // FIXME: This should probably be generalized for 512-bit vectors as well.
16023 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((void)0);
16024 int Size = Mask.size();
16025 int LaneSize = Size / 2;
16026
16027 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16028 // Only do this if the elements aren't all from the lower lane,
16029 // otherwise we're (probably) better off doing a split.
16030 if (VT == MVT::v4f64 &&
16031 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16032 if (SDValue V =
16033 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16034 return V;
16035
16036 // If there are only inputs from one 128-bit lane, splitting will in fact be
16037 // less expensive. The flags track whether the given lane contains an element
16038 // that crosses to another lane.
16039 if (!Subtarget.hasAVX2()) {
16040 bool LaneCrossing[2] = {false, false};
16041 for (int i = 0; i < Size; ++i)
16042 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16043 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16044 if (!LaneCrossing[0] || !LaneCrossing[1])
16045 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16046 } else {
16047 bool LaneUsed[2] = {false, false};
16048 for (int i = 0; i < Size; ++i)
16049 if (Mask[i] >= 0)
16050 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16051 if (!LaneUsed[0] || !LaneUsed[1])
16052 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16053 }
16054
16055 // TODO - we could support shuffling V2 in the Flipped input.
16056 assert(V2.isUndef() &&((void)0)
16057 "This last part of this routine only works on single input shuffles")((void)0);
16058
16059 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16060 for (int i = 0; i < Size; ++i) {
16061 int &M = InLaneMask[i];
16062 if (M < 0)
16063 continue;
16064 if (((M % Size) / LaneSize) != (i / LaneSize))
16065 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16066 }
16067 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&((void)0)
16068 "In-lane shuffle mask expected")((void)0);
16069
16070 // Flip the lanes, and shuffle the results which should now be in-lane.
16071 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16072 SDValue Flipped = DAG.getBitcast(PVT, V1);
16073 Flipped =
16074 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16075 Flipped = DAG.getBitcast(VT, Flipped);
16076 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16077}
16078
16079/// Handle lowering 2-lane 128-bit shuffles.
16080static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16081 SDValue V2, ArrayRef<int> Mask,
16082 const APInt &Zeroable,
16083 const X86Subtarget &Subtarget,
16084 SelectionDAG &DAG) {
16085 if (V2.isUndef()) {
16086 // Attempt to match VBROADCAST*128 subvector broadcast load.
16087 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16088 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16089 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16090 MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
16091 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16092 if (!Ld->isNonTemporal()) {
16093 MVT MemVT = VT.getHalfNumVectorElementsVT();
16094 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16095 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
16096 SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
16097 TypeSize::Fixed(Ofs), DL);
16098 SDValue Ops[] = {Ld->getChain(), Ptr};
16099 SDValue BcastLd = DAG.getMemIntrinsicNode(
16100 X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
16101 DAG.getMachineFunction().getMachineMemOperand(
16102 Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
16103 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
16104 return BcastLd;
16105 }
16106 }
16107
16108 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16109 if (Subtarget.hasAVX2())
16110 return SDValue();
16111 }
16112
16113 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16114
16115 SmallVector<int, 4> WidenedMask;
16116 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16117 return SDValue();
16118
16119 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16120 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16121
16122 // Try to use an insert into a zero vector.
16123 if (WidenedMask[0] == 0 && IsHighZero) {
16124 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16125 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16126 DAG.getIntPtrConstant(0, DL));
16127 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16128 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16129 DAG.getIntPtrConstant(0, DL));
16130 }
16131
16132 // TODO: If minimizing size and one of the inputs is a zero vector and the
16133 // the zero vector has only one use, we could use a VPERM2X128 to save the
16134 // instruction bytes needed to explicitly generate the zero vector.
16135
16136 // Blends are faster and handle all the non-lane-crossing cases.
16137 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16138 Subtarget, DAG))
16139 return Blend;
16140
16141 // If either input operand is a zero vector, use VPERM2X128 because its mask
16142 // allows us to replace the zero input with an implicit zero.
16143 if (!IsLowZero && !IsHighZero) {
16144 // Check for patterns which can be matched with a single insert of a 128-bit
16145 // subvector.
16146 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16147 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16148
16149 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16150 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16151 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16152 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16153 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16154 OnlyUsesV1 ? V1 : V2,
16155 DAG.getIntPtrConstant(0, DL));
16156 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16157 DAG.getIntPtrConstant(2, DL));
16158 }
16159 }
16160
16161 // Try to use SHUF128 if possible.
16162 if (Subtarget.hasVLX()) {
16163 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16164 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16165 ((WidenedMask[1] % 2) << 1);
16166 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16167 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16168 }
16169 }
16170 }
16171
16172 // Otherwise form a 128-bit permutation. After accounting for undefs,
16173 // convert the 64-bit shuffle mask selection values into 128-bit
16174 // selection bits by dividing the indexes by 2 and shifting into positions
16175 // defined by a vperm2*128 instruction's immediate control byte.
16176
16177 // The immediate permute control byte looks like this:
16178 // [1:0] - select 128 bits from sources for low half of destination
16179 // [2] - ignore
16180 // [3] - zero low half of destination
16181 // [5:4] - select 128 bits from sources for high half of destination
16182 // [6] - ignore
16183 // [7] - zero high half of destination
16184
16185 assert((WidenedMask[0] >= 0 || IsLowZero) &&((void)0)
16186 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")((void)0);
16187
16188 unsigned PermMask = 0;
16189 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16190 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16191
16192 // Check the immediate mask and replace unused sources with undef.
16193 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16194 V1 = DAG.getUNDEF(VT);
16195 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16196 V2 = DAG.getUNDEF(VT);
16197
16198 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16199 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16200}
16201
16202/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16203/// shuffling each lane.
16204///
16205/// This attempts to create a repeated lane shuffle where each lane uses one
16206/// or two of the lanes of the inputs. The lanes of the input vectors are
16207/// shuffled in one or two independent shuffles to get the lanes into the
16208/// position needed by the final shuffle.
16209static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16210 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16211 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16212 assert(!V2.isUndef() && "This is only useful with multiple inputs.")((void)0);
16213
16214 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16215 return SDValue();
16216
16217 int NumElts = Mask.size();
16218 int NumLanes = VT.getSizeInBits() / 128;
16219 int NumLaneElts = 128 / VT.getScalarSizeInBits();
16220 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16221 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16222
16223 // First pass will try to fill in the RepeatMask from lanes that need two
16224 // sources.
16225 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16226 int Srcs[2] = {-1, -1};
16227 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16228 for (int i = 0; i != NumLaneElts; ++i) {
16229 int M = Mask[(Lane * NumLaneElts) + i];
16230 if (M < 0)
16231 continue;
16232 // Determine which of the possible input lanes (NumLanes from each source)
16233 // this element comes from. Assign that as one of the sources for this
16234 // lane. We can assign up to 2 sources for this lane. If we run out
16235 // sources we can't do anything.
16236 int LaneSrc = M / NumLaneElts;
16237 int Src;
16238 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16239 Src = 0;
16240 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16241 Src = 1;
16242 else
16243 return SDValue();
16244
16245 Srcs[Src] = LaneSrc;
16246 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16247 }
16248
16249 // If this lane has two sources, see if it fits with the repeat mask so far.
16250 if (Srcs[1] < 0)
16251 continue;
16252
16253 LaneSrcs[Lane][0] = Srcs[0];
16254 LaneSrcs[Lane][1] = Srcs[1];
16255
16256 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16257 assert(M1.size() == M2.size() && "Unexpected mask size")((void)0);
16258 for (int i = 0, e = M1.size(); i != e; ++i)
16259 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16260 return false;
16261 return true;
16262 };
16263
16264 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16265 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")((void)0);
16266 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16267 int M = Mask[i];
16268 if (M < 0)
16269 continue;
16270 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&((void)0)
16271 "Unexpected mask element")((void)0);
16272 MergedMask[i] = M;
16273 }
16274 };
16275
16276 if (MatchMasks(InLaneMask, RepeatMask)) {
16277 // Merge this lane mask into the final repeat mask.
16278 MergeMasks(InLaneMask, RepeatMask);
16279 continue;
16280 }
16281
16282 // Didn't find a match. Swap the operands and try again.
16283 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16284 ShuffleVectorSDNode::commuteMask(InLaneMask);
16285
16286 if (MatchMasks(InLaneMask, RepeatMask)) {
16287 // Merge this lane mask into the final repeat mask.
16288 MergeMasks(InLaneMask, RepeatMask);
16289 continue;
16290 }
16291
16292 // Couldn't find a match with the operands in either order.
16293 return SDValue();
16294 }
16295
16296 // Now handle any lanes with only one source.
16297 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16298 // If this lane has already been processed, skip it.
16299 if (LaneSrcs[Lane][0] >= 0)
16300 continue;
16301
16302 for (int i = 0; i != NumLaneElts; ++i) {
16303 int M = Mask[(Lane * NumLaneElts) + i];
16304 if (M < 0)
16305 continue;
16306
16307 // If RepeatMask isn't defined yet we can define it ourself.
16308 if (RepeatMask[i] < 0)
16309 RepeatMask[i] = M % NumLaneElts;
16310
16311 if (RepeatMask[i] < NumElts) {
16312 if (RepeatMask[i] != M % NumLaneElts)
16313 return SDValue();
16314 LaneSrcs[Lane][0] = M / NumLaneElts;
16315 } else {
16316 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16317 return SDValue();
16318 LaneSrcs[Lane][1] = M / NumLaneElts;
16319 }
16320 }
16321
16322 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16323 return SDValue();
16324 }
16325
16326 SmallVector<int, 16> NewMask(NumElts, -1);
16327 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16328 int Src = LaneSrcs[Lane][0];
16329 for (int i = 0; i != NumLaneElts; ++i) {
16330 int M = -1;
16331 if (Src >= 0)
16332 M = Src * NumLaneElts + i;
16333 NewMask[Lane * NumLaneElts + i] = M;
16334 }
16335 }
16336 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16337 // Ensure we didn't get back the shuffle we started with.
16338 // FIXME: This is a hack to make up for some splat handling code in
16339 // getVectorShuffle.
16340 if (isa<ShuffleVectorSDNode>(NewV1) &&
16341 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16342 return SDValue();
16343
16344 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16345 int Src = LaneSrcs[Lane][1];
16346 for (int i = 0; i != NumLaneElts; ++i) {
16347 int M = -1;
16348 if (Src >= 0)
16349 M = Src * NumLaneElts + i;
16350 NewMask[Lane * NumLaneElts + i] = M;
16351 }
16352 }
16353 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16354 // Ensure we didn't get back the shuffle we started with.
16355 // FIXME: This is a hack to make up for some splat handling code in
16356 // getVectorShuffle.
16357 if (isa<ShuffleVectorSDNode>(NewV2) &&
16358 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16359 return SDValue();
16360
16361 for (int i = 0; i != NumElts; ++i) {
16362 NewMask[i] = RepeatMask[i % NumLaneElts];
16363 if (NewMask[i] < 0)
16364 continue;
16365
16366 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16367 }
16368 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16369}
16370
16371/// If the input shuffle mask results in a vector that is undefined in all upper
16372/// or lower half elements and that mask accesses only 2 halves of the
16373/// shuffle's operands, return true. A mask of half the width with mask indexes
16374/// adjusted to access the extracted halves of the original shuffle operands is
16375/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16376/// lower half of each input operand is accessed.
16377static bool
16378getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16379 int &HalfIdx1, int &HalfIdx2) {
16380 assert((Mask.size() == HalfMask.size() * 2) &&((void)0)
16381 "Expected input mask to be twice as long as output")((void)0);
16382
16383 // Exactly one half of the result must be undef to allow narrowing.
16384 bool UndefLower = isUndefLowerHalf(Mask);
16385 bool UndefUpper = isUndefUpperHalf(Mask);
16386 if (UndefLower == UndefUpper)
16387 return false;
16388
16389 unsigned HalfNumElts = HalfMask.size();
16390 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16391 HalfIdx1 = -1;
16392 HalfIdx2 = -1;
16393 for (unsigned i = 0; i != HalfNumElts; ++i) {
16394 int M = Mask[i + MaskIndexOffset];
16395 if (M < 0) {
16396 HalfMask[i] = M;
16397 continue;
16398 }
16399
16400 // Determine which of the 4 half vectors this element is from.
16401 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16402 int HalfIdx = M / HalfNumElts;
16403
16404 // Determine the element index into its half vector source.
16405 int HalfElt = M % HalfNumElts;
16406
16407 // We can shuffle with up to 2 half vectors, set the new 'half'
16408 // shuffle mask accordingly.
16409 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16410 HalfMask[i] = HalfElt;
16411 HalfIdx1 = HalfIdx;
16412 continue;
16413 }
16414 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16415 HalfMask[i] = HalfElt + HalfNumElts;
16416 HalfIdx2 = HalfIdx;
16417 continue;
16418 }
16419
16420 // Too many half vectors referenced.
16421 return false;
16422 }
16423
16424 return true;
16425}
16426
16427/// Given the output values from getHalfShuffleMask(), create a half width
16428/// shuffle of extracted vectors followed by an insert back to full width.
16429static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16430 ArrayRef<int> HalfMask, int HalfIdx1,
16431 int HalfIdx2, bool UndefLower,
16432 SelectionDAG &DAG, bool UseConcat = false) {
16433 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")((void)0);
16434 assert(V1.getValueType().isSimple() && "Expecting only simple types")((void)0);
16435
16436 MVT VT = V1.getSimpleValueType();
16437 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16438 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16439
16440 auto getHalfVector = [&](int HalfIdx) {
16441 if (HalfIdx < 0)
16442 return DAG.getUNDEF(HalfVT);
16443 SDValue V = (HalfIdx < 2 ? V1 : V2);
16444 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16445 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16446 DAG.getIntPtrConstant(HalfIdx, DL));
16447 };
16448
16449 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16450 SDValue Half1 = getHalfVector(HalfIdx1);
16451 SDValue Half2 = getHalfVector(HalfIdx2);
16452 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16453 if (UseConcat) {
16454 SDValue Op0 = V;
16455 SDValue Op1 = DAG.getUNDEF(HalfVT);
16456 if (UndefLower)
16457 std::swap(Op0, Op1);
16458 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16459 }
16460
16461 unsigned Offset = UndefLower ? HalfNumElts : 0;
16462 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16463 DAG.getIntPtrConstant(Offset, DL));
16464}
16465
16466/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16467/// This allows for fast cases such as subvector extraction/insertion
16468/// or shuffling smaller vector types which can lower more efficiently.
16469static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16470 SDValue V2, ArrayRef<int> Mask,
16471 const X86Subtarget &Subtarget,
16472 SelectionDAG &DAG) {
16473 assert((VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
16474 "Expected 256-bit or 512-bit vector")((void)0);
16475
16476 bool UndefLower = isUndefLowerHalf(Mask);
16477 if (!UndefLower && !isUndefUpperHalf(Mask))
16478 return SDValue();
16479
16480 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&((void)0)
16481 "Completely undef shuffle mask should have been simplified already")((void)0);
16482
16483 // Upper half is undef and lower half is whole upper subvector.
16484 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16485 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16486 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16487 if (!UndefLower &&
16488 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16489 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16490 DAG.getIntPtrConstant(HalfNumElts, DL));
16491 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16492 DAG.getIntPtrConstant(0, DL));
16493 }
16494
16495 // Lower half is undef and upper half is whole lower subvector.
16496 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16497 if (UndefLower &&
16498 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16499 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16500 DAG.getIntPtrConstant(0, DL));
16501 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16502 DAG.getIntPtrConstant(HalfNumElts, DL));
16503 }
16504
16505 int HalfIdx1, HalfIdx2;
16506 SmallVector<int, 8> HalfMask(HalfNumElts);
16507 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16508 return SDValue();
16509
16510 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((void)0);
16511
16512 // Only shuffle the halves of the inputs when useful.
16513 unsigned NumLowerHalves =
16514 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16515 unsigned NumUpperHalves =
16516 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16517 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")((void)0);
16518
16519 // Determine the larger pattern of undef/halves, then decide if it's worth
16520 // splitting the shuffle based on subtarget capabilities and types.
16521 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16522 if (!UndefLower) {
16523 // XXXXuuuu: no insert is needed.
16524 // Always extract lowers when setting lower - these are all free subreg ops.
16525 if (NumUpperHalves == 0)
16526 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16527 UndefLower, DAG);
16528
16529 if (NumUpperHalves == 1) {
16530 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16531 if (Subtarget.hasAVX2()) {
16532 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16533 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16534 !is128BitUnpackShuffleMask(HalfMask) &&
16535 (!isSingleSHUFPSMask(HalfMask) ||
16536 Subtarget.hasFastVariableCrossLaneShuffle()))
16537 return SDValue();
16538 // If this is a unary shuffle (assume that the 2nd operand is
16539 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16540 // are better off extracting the upper half of 1 operand and using a
16541 // narrow shuffle.
16542 if (EltWidth == 64 && V2.isUndef())
16543 return SDValue();
16544 }
16545 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16546 if (Subtarget.hasAVX512() && VT.is512BitVector())
16547 return SDValue();
16548 // Extract + narrow shuffle is better than the wide alternative.
16549 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16550 UndefLower, DAG);
16551 }
16552
16553 // Don't extract both uppers, instead shuffle and then extract.
16554 assert(NumUpperHalves == 2 && "Half vector count went wrong")((void)0);
16555 return SDValue();
16556 }
16557
16558 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16559 if (NumUpperHalves == 0) {
16560 // AVX2 has efficient 64-bit element cross-lane shuffles.
16561 // TODO: Refine to account for unary shuffle, splat, and other masks?
16562 if (Subtarget.hasAVX2() && EltWidth == 64)
16563 return SDValue();
16564 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16565 if (Subtarget.hasAVX512() && VT.is512BitVector())
16566 return SDValue();
16567 // Narrow shuffle + insert is better than the wide alternative.
16568 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16569 UndefLower, DAG);
16570 }
16571
16572 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16573 return SDValue();
16574}
16575
16576/// Test whether the specified input (0 or 1) is in-place blended by the
16577/// given mask.
16578///
16579/// This returns true if the elements from a particular input are already in the
16580/// slot required by the given mask and require no permutation.
16581static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16582 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")((void)0);
16583 int Size = Mask.size();
16584 for (int i = 0; i < Size; ++i)
16585 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16586 return false;
16587
16588 return true;
16589}
16590
16591/// Handle case where shuffle sources are coming from the same 128-bit lane and
16592/// every lane can be represented as the same repeating mask - allowing us to
16593/// shuffle the sources with the repeating shuffle and then permute the result
16594/// to the destination lanes.
16595static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16596 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16597 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16598 int NumElts = VT.getVectorNumElements();
16599 int NumLanes = VT.getSizeInBits() / 128;
16600 int NumLaneElts = NumElts / NumLanes;
16601
16602 // On AVX2 we may be able to just shuffle the lowest elements and then
16603 // broadcast the result.
16604 if (Subtarget.hasAVX2()) {
16605 for (unsigned BroadcastSize : {16, 32, 64}) {
16606 if (BroadcastSize <= VT.getScalarSizeInBits())
16607 continue;
16608 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16609
16610 // Attempt to match a repeating pattern every NumBroadcastElts,
16611 // accounting for UNDEFs but only references the lowest 128-bit
16612 // lane of the inputs.
16613 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16614 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16615 for (int j = 0; j != NumBroadcastElts; ++j) {
16616 int M = Mask[i + j];
16617 if (M < 0)
16618 continue;
16619 int &R = RepeatMask[j];
16620 if (0 != ((M % NumElts) / NumLaneElts))
16621 return false;
16622 if (0 <= R && R != M)
16623 return false;
16624 R = M;
16625 }
16626 return true;
16627 };
16628
16629 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16630 if (!FindRepeatingBroadcastMask(RepeatMask))
16631 continue;
16632
16633 // Shuffle the (lowest) repeated elements in place for broadcast.
16634 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16635
16636 // Shuffle the actual broadcast.
16637 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16638 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16639 for (int j = 0; j != NumBroadcastElts; ++j)
16640 BroadcastMask[i + j] = j;
16641 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16642 BroadcastMask);
16643 }
16644 }
16645
16646 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16647 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16648 return SDValue();
16649
16650 // Bail if we already have a repeated lane shuffle mask.
16651 SmallVector<int, 8> RepeatedShuffleMask;
16652 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16653 return SDValue();
16654
16655 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16656 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16657 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16658 int NumSubLanes = NumLanes * SubLaneScale;
16659 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16660
16661 // Check that all the sources are coming from the same lane and see if we can
16662 // form a repeating shuffle mask (local to each sub-lane). At the same time,
16663 // determine the source sub-lane for each destination sub-lane.
16664 int TopSrcSubLane = -1;
16665 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16666 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16667 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16668 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16669
16670 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16671 // Extract the sub-lane mask, check that it all comes from the same lane
16672 // and normalize the mask entries to come from the first lane.
16673 int SrcLane = -1;
16674 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16675 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16676 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16677 if (M < 0)
16678 continue;
16679 int Lane = (M % NumElts) / NumLaneElts;
16680 if ((0 <= SrcLane) && (SrcLane != Lane))
16681 return SDValue();
16682 SrcLane = Lane;
16683 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16684 SubLaneMask[Elt] = LocalM;
16685 }
16686
16687 // Whole sub-lane is UNDEF.
16688 if (SrcLane < 0)
16689 continue;
16690
16691 // Attempt to match against the candidate repeated sub-lane masks.
16692 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16693 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16694 for (int i = 0; i != NumSubLaneElts; ++i) {
16695 if (M1[i] < 0 || M2[i] < 0)
16696 continue;
16697 if (M1[i] != M2[i])
16698 return false;
16699 }
16700 return true;
16701 };
16702
16703 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16704 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16705 continue;
16706
16707 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16708 for (int i = 0; i != NumSubLaneElts; ++i) {
16709 int M = SubLaneMask[i];
16710 if (M < 0)
16711 continue;
16712 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&((void)0)
16713 "Unexpected mask element")((void)0);
16714 RepeatedSubLaneMask[i] = M;
16715 }
16716
16717 // Track the top most source sub-lane - by setting the remaining to UNDEF
16718 // we can greatly simplify shuffle matching.
16719 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16720 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16721 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16722 break;
16723 }
16724
16725 // Bail if we failed to find a matching repeated sub-lane mask.
16726 if (Dst2SrcSubLanes[DstSubLane] < 0)
16727 return SDValue();
16728 }
16729 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((void)0)
16730 "Unexpected source lane")((void)0);
16731
16732 // Create a repeating shuffle mask for the entire vector.
16733 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16734 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16735 int Lane = SubLane / SubLaneScale;
16736 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16737 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16738 int M = RepeatedSubLaneMask[Elt];
16739 if (M < 0)
16740 continue;
16741 int Idx = (SubLane * NumSubLaneElts) + Elt;
16742 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16743 }
16744 }
16745 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16746
16747 // Shuffle each source sub-lane to its destination.
16748 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16749 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16750 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16751 if (SrcSubLane < 0)
16752 continue;
16753 for (int j = 0; j != NumSubLaneElts; ++j)
16754 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16755 }
16756
16757 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16758 SubLaneMask);
16759}
16760
16761static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
16762 bool &ForceV1Zero, bool &ForceV2Zero,
16763 unsigned &ShuffleImm, ArrayRef<int> Mask,
16764 const APInt &Zeroable) {
16765 int NumElts = VT.getVectorNumElements();
16766 assert(VT.getScalarSizeInBits() == 64 &&((void)0)
16767 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&((void)0)
16768 "Unexpected data type for VSHUFPD")((void)0);
16769 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&((void)0)
16770 "Illegal shuffle mask")((void)0);
16771
16772 bool ZeroLane[2] = { true, true };
16773 for (int i = 0; i < NumElts; ++i)
16774 ZeroLane[i & 1] &= Zeroable[i];
16775
16776 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16777 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16778 ShuffleImm = 0;
16779 bool ShufpdMask = true;
16780 bool CommutableMask = true;
16781 for (int i = 0; i < NumElts; ++i) {
16782 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16783 continue;
16784 if (Mask[i] < 0)
16785 return false;
16786 int Val = (i & 6) + NumElts * (i & 1);
16787 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16788 if (Mask[i] < Val || Mask[i] > Val + 1)
16789 ShufpdMask = false;
16790 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16791 CommutableMask = false;
16792 ShuffleImm |= (Mask[i] % 2) << i;
16793 }
16794
16795 if (!ShufpdMask && !CommutableMask)
16796 return false;
16797
16798 if (!ShufpdMask && CommutableMask)
16799 std::swap(V1, V2);
16800
16801 ForceV1Zero = ZeroLane[0];
16802 ForceV2Zero = ZeroLane[1];
16803 return true;
16804}
16805
16806static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
16807 SDValue V2, ArrayRef<int> Mask,
16808 const APInt &Zeroable,
16809 const X86Subtarget &Subtarget,
16810 SelectionDAG &DAG) {
16811 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&((void)0)
16812 "Unexpected data type for VSHUFPD")((void)0);
16813
16814 unsigned Immediate = 0;
16815 bool ForceV1Zero = false, ForceV2Zero = false;
16816 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16817 Mask, Zeroable))
16818 return SDValue();
16819
16820 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16821 if (ForceV1Zero)
16822 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16823 if (ForceV2Zero)
16824 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16825
16826 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16827 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16828}
16829
16830// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16831// by zeroable elements in the remaining 24 elements. Turn this into two
16832// vmovqb instructions shuffled together.
16833static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16834 SDValue V1, SDValue V2,
16835 ArrayRef<int> Mask,
16836 const APInt &Zeroable,
16837 SelectionDAG &DAG) {
16838 assert(VT == MVT::v32i8 && "Unexpected type!")((void)0);
16839
16840 // The first 8 indices should be every 8th element.
16841 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16842 return SDValue();
16843
16844 // Remaining elements need to be zeroable.
16845 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16846 return SDValue();
16847
16848 V1 = DAG.getBitcast(MVT::v4i64, V1);
16849 V2 = DAG.getBitcast(MVT::v4i64, V2);
16850
16851 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16852 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16853
16854 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16855 // the upper bits of the result using an unpckldq.
16856 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16857 { 0, 1, 2, 3, 16, 17, 18, 19,
16858 4, 5, 6, 7, 20, 21, 22, 23 });
16859 // Insert the unpckldq into a zero vector to widen to v32i8.
16860 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16861 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16862 DAG.getIntPtrConstant(0, DL));
16863}
16864
16865
16866/// Handle lowering of 4-lane 64-bit floating point shuffles.
16867///
16868/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16869/// isn't available.
16870static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16871 const APInt &Zeroable, SDValue V1, SDValue V2,
16872 const X86Subtarget &Subtarget,
16873 SelectionDAG &DAG) {
16874 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((void)0);
16875 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((void)0);
16876 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
16877
16878 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16879 Subtarget, DAG))
16880 return V;
16881
16882 if (V2.isUndef()) {
16883 // Check for being able to broadcast a single element.
16884 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16885 Mask, Subtarget, DAG))
16886 return Broadcast;
16887
16888 // Use low duplicate instructions for masks that match their pattern.
16889 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16890 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16891
16892 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16893 // Non-half-crossing single input shuffles can be lowered with an
16894 // interleaved permutation.
16895 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16896 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16897 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16898 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16899 }
16900
16901 // With AVX2 we have direct support for this permutation.
16902 if (Subtarget.hasAVX2())
16903 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16904 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16905
16906 // Try to create an in-lane repeating shuffle mask and then shuffle the
16907 // results into the target lanes.
16908 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16909 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16910 return V;
16911
16912 // Try to permute the lanes and then use a per-lane permute.
16913 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16914 Mask, DAG, Subtarget))
16915 return V;
16916
16917 // Otherwise, fall back.
16918 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16919 DAG, Subtarget);
16920 }
16921
16922 // Use dedicated unpack instructions for masks that match their pattern.
16923 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16924 return V;
16925
16926 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16927 Zeroable, Subtarget, DAG))
16928 return Blend;
16929
16930 // Check if the blend happens to exactly fit that of SHUFPD.
16931 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16932 Zeroable, Subtarget, DAG))
16933 return Op;
16934
16935 // If we have lane crossing shuffles AND they don't all come from the lower
16936 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16937 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16938 // canonicalize to a blend of splat which isn't necessary for this combine.
16939 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16940 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16941 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16942 (V2.getOpcode() != ISD::BUILD_VECTOR))
16943 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16944 Mask, DAG))
16945 return Op;
16946
16947 // If we have one input in place, then we can permute the other input and
16948 // blend the result.
16949 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16950 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16951 Subtarget, DAG);
16952
16953 // Try to create an in-lane repeating shuffle mask and then shuffle the
16954 // results into the target lanes.
16955 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16956 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16957 return V;
16958
16959 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16960 // shuffle. However, if we have AVX2 and either inputs are already in place,
16961 // we will be able to shuffle even across lanes the other input in a single
16962 // instruction so skip this pattern.
16963 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16964 isShuffleMaskInputInPlace(1, Mask))))
16965 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16966 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16967 return V;
16968
16969 // If we have VLX support, we can use VEXPAND.
16970 if (Subtarget.hasVLX())
16971 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16972 DAG, Subtarget))
16973 return V;
16974
16975 // If we have AVX2 then we always want to lower with a blend because an v4 we
16976 // can fully permute the elements.
16977 if (Subtarget.hasAVX2())
16978 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16979 Subtarget, DAG);
16980
16981 // Otherwise fall back on generic lowering.
16982 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16983 Subtarget, DAG);
16984}
16985
16986/// Handle lowering of 4-lane 64-bit integer shuffles.
16987///
16988/// This routine is only called when we have AVX2 and thus a reasonable
16989/// instruction set for v4i64 shuffling..
16990static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16991 const APInt &Zeroable, SDValue V1, SDValue V2,
16992 const X86Subtarget &Subtarget,
16993 SelectionDAG &DAG) {
16994 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((void)0);
16995 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((void)0);
16996 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
16997 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((void)0);
16998
16999 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17000 Subtarget, DAG))
17001 return V;
17002
17003 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17004 Zeroable, Subtarget, DAG))
17005 return Blend;
17006
17007 // Check for being able to broadcast a single element.
17008 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17009 Subtarget, DAG))
17010 return Broadcast;
17011
17012 if (V2.isUndef()) {
17013 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17014 // can use lower latency instructions that will operate on both lanes.
17015 SmallVector<int, 2> RepeatedMask;
17016 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17017 SmallVector<int, 4> PSHUFDMask;
17018 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17019 return DAG.getBitcast(
17020 MVT::v4i64,
17021 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17022 DAG.getBitcast(MVT::v8i32, V1),
17023 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17024 }
17025
17026 // AVX2 provides a direct instruction for permuting a single input across
17027 // lanes.
17028 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17029 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17030 }
17031
17032 // Try to use shift instructions.
17033 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17034 Zeroable, Subtarget, DAG))
17035 return Shift;
17036
17037 // If we have VLX support, we can use VALIGN or VEXPAND.
17038 if (Subtarget.hasVLX()) {
17039 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17040 Subtarget, DAG))
17041 return Rotate;
17042
17043 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17044 DAG, Subtarget))
17045 return V;
17046 }
17047
17048 // Try to use PALIGNR.
17049 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17050 Subtarget, DAG))
17051 return Rotate;
17052
17053 // Use dedicated unpack instructions for masks that match their pattern.
17054 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17055 return V;
17056
17057 // If we have one input in place, then we can permute the other input and
17058 // blend the result.
17059 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17060 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17061 Subtarget, DAG);
17062
17063 // Try to create an in-lane repeating shuffle mask and then shuffle the
17064 // results into the target lanes.
17065 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17066 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17067 return V;
17068
17069 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17070 // shuffle. However, if we have AVX2 and either inputs are already in place,
17071 // we will be able to shuffle even across lanes the other input in a single
17072 // instruction so skip this pattern.
17073 if (!isShuffleMaskInputInPlace(0, Mask) &&
17074 !isShuffleMaskInputInPlace(1, Mask))
17075 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17076 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17077 return Result;
17078
17079 // Otherwise fall back on generic blend lowering.
17080 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17081 Subtarget, DAG);
17082}
17083
17084/// Handle lowering of 8-lane 32-bit floating point shuffles.
17085///
17086/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17087/// isn't available.
17088static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17089 const APInt &Zeroable, SDValue V1, SDValue V2,
17090 const X86Subtarget &Subtarget,
17091 SelectionDAG &DAG) {
17092 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((void)0);
17093 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((void)0);
17094 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17095
17096 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17097 Zeroable, Subtarget, DAG))
17098 return Blend;
17099
17100 // Check for being able to broadcast a single element.
17101 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17102 Subtarget, DAG))
17103 return Broadcast;
17104
17105 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17106 // options to efficiently lower the shuffle.
17107 SmallVector<int, 4> RepeatedMask;
17108 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17109 assert(RepeatedMask.size() == 4 &&((void)0)
17110 "Repeated masks must be half the mask width!")((void)0);
17111
17112 // Use even/odd duplicate instructions for masks that match their pattern.
17113 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17114 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17115 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17116 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17117
17118 if (V2.isUndef())
17119 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17120 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17121
17122 // Use dedicated unpack instructions for masks that match their pattern.
17123 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17124 return V;
17125
17126 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17127 // have already handled any direct blends.
17128 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17129 }
17130
17131 // Try to create an in-lane repeating shuffle mask and then shuffle the
17132 // results into the target lanes.
17133 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17134 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17135 return V;
17136
17137 // If we have a single input shuffle with different shuffle patterns in the
17138 // two 128-bit lanes use the variable mask to VPERMILPS.
17139 if (V2.isUndef()) {
17140 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17141 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17142 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17143 }
17144 if (Subtarget.hasAVX2()) {
17145 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17146 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17147 }
17148 // Otherwise, fall back.
17149 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17150 DAG, Subtarget);
17151 }
17152
17153 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17154 // shuffle.
17155 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17156 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17157 return Result;
17158
17159 // If we have VLX support, we can use VEXPAND.
17160 if (Subtarget.hasVLX())
17161 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17162 DAG, Subtarget))
17163 return V;
17164
17165 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17166 // since after split we get a more efficient code using vpunpcklwd and
17167 // vpunpckhwd instrs than vblend.
17168 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17169 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17170 DAG);
17171
17172 // If we have AVX2 then we always want to lower with a blend because at v8 we
17173 // can fully permute the elements.
17174 if (Subtarget.hasAVX2())
17175 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17176 Subtarget, DAG);
17177
17178 // Otherwise fall back on generic lowering.
17179 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17180 Subtarget, DAG);
17181}
17182
17183/// Handle lowering of 8-lane 32-bit integer shuffles.
17184///
17185/// This routine is only called when we have AVX2 and thus a reasonable
17186/// instruction set for v8i32 shuffling..
17187static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17188 const APInt &Zeroable, SDValue V1, SDValue V2,
17189 const X86Subtarget &Subtarget,
17190 SelectionDAG &DAG) {
17191 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((void)0);
17192 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((void)0);
17193 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17194 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((void)0);
17195
17196 // Whenever we can lower this as a zext, that instruction is strictly faster
17197 // than any alternative. It also allows us to fold memory operands into the
17198 // shuffle in many cases.
17199 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17200 Zeroable, Subtarget, DAG))
17201 return ZExt;
17202
17203 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17204 // since after split we get a more efficient code than vblend by using
17205 // vpunpcklwd and vpunpckhwd instrs.
17206 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17207 !Subtarget.hasAVX512())
17208 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17209 DAG);
17210
17211 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17212 Zeroable, Subtarget, DAG))
17213 return Blend;
17214
17215 // Check for being able to broadcast a single element.
17216 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17217 Subtarget, DAG))
17218 return Broadcast;
17219
17220 // If the shuffle mask is repeated in each 128-bit lane we can use more
17221 // efficient instructions that mirror the shuffles across the two 128-bit
17222 // lanes.
17223 SmallVector<int, 4> RepeatedMask;
17224 bool Is128BitLaneRepeatedShuffle =
17225 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17226 if (Is128BitLaneRepeatedShuffle) {
17227 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17228 if (V2.isUndef())
17229 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17230 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17231
17232 // Use dedicated unpack instructions for masks that match their pattern.
17233 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17234 return V;
17235 }
17236
17237 // Try to use shift instructions.
17238 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17239 Zeroable, Subtarget, DAG))
17240 return Shift;
17241
17242 // If we have VLX support, we can use VALIGN or EXPAND.
17243 if (Subtarget.hasVLX()) {
17244 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17245 Subtarget, DAG))
17246 return Rotate;
17247
17248 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17249 DAG, Subtarget))
17250 return V;
17251 }
17252
17253 // Try to use byte rotation instructions.
17254 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17255 Subtarget, DAG))
17256 return Rotate;
17257
17258 // Try to create an in-lane repeating shuffle mask and then shuffle the
17259 // results into the target lanes.
17260 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17261 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17262 return V;
17263
17264 if (V2.isUndef()) {
17265 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17266 // because that should be faster than the variable permute alternatives.
17267 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17268 return V;
17269
17270 // If the shuffle patterns aren't repeated but it's a single input, directly
17271 // generate a cross-lane VPERMD instruction.
17272 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17273 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17274 }
17275
17276 // Assume that a single SHUFPS is faster than an alternative sequence of
17277 // multiple instructions (even if the CPU has a domain penalty).
17278 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17279 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17280 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17281 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17282 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17283 CastV1, CastV2, DAG);
17284 return DAG.getBitcast(MVT::v8i32, ShufPS);
17285 }
17286
17287 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17288 // shuffle.
17289 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17290 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17291 return Result;
17292
17293 // Otherwise fall back on generic blend lowering.
17294 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17295 Subtarget, DAG);
17296}
17297
17298/// Handle lowering of 16-lane 16-bit integer shuffles.
17299///
17300/// This routine is only called when we have AVX2 and thus a reasonable
17301/// instruction set for v16i16 shuffling..
17302static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17303 const APInt &Zeroable, SDValue V1, SDValue V2,
17304 const X86Subtarget &Subtarget,
17305 SelectionDAG &DAG) {
17306 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((void)0);
17307 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((void)0);
17308 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17309 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((void)0);
17310
17311 // Whenever we can lower this as a zext, that instruction is strictly faster
17312 // than any alternative. It also allows us to fold memory operands into the
17313 // shuffle in many cases.
17314 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17315 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17316 return ZExt;
17317
17318 // Check for being able to broadcast a single element.
17319 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17320 Subtarget, DAG))
17321 return Broadcast;
17322
17323 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17324 Zeroable, Subtarget, DAG))
17325 return Blend;
17326
17327 // Use dedicated unpack instructions for masks that match their pattern.
17328 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17329 return V;
17330
17331 // Use dedicated pack instructions for masks that match their pattern.
17332 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17333 Subtarget))
17334 return V;
17335
17336 // Try to use lower using a truncation.
17337 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17338 Subtarget, DAG))
17339 return V;
17340
17341 // Try to use shift instructions.
17342 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17343 Zeroable, Subtarget, DAG))
17344 return Shift;
17345
17346 // Try to use byte rotation instructions.
17347 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17348 Subtarget, DAG))
17349 return Rotate;
17350
17351 // Try to create an in-lane repeating shuffle mask and then shuffle the
17352 // results into the target lanes.
17353 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17354 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17355 return V;
17356
17357 if (V2.isUndef()) {
17358 // Try to use bit rotation instructions.
17359 if (SDValue Rotate =
17360 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17361 return Rotate;
17362
17363 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17364 // because that should be faster than the variable permute alternatives.
17365 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17366 return V;
17367
17368 // There are no generalized cross-lane shuffle operations available on i16
17369 // element types.
17370 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17371 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17372 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17373 return V;
17374
17375 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17376 DAG, Subtarget);
17377 }
17378
17379 SmallVector<int, 8> RepeatedMask;
17380 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17381 // As this is a single-input shuffle, the repeated mask should be
17382 // a strictly valid v8i16 mask that we can pass through to the v8i16
17383 // lowering to handle even the v16 case.
17384 return lowerV8I16GeneralSingleInputShuffle(
17385 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17386 }
17387 }
17388
17389 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17390 Zeroable, Subtarget, DAG))
17391 return PSHUFB;
17392
17393 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17394 if (Subtarget.hasBWI())
17395 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17396
17397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17398 // shuffle.
17399 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17400 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17401 return Result;
17402
17403 // Try to permute the lanes and then use a per-lane permute.
17404 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17405 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17406 return V;
17407
17408 // Otherwise fall back on generic lowering.
17409 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17410 Subtarget, DAG);
17411}
17412
17413/// Handle lowering of 32-lane 8-bit integer shuffles.
17414///
17415/// This routine is only called when we have AVX2 and thus a reasonable
17416/// instruction set for v32i8 shuffling..
17417static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17418 const APInt &Zeroable, SDValue V1, SDValue V2,
17419 const X86Subtarget &Subtarget,
17420 SelectionDAG &DAG) {
17421 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((void)0);
17422 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((void)0);
17423 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((void)0);
17424 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((void)0);
17425
17426 // Whenever we can lower this as a zext, that instruction is strictly faster
17427 // than any alternative. It also allows us to fold memory operands into the
17428 // shuffle in many cases.
17429 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17430 Zeroable, Subtarget, DAG))
17431 return ZExt;
17432
17433 // Check for being able to broadcast a single element.
17434 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17435 Subtarget, DAG))
17436 return Broadcast;
17437
17438 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17439 Zeroable, Subtarget, DAG))
17440 return Blend;
17441
17442 // Use dedicated unpack instructions for masks that match their pattern.
17443 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17444 return V;
17445
17446 // Use dedicated pack instructions for masks that match their pattern.
17447 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17448 Subtarget))
17449 return V;
17450
17451 // Try to use lower using a truncation.
17452 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17453 Subtarget, DAG))
17454 return V;
17455
17456 // Try to use shift instructions.
17457 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17458 Zeroable, Subtarget, DAG))
17459 return Shift;
17460
17461 // Try to use byte rotation instructions.
17462 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17463 Subtarget, DAG))
17464 return Rotate;
17465
17466 // Try to use bit rotation instructions.
17467 if (V2.isUndef())
17468 if (SDValue Rotate =
17469 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17470 return Rotate;
17471
17472 // Try to create an in-lane repeating shuffle mask and then shuffle the
17473 // results into the target lanes.
17474 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17475 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17476 return V;
17477
17478 // There are no generalized cross-lane shuffle operations available on i8
17479 // element types.
17480 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17481 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17482 // because that should be faster than the variable permute alternatives.
17483 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17484 return V;
17485
17486 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17487 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17488 return V;
17489
17490 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17491 DAG, Subtarget);
17492 }
17493
17494 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17495 Zeroable, Subtarget, DAG))
17496 return PSHUFB;
17497
17498 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17499 if (Subtarget.hasVBMI())
17500 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17501
17502 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17503 // shuffle.
17504 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17505 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17506 return Result;
17507
17508 // Try to permute the lanes and then use a per-lane permute.
17509 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17510 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17511 return V;
17512
17513 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17514 // by zeroable elements in the remaining 24 elements. Turn this into two
17515 // vmovqb instructions shuffled together.
17516 if (Subtarget.hasVLX())
17517 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17518 Mask, Zeroable, DAG))
17519 return V;
17520
17521 // Otherwise fall back on generic lowering.
17522 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17523 Subtarget, DAG);
17524}
17525
17526/// High-level routine to lower various 256-bit x86 vector shuffles.
17527///
17528/// This routine either breaks down the specific type of a 256-bit x86 vector
17529/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17530/// together based on the available instructions.
17531static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17532 SDValue V1, SDValue V2, const APInt &Zeroable,
17533 const X86Subtarget &Subtarget,
17534 SelectionDAG &DAG) {
17535 // If we have a single input to the zero element, insert that into V1 if we
17536 // can do so cheaply.
17537 int NumElts = VT.getVectorNumElements();
17538 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17539
17540 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17541 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17542 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17543 return Insertion;
17544
17545 // Handle special cases where the lower or upper half is UNDEF.
17546 if (SDValue V =
17547 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17548 return V;
17549
17550 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17551 // can check for those subtargets here and avoid much of the subtarget
17552 // querying in the per-vector-type lowering routines. With AVX1 we have
17553 // essentially *zero* ability to manipulate a 256-bit vector with integer
17554 // types. Since we'll use floating point types there eventually, just
17555 // immediately cast everything to a float and operate entirely in that domain.
17556 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17557 int ElementBits = VT.getScalarSizeInBits();
17558 if (ElementBits < 32) {
17559 // No floating point type available, if we can't use the bit operations
17560 // for masking/blending then decompose into 128-bit vectors.
17561 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17562 Subtarget, DAG))
17563 return V;
17564 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17565 return V;
17566 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17567 }
17568
17569 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17570 VT.getVectorNumElements());
17571 V1 = DAG.getBitcast(FpVT, V1);
17572 V2 = DAG.getBitcast(FpVT, V2);
17573 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17574 }
17575
17576 switch (VT.SimpleTy) {
17577 case MVT::v4f64:
17578 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17579 case MVT::v4i64:
17580 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17581 case MVT::v8f32:
17582 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17583 case MVT::v8i32:
17584 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17585 case MVT::v16i16:
17586 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17587 case MVT::v32i8:
17588 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17589
17590 default:
17591 llvm_unreachable("Not a valid 256-bit x86 vector type!")__builtin_unreachable();
17592 }
17593}
17594
17595/// Try to lower a vector shuffle as a 128-bit shuffles.
17596static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17597 const APInt &Zeroable, SDValue V1, SDValue V2,
17598 const X86Subtarget &Subtarget,
17599 SelectionDAG &DAG) {
17600 assert(VT.getScalarSizeInBits() == 64 &&((void)0)
17601 "Unexpected element type size for 128bit shuffle.")((void)0);
17602
17603 // To handle 256 bit vector requires VLX and most probably
17604 // function lowerV2X128VectorShuffle() is better solution.
17605 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((void)0);
17606
17607 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17608 SmallVector<int, 4> Widened128Mask;
17609 if (!canWidenShuffleElements(Mask, Widened128Mask))
17610 return SDValue();
17611 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")((void)0);
17612
17613 // Try to use an insert into a zero vector.
17614 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17615 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17616 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17617 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17618 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17619 DAG.getIntPtrConstant(0, DL));
17620 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17621 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17622 DAG.getIntPtrConstant(0, DL));
17623 }
17624
17625 // Check for patterns which can be matched with a single insert of a 256-bit
17626 // subvector.
17627 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17628 if (OnlyUsesV1 ||
17629 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17630 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17631 SDValue SubVec =
17632 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17633 DAG.getIntPtrConstant(0, DL));
17634 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17635 DAG.getIntPtrConstant(4, DL));
17636 }
17637
17638 // See if this is an insertion of the lower 128-bits of V2 into V1.
17639 bool IsInsert = true;
17640 int V2Index = -1;
17641 for (int i = 0; i < 4; ++i) {
17642 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((void)0);
17643 if (Widened128Mask[i] < 0)
17644 continue;
17645
17646 // Make sure all V1 subvectors are in place.
17647 if (Widened128Mask[i] < 4) {
17648 if (Widened128Mask[i] != i) {
17649 IsInsert = false;
17650 break;
17651 }
17652 } else {
17653 // Make sure we only have a single V2 index and its the lowest 128-bits.
17654 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17655 IsInsert = false;
17656 break;
17657 }
17658 V2Index = i;
17659 }
17660 }
17661 if (IsInsert && V2Index >= 0) {
17662 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17663 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17664 DAG.getIntPtrConstant(0, DL));
17665 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17666 }
17667
17668 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17669 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17670 // possible we at least ensure the lanes stay sequential to help later
17671 // combines.
17672 SmallVector<int, 2> Widened256Mask;
17673 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17674 Widened128Mask.clear();
17675 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17676 }
17677
17678 // Try to lower to vshuf64x2/vshuf32x4.
17679 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17680 unsigned PermMask = 0;
17681 // Insure elements came from the same Op.
17682 for (int i = 0; i < 4; ++i) {
17683 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((void)0);
17684 if (Widened128Mask[i] < 0)
17685 continue;
17686
17687 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17688 unsigned OpIndex = i / 2;
17689 if (Ops[OpIndex].isUndef())
17690 Ops[OpIndex] = Op;
17691 else if (Ops[OpIndex] != Op)
17692 return SDValue();
17693
17694 // Convert the 128-bit shuffle mask selection values into 128-bit selection
17695 // bits defined by a vshuf64x2 instruction's immediate control byte.
17696 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17697 }
17698
17699 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17700 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17701}
17702
17703/// Handle lowering of 8-lane 64-bit floating point shuffles.
17704static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17705 const APInt &Zeroable, SDValue V1, SDValue V2,
17706 const X86Subtarget &Subtarget,
17707 SelectionDAG &DAG) {
17708 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((void)0);
17709 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((void)0);
17710 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17711
17712 if (V2.isUndef()) {
17713 // Use low duplicate instructions for masks that match their pattern.
17714 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17715 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17716
17717 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17718 // Non-half-crossing single input shuffles can be lowered with an
17719 // interleaved permutation.
17720 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17721 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17722 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17723 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17724 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17725 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17726 }
17727
17728 SmallVector<int, 4> RepeatedMask;
17729 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17730 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17731 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17732 }
17733
17734 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17735 V2, Subtarget, DAG))
17736 return Shuf128;
17737
17738 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
17739 return Unpck;
17740
17741 // Check if the blend happens to exactly fit that of SHUFPD.
17742 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17743 Zeroable, Subtarget, DAG))
17744 return Op;
17745
17746 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
17747 DAG, Subtarget))
17748 return V;
17749
17750 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17751 Zeroable, Subtarget, DAG))
17752 return Blend;
17753
17754 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17755}
17756
17757/// Handle lowering of 16-lane 32-bit floating point shuffles.
17758static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17759 const APInt &Zeroable, SDValue V1, SDValue V2,
17760 const X86Subtarget &Subtarget,
17761 SelectionDAG &DAG) {
17762 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((void)0);
17763 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((void)0);
17764 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17765
17766 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17767 // options to efficiently lower the shuffle.
17768 SmallVector<int, 4> RepeatedMask;
17769 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17770 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17771
17772 // Use even/odd duplicate instructions for masks that match their pattern.
17773 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17774 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17775 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17776 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17777
17778 if (V2.isUndef())
17779 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17780 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17781
17782 // Use dedicated unpack instructions for masks that match their pattern.
17783 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
17784 return V;
17785
17786 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17787 Zeroable, Subtarget, DAG))
17788 return Blend;
17789
17790 // Otherwise, fall back to a SHUFPS sequence.
17791 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17792 }
17793
17794 // Try to create an in-lane repeating shuffle mask and then shuffle the
17795 // results into the target lanes.
17796 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17797 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17798 return V;
17799
17800 // If we have a single input shuffle with different shuffle patterns in the
17801 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17802 if (V2.isUndef() &&
17803 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17804 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17805 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17806 }
17807
17808 // If we have AVX512F support, we can use VEXPAND.
17809 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
17810 V1, V2, DAG, Subtarget))
17811 return V;
17812
17813 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17814}
17815
17816/// Handle lowering of 8-lane 64-bit integer shuffles.
17817static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17818 const APInt &Zeroable, SDValue V1, SDValue V2,
17819 const X86Subtarget &Subtarget,
17820 SelectionDAG &DAG) {
17821 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((void)0);
17822 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((void)0);
17823 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17824
17825 if (V2.isUndef()) {
17826 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17827 // can use lower latency instructions that will operate on all four
17828 // 128-bit lanes.
17829 SmallVector<int, 2> Repeated128Mask;
17830 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17831 SmallVector<int, 4> PSHUFDMask;
17832 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17833 return DAG.getBitcast(
17834 MVT::v8i64,
17835 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17836 DAG.getBitcast(MVT::v16i32, V1),
17837 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17838 }
17839
17840 SmallVector<int, 4> Repeated256Mask;
17841 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17842 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17843 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17844 }
17845
17846 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17847 V2, Subtarget, DAG))
17848 return Shuf128;
17849
17850 // Try to use shift instructions.
17851 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
17852 Zeroable, Subtarget, DAG))
17853 return Shift;
17854
17855 // Try to use VALIGN.
17856 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17857 Subtarget, DAG))
17858 return Rotate;
17859
17860 // Try to use PALIGNR.
17861 if (Subtarget.hasBWI())
17862 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17863 Subtarget, DAG))
17864 return Rotate;
17865
17866 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17867 return Unpck;
17868
17869 // If we have AVX512F support, we can use VEXPAND.
17870 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17871 DAG, Subtarget))
17872 return V;
17873
17874 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17875 Zeroable, Subtarget, DAG))
17876 return Blend;
17877
17878 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17879}
17880
17881/// Handle lowering of 16-lane 32-bit integer shuffles.
17882static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17883 const APInt &Zeroable, SDValue V1, SDValue V2,
17884 const X86Subtarget &Subtarget,
17885 SelectionDAG &DAG) {
17886 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((void)0);
17887 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((void)0);
17888 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17889
17890 // Whenever we can lower this as a zext, that instruction is strictly faster
17891 // than any alternative. It also allows us to fold memory operands into the
17892 // shuffle in many cases.
17893 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17894 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17895 return ZExt;
17896
17897 // If the shuffle mask is repeated in each 128-bit lane we can use more
17898 // efficient instructions that mirror the shuffles across the four 128-bit
17899 // lanes.
17900 SmallVector<int, 4> RepeatedMask;
17901 bool Is128BitLaneRepeatedShuffle =
17902 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17903 if (Is128BitLaneRepeatedShuffle) {
17904 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17905 if (V2.isUndef())
17906 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17907 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17908
17909 // Use dedicated unpack instructions for masks that match their pattern.
17910 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17911 return V;
17912 }
17913
17914 // Try to use shift instructions.
17915 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17916 Zeroable, Subtarget, DAG))
17917 return Shift;
17918
17919 // Try to use VALIGN.
17920 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17921 Subtarget, DAG))
17922 return Rotate;
17923
17924 // Try to use byte rotation instructions.
17925 if (Subtarget.hasBWI())
17926 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17927 Subtarget, DAG))
17928 return Rotate;
17929
17930 // Assume that a single SHUFPS is faster than using a permv shuffle.
17931 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17932 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17933 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17934 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17935 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17936 CastV1, CastV2, DAG);
17937 return DAG.getBitcast(MVT::v16i32, ShufPS);
17938 }
17939
17940 // Try to create an in-lane repeating shuffle mask and then shuffle the
17941 // results into the target lanes.
17942 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17943 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17944 return V;
17945
17946 // If we have AVX512F support, we can use VEXPAND.
17947 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17948 DAG, Subtarget))
17949 return V;
17950
17951 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17952 Zeroable, Subtarget, DAG))
17953 return Blend;
17954
17955 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17956}
17957
17958/// Handle lowering of 32-lane 16-bit integer shuffles.
17959static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17960 const APInt &Zeroable, SDValue V1, SDValue V2,
17961 const X86Subtarget &Subtarget,
17962 SelectionDAG &DAG) {
17963 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((void)0);
17964 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((void)0);
17965 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((void)0);
17966 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((void)0);
17967
17968 // Whenever we can lower this as a zext, that instruction is strictly faster
17969 // than any alternative. It also allows us to fold memory operands into the
17970 // shuffle in many cases.
17971 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17972 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17973 return ZExt;
17974
17975 // Use dedicated unpack instructions for masks that match their pattern.
17976 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17977 return V;
17978
17979 // Use dedicated pack instructions for masks that match their pattern.
17980 if (SDValue V =
17981 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17982 return V;
17983
17984 // Try to use shift instructions.
17985 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17986 Zeroable, Subtarget, DAG))
17987 return Shift;
17988
17989 // Try to use byte rotation instructions.
17990 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17991 Subtarget, DAG))
17992 return Rotate;
17993
17994 if (V2.isUndef()) {
17995 // Try to use bit rotation instructions.
17996 if (SDValue Rotate =
17997 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17998 return Rotate;
17999
18000 SmallVector<int, 8> RepeatedMask;
18001 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18002 // As this is a single-input shuffle, the repeated mask should be
18003 // a strictly valid v8i16 mask that we can pass through to the v8i16
18004 // lowering to handle even the v32 case.
18005 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18006 RepeatedMask, Subtarget, DAG);
18007 }
18008 }
18009
18010 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18011 Zeroable, Subtarget, DAG))
18012 return Blend;
18013
18014 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18015 Zeroable, Subtarget, DAG))
18016 return PSHUFB;
18017
18018 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18019}
18020
18021/// Handle lowering of 64-lane 8-bit integer shuffles.
18022static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18023 const APInt &Zeroable, SDValue V1, SDValue V2,
18024 const X86Subtarget &Subtarget,
18025 SelectionDAG &DAG) {
18026 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((void)0);
18027 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((void)0);
18028 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((void)0);
18029 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((void)0);
18030
18031 // Whenever we can lower this as a zext, that instruction is strictly faster
18032 // than any alternative. It also allows us to fold memory operands into the
18033 // shuffle in many cases.
18034 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18035 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18036 return ZExt;
18037
18038 // Use dedicated unpack instructions for masks that match their pattern.
18039 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18040 return V;
18041
18042 // Use dedicated pack instructions for masks that match their pattern.
18043 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18044 Subtarget))
18045 return V;
18046
18047 // Try to use shift instructions.
18048 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18049 Zeroable, Subtarget, DAG))
18050 return Shift;
18051
18052 // Try to use byte rotation instructions.
18053 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18054 Subtarget, DAG))
18055 return Rotate;
18056
18057 // Try to use bit rotation instructions.
18058 if (V2.isUndef())
18059 if (SDValue Rotate =
18060 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18061 return Rotate;
18062
18063 // Lower as AND if possible.
18064 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18065 Zeroable, Subtarget, DAG))
18066 return Masked;
18067
18068 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18069 Zeroable, Subtarget, DAG))
18070 return PSHUFB;
18071
18072 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18073 if (Subtarget.hasVBMI())
18074 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18075
18076 // Try to create an in-lane repeating shuffle mask and then shuffle the
18077 // results into the target lanes.
18078 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18079 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18080 return V;
18081
18082 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18083 Zeroable, Subtarget, DAG))
18084 return Blend;
18085
18086 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18087 // shuffle.
18088 if (!V2.isUndef())
18089 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18090 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18091 return Result;
18092
18093 // FIXME: Implement direct support for this type!
18094 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18095}
18096
18097/// High-level routine to lower various 512-bit x86 vector shuffles.
18098///
18099/// This routine either breaks down the specific type of a 512-bit x86 vector
18100/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18101/// together based on the available instructions.
18102static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18103 MVT VT, SDValue V1, SDValue V2,
18104 const APInt &Zeroable,
18105 const X86Subtarget &Subtarget,
18106 SelectionDAG &DAG) {
18107 assert(Subtarget.hasAVX512() &&((void)0)
18108 "Cannot lower 512-bit vectors w/ basic ISA!")((void)0);
18109
18110 // If we have a single input to the zero element, insert that into V1 if we
18111 // can do so cheaply.
18112 int NumElts = Mask.size();
18113 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18114
18115 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18116 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18117 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18118 return Insertion;
18119
18120 // Handle special cases where the lower or upper half is UNDEF.
18121 if (SDValue V =
18122 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18123 return V;
18124
18125 // Check for being able to broadcast a single element.
18126 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18127 Subtarget, DAG))
18128 return Broadcast;
18129
18130 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18131 // Try using bit ops for masking and blending before falling back to
18132 // splitting.
18133 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18134 Subtarget, DAG))
18135 return V;
18136 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18137 return V;
18138
18139 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18140 }
18141
18142 // Dispatch to each element type for lowering. If we don't have support for
18143 // specific element type shuffles at 512 bits, immediately split them and
18144 // lower them. Each lowering routine of a given type is allowed to assume that
18145 // the requisite ISA extensions for that element type are available.
18146 switch (VT.SimpleTy) {
18147 case MVT::v8f64:
18148 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18149 case MVT::v16f32:
18150 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18151 case MVT::v8i64:
18152 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18153 case MVT::v16i32:
18154 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18155 case MVT::v32i16:
18156 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18157 case MVT::v64i8:
18158 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18159
18160 default:
18161 llvm_unreachable("Not a valid 512-bit x86 vector type!")__builtin_unreachable();
18162 }
18163}
18164
18165static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18166 MVT VT, SDValue V1, SDValue V2,
18167 const X86Subtarget &Subtarget,
18168 SelectionDAG &DAG) {
18169 // Shuffle should be unary.
18170 if (!V2.isUndef())
18171 return SDValue();
18172
18173 int ShiftAmt = -1;
18174 int NumElts = Mask.size();
18175 for (int i = 0; i != NumElts; ++i) {
18176 int M = Mask[i];
18177 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&((void)0)
18178 "Unexpected mask index.")((void)0);
18179 if (M < 0)
18180 continue;
18181
18182 // The first non-undef element determines our shift amount.
18183 if (ShiftAmt < 0) {
18184 ShiftAmt = M - i;
18185 // Need to be shifting right.
18186 if (ShiftAmt <= 0)
18187 return SDValue();
18188 }
18189 // All non-undef elements must shift by the same amount.
18190 if (ShiftAmt != M - i)
18191 return SDValue();
18192 }
18193 assert(ShiftAmt >= 0 && "All undef?")((void)0);
18194
18195 // Great we found a shift right.
18196 MVT WideVT = VT;
18197 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18198 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18199 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18200 DAG.getUNDEF(WideVT), V1,
18201 DAG.getIntPtrConstant(0, DL));
18202 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18203 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18204 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18205 DAG.getIntPtrConstant(0, DL));
18206}
18207
18208// Determine if this shuffle can be implemented with a KSHIFT instruction.
18209// Returns the shift amount if possible or -1 if not. This is a simplified
18210// version of matchShuffleAsShift.
18211static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18212 int MaskOffset, const APInt &Zeroable) {
18213 int Size = Mask.size();
18214
18215 auto CheckZeros = [&](int Shift, bool Left) {
18216 for (int j = 0; j < Shift; ++j)
18217 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18218 return false;
18219
18220 return true;
18221 };
18222
18223 auto MatchShift = [&](int Shift, bool Left) {
18224 unsigned Pos = Left ? Shift : 0;
18225 unsigned Low = Left ? 0 : Shift;
18226 unsigned Len = Size - Shift;
18227 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18228 };
18229
18230 for (int Shift = 1; Shift != Size; ++Shift)
18231 for (bool Left : {true, false})
18232 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18233 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18234 return Shift;
18235 }
18236
18237 return -1;
18238}
18239
18240
18241// Lower vXi1 vector shuffles.
18242// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18243// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18244// vector, shuffle and then truncate it back.
18245static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18246 MVT VT, SDValue V1, SDValue V2,
18247 const APInt &Zeroable,
18248 const X86Subtarget &Subtarget,
18249 SelectionDAG &DAG) {
18250 assert(Subtarget.hasAVX512() &&((void)0)
18251 "Cannot lower 512-bit vectors w/o basic ISA!")((void)0);
18252
18253 int NumElts = Mask.size();
18254
18255 // Try to recognize shuffles that are just padding a subvector with zeros.
18256 int SubvecElts = 0;
18257 int Src = -1;
18258 for (int i = 0; i != NumElts; ++i) {
18259 if (Mask[i] >= 0) {
18260 // Grab the source from the first valid mask. All subsequent elements need
18261 // to use this same source.
18262 if (Src < 0)
18263 Src = Mask[i] / NumElts;
18264 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18265 break;
18266 }
18267
18268 ++SubvecElts;
18269 }
18270 assert(SubvecElts != NumElts && "Identity shuffle?")((void)0);
18271
18272 // Clip to a power 2.
18273 SubvecElts = PowerOf2Floor(SubvecElts);
18274
18275 // Make sure the number of zeroable bits in the top at least covers the bits
18276 // not covered by the subvector.
18277 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18278 assert(Src >= 0 && "Expected a source!")((void)0);
18279 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18280 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18281 Src == 0 ? V1 : V2,
18282 DAG.getIntPtrConstant(0, DL));
18283 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18284 DAG.getConstant(0, DL, VT),
18285 Extract, DAG.getIntPtrConstant(0, DL));
18286 }
18287
18288 // Try a simple shift right with undef elements. Later we'll try with zeros.
18289 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18290 DAG))
18291 return Shift;
18292
18293 // Try to match KSHIFTs.
18294 unsigned Offset = 0;
18295 for (SDValue V : { V1, V2 }) {
18296 unsigned Opcode;
18297 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18298 if (ShiftAmt >= 0) {
18299 MVT WideVT = VT;
18300 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18301 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18302 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18303 DAG.getUNDEF(WideVT), V,
18304 DAG.getIntPtrConstant(0, DL));
18305 // Widened right shifts need two shifts to ensure we shift in zeroes.
18306 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18307 int WideElts = WideVT.getVectorNumElements();
18308 // Shift left to put the original vector in the MSBs of the new size.
18309 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18310 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18311 // Increase the shift amount to account for the left shift.
18312 ShiftAmt += WideElts - NumElts;
18313 }
18314
18315 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18316 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18317 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18318 DAG.getIntPtrConstant(0, DL));
18319 }
18320 Offset += NumElts; // Increment for next iteration.
18321 }
18322
18323
18324
18325 MVT ExtVT;
18326 switch (VT.SimpleTy) {
18327 default:
18328 llvm_unreachable("Expected a vector of i1 elements")__builtin_unreachable();
18329 case MVT::v2i1:
18330 ExtVT = MVT::v2i64;
18331 break;
18332 case MVT::v4i1:
18333 ExtVT = MVT::v4i32;
18334 break;
18335 case MVT::v8i1:
18336 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18337 // shuffle.
18338 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18339 break;
18340 case MVT::v16i1:
18341 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18342 // 256-bit operation available.
18343 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18344 break;
18345 case MVT::v32i1:
18346 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18347 // 256-bit operation available.
18348 assert(Subtarget.hasBWI() && "Expected AVX512BW support")((void)0);
18349 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18350 break;
18351 case MVT::v64i1:
18352 // Fall back to scalarization. FIXME: We can do better if the shuffle
18353 // can be partitioned cleanly.
18354 if (!Subtarget.useBWIRegs())
18355 return SDValue();
18356 ExtVT = MVT::v64i8;
18357 break;
18358 }
18359
18360 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18361 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18362
18363 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18364 // i1 was sign extended we can use X86ISD::CVT2MASK.
18365 int NumElems = VT.getVectorNumElements();
18366 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18367 (Subtarget.hasDQI() && (NumElems < 32)))
18368 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18369 Shuffle, ISD::SETGT);
18370
18371 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18372}
18373
18374/// Helper function that returns true if the shuffle mask should be
18375/// commuted to improve canonicalization.
18376static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18377 int NumElements = Mask.size();
18378
18379 int NumV1Elements = 0, NumV2Elements = 0;
18380 for (int M : Mask)
18381 if (M < 0)
18382 continue;
18383 else if (M < NumElements)
18384 ++NumV1Elements;
18385 else
18386 ++NumV2Elements;
18387
18388 // Commute the shuffle as needed such that more elements come from V1 than
18389 // V2. This allows us to match the shuffle pattern strictly on how many
18390 // elements come from V1 without handling the symmetric cases.
18391 if (NumV2Elements > NumV1Elements)
18392 return true;
18393
18394 assert(NumV1Elements > 0 && "No V1 indices")((void)0);
18395
18396 if (NumV2Elements == 0)
18397 return false;
18398
18399 // When the number of V1 and V2 elements are the same, try to minimize the
18400 // number of uses of V2 in the low half of the vector. When that is tied,
18401 // ensure that the sum of indices for V1 is equal to or lower than the sum
18402 // indices for V2. When those are equal, try to ensure that the number of odd
18403 // indices for V1 is lower than the number of odd indices for V2.
18404 if (NumV1Elements == NumV2Elements) {
18405 int LowV1Elements = 0, LowV2Elements = 0;
18406 for (int M : Mask.slice(0, NumElements / 2))
18407 if (M >= NumElements)
18408 ++LowV2Elements;
18409 else if (M >= 0)
18410 ++LowV1Elements;
18411 if (LowV2Elements > LowV1Elements)
18412 return true;
18413 if (LowV2Elements == LowV1Elements) {
18414 int SumV1Indices = 0, SumV2Indices = 0;
18415 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18416 if (Mask[i] >= NumElements)
18417 SumV2Indices += i;
18418 else if (Mask[i] >= 0)
18419 SumV1Indices += i;
18420 if (SumV2Indices < SumV1Indices)
18421 return true;
18422 if (SumV2Indices == SumV1Indices) {
18423 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18424 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18425 if (Mask[i] >= NumElements)
18426 NumV2OddIndices += i % 2;
18427 else if (Mask[i] >= 0)
18428 NumV1OddIndices += i % 2;
18429 if (NumV2OddIndices < NumV1OddIndices)
18430 return true;
18431 }
18432 }
18433 }
18434
18435 return false;
18436}
18437
18438/// Top-level lowering for x86 vector shuffles.
18439///
18440/// This handles decomposition, canonicalization, and lowering of all x86
18441/// vector shuffles. Most of the specific lowering strategies are encapsulated
18442/// above in helper routines. The canonicalization attempts to widen shuffles
18443/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18444/// s.t. only one of the two inputs needs to be tested, etc.
18445static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18446 SelectionDAG &DAG) {
18447 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18448 ArrayRef<int> OrigMask = SVOp->getMask();
18449 SDValue V1 = Op.getOperand(0);
18450 SDValue V2 = Op.getOperand(1);
18451 MVT VT = Op.getSimpleValueType();
18452 int NumElements = VT.getVectorNumElements();
18453 SDLoc DL(Op);
18454 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18455
18456 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&((void)0)
18457 "Can't lower MMX shuffles")((void)0);
18458
18459 bool V1IsUndef = V1.isUndef();
18460 bool V2IsUndef = V2.isUndef();
18461 if (V1IsUndef && V2IsUndef)
18462 return DAG.getUNDEF(VT);
18463
18464 // When we create a shuffle node we put the UNDEF node to second operand,
18465 // but in some cases the first operand may be transformed to UNDEF.
18466 // In this case we should just commute the node.
18467 if (V1IsUndef)
18468 return DAG.getCommutedVectorShuffle(*SVOp);
18469
18470 // Check for non-undef masks pointing at an undef vector and make the masks
18471 // undef as well. This makes it easier to match the shuffle based solely on
18472 // the mask.
18473 if (V2IsUndef &&
18474 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18475 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18476 for (int &M : NewMask)
18477 if (M >= NumElements)
18478 M = -1;
18479 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18480 }
18481
18482 // Check for illegal shuffle mask element index values.
18483 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18484 (void)MaskUpperLimit;
18485 assert(llvm::all_of(OrigMask,((void)0)
18486 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((void)0)
18487 "Out of bounds shuffle index")((void)0);
18488
18489 // We actually see shuffles that are entirely re-arrangements of a set of
18490 // zero inputs. This mostly happens while decomposing complex shuffles into
18491 // simple ones. Directly lower these as a buildvector of zeros.
18492 APInt KnownUndef, KnownZero;
18493 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18494
18495 APInt Zeroable = KnownUndef | KnownZero;
18496 if (Zeroable.isAllOnesValue())
18497 return getZeroVector(VT, Subtarget, DAG, DL);
18498
18499 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18500
18501 // Try to collapse shuffles into using a vector type with fewer elements but
18502 // wider element types. We cap this to not form integers or floating point
18503 // elements wider than 64 bits. It does not seem beneficial to form i128
18504 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18505 SmallVector<int, 16> WidenedMask;
18506 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18507 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18508 // Shuffle mask widening should not interfere with a broadcast opportunity
18509 // by obfuscating the operands with bitcasts.
18510 // TODO: Avoid lowering directly from this top-level function: make this
18511 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18512 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18513 Subtarget, DAG))
18514 return Broadcast;
18515
18516 MVT NewEltVT = VT.isFloatingPoint()
18517 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18518 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18519 int NewNumElts = NumElements / 2;
18520 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18521 // Make sure that the new vector type is legal. For example, v2f64 isn't
18522 // legal on SSE1.
18523 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18524 if (V2IsZero) {
18525 // Modify the new Mask to take all zeros from the all-zero vector.
18526 // Choose indices that are blend-friendly.
18527 bool UsedZeroVector = false;
18528 assert(is_contained(WidenedMask, SM_SentinelZero) &&((void)0)
18529 "V2's non-undef elements are used?!")((void)0);
18530 for (int i = 0; i != NewNumElts; ++i)
18531 if (WidenedMask[i] == SM_SentinelZero) {
18532 WidenedMask[i] = i + NewNumElts;
18533 UsedZeroVector = true;
18534 }
18535 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18536 // some elements to be undef.
18537 if (UsedZeroVector)
18538 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18539 }
18540 V1 = DAG.getBitcast(NewVT, V1);
18541 V2 = DAG.getBitcast(NewVT, V2);
18542 return DAG.getBitcast(
18543 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18544 }
18545 }
18546
18547 // Commute the shuffle if it will improve canonicalization.
18548 SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
18549 if (canonicalizeShuffleMaskWithCommute(Mask)) {
18550 ShuffleVectorSDNode::commuteMask(Mask);
18551 std::swap(V1, V2);
18552 }
18553
18554 // For each vector width, delegate to a specialized lowering routine.
18555 if (VT.is128BitVector())
18556 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18557
18558 if (VT.is256BitVector())
18559 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18560
18561 if (VT.is512BitVector())
18562 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18563
18564 if (Is1BitVector)
18565 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18566
18567 llvm_unreachable("Unimplemented!")__builtin_unreachable();
18568}
18569
18570/// Try to lower a VSELECT instruction to a vector shuffle.
18571static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18572 const X86Subtarget &Subtarget,
18573 SelectionDAG &DAG) {
18574 SDValue Cond = Op.getOperand(0);
18575 SDValue LHS = Op.getOperand(1);
18576 SDValue RHS = Op.getOperand(2);
18577 MVT VT = Op.getSimpleValueType();
18578
18579 // Only non-legal VSELECTs reach this lowering, convert those into generic
18580 // shuffles and re-use the shuffle lowering path for blends.
18581 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18582 SmallVector<int, 32> Mask;
18583 if (createShuffleMaskFromVSELECT(Mask, Cond))
18584 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18585 }
18586
18587 return SDValue();
18588}
18589
18590SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18591 SDValue Cond = Op.getOperand(0);
18592 SDValue LHS = Op.getOperand(1);
18593 SDValue RHS = Op.getOperand(2);
18594
18595 // A vselect where all conditions and data are constants can be optimized into
18596 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18597 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18598 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18599 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18600 return SDValue();
18601
18602 // Try to lower this to a blend-style vector shuffle. This can handle all
18603 // constant condition cases.
18604 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18605 return BlendOp;
18606
18607 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18608 // with patterns on the mask registers on AVX-512.
18609 MVT CondVT = Cond.getSimpleValueType();
18610 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18611 if (CondEltSize == 1)
18612 return Op;
18613
18614 // Variable blends are only legal from SSE4.1 onward.
18615 if (!Subtarget.hasSSE41())
18616 return SDValue();
18617
18618 SDLoc dl(Op);
18619 MVT VT = Op.getSimpleValueType();
18620 unsigned EltSize = VT.getScalarSizeInBits();
18621 unsigned NumElts = VT.getVectorNumElements();
18622
18623 // Expand v32i16/v64i8 without BWI.
18624 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18625 return SDValue();
18626
18627 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18628 // into an i1 condition so that we can use the mask-based 512-bit blend
18629 // instructions.
18630 if (VT.getSizeInBits() == 512) {
18631 // Build a mask by testing the condition against zero.
18632 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18633 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18634 DAG.getConstant(0, dl, CondVT),
18635 ISD::SETNE);
18636 // Now return a new VSELECT using the mask.
18637 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18638 }
18639
18640 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18641 if (CondEltSize != EltSize) {
18642 // If we don't have a sign splat, rely on the expansion.
18643 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18644 return SDValue();
18645
18646 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18647 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18648 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18649 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18650 }
18651
18652 // Only some types will be legal on some subtargets. If we can emit a legal
18653 // VSELECT-matching blend, return Op, and but if we need to expand, return
18654 // a null value.
18655 switch (VT.SimpleTy) {
18656 default:
18657 // Most of the vector types have blends past SSE4.1.
18658 return Op;
18659
18660 case MVT::v32i8:
18661 // The byte blends for AVX vectors were introduced only in AVX2.
18662 if (Subtarget.hasAVX2())
18663 return Op;
18664
18665 return SDValue();
18666
18667 case MVT::v8i16:
18668 case MVT::v16i16: {
18669 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18670 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18671 Cond = DAG.getBitcast(CastVT, Cond);
18672 LHS = DAG.getBitcast(CastVT, LHS);
18673 RHS = DAG.getBitcast(CastVT, RHS);
18674 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18675 return DAG.getBitcast(VT, Select);
18676 }
18677 }
18678}
18679
18680static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18681 MVT VT = Op.getSimpleValueType();
18682 SDValue Vec = Op.getOperand(0);
18683 SDValue Idx = Op.getOperand(1);
18684 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")((void)0);
18685 SDLoc dl(Op);
18686
18687 if (!Vec.getSimpleValueType().is128BitVector())
18688 return SDValue();
18689
18690 if (VT.getSizeInBits() == 8) {
18691 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18692 // we're going to zero extend the register or fold the store.
18693 if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
18694 !MayFoldIntoStore(Op))
18695 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18696 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18697 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18698
18699 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18700 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18701 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18702 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18703 }
18704
18705 if (VT == MVT::f32) {
18706 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18707 // the result back to FR32 register. It's only worth matching if the
18708 // result has a single use which is a store or a bitcast to i32. And in
18709 // the case of a store, it's not worth it if the index is a constant 0,
18710 // because a MOVSSmr can be used instead, which is smaller and faster.
18711 if (!Op.hasOneUse())
18712 return SDValue();
18713 SDNode *User = *Op.getNode()->use_begin();
18714 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18715 (User->getOpcode() != ISD::BITCAST ||
18716 User->getValueType(0) != MVT::i32))
18717 return SDValue();
18718 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18719 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18720 return DAG.getBitcast(MVT::f32, Extract);
18721 }
18722
18723 if (VT == MVT::i32 || VT == MVT::i64)
18724 return Op;
18725
18726 return SDValue();
18727}
18728
18729/// Extract one bit from mask vector, like v16i1 or v8i1.
18730/// AVX-512 feature.
18731static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
18732 const X86Subtarget &Subtarget) {
18733 SDValue Vec = Op.getOperand(0);
18734 SDLoc dl(Vec);
18735 MVT VecVT = Vec.getSimpleValueType();
18736 SDValue Idx = Op.getOperand(1);
18737 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18738 MVT EltVT = Op.getSimpleValueType();
18739
18740 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&((void)0)
18741 "Unexpected vector type in ExtractBitFromMaskVector")((void)0);
18742
18743 // variable index can't be handled in mask registers,
18744 // extend vector to VR512/128
18745 if (!IdxC) {
18746 unsigned NumElts = VecVT.getVectorNumElements();
18747 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18748 // than extending to 128/256bit.
18749 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18750 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18751 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18752 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18753 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18754 }
18755
18756 unsigned IdxVal = IdxC->getZExtValue();
18757 if (IdxVal == 0) // the operation is legal
18758 return Op;
18759
18760 // Extend to natively supported kshift.
18761 unsigned NumElems = VecVT.getVectorNumElements();
18762 MVT WideVecVT = VecVT;
18763 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18764 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18765 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18766 DAG.getUNDEF(WideVecVT), Vec,
18767 DAG.getIntPtrConstant(0, dl));
18768 }
18769
18770 // Use kshiftr instruction to move to the lower element.
18771 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18772 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18773
18774 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18775 DAG.getIntPtrConstant(0, dl));
18776}
18777
18778SDValue
18779X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18780 SelectionDAG &DAG) const {
18781 SDLoc dl(Op);
18782 SDValue Vec = Op.getOperand(0);
18783 MVT VecVT = Vec.getSimpleValueType();
18784 SDValue Idx = Op.getOperand(1);
18785 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18786
18787 if (VecVT.getVectorElementType() == MVT::i1)
18788 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18789
18790 if (!IdxC) {
18791 // Its more profitable to go through memory (1 cycles throughput)
18792 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
18793 // IACA tool was used to get performance estimation
18794 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18795 //
18796 // example : extractelement <16 x i8> %a, i32 %i
18797 //
18798 // Block Throughput: 3.00 Cycles
18799 // Throughput Bottleneck: Port5
18800 //
18801 // | Num Of | Ports pressure in cycles | |
18802 // | Uops | 0 - DV | 5 | 6 | 7 | |
18803 // ---------------------------------------------
18804 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18805 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18806 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18807 // Total Num Of Uops: 4
18808 //
18809 //
18810 // Block Throughput: 1.00 Cycles
18811 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18812 //
18813 // | | Ports pressure in cycles | |
18814 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18815 // ---------------------------------------------------------
18816 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18817 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18818 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18819 // Total Num Of Uops: 4
18820
18821 return SDValue();
18822 }
18823
18824 unsigned IdxVal = IdxC->getZExtValue();
18825
18826 // If this is a 256-bit vector result, first extract the 128-bit vector and
18827 // then extract the element from the 128-bit vector.
18828 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18829 // Get the 128-bit vector.
18830 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18831 MVT EltVT = VecVT.getVectorElementType();
18832
18833 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18834 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
18835
18836 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18837 // this can be done with a mask.
18838 IdxVal &= ElemsPerChunk - 1;
18839 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18840 DAG.getIntPtrConstant(IdxVal, dl));
18841 }
18842
18843 assert(VecVT.is128BitVector() && "Unexpected vector length")((void)0);
18844
18845 MVT VT = Op.getSimpleValueType();
18846
18847 if (VT.getSizeInBits() == 16) {
18848 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18849 // we're going to zero extend the register or fold the store (SSE41 only).
18850 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
18851 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
18852 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18853 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18854 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18855
18856 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18857 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18858 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18859 }
18860
18861 if (Subtarget.hasSSE41())
18862 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18863 return Res;
18864
18865 // TODO: We only extract a single element from v16i8, we can probably afford
18866 // to be more aggressive here before using the default approach of spilling to
18867 // stack.
18868 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
18869 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18870 int DWordIdx = IdxVal / 4;
18871 if (DWordIdx == 0) {
18872 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18873 DAG.getBitcast(MVT::v4i32, Vec),
18874 DAG.getIntPtrConstant(DWordIdx, dl));
18875 int ShiftVal = (IdxVal % 4) * 8;
18876 if (ShiftVal != 0)
18877 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18878 DAG.getConstant(ShiftVal, dl, MVT::i8));
18879 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18880 }
18881
18882 int WordIdx = IdxVal / 2;
18883 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18884 DAG.getBitcast(MVT::v8i16, Vec),
18885 DAG.getIntPtrConstant(WordIdx, dl));
18886 int ShiftVal = (IdxVal % 2) * 8;
18887 if (ShiftVal != 0)
18888 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18889 DAG.getConstant(ShiftVal, dl, MVT::i8));
18890 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18891 }
18892
18893 if (VT.getSizeInBits() == 32) {
18894 if (IdxVal == 0)
18895 return Op;
18896
18897 // SHUFPS the element to the lowest double word, then movss.
18898 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18899 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18900 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18901 DAG.getIntPtrConstant(0, dl));
18902 }
18903
18904 if (VT.getSizeInBits() == 64) {
18905 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18906 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18907 // to match extract_elt for f64.
18908 if (IdxVal == 0)
18909 return Op;
18910
18911 // UNPCKHPD the element to the lowest double word, then movsd.
18912 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18913 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18914 int Mask[2] = { 1, -1 };
18915 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18916 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18917 DAG.getIntPtrConstant(0, dl));
18918 }
18919
18920 return SDValue();
18921}
18922
18923/// Insert one bit to mask vector, like v16i1 or v8i1.
18924/// AVX-512 feature.
18925static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18926 const X86Subtarget &Subtarget) {
18927 SDLoc dl(Op);
18928 SDValue Vec = Op.getOperand(0);
18929 SDValue Elt = Op.getOperand(1);
18930 SDValue Idx = Op.getOperand(2);
18931 MVT VecVT = Vec.getSimpleValueType();
18932
18933 if (!isa<ConstantSDNode>(Idx)) {
18934 // Non constant index. Extend source and destination,
18935 // insert element and then truncate the result.
18936 unsigned NumElts = VecVT.getVectorNumElements();
18937 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18938 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18939 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18940 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18941 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18942 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18943 }
18944
18945 // Copy into a k-register, extract to v1i1 and insert_subvector.
18946 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18947 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18948}
18949
18950SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18951 SelectionDAG &DAG) const {
18952 MVT VT = Op.getSimpleValueType();
18953 MVT EltVT = VT.getVectorElementType();
18954 unsigned NumElts = VT.getVectorNumElements();
18955 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18956
18957 if (EltVT == MVT::i1)
18958 return InsertBitToMaskVector(Op, DAG, Subtarget);
18959
18960 SDLoc dl(Op);
18961 SDValue N0 = Op.getOperand(0);
18962 SDValue N1 = Op.getOperand(1);
18963 SDValue N2 = Op.getOperand(2);
18964 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18965
18966 if (!N2C) {
18967 // Variable insertion indices, usually we're better off spilling to stack,
18968 // but AVX512 can use a variable compare+select by comparing against all
18969 // possible vector indices, and FP insertion has less gpr->simd traffic.
18970 if (!(Subtarget.hasBWI() ||
18971 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18972 (Subtarget.hasSSE41() && VT.isFloatingPoint())))
18973 return SDValue();
18974
18975 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18976 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18977 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18978 return SDValue();
18979
18980 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18981 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18982 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18983
18984 SmallVector<SDValue, 16> RawIndices;
18985 for (unsigned I = 0; I != NumElts; ++I)
18986 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18987 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18988
18989 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18990 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18991 ISD::CondCode::SETEQ);
18992 }
18993
18994 if (N2C->getAPIntValue().uge(NumElts))
18995 return SDValue();
18996 uint64_t IdxVal = N2C->getZExtValue();
18997
18998 bool IsZeroElt = X86::isZeroNode(N1);
18999 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19000
19001 // If we are inserting a element, see if we can do this more efficiently with
19002 // a blend shuffle with a rematerializable vector than a costly integer
19003 // insertion.
19004 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
19005 (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
19006 SmallVector<int, 8> BlendMask;
19007 for (unsigned i = 0; i != NumElts; ++i)
19008 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19009 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19010 : getOnesVector(VT, DAG, dl);
19011 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19012 }
19013
19014 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19015 // into that, and then insert the subvector back into the result.
19016 if (VT.is256BitVector() || VT.is512BitVector()) {
19017 // With a 256-bit vector, we can insert into the zero element efficiently
19018 // using a blend if we have AVX or AVX2 and the right data type.
19019 if (VT.is256BitVector() && IdxVal == 0) {
19020 // TODO: It is worthwhile to cast integer to floating point and back
19021 // and incur a domain crossing penalty if that's what we'll end up
19022 // doing anyway after extracting to a 128-bit vector.
19023 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19024 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19025 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19026 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19027 DAG.getTargetConstant(1, dl, MVT::i8));
19028 }
19029 }
19030
19031 // Get the desired 128-bit vector chunk.
19032 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19033
19034 // Insert the element into the desired chunk.
19035 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19036 assert(isPowerOf2_32(NumEltsIn128))((void)0);
19037 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19038 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19039
19040 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19041 DAG.getIntPtrConstant(IdxIn128, dl));
19042
19043 // Insert the changed part back into the bigger vector
19044 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19045 }
19046 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((void)0);
19047
19048 // This will be just movd/movq/movss/movsd.
19049 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19050 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19051 EltVT == MVT::i64) {
19052 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19053 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19054 }
19055
19056 // We can't directly insert an i8 or i16 into a vector, so zero extend
19057 // it to i32 first.
19058 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19059 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19060 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19061 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19062 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19063 return DAG.getBitcast(VT, N1);
19064 }
19065 }
19066
19067 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19068 // argument. SSE41 required for pinsrb.
19069 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19070 unsigned Opc;
19071 if (VT == MVT::v8i16) {
19072 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((void)0);
19073 Opc = X86ISD::PINSRW;
19074 } else {
19075 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((void)0);
19076 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((void)0);
19077 Opc = X86ISD::PINSRB;
19078 }
19079
19080 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")((void)0);
19081 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19082 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19083 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19084 }
19085
19086 if (Subtarget.hasSSE41()) {
19087 if (EltVT == MVT::f32) {
19088 // Bits [7:6] of the constant are the source select. This will always be
19089 // zero here. The DAG Combiner may combine an extract_elt index into
19090 // these bits. For example (insert (extract, 3), 2) could be matched by
19091 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19092 // Bits [5:4] of the constant are the destination select. This is the
19093 // value of the incoming immediate.
19094 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19095 // combine either bitwise AND or insert of float 0.0 to set these bits.
19096
19097 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19098 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19099 // If this is an insertion of 32-bits into the low 32-bits of
19100 // a vector, we prefer to generate a blend with immediate rather
19101 // than an insertps. Blends are simpler operations in hardware and so
19102 // will always have equal or better performance than insertps.
19103 // But if optimizing for size and there's a load folding opportunity,
19104 // generate insertps because blendps does not have a 32-bit memory
19105 // operand form.
19106 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19107 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19108 DAG.getTargetConstant(1, dl, MVT::i8));
19109 }
19110 // Create this as a scalar to vector..
19111 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19112 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19113 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19114 }
19115
19116 // PINSR* works with constant index.
19117 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19118 return Op;
19119 }
19120
19121 return SDValue();
19122}
19123
19124static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19125 SelectionDAG &DAG) {
19126 SDLoc dl(Op);
19127 MVT OpVT = Op.getSimpleValueType();
19128
19129 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19130 // combines.
19131 if (X86::isZeroNode(Op.getOperand(0)))
19132 return getZeroVector(OpVT, Subtarget, DAG, dl);
19133
19134 // If this is a 256-bit vector result, first insert into a 128-bit
19135 // vector and then insert into the 256-bit vector.
19136 if (!OpVT.is128BitVector()) {
19137 // Insert into a 128-bit vector.
19138 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19139 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19140 OpVT.getVectorNumElements() / SizeFactor);
19141
19142 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19143
19144 // Insert the 128-bit vector.
19145 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19146 }
19147 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&((void)0)
19148 "Expected an SSE type!")((void)0);
19149
19150 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
19151 if (OpVT == MVT::v4i32)
19152 return Op;
19153
19154 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19155 return DAG.getBitcast(
19156 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19157}
19158
19159// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19160// simple superregister reference or explicit instructions to insert
19161// the upper bits of a vector.
19162static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19163 SelectionDAG &DAG) {
19164 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((void)0);
19165
19166 return insert1BitVector(Op, DAG, Subtarget);
19167}
19168
19169static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19170 SelectionDAG &DAG) {
19171 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((void)0)
19172 "Only vXi1 extract_subvectors need custom lowering")((void)0);
19173
19174 SDLoc dl(Op);
19175 SDValue Vec = Op.getOperand(0);
19176 uint64_t IdxVal = Op.getConstantOperandVal(1);
19177
19178 if (IdxVal == 0) // the operation is legal
19179 return Op;
19180
19181 MVT VecVT = Vec.getSimpleValueType();
19182 unsigned NumElems = VecVT.getVectorNumElements();
19183
19184 // Extend to natively supported kshift.
19185 MVT WideVecVT = VecVT;
19186 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19187 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19188 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19189 DAG.getUNDEF(WideVecVT), Vec,
19190 DAG.getIntPtrConstant(0, dl));
19191 }
19192
19193 // Shift to the LSB.
19194 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19195 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19196
19197 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19198 DAG.getIntPtrConstant(0, dl));
19199}
19200
19201// Returns the appropriate wrapper opcode for a global reference.
19202unsigned X86TargetLowering::getGlobalWrapperKind(
19203 const GlobalValue *GV, const unsigned char OpFlags) const {
19204 // References to absolute symbols are never PC-relative.
19205 if (GV && GV->isAbsoluteSymbolRef())
19206 return X86ISD::Wrapper;
19207
19208 CodeModel::Model M = getTargetMachine().getCodeModel();
19209 if (Subtarget.isPICStyleRIPRel() &&
19210 (M == CodeModel::Small || M == CodeModel::Kernel))
19211 return X86ISD::WrapperRIP;
19212
19213 // GOTPCREL references must always use RIP.
19214 if (OpFlags == X86II::MO_GOTPCREL)
19215 return X86ISD::WrapperRIP;
19216
19217 return X86ISD::Wrapper;
19218}
19219
19220// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19221// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19222// one of the above mentioned nodes. It has to be wrapped because otherwise
19223// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19224// be used to form addressing mode. These wrapped nodes will be selected
19225// into MOV32ri.
19226SDValue
19227X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19228 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19229
19230 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19231 // global base reg.
19232 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19233
19234 auto PtrVT = getPointerTy(DAG.getDataLayout());
19235 SDValue Result = DAG.getTargetConstantPool(
19236 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19237 SDLoc DL(CP);
19238 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19239 // With PIC, the address is actually $g + Offset.
19240 if (OpFlag) {
19241 Result =
19242 DAG.getNode(ISD::ADD, DL, PtrVT,
19243 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19244 }
19245
19246 return Result;
19247}
19248
19249SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19250 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19251
19252 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19253 // global base reg.
19254 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19255
19256 auto PtrVT = getPointerTy(DAG.getDataLayout());
19257 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19258 SDLoc DL(JT);
19259 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19260
19261 // With PIC, the address is actually $g + Offset.
19262 if (OpFlag)
19263 Result =
19264 DAG.getNode(ISD::ADD, DL, PtrVT,
19265 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19266
19267 return Result;
19268}
19269
19270SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19271 SelectionDAG &DAG) const {
19272 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19273}
19274
19275SDValue
19276X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19277 // Create the TargetBlockAddressAddress node.
19278 unsigned char OpFlags =
19279 Subtarget.classifyBlockAddressReference();
19280 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19281 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19282 SDLoc dl(Op);
19283 auto PtrVT = getPointerTy(DAG.getDataLayout());
19284 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19285 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19286
19287 // With PIC, the address is actually $g + Offset.
19288 if (isGlobalRelativeToPICBase(OpFlags)) {
19289 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19290 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19291 }
19292
19293 return Result;
19294}
19295
19296/// Creates target global address or external symbol nodes for calls or
19297/// other uses.
19298SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19299 bool ForCall) const {
19300 // Unpack the global address or external symbol.
19301 const SDLoc &dl = SDLoc(Op);
19302 const GlobalValue *GV = nullptr;
19303 int64_t Offset = 0;
19304 const char *ExternalSym = nullptr;
19305 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19306 GV = G->getGlobal();
19307 Offset = G->getOffset();
19308 } else {
19309 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19310 ExternalSym = ES->getSymbol();
19311 }
19312
19313 // Calculate some flags for address lowering.
19314 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19315 unsigned char OpFlags;
19316 if (ForCall)
19317 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19318 else
19319 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19320 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19321 bool NeedsLoad = isGlobalStubReference(OpFlags);
19322
19323 CodeModel::Model M = DAG.getTarget().getCodeModel();
19324 auto PtrVT = getPointerTy(DAG.getDataLayout());
19325 SDValue Result;
19326
19327 if (GV) {
19328 // Create a target global address if this is a global. If possible, fold the
19329 // offset into the global address reference. Otherwise, ADD it on later.
19330 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19331 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19332 // relocation will compute to a negative value, which is invalid.
19333 int64_t GlobalOffset = 0;
19334 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19335 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19336 std::swap(GlobalOffset, Offset);
19337 }
19338 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19339 } else {
19340 // If this is not a global address, this must be an external symbol.
19341 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19342 }
19343
19344 // If this is a direct call, avoid the wrapper if we don't need to do any
19345 // loads or adds. This allows SDAG ISel to match direct calls.
19346 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19347 return Result;
19348
19349 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19350
19351 // With PIC, the address is actually $g + Offset.
19352 if (HasPICReg) {
19353 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19354 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19355 }
19356
19357 // For globals that require a load from a stub to get the address, emit the
19358 // load.
19359 if (NeedsLoad)
19360 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19361 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19362
19363 // If there was a non-zero offset that we didn't fold, create an explicit
19364 // addition for it.
19365 if (Offset != 0)
19366 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19367 DAG.getConstant(Offset, dl, PtrVT));
19368
19369 return Result;
19370}
19371
19372SDValue
19373X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19374 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19375}
19376
19377static SDValue
19378GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19379 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19380 unsigned char OperandFlags, bool LocalDynamic = false) {
19381 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19382 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19383 SDLoc dl(GA);
19384 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19385 GA->getValueType(0),
19386 GA->getOffset(),
19387 OperandFlags);
19388
19389 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19390 : X86ISD::TLSADDR;
19391
19392 if (InFlag) {
19393 SDValue Ops[] = { Chain, TGA, *InFlag };
19394 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19395 } else {
19396 SDValue Ops[] = { Chain, TGA };
19397 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19398 }
19399
19400 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19401 MFI.setAdjustsStack(true);
19402 MFI.setHasCalls(true);
19403
19404 SDValue Flag = Chain.getValue(1);
19405 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19406}
19407
19408// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19409static SDValue
19410LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19411 const EVT PtrVT) {
19412 SDValue InFlag;
19413 SDLoc dl(GA); // ? function entry point might be better
19414 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19415 DAG.getNode(X86ISD::GlobalBaseReg,
19416 SDLoc(), PtrVT), InFlag);
19417 InFlag = Chain.getValue(1);
19418
19419 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19420}
19421
19422// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19423static SDValue
19424LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19425 const EVT PtrVT) {
19426 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19427 X86::RAX, X86II::MO_TLSGD);
19428}
19429
19430// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19431static SDValue
19432LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19433 const EVT PtrVT) {
19434 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19435 X86::EAX, X86II::MO_TLSGD);
19436}
19437
19438static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19439 SelectionDAG &DAG, const EVT PtrVT,
19440 bool Is64Bit, bool Is64BitLP64) {
19441 SDLoc dl(GA);
19442
19443 // Get the start address of the TLS block for this module.
19444 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19445 .getInfo<X86MachineFunctionInfo>();
19446 MFI->incNumLocalDynamicTLSAccesses();
19447
19448 SDValue Base;
19449 if (Is64Bit) {
19450 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19451 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19452 X86II::MO_TLSLD, /*LocalDynamic=*/true);
19453 } else {
19454 SDValue InFlag;
19455 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19456 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19457 InFlag = Chain.getValue(1);
19458 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19459 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19460 }
19461
19462 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19463 // of Base.
19464
19465 // Build x@dtpoff.
19466 unsigned char OperandFlags = X86II::MO_DTPOFF;
19467 unsigned WrapperKind = X86ISD::Wrapper;
19468 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19469 GA->getValueType(0),
19470 GA->getOffset(), OperandFlags);
19471 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19472
19473 // Add x@dtpoff with the base.
19474 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19475}
19476
19477// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19478static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19479 const EVT PtrVT, TLSModel::Model model,
19480 bool is64Bit, bool isPIC) {
19481 SDLoc dl(GA);
19482
19483 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19484 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19485 is64Bit ? 257 : 256));
19486
19487 SDValue ThreadPointer =
19488 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19489 MachinePointerInfo(Ptr));
19490
19491 unsigned char OperandFlags = 0;
19492 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19493 // initialexec.
19494 unsigned WrapperKind = X86ISD::Wrapper;
19495 if (model == TLSModel::LocalExec) {
19496 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19497 } else if (model == TLSModel::InitialExec) {
19498 if (is64Bit) {
19499 OperandFlags = X86II::MO_GOTTPOFF;
19500 WrapperKind = X86ISD::WrapperRIP;
19501 } else {
19502 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19503 }
19504 } else {
19505 llvm_unreachable("Unexpected model")__builtin_unreachable();
19506 }
19507
19508 // emit "addl x@ntpoff,%eax" (local exec)
19509 // or "addl x@indntpoff,%eax" (initial exec)
19510 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19511 SDValue TGA =
19512 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19513 GA->getOffset(), OperandFlags);
19514 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19515
19516 if (model == TLSModel::InitialExec) {
19517 if (isPIC && !is64Bit) {
19518 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19519 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19520 Offset);
19521 }
19522
19523 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19524 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19525 }
19526
19527 // The address of the thread local variable is the add of the thread
19528 // pointer with the offset of the variable.
19529 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19530}
19531
19532SDValue
19533X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19534
19535 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19536
19537 if (DAG.getTarget().useEmulatedTLS())
19538 return LowerToTLSEmulatedModel(GA, DAG);
19539
19540 const GlobalValue *GV = GA->getGlobal();
19541 auto PtrVT = getPointerTy(DAG.getDataLayout());
19542 bool PositionIndependent = isPositionIndependent();
19543
19544 if (Subtarget.isTargetELF()) {
19545 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19546 switch (model) {
19547 case TLSModel::GeneralDynamic:
19548 if (Subtarget.is64Bit()) {
19549 if (Subtarget.isTarget64BitLP64())
19550 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19551 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19552 }
19553 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19554 case TLSModel::LocalDynamic:
19555 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19556 Subtarget.isTarget64BitLP64());
19557 case TLSModel::InitialExec:
19558 case TLSModel::LocalExec:
19559 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19560 PositionIndependent);
19561 }
19562 llvm_unreachable("Unknown TLS model.")__builtin_unreachable();
19563 }
19564
19565 if (Subtarget.isTargetDarwin()) {
19566 // Darwin only has one model of TLS. Lower to that.
19567 unsigned char OpFlag = 0;
19568 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19569 X86ISD::WrapperRIP : X86ISD::Wrapper;
19570
19571 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19572 // global base reg.
19573 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19574 if (PIC32)
19575 OpFlag = X86II::MO_TLVP_PIC_BASE;
19576 else
19577 OpFlag = X86II::MO_TLVP;
19578 SDLoc DL(Op);
19579 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19580 GA->getValueType(0),
19581 GA->getOffset(), OpFlag);
19582 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19583
19584 // With PIC32, the address is actually $g + Offset.
19585 if (PIC32)
19586 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19587 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19588 Offset);
19589
19590 // Lowering the machine isd will make sure everything is in the right
19591 // location.
19592 SDValue Chain = DAG.getEntryNode();
19593 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19594 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19595 SDValue Args[] = { Chain, Offset };
19596 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19597 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19598 DAG.getIntPtrConstant(0, DL, true),
19599 Chain.getValue(1), DL);
19600
19601 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19602 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19603 MFI.setAdjustsStack(true);
19604
19605 // And our return value (tls address) is in the standard call return value
19606 // location.
19607 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19608 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19609 }
19610
19611 if (Subtarget.isOSWindows()) {
19612 // Just use the implicit TLS architecture
19613 // Need to generate something similar to:
19614 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19615 // ; from TEB
19616 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19617 // mov rcx, qword [rdx+rcx*8]
19618 // mov eax, .tls$:tlsvar
19619 // [rax+rcx] contains the address
19620 // Windows 64bit: gs:0x58
19621 // Windows 32bit: fs:__tls_array
19622
19623 SDLoc dl(GA);
19624 SDValue Chain = DAG.getEntryNode();
19625
19626 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19627 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19628 // use its literal value of 0x2C.
19629 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19630 ? Type::getInt8PtrTy(*DAG.getContext(),
19631 256)
19632 : Type::getInt32PtrTy(*DAG.getContext(),
19633 257));
19634
19635 SDValue TlsArray = Subtarget.is64Bit()
19636 ? DAG.getIntPtrConstant(0x58, dl)
19637 : (Subtarget.isTargetWindowsGNU()
19638 ? DAG.getIntPtrConstant(0x2C, dl)
19639 : DAG.getExternalSymbol("_tls_array", PtrVT));
19640
19641 SDValue ThreadPointer =
19642 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19643
19644 SDValue res;
19645 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19646 res = ThreadPointer;
19647 } else {
19648 // Load the _tls_index variable
19649 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19650 if (Subtarget.is64Bit())
19651 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19652 MachinePointerInfo(), MVT::i32);
19653 else
19654 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19655
19656 const DataLayout &DL = DAG.getDataLayout();
19657 SDValue Scale =
19658 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19659 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19660
19661 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19662 }
19663
19664 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19665
19666 // Get the offset of start of .tls section
19667 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19668 GA->getValueType(0),
19669 GA->getOffset(), X86II::MO_SECREL);
19670 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19671
19672 // The address of the thread local variable is the add of the thread
19673 // pointer with the offset of the variable.
19674 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19675 }
19676
19677 llvm_unreachable("TLS not implemented for this target.")__builtin_unreachable();
19678}
19679
19680/// Lower SRA_PARTS and friends, which return two i32 values
19681/// and take a 2 x i32 value to shift plus a shift amount.
19682/// TODO: Can this be moved to general expansion code?
19683static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
19684 SDValue Lo, Hi;
19685 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19686 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19687}
19688
19689static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
19690 SelectionDAG &DAG) {
19691 MVT VT = Op.getSimpleValueType();
19692 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&((void)0)
19693 "Unexpected funnel shift opcode!")((void)0);
19694
19695 SDLoc DL(Op);
19696 SDValue Op0 = Op.getOperand(0);
19697 SDValue Op1 = Op.getOperand(1);
19698 SDValue Amt = Op.getOperand(2);
19699
19700 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
19701
19702 if (VT.isVector()) {
19703 assert(Subtarget.hasVBMI2() && "Expected VBMI2")((void)0);
19704
19705 if (IsFSHR)
19706 std::swap(Op0, Op1);
19707
19708 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19709 if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
19710 Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
19711 Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
19712 }
19713
19714 SDValue Funnel;
19715 APInt APIntShiftAmt;
19716 MVT ResultVT = Op0.getSimpleValueType();
19717 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
19718 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
19719 Funnel =
19720 DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
19721 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19722 } else {
19723 if (!Subtarget.hasVLX() && !VT.is512BitVector())
19724 Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
19725 Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
19726 ResultVT, Op0, Op1, Amt);
19727 }
19728 if (!Subtarget.hasVLX() && !VT.is512BitVector())
19729 Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
19730 return Funnel;
19731 }
19732 assert(((void)0)
19733 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&((void)0)
19734 "Unexpected funnel shift type!")((void)0);
19735
19736 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
19737 bool OptForSize = DAG.shouldOptForSize();
19738 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
19739
19740 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
19741 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
19742 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
19743 !isa<ConstantSDNode>(Amt)) {
19744 unsigned EltSizeInBits = VT.getScalarSizeInBits();
19745 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
19746 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
19747 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
19748 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
19749 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
19750 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
19751 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
19752 if (IsFSHR) {
19753 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
19754 } else {
19755 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
19756 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
19757 }
19758 return DAG.getZExtOrTrunc(Res, DL, VT);
19759 }
19760
19761 if (VT == MVT::i8 || ExpandFunnel)
19762 return SDValue();
19763
19764 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
19765 if (VT == MVT::i16) {
19766 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
19767 DAG.getConstant(15, DL, Amt.getValueType()));
19768 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
19769 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
19770 }
19771
19772 return Op;
19773}
19774
19775// Try to use a packed vector operation to handle i64 on 32-bit targets when
19776// AVX512DQ is enabled.
19777static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
19778 const X86Subtarget &Subtarget) {
19779 assert((Op.getOpcode() == ISD::SINT_TO_FP ||((void)0)
19780 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||((void)0)
19781 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||((void)0)
19782 Op.getOpcode() == ISD::UINT_TO_FP) &&((void)0)
19783 "Unexpected opcode!")((void)0);
19784 bool IsStrict = Op->isStrictFPOpcode();
19785 unsigned OpNo = IsStrict ? 1 : 0;
19786 SDValue Src = Op.getOperand(OpNo);
19787 MVT SrcVT = Src.getSimpleValueType();
19788 MVT VT = Op.getSimpleValueType();
19789
19790 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19791 (VT != MVT::f32 && VT != MVT::f64))
19792 return SDValue();
19793
19794 // Pack the i64 into a vector, do the operation and extract.
19795
19796 // Using 256-bit to ensure result is 128-bits for f32 case.
19797 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19798 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19799 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19800
19801 SDLoc dl(Op);
19802 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19803 if (IsStrict) {
19804 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19805 {Op.getOperand(0), InVec});
19806 SDValue Chain = CvtVec.getValue(1);
19807 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19808 DAG.getIntPtrConstant(0, dl));
19809 return DAG.getMergeValues({Value, Chain}, dl);
19810 }
19811
19812 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19813
19814 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19815 DAG.getIntPtrConstant(0, dl));
19816}
19817
19818static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19819 const X86Subtarget &Subtarget) {
19820 switch (Opcode) {
19821 case ISD::SINT_TO_FP:
19822 // TODO: Handle wider types with AVX/AVX512.
19823 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19824 return false;
19825 // CVTDQ2PS or (V)CVTDQ2PD
19826 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19827
19828 case ISD::UINT_TO_FP:
19829 // TODO: Handle wider types and i64 elements.
19830 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19831 return false;
19832 // VCVTUDQ2PS or VCVTUDQ2PD
19833 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19834
19835 default:
19836 return false;
19837 }
19838}
19839
19840/// Given a scalar cast operation that is extracted from a vector, try to
19841/// vectorize the cast op followed by extraction. This will avoid an expensive
19842/// round-trip between XMM and GPR.
19843static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
19844 const X86Subtarget &Subtarget) {
19845 // TODO: This could be enhanced to handle smaller integer types by peeking
19846 // through an extend.
19847 SDValue Extract = Cast.getOperand(0);
19848 MVT DestVT = Cast.getSimpleValueType();
19849 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19850 !isa<ConstantSDNode>(Extract.getOperand(1)))
19851 return SDValue();
19852
19853 // See if we have a 128-bit vector cast op for this type of cast.
19854 SDValue VecOp = Extract.getOperand(0);
19855 MVT FromVT = VecOp.getSimpleValueType();
19856 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19857 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19858 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19859 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19860 return SDValue();
19861
19862 // If we are extracting from a non-zero element, first shuffle the source
19863 // vector to allow extracting from element zero.
19864 SDLoc DL(Cast);
19865 if (!isNullConstant(Extract.getOperand(1))) {
19866 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19867 Mask[0] = Extract.getConstantOperandVal(1);
19868 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19869 }
19870 // If the source vector is wider than 128-bits, extract the low part. Do not
19871 // create an unnecessarily wide vector cast op.
19872 if (FromVT != Vec128VT)
19873 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19874
19875 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19876 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19877 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19878 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19879 DAG.getIntPtrConstant(0, DL));
19880}
19881
19882/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19883/// try to vectorize the cast ops. This will avoid an expensive round-trip
19884/// between XMM and GPR.
19885static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
19886 const X86Subtarget &Subtarget) {
19887 // TODO: Allow FP_TO_UINT.
19888 SDValue CastToInt = CastToFP.getOperand(0);
19889 MVT VT = CastToFP.getSimpleValueType();
19890 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19891 return SDValue();
19892
19893 MVT IntVT = CastToInt.getSimpleValueType();
19894 SDValue X = CastToInt.getOperand(0);
19895 MVT SrcVT = X.getSimpleValueType();
19896 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19897 return SDValue();
19898
19899 // See if we have 128-bit vector cast instructions for this type of cast.
19900 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19901 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19902 IntVT != MVT::i32)
19903 return SDValue();
19904
19905 unsigned SrcSize = SrcVT.getSizeInBits();
19906 unsigned IntSize = IntVT.getSizeInBits();
19907 unsigned VTSize = VT.getSizeInBits();
19908 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19909 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19910 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19911
19912 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19913 unsigned ToIntOpcode =
19914 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19915 unsigned ToFPOpcode =
19916 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19917
19918 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19919 //
19920 // We are not defining the high elements (for example, zero them) because
19921 // that could nullify any performance advantage that we hoped to gain from
19922 // this vector op hack. We do not expect any adverse effects (like denorm
19923 // penalties) with cast ops.
19924 SDLoc DL(CastToFP);
19925 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19926 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19927 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19928 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19929 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19930}
19931
19932static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
19933 const X86Subtarget &Subtarget) {
19934 SDLoc DL(Op);
19935 bool IsStrict = Op->isStrictFPOpcode();
19936 MVT VT = Op->getSimpleValueType(0);
19937 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19938
19939 if (Subtarget.hasDQI()) {
19940 assert(!Subtarget.hasVLX() && "Unexpected features")((void)0);
19941
19942 assert((Src.getSimpleValueType() == MVT::v2i64 ||((void)0)
19943 Src.getSimpleValueType() == MVT::v4i64) &&((void)0)
19944 "Unsupported custom type")((void)0);
19945
19946 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19947 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&((void)0)
19948 "Unexpected VT!")((void)0);
19949 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19950
19951 // Need to concat with zero vector for strict fp to avoid spurious
19952 // exceptions.
19953 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19954 : DAG.getUNDEF(MVT::v8i64);
19955 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19956 DAG.getIntPtrConstant(0, DL));
19957 SDValue Res, Chain;
19958 if (IsStrict) {
19959 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19960 {Op->getOperand(0), Src});
19961 Chain = Res.getValue(1);
19962 } else {
19963 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19964 }
19965
19966 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19967 DAG.getIntPtrConstant(0, DL));
19968
19969 if (IsStrict)
19970 return DAG.getMergeValues({Res, Chain}, DL);
19971 return Res;
19972 }
19973
19974 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19975 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19976 if (VT != MVT::v4f32 || IsSigned)
19977 return SDValue();
19978
19979 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19980 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19981 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19982 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19983 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19984 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19985 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19986 SmallVector<SDValue, 4> SignCvts(4);
19987 SmallVector<SDValue, 4> Chains(4);
19988 for (int i = 0; i != 4; ++i) {
19989 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19990 DAG.getIntPtrConstant(i, DL));
19991 if (IsStrict) {
19992 SignCvts[i] =
19993 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19994 {Op.getOperand(0), Elt});
19995 Chains[i] = SignCvts[i].getValue(1);
19996 } else {
19997 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19998 }
19999 }
20000 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20001
20002 SDValue Slow, Chain;
20003 if (IsStrict) {
20004 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20005 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20006 {Chain, SignCvt, SignCvt});
20007 Chain = Slow.getValue(1);
20008 } else {
20009 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20010 }
20011
20012 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20013 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20014
20015 if (IsStrict)
20016 return DAG.getMergeValues({Cvt, Chain}, DL);
20017
20018 return Cvt;
20019}
20020
20021SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20022 SelectionDAG &DAG) const {
20023 bool IsStrict = Op->isStrictFPOpcode();
20024 unsigned OpNo = IsStrict ? 1 : 0;
20025 SDValue Src = Op.getOperand(OpNo);
20026 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20027 MVT SrcVT = Src.getSimpleValueType();
20028 MVT VT = Op.getSimpleValueType();
20029 SDLoc dl(Op);
20030
20031 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20032 return Extract;
20033
20034 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20035 return R;
20036
20037 if (SrcVT.isVector()) {
20038 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20039 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20040 // source for strict FP.
20041 if (IsStrict)
20042 return DAG.getNode(
20043 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20044 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20045 DAG.getUNDEF(SrcVT))});
20046 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20047 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20048 DAG.getUNDEF(SrcVT)));
20049 }
20050 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20051 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20052
20053 return SDValue();
20054 }
20055
20056 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((void)0)
20057 "Unknown SINT_TO_FP to lower!")((void)0);
20058
20059 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20060
20061 // These are really Legal; return the operand so the caller accepts it as
20062 // Legal.
20063 if (SrcVT == MVT::i32 && UseSSEReg)
20064 return Op;
20065 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20066 return Op;
20067
20068 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20069 return V;
20070
20071 // SSE doesn't have an i16 conversion so we need to promote.
20072 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20073 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20074 if (IsStrict)
20075 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20076 {Chain, Ext});
20077
20078 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20079 }
20080
20081 if (VT == MVT::f128)
20082 return SDValue();
20083
20084 SDValue ValueToStore = Src;
20085 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20086 // Bitcasting to f64 here allows us to do a single 64-bit store from
20087 // an SSE register, avoiding the store forwarding penalty that would come
20088 // with two 32-bit stores.
20089 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20090
20091 unsigned Size = SrcVT.getStoreSize();
20092 Align Alignment(Size);
20093 MachineFunction &MF = DAG.getMachineFunction();
20094 auto PtrVT = getPointerTy(MF.getDataLayout());
20095 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20096 MachinePointerInfo MPI =
20097 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20098 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20099 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20100 std::pair<SDValue, SDValue> Tmp =
20101 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20102
20103 if (IsStrict)
20104 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20105
20106 return Tmp.first;
20107}
20108
20109std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20110 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20111 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20112 // Build the FILD
20113 SDVTList Tys;
20114 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20115 if (useSSE)
20116 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20117 else
20118 Tys = DAG.getVTList(DstVT, MVT::Other);
20119
20120 SDValue FILDOps[] = {Chain, Pointer};
20121 SDValue Result =
20122 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20123 Alignment, MachineMemOperand::MOLoad);
20124 Chain = Result.getValue(1);
20125
20126 if (useSSE) {
20127 MachineFunction &MF = DAG.getMachineFunction();
20128 unsigned SSFISize = DstVT.getStoreSize();
20129 int SSFI =
20130 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20131 auto PtrVT = getPointerTy(MF.getDataLayout());
20132 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20133 Tys = DAG.getVTList(MVT::Other);
20134 SDValue FSTOps[] = {Chain, Result, StackSlot};
20135 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20136 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20137 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20138
20139 Chain =
20140 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20141 Result = DAG.getLoad(
20142 DstVT, DL, Chain, StackSlot,
20143 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20144 Chain = Result.getValue(1);
20145 }
20146
20147 return { Result, Chain };
20148}
20149
20150/// Horizontal vector math instructions may be slower than normal math with
20151/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20152/// implementation, and likely shuffle complexity of the alternate sequence.
20153static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20154 const X86Subtarget &Subtarget) {
20155 bool IsOptimizingSize = DAG.shouldOptForSize();
20156 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20157 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20158}
20159
20160/// 64-bit unsigned integer to double expansion.
20161static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20162 const X86Subtarget &Subtarget) {
20163 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20164 // when converting 0 when rounding toward negative infinity. Caller will
20165 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20166 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")((void)0);
20167 // This algorithm is not obvious. Here it is what we're trying to output:
20168 /*
20169 movq %rax, %xmm0
20170 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20171 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20172 #ifdef __SSE3__
20173 haddpd %xmm0, %xmm0
20174 #else
20175 pshufd $0x4e, %xmm0, %xmm1
20176 addpd %xmm1, %xmm0
20177 #endif
20178 */
20179
20180 SDLoc dl(Op);
20181 LLVMContext *Context = DAG.getContext();
20182
20183 // Build some magic constants.
20184 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20185 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20186 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20187 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20188
20189 SmallVector<Constant*,2> CV1;
20190 CV1.push_back(
20191 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20192 APInt(64, 0x4330000000000000ULL))));
20193 CV1.push_back(
20194 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20195 APInt(64, 0x4530000000000000ULL))));
20196 Constant *C1 = ConstantVector::get(CV1);
20197 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20198
20199 // Load the 64-bit value into an XMM register.
20200 SDValue XR1 =
20201 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20202 SDValue CLod0 = DAG.getLoad(
20203 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20204 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20205 SDValue Unpck1 =
20206 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20207
20208 SDValue CLod1 = DAG.getLoad(
20209 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20210 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20211 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20212 // TODO: Are there any fast-math-flags to propagate here?
20213 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20214 SDValue Result;
20215
20216 if (Subtarget.hasSSE3() &&
20217 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20218 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20219 } else {
20220 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20221 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20222 }
20223 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20224 DAG.getIntPtrConstant(0, dl));
20225 return Result;
20226}
20227
20228/// 32-bit unsigned integer to float expansion.
20229static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20230 const X86Subtarget &Subtarget) {
20231 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20232 SDLoc dl(Op);
20233 // FP constant to bias correct the final result.
20234 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20235 MVT::f64);
20236
20237 // Load the 32-bit value into an XMM register.
20238 SDValue Load =
20239 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20240
20241 // Zero out the upper parts of the register.
20242 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20243
20244 // Or the load with the bias.
20245 SDValue Or = DAG.getNode(
20246 ISD::OR, dl, MVT::v2i64,
20247 DAG.getBitcast(MVT::v2i64, Load),
20248 DAG.getBitcast(MVT::v2i64,
20249 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20250 Or =
20251 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20252 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20253
20254 if (Op.getNode()->isStrictFPOpcode()) {
20255 // Subtract the bias.
20256 // TODO: Are there any fast-math-flags to propagate here?
20257 SDValue Chain = Op.getOperand(0);
20258 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20259 {Chain, Or, Bias});
20260
20261 if (Op.getValueType() == Sub.getValueType())
20262 return Sub;
20263
20264 // Handle final rounding.
20265 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20266 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20267
20268 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20269 }
20270
20271 // Subtract the bias.
20272 // TODO: Are there any fast-math-flags to propagate here?
20273 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20274
20275 // Handle final rounding.
20276 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20277}
20278
20279static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20280 const X86Subtarget &Subtarget,
20281 const SDLoc &DL) {
20282 if (Op.getSimpleValueType() != MVT::v2f64)
20283 return SDValue();
20284
20285 bool IsStrict = Op->isStrictFPOpcode();
20286
20287 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20288 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((void)0);
20289
20290 if (Subtarget.hasAVX512()) {
20291 if (!Subtarget.hasVLX()) {
20292 // Let generic type legalization widen this.
20293 if (!IsStrict)
20294 return SDValue();
20295 // Otherwise pad the integer input with 0s and widen the operation.
20296 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20297 DAG.getConstant(0, DL, MVT::v2i32));
20298 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20299 {Op.getOperand(0), N0});
20300 SDValue Chain = Res.getValue(1);
20301 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20302 DAG.getIntPtrConstant(0, DL));
20303 return DAG.getMergeValues({Res, Chain}, DL);
20304 }
20305
20306 // Legalize to v4i32 type.
20307 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20308 DAG.getUNDEF(MVT::v2i32));
20309 if (IsStrict)
20310 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20311 {Op.getOperand(0), N0});
20312 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20313 }
20314
20315 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20316 // This gives us the floating point equivalent of 2^52 + the i32 integer
20317 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20318 // point leaving just our i32 integers in double format.
20319 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20320 SDValue VBias =
20321 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20322 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20323 DAG.getBitcast(MVT::v2i64, VBias));
20324 Or = DAG.getBitcast(MVT::v2f64, Or);
20325
20326 if (IsStrict)
20327 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20328 {Op.getOperand(0), Or, VBias});
20329 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20330}
20331
20332static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20333 const X86Subtarget &Subtarget) {
20334 SDLoc DL(Op);
20335 bool IsStrict = Op->isStrictFPOpcode();
20336 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20337 MVT VecIntVT = V.getSimpleValueType();
20338 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&((void)0)
20339 "Unsupported custom type")((void)0);
20340
20341 if (Subtarget.hasAVX512()) {
20342 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20343 assert(!Subtarget.hasVLX() && "Unexpected features")((void)0);
20344 MVT VT = Op->getSimpleValueType(0);
20345
20346 // v8i32->v8f64 is legal with AVX512 so just return it.
20347 if (VT == MVT::v8f64)
20348 return Op;
20349
20350 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&((void)0)
20351 "Unexpected VT!")((void)0);
20352 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20353 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20354 // Need to concat with zero vector for strict fp to avoid spurious
20355 // exceptions.
20356 SDValue Tmp =
20357 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20358 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20359 DAG.getIntPtrConstant(0, DL));
20360 SDValue Res, Chain;
20361 if (IsStrict) {
20362 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20363 {Op->getOperand(0), V});
20364 Chain = Res.getValue(1);
20365 } else {
20366 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20367 }
20368
20369 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20370 DAG.getIntPtrConstant(0, DL));
20371
20372 if (IsStrict)
20373 return DAG.getMergeValues({Res, Chain}, DL);
20374 return Res;
20375 }
20376
20377 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20378 Op->getSimpleValueType(0) == MVT::v4f64) {
20379 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20380 Constant *Bias = ConstantFP::get(
20381 *DAG.getContext(),
20382 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20383 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20384 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20385 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20386 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20387 SDValue VBias = DAG.getMemIntrinsicNode(
20388 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20389 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20390 MachineMemOperand::MOLoad);
20391
20392 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20393 DAG.getBitcast(MVT::v4i64, VBias));
20394 Or = DAG.getBitcast(MVT::v4f64, Or);
20395
20396 if (IsStrict)
20397 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20398 {Op.getOperand(0), Or, VBias});
20399 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20400 }
20401
20402 // The algorithm is the following:
20403 // #ifdef __SSE4_1__
20404 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20405 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20406 // (uint4) 0x53000000, 0xaa);
20407 // #else
20408 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20409 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20410 // #endif
20411 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20412 // return (float4) lo + fhi;
20413
20414 bool Is128 = VecIntVT == MVT::v4i32;
20415 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20416 // If we convert to something else than the supported type, e.g., to v4f64,
20417 // abort early.
20418 if (VecFloatVT != Op->getSimpleValueType(0))
20419 return SDValue();
20420
20421 // In the #idef/#else code, we have in common:
20422 // - The vector of constants:
20423 // -- 0x4b000000
20424 // -- 0x53000000
20425 // - A shift:
20426 // -- v >> 16
20427
20428 // Create the splat vector for 0x4b000000.
20429 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20430 // Create the splat vector for 0x53000000.
20431 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20432
20433 // Create the right shift.
20434 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20435 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20436
20437 SDValue Low, High;
20438 if (Subtarget.hasSSE41()) {
20439 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20440 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20441 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20442 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20443 // Low will be bitcasted right away, so do not bother bitcasting back to its
20444 // original type.
20445 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20446 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20447 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20448 // (uint4) 0x53000000, 0xaa);
20449 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20450 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20451 // High will be bitcasted right away, so do not bother bitcasting back to
20452 // its original type.
20453 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20454 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20455 } else {
20456 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20457 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20458 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20459 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20460
20461 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20462 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20463 }
20464
20465 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20466 SDValue VecCstFSub = DAG.getConstantFP(
20467 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20468
20469 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20470 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20471 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20472 // enabled. See PR24512.
20473 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20474 // TODO: Are there any fast-math-flags to propagate here?
20475 // (float4) lo;
20476 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20477 // return (float4) lo + fhi;
20478 if (IsStrict) {
20479 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20480 {Op.getOperand(0), HighBitcast, VecCstFSub});
20481 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20482 {FHigh.getValue(1), LowBitcast, FHigh});
20483 }
20484
20485 SDValue FHigh =
20486 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20487 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20488}
20489
20490static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20491 const X86Subtarget &Subtarget) {
20492 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20493 SDValue N0 = Op.getOperand(OpNo);
20494 MVT SrcVT = N0.getSimpleValueType();
20495 SDLoc dl(Op);
20496
20497 switch (SrcVT.SimpleTy) {
20498 default:
20499 llvm_unreachable("Custom UINT_TO_FP is not supported!")__builtin_unreachable();
20500 case MVT::v2i32:
20501 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20502 case MVT::v4i32:
20503 case MVT::v8i32:
20504 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20505 case MVT::v2i64:
20506 case MVT::v4i64:
20507 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20508 }
20509}
20510
20511SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20512 SelectionDAG &DAG) const {
20513 bool IsStrict = Op->isStrictFPOpcode();
20514 unsigned OpNo = IsStrict ? 1 : 0;
20515 SDValue Src = Op.getOperand(OpNo);
20516 SDLoc dl(Op);
20517 auto PtrVT = getPointerTy(DAG.getDataLayout());
20518 MVT SrcVT = Src.getSimpleValueType();
20519 MVT DstVT = Op->getSimpleValueType(0);
20520 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20521
20522 if (DstVT == MVT::f128)
20523 return SDValue();
20524
20525 if (DstVT.isVector())
20526 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20527
20528 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20529 return Extract;
20530
20531 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20532 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20533 // Conversions from unsigned i32 to f32/f64 are legal,
20534 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20535 return Op;
20536 }
20537
20538 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20539 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20540 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20541 if (IsStrict)
20542 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20543 {Chain, Src});
20544 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20545 }
20546
20547 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20548 return V;
20549
20550 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20551 // infinity. It produces -0.0, so disable under strictfp.
20552 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20553 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20554 if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20555 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20556 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20557 (DstVT == MVT::f32 || DstVT == MVT::f64))
20558 return SDValue();
20559
20560 // Make a 64-bit buffer, and use it to build an FILD.
20561 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20562 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20563 Align SlotAlign(8);
20564 MachinePointerInfo MPI =
20565 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20566 if (SrcVT == MVT::i32) {
20567 SDValue OffsetSlot =
20568 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20569 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20570 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20571 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20572 std::pair<SDValue, SDValue> Tmp =
20573 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20574 if (IsStrict)
20575 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20576
20577 return Tmp.first;
20578 }
20579
20580 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((void)0);
20581 SDValue ValueToStore = Src;
20582 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20583 // Bitcasting to f64 here allows us to do a single 64-bit store from
20584 // an SSE register, avoiding the store forwarding penalty that would come
20585 // with two 32-bit stores.
20586 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20587 }
20588 SDValue Store =
20589 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20590 // For i64 source, we need to add the appropriate power of 2 if the input
20591 // was negative. We must be careful to do the computation in x87 extended
20592 // precision, not in SSE.
20593 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20594 SDValue Ops[] = { Store, StackSlot };
20595 SDValue Fild =
20596 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20597 SlotAlign, MachineMemOperand::MOLoad);
20598 Chain = Fild.getValue(1);
20599
20600
20601 // Check whether the sign bit is set.
20602 SDValue SignSet = DAG.getSetCC(
20603 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20604 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20605
20606 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20607 APInt FF(64, 0x5F80000000000000ULL);
20608 SDValue FudgePtr = DAG.getConstantPool(
20609 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20610 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20611
20612 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20613 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20614 SDValue Four = DAG.getIntPtrConstant(4, dl);
20615 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20616 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20617
20618 // Load the value out, extending it from f32 to f80.
20619 SDValue Fudge = DAG.getExtLoad(
20620 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20621 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20622 CPAlignment);
20623 Chain = Fudge.getValue(1);
20624 // Extend everything to 80 bits to force it to be done on x87.
20625 // TODO: Are there any fast-math-flags to propagate here?
20626 if (IsStrict) {
20627 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20628 {Chain, Fild, Fudge});
20629 // STRICT_FP_ROUND can't handle equal types.
20630 if (DstVT == MVT::f80)
20631 return Add;
20632 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20633 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20634 }
20635 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
20636 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20637 DAG.getIntPtrConstant(0, dl));
20638}
20639
20640// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20641// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20642// just return an SDValue().
20643// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20644// to i16, i32 or i64, and we lower it to a legal sequence and return the
20645// result.
20646SDValue
20647X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20648 bool IsSigned, SDValue &Chain) const {
20649 bool IsStrict = Op->isStrictFPOpcode();
20650 SDLoc DL(Op);
20651
20652 EVT DstTy = Op.getValueType();
20653 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20654 EVT TheVT = Value.getValueType();
20655 auto PtrVT = getPointerTy(DAG.getDataLayout());
20656
20657 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20658 // f16 must be promoted before using the lowering in this routine.
20659 // fp128 does not use this lowering.
20660 return SDValue();
20661 }
20662
20663 // If using FIST to compute an unsigned i64, we'll need some fixup
20664 // to handle values above the maximum signed i64. A FIST is always
20665 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20666 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20667
20668 // FIXME: This does not generate an invalid exception if the input does not
20669 // fit in i32. PR44019
20670 if (!IsSigned && DstTy != MVT::i64) {
20671 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20672 // The low 32 bits of the fist result will have the correct uint32 result.
20673 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((void)0);
20674 DstTy = MVT::i64;
20675 }
20676
20677 assert(DstTy.getSimpleVT() <= MVT::i64 &&((void)0)
20678 DstTy.getSimpleVT() >= MVT::i16 &&((void)0)
20679 "Unknown FP_TO_INT to lower!")((void)0);
20680
20681 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20682 // stack slot.
20683 MachineFunction &MF = DAG.getMachineFunction();
20684 unsigned MemSize = DstTy.getStoreSize();
20685 int SSFI =
20686 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20687 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20688
20689 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20690
20691 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20692
20693 if (UnsignedFixup) {
20694 //
20695 // Conversion to unsigned i64 is implemented with a select,
20696 // depending on whether the source value fits in the range
20697 // of a signed i64. Let Thresh be the FP equivalent of
20698 // 0x8000000000000000ULL.
20699 //
20700 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20701 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20702 // FistSrc = (Value - FltOfs);
20703 // Fist-to-mem64 FistSrc
20704 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20705 // to XOR'ing the high 32 bits with Adjust.
20706 //
20707 // Being a power of 2, Thresh is exactly representable in all FP formats.
20708 // For X87 we'd like to use the smallest FP type for this constant, but
20709 // for DAG type consistency we have to match the FP operand type.
20710
20711 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20712 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
20713 bool LosesInfo = false;
20714 if (TheVT == MVT::f64)
20715 // The rounding mode is irrelevant as the conversion should be exact.
20716 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20717 &LosesInfo);
20718 else if (TheVT == MVT::f80)
20719 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20720 APFloat::rmNearestTiesToEven, &LosesInfo);
20721
20722 assert(Status == APFloat::opOK && !LosesInfo &&((void)0)
20723 "FP conversion should have been exact")((void)0);
20724
20725 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20726
20727 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20728 *DAG.getContext(), TheVT);
20729 SDValue Cmp;
20730 if (IsStrict) {
20731 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20732 /*IsSignaling*/ true);
20733 Chain = Cmp.getValue(1);
20734 } else {
20735 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20736 }
20737
20738 // Our preferred lowering of
20739 //
20740 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20741 //
20742 // is
20743 //
20744 // (Value >= Thresh) << 63
20745 //
20746 // but since we can get here after LegalOperations, DAGCombine might do the
20747 // wrong thing if we create a select. So, directly create the preferred
20748 // version.
20749 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20750 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20751 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20752
20753 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20754 DAG.getConstantFP(0.0, DL, TheVT));
20755
20756 if (IsStrict) {
20757 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20758 { Chain, Value, FltOfs });
20759 Chain = Value.getValue(1);
20760 } else
20761 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20762 }
20763
20764 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20765
20766 // FIXME This causes a redundant load/store if the SSE-class value is already
20767 // in memory, such as if it is on the callstack.
20768 if (isScalarFPTypeInSSEReg(TheVT)) {
20769 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((void)0);
20770 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20771 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20772 SDValue Ops[] = { Chain, StackSlot };
20773
20774 unsigned FLDSize = TheVT.getStoreSize();
20775 assert(FLDSize <= MemSize && "Stack slot not big enough")((void)0);
20776 MachineMemOperand *MMO = MF.getMachineMemOperand(
20777 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20778 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20779 Chain = Value.getValue(1);
20780 }
20781
20782 // Build the FP_TO_INT*_IN_MEM
20783 MachineMemOperand *MMO = MF.getMachineMemOperand(
20784 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20785 SDValue Ops[] = { Chain, Value, StackSlot };
20786 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
20787 DAG.getVTList(MVT::Other),
20788 Ops, DstTy, MMO);
20789
20790 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20791 Chain = Res.getValue(1);
20792
20793 // If we need an unsigned fixup, XOR the result with adjust.
20794 if (UnsignedFixup)
20795 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20796
20797 return Res;
20798}
20799
20800static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
20801 const X86Subtarget &Subtarget) {
20802 MVT VT = Op.getSimpleValueType();
20803 SDValue In = Op.getOperand(0);
20804 MVT InVT = In.getSimpleValueType();
20805 SDLoc dl(Op);
20806 unsigned Opc = Op.getOpcode();
20807
20808 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((void)0);
20809 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&((void)0)
20810 "Unexpected extension opcode")((void)0);
20811 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
20812 "Expected same number of elements")((void)0);
20813 assert((VT.getVectorElementType() == MVT::i16 ||((void)0)
20814 VT.getVectorElementType() == MVT::i32 ||((void)0)
20815 VT.getVectorElementType() == MVT::i64) &&((void)0)
20816 "Unexpected element type")((void)0);
20817 assert((InVT.getVectorElementType() == MVT::i8 ||((void)0)
20818 InVT.getVectorElementType() == MVT::i16 ||((void)0)
20819 InVT.getVectorElementType() == MVT::i32) &&((void)0)
20820 "Unexpected element type")((void)0);
20821
20822 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
20823
20824 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20825 assert(InVT == MVT::v32i8 && "Unexpected VT!")((void)0);
20826 return splitVectorIntUnary(Op, DAG);
20827 }
20828
20829 if (Subtarget.hasInt256())
20830 return Op;
20831
20832 // Optimize vectors in AVX mode:
20833 //
20834 // v8i16 -> v8i32
20835 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20836 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20837 // Concat upper and lower parts.
20838 //
20839 // v4i32 -> v4i64
20840 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20841 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20842 // Concat upper and lower parts.
20843 //
20844 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20845 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20846
20847 // Short-circuit if we can determine that each 128-bit half is the same value.
20848 // Otherwise, this is difficult to match and optimize.
20849 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20850 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20851 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20852
20853 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20854 SDValue Undef = DAG.getUNDEF(InVT);
20855 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20856 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20857 OpHi = DAG.getBitcast(HalfVT, OpHi);
20858
20859 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20860}
20861
20862// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20863static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20864 const SDLoc &dl, SelectionDAG &DAG) {
20865 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")((void)0);
20866 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20867 DAG.getIntPtrConstant(0, dl));
20868 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20869 DAG.getIntPtrConstant(8, dl));
20870 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20871 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20872 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20873 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20874}
20875
20876static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
20877 const X86Subtarget &Subtarget,
20878 SelectionDAG &DAG) {
20879 MVT VT = Op->getSimpleValueType(0);
20880 SDValue In = Op->getOperand(0);
20881 MVT InVT = In.getSimpleValueType();
20882 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((void)0);
20883 SDLoc DL(Op);
20884 unsigned NumElts = VT.getVectorNumElements();
20885
20886 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20887 // avoids a constant pool load.
20888 if (VT.getVectorElementType() != MVT::i8) {
20889 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20890 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20891 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20892 }
20893
20894 // Extend VT if BWI is not supported.
20895 MVT ExtVT = VT;
20896 if (!Subtarget.hasBWI()) {
20897 // If v16i32 is to be avoided, we'll need to split and concatenate.
20898 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20899 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20900
20901 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20902 }
20903
20904 // Widen to 512-bits if VLX is not supported.
20905 MVT WideVT = ExtVT;
20906 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20907 NumElts *= 512 / ExtVT.getSizeInBits();
20908 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20909 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20910 In, DAG.getIntPtrConstant(0, DL));
20911 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20912 NumElts);
20913 }
20914
20915 SDValue One = DAG.getConstant(1, DL, WideVT);
20916 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20917
20918 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20919
20920 // Truncate if we had to extend above.
20921 if (VT != ExtVT) {
20922 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20923 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20924 }
20925
20926 // Extract back to 128/256-bit if we widened.
20927 if (WideVT != VT)
20928 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20929 DAG.getIntPtrConstant(0, DL));
20930
20931 return SelectedVal;
20932}
20933
20934static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20935 SelectionDAG &DAG) {
20936 SDValue In = Op.getOperand(0);
20937 MVT SVT = In.getSimpleValueType();
20938
20939 if (SVT.getVectorElementType() == MVT::i1)
20940 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20941
20942 assert(Subtarget.hasAVX() && "Expected AVX support")((void)0);
20943 return LowerAVXExtend(Op, DAG, Subtarget);
20944}
20945
20946/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20947/// It makes use of the fact that vectors with enough leading sign/zero bits
20948/// prevent the PACKSS/PACKUS from saturating the results.
20949/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20950/// within each 128-bit lane.
20951static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20952 const SDLoc &DL, SelectionDAG &DAG,
20953 const X86Subtarget &Subtarget) {
20954 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&((void)0)
20955 "Unexpected PACK opcode")((void)0);
20956 assert(DstVT.isVector() && "VT not a vector?")((void)0);
20957
20958 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20959 if (!Subtarget.hasSSE2())
20960 return SDValue();
20961
20962 EVT SrcVT = In.getValueType();
20963
20964 // No truncation required, we might get here due to recursive calls.
20965 if (SrcVT == DstVT)
20966 return In;
20967
20968 // We only support vector truncation to 64bits or greater from a
20969 // 128bits or greater source.
20970 unsigned DstSizeInBits = DstVT.getSizeInBits();
20971 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20972 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
20973 return SDValue();
20974
20975 unsigned NumElems = SrcVT.getVectorNumElements();
20976 if (!isPowerOf2_32(NumElems))
20977 return SDValue();
20978
20979 LLVMContext &Ctx = *DAG.getContext();
20980 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((void)0);
20981 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")((void)0);
20982
20983 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20984
20985 // Pack to the largest type possible:
20986 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20987 EVT InVT = MVT::i16, OutVT = MVT::i8;
20988 if (SrcVT.getScalarSizeInBits() > 16 &&
20989 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20990 InVT = MVT::i32;
20991 OutVT = MVT::i16;
20992 }
20993
20994 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20995 if (SrcVT.is128BitVector()) {
20996 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20997 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20998 In = DAG.getBitcast(InVT, In);
20999 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21000 Res = extractSubVector(Res, 0, DAG, DL, 64);
21001 return DAG.getBitcast(DstVT, Res);
21002 }
21003
21004 // Split lower/upper subvectors.
21005 SDValue Lo, Hi;
21006 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21007
21008 unsigned SubSizeInBits = SrcSizeInBits / 2;
21009 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21010 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21011
21012 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21013 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21014 Lo = DAG.getBitcast(InVT, Lo);
21015 Hi = DAG.getBitcast(InVT, Hi);
21016 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21017 return DAG.getBitcast(DstVT, Res);
21018 }
21019
21020 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21021 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21022 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21023 Lo = DAG.getBitcast(InVT, Lo);
21024 Hi = DAG.getBitcast(InVT, Hi);
21025 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21026
21027 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21028 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21029 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21030 SmallVector<int, 64> Mask;
21031 int Scale = 64 / OutVT.getScalarSizeInBits();
21032 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21033 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21034
21035 if (DstVT.is256BitVector())
21036 return DAG.getBitcast(DstVT, Res);
21037
21038 // If 512bit -> 128bit truncate another stage.
21039 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21040 Res = DAG.getBitcast(PackedVT, Res);
21041 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21042 }
21043
21044 // Recursively pack lower/upper subvectors, concat result and pack again.
21045 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")((void)0);
21046 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21047 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21048 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21049
21050 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21051 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21052 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21053}
21054
21055static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21056 const X86Subtarget &Subtarget) {
21057
21058 SDLoc DL(Op);
21059 MVT VT = Op.getSimpleValueType();
21060 SDValue In = Op.getOperand(0);
21061 MVT InVT = In.getSimpleValueType();
21062
21063 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((void)0);
21064
21065 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21066 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21067 if (InVT.getScalarSizeInBits() <= 16) {
21068 if (Subtarget.hasBWI()) {
21069 // legal, will go to VPMOVB2M, VPMOVW2M
21070 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21071 // We need to shift to get the lsb into sign position.
21072 // Shift packed bytes not supported natively, bitcast to word
21073 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21074 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21075 DAG.getBitcast(ExtVT, In),
21076 DAG.getConstant(ShiftInx, DL, ExtVT));
21077 In = DAG.getBitcast(InVT, In);
21078 }
21079 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21080 In, ISD::SETGT);
21081 }
21082 // Use TESTD/Q, extended vector to packed dword/qword.
21083 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&((void)0)
21084 "Unexpected vector type.")((void)0);
21085 unsigned NumElts = InVT.getVectorNumElements();
21086 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")((void)0);
21087 // We need to change to a wider element type that we have support for.
21088 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21089 // For 16 element vectors we extend to v16i32 unless we are explicitly
21090 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21091 // we need to split into two 8 element vectors which we can extend to v8i32,
21092 // truncate and concat the results. There's an additional complication if
21093 // the original type is v16i8. In that case we can't split the v16i8
21094 // directly, so we need to shuffle high elements to low and use
21095 // sign_extend_vector_inreg.
21096 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21097 SDValue Lo, Hi;
21098 if (InVT == MVT::v16i8) {
21099 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21100 Hi = DAG.getVectorShuffle(
21101 InVT, DL, In, In,
21102 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21103 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21104 } else {
21105 assert(InVT == MVT::v16i16 && "Unexpected VT!")((void)0);
21106 Lo = extract128BitVector(In, 0, DAG, DL);
21107 Hi = extract128BitVector(In, 8, DAG, DL);
21108 }
21109 // We're split now, just emit two truncates and a concat. The two
21110 // truncates will trigger legalization to come back to this function.
21111 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21112 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21113 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21114 }
21115 // We either have 8 elements or we're allowed to use 512-bit vectors.
21116 // If we have VLX, we want to use the narrowest vector that can get the
21117 // job done so we use vXi32.
21118 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21119 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21120 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21121 InVT = ExtVT;
21122 ShiftInx = InVT.getScalarSizeInBits() - 1;
21123 }
21124
21125 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21126 // We need to shift to get the lsb into sign position.
21127 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21128 DAG.getConstant(ShiftInx, DL, InVT));
21129 }
21130 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21131 if (Subtarget.hasDQI())
21132 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21133 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21134}
21135
21136SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21137 SDLoc DL(Op);
21138 MVT VT = Op.getSimpleValueType();
21139 SDValue In = Op.getOperand(0);
21140 MVT InVT = In.getSimpleValueType();
21141 unsigned InNumEltBits = InVT.getScalarSizeInBits();
21142
21143 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
21144 "Invalid TRUNCATE operation")((void)0);
21145
21146 // If we're called by the type legalizer, handle a few cases.
21147 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21148 if (!TLI.isTypeLegal(InVT)) {
21149 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21150 VT.is128BitVector()) {
21151 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&((void)0)
21152 "Unexpected subtarget!")((void)0);
21153 // The default behavior is to truncate one step, concatenate, and then
21154 // truncate the remainder. We'd rather produce two 64-bit results and
21155 // concatenate those.
21156 SDValue Lo, Hi;
21157 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21158
21159 EVT LoVT, HiVT;
21160 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21161
21162 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21163 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21164 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21165 }
21166
21167 // Otherwise let default legalization handle it.
21168 return SDValue();
21169 }
21170
21171 if (VT.getVectorElementType() == MVT::i1)
21172 return LowerTruncateVecI1(Op, DAG, Subtarget);
21173
21174 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21175 if (Subtarget.hasAVX512()) {
21176 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21177 assert(VT == MVT::v32i8 && "Unexpected VT!")((void)0);
21178 return splitVectorIntUnary(Op, DAG);
21179 }
21180
21181 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21182 // and then truncate that. But we should only do that if we haven't been
21183 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21184 // handled by isel patterns.
21185 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21186 Subtarget.canExtendTo512DQ())
21187 return Op;
21188 }
21189
21190 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21191 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21192
21193 // Truncate with PACKUS if we are truncating a vector with leading zero bits
21194 // that extend all the way to the packed/truncated value.
21195 // Pre-SSE41 we can only use PACKUSWB.
21196 KnownBits Known = DAG.computeKnownBits(In);
21197 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21198 if (SDValue V =
21199 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21200 return V;
21201
21202 // Truncate with PACKSS if we are truncating a vector with sign-bits that
21203 // extend all the way to the packed/truncated value.
21204 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21205 if (SDValue V =
21206 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21207 return V;
21208
21209 // Handle truncation of V256 to V128 using shuffles.
21210 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")((void)0);
21211
21212 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21213 In = DAG.getBitcast(MVT::v8i32, In);
21214
21215 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21216 if (Subtarget.hasInt256()) {
21217 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21218 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21219 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21220 DAG.getIntPtrConstant(0, DL));
21221 }
21222
21223 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21224 DAG.getIntPtrConstant(0, DL));
21225 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21226 DAG.getIntPtrConstant(4, DL));
21227 static const int ShufMask[] = {0, 2, 4, 6};
21228 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21229 }
21230
21231 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21232 In = DAG.getBitcast(MVT::v32i8, In);
21233
21234 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21235 if (Subtarget.hasInt256()) {
21236 // The PSHUFB mask:
21237 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21238 -1, -1, -1, -1, -1, -1, -1, -1,
21239 16, 17, 20, 21, 24, 25, 28, 29,
21240 -1, -1, -1, -1, -1, -1, -1, -1 };
21241 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21242 In = DAG.getBitcast(MVT::v4i64, In);
21243
21244 static const int ShufMask2[] = {0, 2, -1, -1};
21245 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21246 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21247 DAG.getBitcast(MVT::v16i16, In),
21248 DAG.getIntPtrConstant(0, DL));
21249 }
21250
21251 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21252 DAG.getIntPtrConstant(0, DL));
21253 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21254 DAG.getIntPtrConstant(16, DL));
21255
21256 // The PSHUFB mask:
21257 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
21258 -1, -1, -1, -1, -1, -1, -1, -1};
21259
21260 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21261 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21262
21263 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21264 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21265
21266 // The MOVLHPS Mask:
21267 static const int ShufMask2[] = {0, 1, 4, 5};
21268 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21269 return DAG.getBitcast(MVT::v8i16, res);
21270 }
21271
21272 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21273 // Use an AND to zero uppper bits for PACKUS.
21274 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21275
21276 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21277 DAG.getIntPtrConstant(0, DL));
21278 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21279 DAG.getIntPtrConstant(8, DL));
21280 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21281 }
21282
21283 llvm_unreachable("All 256->128 cases should have been handled above!")__builtin_unreachable();
21284}
21285
21286// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21287// behaves on out of range inputs to generate optimized conversions.
21288static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21289 SelectionDAG &DAG,
21290 const X86Subtarget &Subtarget) {
21291 MVT SrcVT = Src.getSimpleValueType();
21292 unsigned DstBits = VT.getScalarSizeInBits();
21293 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")((void)0);
21294
21295 // Calculate the converted result for values in the range 0 to
21296 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21297 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21298 SDValue Big =
21299 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21300 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21301 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21302
21303 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21304 // and only if the value was out of range. So we can use that
21305 // as our indicator that we rather use "Big" instead of "Small".
21306 //
21307 // Use "Small" if "IsOverflown" has all bits cleared
21308 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21309
21310 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21311 // use the slightly slower blendv select instead.
21312 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21313 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21314 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21315 }
21316
21317 SDValue IsOverflown =
21318 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21319 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21320 return DAG.getNode(ISD::OR, dl, VT, Small,
21321 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21322}
21323
21324SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21325 bool IsStrict = Op->isStrictFPOpcode();
21326 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21327 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21328 MVT VT = Op->getSimpleValueType(0);
21329 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21330 MVT SrcVT = Src.getSimpleValueType();
21331 SDLoc dl(Op);
21332
21333 if (VT.isVector()) {
21334 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21335 MVT ResVT = MVT::v4i32;
21336 MVT TruncVT = MVT::v4i1;
21337 unsigned Opc;
21338 if (IsStrict)
21339 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21340 else
21341 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21342
21343 if (!IsSigned && !Subtarget.hasVLX()) {
21344 assert(Subtarget.useAVX512Regs() && "Unexpected features!")((void)0);
21345 // Widen to 512-bits.
21346 ResVT = MVT::v8i32;
21347 TruncVT = MVT::v8i1;
21348 Opc = Op.getOpcode();
21349 // Need to concat with zero vector for strict fp to avoid spurious
21350 // exceptions.
21351 // TODO: Should we just do this for non-strict as well?
21352 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21353 : DAG.getUNDEF(MVT::v8f64);
21354 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21355 DAG.getIntPtrConstant(0, dl));
21356 }
21357 SDValue Res, Chain;
21358 if (IsStrict) {
21359 Res =
21360 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
21361 Chain = Res.getValue(1);
21362 } else {
21363 Res = DAG.getNode(Opc, dl, ResVT, Src);
21364 }
21365
21366 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21367 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21368 DAG.getIntPtrConstant(0, dl));
21369 if (IsStrict)
21370 return DAG.getMergeValues({Res, Chain}, dl);
21371 return Res;
21372 }
21373
21374 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21375 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21376 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21377 assert(Subtarget.useAVX512Regs() && "Requires avx512f")((void)0);
21378 return Op;
21379 }
21380
21381 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21382 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21383 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21384 Subtarget.useAVX512Regs()) {
21385 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21386 assert(!Subtarget.hasVLX() && "Unexpected features!")((void)0);
21387 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21388 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21389 // Need to concat with zero vector for strict fp to avoid spurious
21390 // exceptions.
21391 // TODO: Should we just do this for non-strict as well?
21392 SDValue Tmp =
21393 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21394 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21395 DAG.getIntPtrConstant(0, dl));
21396
21397 SDValue Res, Chain;
21398 if (IsStrict) {
21399 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21400 {Op->getOperand(0), Src});
21401 Chain = Res.getValue(1);
21402 } else {
21403 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21404 }
21405
21406 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21407 DAG.getIntPtrConstant(0, dl));
21408
21409 if (IsStrict)
21410 return DAG.getMergeValues({Res, Chain}, dl);
21411 return Res;
21412 }
21413
21414 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21415 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21416 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21417 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21418 assert(!Subtarget.hasVLX() && "Unexpected features!")((void)0);
21419 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21420 // Need to concat with zero vector for strict fp to avoid spurious
21421 // exceptions.
21422 // TODO: Should we just do this for non-strict as well?
21423 SDValue Tmp =
21424 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21425 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21426 DAG.getIntPtrConstant(0, dl));
21427
21428 SDValue Res, Chain;
21429 if (IsStrict) {
21430 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21431 {Op->getOperand(0), Src});
21432 Chain = Res.getValue(1);
21433 } else {
21434 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21435 }
21436
21437 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21438 DAG.getIntPtrConstant(0, dl));
21439
21440 if (IsStrict)
21441 return DAG.getMergeValues({Res, Chain}, dl);
21442 return Res;
21443 }
21444
21445 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21446 if (!Subtarget.hasVLX()) {
21447 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21448 // legalizer and then widened again by vector op legalization.
21449 if (!IsStrict)
21450 return SDValue();
21451
21452 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21453 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21454 {Src, Zero, Zero, Zero});
21455 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21456 {Op->getOperand(0), Tmp});
21457 SDValue Chain = Tmp.getValue(1);
21458 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21459 DAG.getIntPtrConstant(0, dl));
21460 return DAG.getMergeValues({Tmp, Chain}, dl);
21461 }
21462
21463 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")((void)0);
21464 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21465 DAG.getUNDEF(MVT::v2f32));
21466 if (IsStrict) {
21467 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21468 : X86ISD::STRICT_CVTTP2UI;
21469 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21470 }
21471 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21472 return DAG.getNode(Opc, dl, VT, Tmp);
21473 }
21474
21475 // Generate optimized instructions for pre AVX512 unsigned conversions from
21476 // vXf32 to vXi32.
21477 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21478 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21479 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21480 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21481 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21482 }
21483
21484 return SDValue();
21485 }
21486
21487 assert(!VT.isVector())((void)0);
21488
21489 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21490
21491 if (!IsSigned && UseSSEReg) {
21492 // Conversions from f32/f64 with AVX512 should be legal.
21493 if (Subtarget.hasAVX512())
21494 return Op;
21495
21496 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21497 // behaves on out of range inputs to generate optimized conversions.
21498 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21499 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21500 unsigned DstBits = VT.getScalarSizeInBits();
21501 APInt UIntLimit = APInt::getSignMask(DstBits);
21502 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21503 DAG.getConstant(UIntLimit, dl, VT));
21504 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21505
21506 // Calculate the converted result for values in the range:
21507 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21508 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21509 SDValue Small =
21510 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21511 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21512 SDValue Big = DAG.getNode(
21513 X86ISD::CVTTS2SI, dl, VT,
21514 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21515 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21516
21517 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21518 // and only if the value was out of range. So we can use that
21519 // as our indicator that we rather use "Big" instead of "Small".
21520 //
21521 // Use "Small" if "IsOverflown" has all bits cleared
21522 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21523 SDValue IsOverflown = DAG.getNode(
21524 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21525 return DAG.getNode(ISD::OR, dl, VT, Small,
21526 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21527 }
21528
21529 // Use default expansion for i64.
21530 if (VT == MVT::i64)
21531 return SDValue();
21532
21533 assert(VT == MVT::i32 && "Unexpected VT!")((void)0);
21534
21535 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21536 // FIXME: This does not generate an invalid exception if the input does not
21537 // fit in i32. PR44019
21538 if (Subtarget.is64Bit()) {
21539 SDValue Res, Chain;
21540 if (IsStrict) {
21541 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
21542 { Op.getOperand(0), Src });
21543 Chain = Res.getValue(1);
21544 } else
21545 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21546
21547 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21548 if (IsStrict)
21549 return DAG.getMergeValues({ Res, Chain }, dl);
21550 return Res;
21551 }
21552
21553 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21554 // use fisttp which will be handled later.
21555 if (!Subtarget.hasSSE3())
21556 return SDValue();
21557 }
21558
21559 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21560 // FIXME: This does not generate an invalid exception if the input does not
21561 // fit in i16. PR44019
21562 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21563 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")((void)0);
21564 SDValue Res, Chain;
21565 if (IsStrict) {
21566 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
21567 { Op.getOperand(0), Src });
21568 Chain = Res.getValue(1);
21569 } else
21570 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21571
21572 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21573 if (IsStrict)
21574 return DAG.getMergeValues({ Res, Chain }, dl);
21575 return Res;
21576 }
21577
21578 // If this is a FP_TO_SINT using SSEReg we're done.
21579 if (UseSSEReg && IsSigned)
21580 return Op;
21581
21582 // fp128 needs to use a libcall.
21583 if (SrcVT == MVT::f128) {
21584 RTLIB::Libcall LC;
21585 if (IsSigned)
21586 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21587 else
21588 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21589
21590 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21591 MakeLibCallOptions CallOptions;
21592 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21593 SDLoc(Op), Chain);
21594
21595 if (IsStrict)
21596 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21597
21598 return Tmp.first;
21599 }
21600
21601 // Fall back to X87.
21602 SDValue Chain;
21603 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21604 if (IsStrict)
21605 return DAG.getMergeValues({V, Chain}, dl);
21606 return V;
21607 }
21608
21609 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")__builtin_unreachable();
21610}
21611
21612SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21613 SelectionDAG &DAG) const {
21614 SDValue Src = Op.getOperand(0);
21615 MVT SrcVT = Src.getSimpleValueType();
21616
21617 // If the source is in an SSE register, the node is Legal.
21618 if (isScalarFPTypeInSSEReg(SrcVT))
21619 return Op;
21620
21621 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21622}
21623
21624SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21625 SelectionDAG &DAG) const {
21626 EVT DstVT = N->getValueType(0);
21627 SDValue Src = N->getOperand(0);
21628 EVT SrcVT = Src.getValueType();
21629
21630 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21631 // f16 must be promoted before using the lowering in this routine.
21632 // fp128 does not use this lowering.
21633 return SDValue();
21634 }
21635
21636 SDLoc DL(N);
21637 SDValue Chain = DAG.getEntryNode();
21638
21639 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21640
21641 // If we're converting from SSE, the stack slot needs to hold both types.
21642 // Otherwise it only needs to hold the DstVT.
21643 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21644 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21645 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21646 MachinePointerInfo MPI =
21647 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
21648
21649 if (UseSSE) {
21650 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")((void)0);
21651 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21652 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21653 SDValue Ops[] = { Chain, StackPtr };
21654
21655 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21656 /*Align*/ None, MachineMemOperand::MOLoad);
21657 Chain = Src.getValue(1);
21658 }
21659
21660 SDValue StoreOps[] = { Chain, Src, StackPtr };
21661 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21662 StoreOps, DstVT, MPI, /*Align*/ None,
21663 MachineMemOperand::MOStore);
21664
21665 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21666}
21667
21668SDValue
21669X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21670 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21671 // but making use of X86 specifics to produce better instruction sequences.
21672 SDNode *Node = Op.getNode();
21673 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21674 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21675 SDLoc dl(SDValue(Node, 0));
21676 SDValue Src = Node->getOperand(0);
21677
21678 // There are three types involved here: SrcVT is the source floating point
21679 // type, DstVT is the type of the result, and TmpVT is the result of the
21680 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21681 // DstVT).
21682 EVT SrcVT = Src.getValueType();
21683 EVT DstVT = Node->getValueType(0);
21684 EVT TmpVT = DstVT;
21685
21686 // This code is only for floats and doubles. Fall back to generic code for
21687 // anything else.
21688 if (!isScalarFPTypeInSSEReg(SrcVT))
21689 return SDValue();
21690
21691 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21692 unsigned SatWidth = SatVT.getScalarSizeInBits();
21693 unsigned DstWidth = DstVT.getScalarSizeInBits();
21694 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21695 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&((void)0)
21696 "Expected saturation width smaller than result width")((void)0);
21697
21698 // Promote result of FP_TO_*INT to at least 32 bits.
21699 if (TmpWidth < 32) {
21700 TmpVT = MVT::i32;
21701 TmpWidth = 32;
21702 }
21703
21704 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21705 // us to use a native signed conversion instead.
21706 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21707 TmpVT = MVT::i64;
21708 TmpWidth = 64;
21709 }
21710
21711 // If the saturation width is smaller than the size of the temporary result,
21712 // we can always use signed conversion, which is native.
21713 if (SatWidth < TmpWidth)
21714 FpToIntOpcode = ISD::FP_TO_SINT;
21715
21716 // Determine minimum and maximum integer values and their corresponding
21717 // floating-point values.
21718 APInt MinInt, MaxInt;
21719 if (IsSigned) {
21720 MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
21721 MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
21722 } else {
21723 MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
21724 MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
21725 }
21726
21727 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21728 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21729
21730 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21731 MinInt, IsSigned, APFloat::rmTowardZero);
21732 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21733 MaxInt, IsSigned, APFloat::rmTowardZero);
21734 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21735 && !(MaxStatus & APFloat::opStatus::opInexact);
21736
21737 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21738 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21739
21740 // If the integer bounds are exactly representable as floats, emit a
21741 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21742 if (AreExactFloatBounds) {
21743 if (DstVT != TmpVT) {
21744 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21745 SDValue MinClamped = DAG.getNode(
21746 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21747 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21748 SDValue BothClamped = DAG.getNode(
21749 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21750 // Convert clamped value to integer.
21751 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21752
21753 // NaN will become INDVAL, with the top bit set and the rest zero.
21754 // Truncation will discard the top bit, resulting in zero.
21755 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21756 }
21757
21758 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21759 SDValue MinClamped = DAG.getNode(
21760 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21761 // Clamp by MaxFloat from above. NaN cannot occur.
21762 SDValue BothClamped = DAG.getNode(
21763 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21764 // Convert clamped value to integer.
21765 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21766
21767 if (!IsSigned) {
21768 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21769 // which is zero.
21770 return FpToInt;
21771 }
21772
21773 // Otherwise, select zero if Src is NaN.
21774 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21775 return DAG.getSelectCC(
21776 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21777 }
21778
21779 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21780 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21781
21782 // Result of direct conversion, which may be selected away.
21783 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21784
21785 if (DstVT != TmpVT) {
21786 // NaN will become INDVAL, with the top bit set and the rest zero.
21787 // Truncation will discard the top bit, resulting in zero.
21788 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21789 }
21790
21791 SDValue Select = FpToInt;
21792 // For signed conversions where we saturate to the same size as the
21793 // result type of the fptoi instructions, INDVAL coincides with integer
21794 // minimum, so we don't need to explicitly check it.
21795 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21796 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21797 // MinInt if Src is NaN.
21798 Select = DAG.getSelectCC(
21799 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21800 }
21801
21802 // If Src OGT MaxFloat, select MaxInt.
21803 Select = DAG.getSelectCC(
21804 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21805
21806 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21807 // is already zero. The promoted case was already handled above.
21808 if (!IsSigned || DstVT != TmpVT) {
21809 return Select;
21810 }
21811
21812 // Otherwise, select 0 if Src is NaN.
21813 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21814 return DAG.getSelectCC(
21815 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21816}
21817
21818SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21819 bool IsStrict = Op->isStrictFPOpcode();
21820
21821 SDLoc DL(Op);
21822 MVT VT = Op.getSimpleValueType();
21823 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21824 MVT SVT = In.getSimpleValueType();
21825
21826 if (VT == MVT::f128)
21827 return SDValue();
21828
21829 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((void)0);
21830
21831 SDValue Res =
21832 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21833 if (IsStrict)
21834 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21835 {Op->getOperand(0), Res});
21836 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21837}
21838
21839SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21840 bool IsStrict = Op->isStrictFPOpcode();
21841 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21842 // It's legal except when f128 is involved
21843 if (In.getSimpleValueType() != MVT::f128)
21844 return Op;
21845
21846 return SDValue();
21847}
21848
21849static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21850 bool IsStrict = Op->isStrictFPOpcode();
21851 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21852 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&((void)0)
21853 "Unexpected VT!")((void)0);
21854
21855 SDLoc dl(Op);
21856 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21857 DAG.getConstant(0, dl, MVT::v8i16), Src,
21858 DAG.getIntPtrConstant(0, dl));
21859
21860 SDValue Chain;
21861 if (IsStrict) {
21862 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21863 {Op.getOperand(0), Res});
21864 Chain = Res.getValue(1);
21865 } else {
21866 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21867 }
21868
21869 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21870 DAG.getIntPtrConstant(0, dl));
21871
21872 if (IsStrict)
21873 return DAG.getMergeValues({Res, Chain}, dl);
21874
21875 return Res;
21876}
21877
21878static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21879 bool IsStrict = Op->isStrictFPOpcode();
21880 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21881 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&((void)0)
21882 "Unexpected VT!")((void)0);
21883
21884 SDLoc dl(Op);
21885 SDValue Res, Chain;
21886 if (IsStrict) {
21887 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21888 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21889 DAG.getIntPtrConstant(0, dl));
21890 Res = DAG.getNode(
21891 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21892 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21893 Chain = Res.getValue(1);
21894 } else {
21895 // FIXME: Should we use zeros for upper elements for non-strict?
21896 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21897 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21898 DAG.getTargetConstant(4, dl, MVT::i32));
21899 }
21900
21901 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21902 DAG.getIntPtrConstant(0, dl));
21903
21904 if (IsStrict)
21905 return DAG.getMergeValues({Res, Chain}, dl);
21906
21907 return Res;
21908}
21909
21910/// Depending on uarch and/or optimizing for size, we might prefer to use a
21911/// vector operation in place of the typical scalar operation.
21912static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21913 const X86Subtarget &Subtarget) {
21914 // If both operands have other uses, this is probably not profitable.
21915 SDValue LHS = Op.getOperand(0);
21916 SDValue RHS = Op.getOperand(1);
21917 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21918 return Op;
21919
21920 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21921 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21922 if (IsFP && !Subtarget.hasSSE3())
21923 return Op;
21924 if (!IsFP && !Subtarget.hasSSSE3())
21925 return Op;
21926
21927 // Extract from a common vector.
21928 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21929 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21930 LHS.getOperand(0) != RHS.getOperand(0) ||
21931 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21932 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21933 !shouldUseHorizontalOp(true, DAG, Subtarget))
21934 return Op;
21935
21936 // Allow commuted 'hadd' ops.
21937 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21938 unsigned HOpcode;
21939 switch (Op.getOpcode()) {
21940 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21941 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21942 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21943 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21944 default:
21945 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")__builtin_unreachable();
21946 }
21947 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21948 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21949 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21950 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21951 std::swap(LExtIndex, RExtIndex);
21952
21953 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21954 return Op;
21955
21956 SDValue X = LHS.getOperand(0);
21957 EVT VecVT = X.getValueType();
21958 unsigned BitWidth = VecVT.getSizeInBits();
21959 unsigned NumLanes = BitWidth / 128;
21960 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21961 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&((void)0)
21962 "Not expecting illegal vector widths here")((void)0);
21963
21964 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21965 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21966 SDLoc DL(Op);
21967 if (BitWidth == 256 || BitWidth == 512) {
21968 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21969 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21970 LExtIndex %= NumEltsPerLane;
21971 }
21972
21973 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21974 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21975 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21976 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21977 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21978 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21979 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21980}
21981
21982/// Depending on uarch and/or optimizing for size, we might prefer to use a
21983/// vector operation in place of the typical scalar operation.
21984SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21985 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&((void)0)
21986 "Only expecting float/double")((void)0);
21987 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21988}
21989
21990/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21991/// This mode isn't supported in hardware on X86. But as long as we aren't
21992/// compiling with trapping math, we can emulate this with
21993/// floor(X + copysign(nextafter(0.5, 0.0), X)).
21994static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21995 SDValue N0 = Op.getOperand(0);
21996 SDLoc dl(Op);
21997 MVT VT = Op.getSimpleValueType();
21998
21999 // N0 += copysign(nextafter(0.5, 0.0), N0)
22000 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22001 bool Ignored;
22002 APFloat Point5Pred = APFloat(0.5f);
22003 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22004 Point5Pred.next(/*nextDown*/true);
22005
22006 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22007 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22008 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22009
22010 // Truncate the result to remove fraction.
22011 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22012}
22013
22014/// The only differences between FABS and FNEG are the mask and the logic op.
22015/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22016static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22017 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&((void)0)
22018 "Wrong opcode for lowering FABS or FNEG.")((void)0);
22019
22020 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22021
22022 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22023 // into an FNABS. We'll lower the FABS after that if it is still in use.
22024 if (IsFABS)
22025 for (SDNode *User : Op->uses())
22026 if (User->getOpcode() == ISD::FNEG)
22027 return Op;
22028
22029 SDLoc dl(Op);
22030 MVT VT = Op.getSimpleValueType();
22031
22032 bool IsF128 = (VT == MVT::f128);
22033 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||((void)0)
22034 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||((void)0)
22035 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&((void)0)
22036 "Unexpected type in LowerFABSorFNEG")((void)0);
22037
22038 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22039 // decide if we should generate a 16-byte constant mask when we only need 4 or
22040 // 8 bytes for the scalar case.
22041
22042 // There are no scalar bitwise logical SSE/AVX instructions, so we
22043 // generate a 16-byte vector constant and logic op even for the scalar case.
22044 // Using a 16-byte mask allows folding the load of the mask with
22045 // the logic op, so it can save (~4 bytes) on code size.
22046 bool IsFakeVector = !VT.isVector() && !IsF128;
22047 MVT LogicVT = VT;
22048 if (IsFakeVector)
22049 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22050
22051 unsigned EltBits = VT.getScalarSizeInBits();
22052 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22053 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22054 APInt::getSignMask(EltBits);
22055 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22056 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22057
22058 SDValue Op0 = Op.getOperand(0);
22059 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22060 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22061 IsFNABS ? X86ISD::FOR :
22062 X86ISD::FXOR;
22063 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22064
22065 if (VT.isVector() || IsF128)
22066 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22067
22068 // For the scalar case extend to a 128-bit vector, perform the logic op,
22069 // and extract the scalar result back out.
22070 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22071 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22072 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22073 DAG.getIntPtrConstant(0, dl));
22074}
22075
22076static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22077 SDValue Mag = Op.getOperand(0);
22078 SDValue Sign = Op.getOperand(1);
22079 SDLoc dl(Op);
22080
22081 // If the sign operand is smaller, extend it first.
22082 MVT VT = Op.getSimpleValueType();
22083 if (Sign.getSimpleValueType().bitsLT(VT))
22084 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22085
22086 // And if it is bigger, shrink it first.
22087 if (Sign.getSimpleValueType().bitsGT(VT))
22088 Sign =
22089 DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22090
22091 // At this point the operands and the result should have the same
22092 // type, and that won't be f80 since that is not custom lowered.
22093 bool IsF128 = (VT == MVT::f128);
22094 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||((void)0)
22095 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||((void)0)
22096 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&((void)0)
22097 "Unexpected type in LowerFCOPYSIGN")((void)0);
22098
22099 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22100
22101 // Perform all scalar logic operations as 16-byte vectors because there are no
22102 // scalar FP logic instructions in SSE.
22103 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22104 // unnecessary splats, but we might miss load folding opportunities. Should
22105 // this decision be based on OptimizeForSize?
22106 bool IsFakeVector = !VT.isVector() && !IsF128;
22107 MVT LogicVT = VT;
22108 if (IsFakeVector)
22109 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22110
22111 // The mask constants are automatically splatted for vector types.
22112 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22113 SDValue SignMask = DAG.getConstantFP(
22114 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22115 SDValue MagMask = DAG.getConstantFP(
22116 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22117
22118 // First, clear all bits but the sign bit from the second operand (sign).
22119 if (IsFakeVector)
22120 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22121 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22122
22123 // Next, clear the sign bit from the first operand (magnitude).
22124 // TODO: If we had general constant folding for FP logic ops, this check
22125 // wouldn't be necessary.
22126 SDValue MagBits;
22127 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22128 APFloat APF = Op0CN->getValueAPF();
22129 APF.clearSign();
22130 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22131 } else {
22132 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22133 if (IsFakeVector)
22134 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22135 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22136 }
22137
22138 // OR the magnitude value with the sign bit.
22139 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22140 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22141 DAG.getIntPtrConstant(0, dl));
22142}
22143
22144static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22145 SDValue N0 = Op.getOperand(0);
22146 SDLoc dl(Op);
22147 MVT VT = Op.getSimpleValueType();
22148
22149 MVT OpVT = N0.getSimpleValueType();
22150 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&((void)0)
22151 "Unexpected type for FGETSIGN")((void)0);
22152
22153 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22154 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22155 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22156 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22157 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22158 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22159 return Res;
22160}
22161
22162/// Helper for creating a X86ISD::SETCC node.
22163static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22164 SelectionDAG &DAG) {
22165 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22166 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22167}
22168
22169/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22170/// style scalarized (associative) reduction patterns. Partial reductions
22171/// are supported when the pointer SrcMask is non-null.
22172/// TODO - move this to SelectionDAG?
22173static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22174 SmallVectorImpl<SDValue> &SrcOps,
22175 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22176 SmallVector<SDValue, 8> Opnds;
22177 DenseMap<SDValue, APInt> SrcOpMap;
22178 EVT VT = MVT::Other;
22179
22180 // Recognize a special case where a vector is casted into wide integer to
22181 // test all 0s.
22182 assert(Op.getOpcode() == unsigned(BinOp) &&((void)0)
22183 "Unexpected bit reduction opcode")((void)0);
22184 Opnds.push_back(Op.getOperand(0));
22185 Opnds.push_back(Op.getOperand(1));
22186
22187 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22188 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22189 // BFS traverse all BinOp operands.
22190 if (I->getOpcode() == unsigned(BinOp)) {
22191 Opnds.push_back(I->getOperand(0));
22192 Opnds.push_back(I->getOperand(1));
22193 // Re-evaluate the number of nodes to be traversed.
22194 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22195 continue;
22196 }
22197
22198 // Quit if a non-EXTRACT_VECTOR_ELT
22199 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22200 return false;
22201
22202 // Quit if without a constant index.
22203 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22204 if (!Idx)
22205 return false;
22206
22207 SDValue Src = I->getOperand(0);
22208 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22209 if (M == SrcOpMap.end()) {
22210 VT = Src.getValueType();
22211 // Quit if not the same type.
22212 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22213 return false;
22214 unsigned NumElts = VT.getVectorNumElements();
22215 APInt EltCount = APInt::getNullValue(NumElts);
22216 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22217 SrcOps.push_back(Src);
22218 }
22219
22220 // Quit if element already used.
22221 unsigned CIdx = Idx->getZExtValue();
22222 if (M->second[CIdx])
22223 return false;
22224 M->second.setBit(CIdx);
22225 }
22226
22227 if (SrcMask) {
22228 // Collect the source partial masks.
22229 for (SDValue &SrcOp : SrcOps)
22230 SrcMask->push_back(SrcOpMap[SrcOp]);
22231 } else {
22232 // Quit if not all elements are used.
22233 for (const auto &I : SrcOpMap)
22234 if (!I.second.isAllOnesValue())
22235 return false;
22236 }
22237
22238 return true;
22239}
22240
22241// Helper function for comparing all bits of a vector against zero.
22242static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22243 const APInt &Mask,
22244 const X86Subtarget &Subtarget,
22245 SelectionDAG &DAG, X86::CondCode &X86CC) {
22246 EVT VT = V.getValueType();
22247 unsigned ScalarSize = VT.getScalarSizeInBits();
22248 if (Mask.getBitWidth() != ScalarSize) {
22249 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")((void)0);
22250 return SDValue();
22251 }
22252
22253 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")((void)0);
22254 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22255
22256 auto MaskBits = [&](SDValue Src) {
22257 if (Mask.isAllOnesValue())
22258 return Src;
22259 EVT SrcVT = Src.getValueType();
22260 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22261 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22262 };
22263
22264 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22265 if (VT.getSizeInBits() < 128) {
22266 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22267 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22268 return SDValue();
22269 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22270 DAG.getBitcast(IntVT, MaskBits(V)),
22271 DAG.getConstant(0, DL, IntVT));
22272 }
22273
22274 // Quit if not splittable to 128/256-bit vector.
22275 if (!isPowerOf2_32(VT.getSizeInBits()))
22276 return SDValue();
22277
22278 // Split down to 128/256-bit vector.
22279 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22280 while (VT.getSizeInBits() > TestSize) {
22281 auto Split = DAG.SplitVector(V, DL);
22282 VT = Split.first.getValueType();
22283 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22284 }
22285
22286 bool UsePTEST = Subtarget.hasSSE41();
22287 if (UsePTEST) {
22288 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22289 V = DAG.getBitcast(TestVT, MaskBits(V));
22290 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22291 }
22292
22293 // Without PTEST, a masked v2i64 or-reduction is not faster than
22294 // scalarization.
22295 if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22296 return SDValue();
22297
22298 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22299 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22300 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22301 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22302 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22303 DAG.getConstant(0xFFFF, DL, MVT::i32));
22304}
22305
22306// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22307// CMP(MOVMSK(PCMPEQB(X,0))).
22308static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22309 const SDLoc &DL,
22310 const X86Subtarget &Subtarget,
22311 SelectionDAG &DAG, SDValue &X86CC) {
22312 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")((void)0);
22313
22314 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22315 return SDValue();
22316
22317 // Check whether we're masking/truncating an OR-reduction result, in which
22318 // case track the masked bits.
22319 APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22320 switch (Op.getOpcode()) {
22321 case ISD::TRUNCATE: {
22322 SDValue Src = Op.getOperand(0);
22323 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22324 Op.getScalarValueSizeInBits());
22325 Op = Src;
22326 break;
22327 }
22328 case ISD::AND: {
22329 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22330 Mask = Cst->getAPIntValue();
22331 Op = Op.getOperand(0);
22332 }
22333 break;
22334 }
22335 }
22336
22337 SmallVector<SDValue, 8> VecIns;
22338 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22339 EVT VT = VecIns[0].getValueType();
22340 assert(llvm::all_of(VecIns,((void)0)
22341 [VT](SDValue V) { return VT == V.getValueType(); }) &&((void)0)
22342 "Reduction source vector mismatch")((void)0);
22343
22344 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22345 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22346 return SDValue();
22347
22348 // If more than one full vector is evaluated, OR them first before PTEST.
22349 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22350 Slot += 2, e += 1) {
22351 // Each iteration will OR 2 nodes and append the result until there is
22352 // only 1 node left, i.e. the final OR'd value of all vectors.
22353 SDValue LHS = VecIns[Slot];
22354 SDValue RHS = VecIns[Slot + 1];
22355 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22356 }
22357
22358 X86::CondCode CCode;
22359 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22360 DAG, CCode)) {
22361 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22362 return V;
22363 }
22364 }
22365
22366 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22367 ISD::NodeType BinOp;
22368 if (SDValue Match =
22369 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22370 X86::CondCode CCode;
22371 if (SDValue V =
22372 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22373 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22374 return V;
22375 }
22376 }
22377 }
22378
22379 return SDValue();
22380}
22381
22382/// return true if \c Op has a use that doesn't just read flags.
22383static bool hasNonFlagsUse(SDValue Op) {
22384 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22385 ++UI) {
22386 SDNode *User = *UI;
22387 unsigned UOpNo = UI.getOperandNo();
22388 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22389 // Look pass truncate.
22390 UOpNo = User->use_begin().getOperandNo();
22391 User = *User->use_begin();
22392 }
22393
22394 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22395 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22396 return true;
22397 }
22398 return false;
22399}
22400
22401// Transform to an x86-specific ALU node with flags if there is a chance of
22402// using an RMW op or only the flags are used. Otherwise, leave
22403// the node alone and emit a 'cmp' or 'test' instruction.
22404static bool isProfitableToUseFlagOp(SDValue Op) {
22405 for (SDNode *U : Op->uses())
22406 if (U->getOpcode() != ISD::CopyToReg &&
22407 U->getOpcode() != ISD::SETCC &&
22408 U->getOpcode() != ISD::STORE)
22409 return false;
22410
22411 return true;
22412}
22413
22414/// Emit nodes that will be selected as "test Op0,Op0", or something
22415/// equivalent.
22416static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22417 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22418 // CF and OF aren't always set the way we want. Determine which
22419 // of these we need.
22420 bool NeedCF = false;
22421 bool NeedOF = false;
22422 switch (X86CC) {
22423 default: break;
22424 case X86::COND_A: case X86::COND_AE:
22425 case X86::COND_B: case X86::COND_BE:
22426 NeedCF = true;
22427 break;
22428 case X86::COND_G: case X86::COND_GE:
22429 case X86::COND_L: case X86::COND_LE:
22430 case X86::COND_O: case X86::COND_NO: {
22431 // Check if we really need to set the
22432 // Overflow flag. If NoSignedWrap is present
22433 // that is not actually needed.
22434 switch (Op->getOpcode()) {
22435 case ISD::ADD:
22436 case ISD::SUB:
22437 case ISD::MUL:
22438 case ISD::SHL:
22439 if (Op.getNode()->getFlags().hasNoSignedWrap())
22440 break;
22441 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22442 default:
22443 NeedOF = true;
22444 break;
22445 }
22446 break;
22447 }
22448 }
22449 // See if we can use the EFLAGS value from the operand instead of
22450 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22451 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22452 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22453 // Emit a CMP with 0, which is the TEST pattern.
22454 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22455 DAG.getConstant(0, dl, Op.getValueType()));
22456 }
22457 unsigned Opcode = 0;
22458 unsigned NumOperands = 0;
22459
22460 SDValue ArithOp = Op;
22461
22462 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22463 // which may be the result of a CAST. We use the variable 'Op', which is the
22464 // non-casted variable when we check for possible users.
22465 switch (ArithOp.getOpcode()) {
22466 case ISD::AND:
22467 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22468 // because a TEST instruction will be better.
22469 if (!hasNonFlagsUse(Op))
22470 break;
22471
22472 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22473 case ISD::ADD:
22474 case ISD::SUB:
22475 case ISD::OR:
22476 case ISD::XOR:
22477 if (!isProfitableToUseFlagOp(Op))
22478 break;
22479
22480 // Otherwise use a regular EFLAGS-setting instruction.
22481 switch (ArithOp.getOpcode()) {
22482 default: llvm_unreachable("unexpected operator!")__builtin_unreachable();
22483 case ISD::ADD: Opcode = X86ISD::ADD; break;
22484 case ISD::SUB: Opcode = X86ISD::SUB; break;
22485 case ISD::XOR: Opcode = X86ISD::XOR; break;
22486 case ISD::AND: Opcode = X86ISD::AND; break;
22487 case ISD::OR: Opcode = X86ISD::OR; break;
22488 }
22489
22490 NumOperands = 2;
22491 break;
22492 case X86ISD::ADD:
22493 case X86ISD::SUB:
22494 case X86ISD::OR:
22495 case X86ISD::XOR:
22496 case X86ISD::AND:
22497 return SDValue(Op.getNode(), 1);
22498 case ISD::SSUBO:
22499 case ISD::USUBO: {
22500 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22501 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22502 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22503 Op->getOperand(1)).getValue(1);
22504 }
22505 default:
22506 break;
22507 }
22508
22509 if (Opcode == 0) {
22510 // Emit a CMP with 0, which is the TEST pattern.
22511 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22512 DAG.getConstant(0, dl, Op.getValueType()));
22513 }
22514 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22515 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22516
22517 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22518 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22519 return SDValue(New.getNode(), 1);
22520}
22521
22522/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22523/// equivalent.
22524static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22525 const SDLoc &dl, SelectionDAG &DAG,
22526 const X86Subtarget &Subtarget) {
22527 if (isNullConstant(Op1))
22528 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22529
22530 EVT CmpVT = Op0.getValueType();
22531
22532 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||((void)0)
22533 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")((void)0);
22534
22535 // Only promote the compare up to I32 if it is a 16 bit operation
22536 // with an immediate. 16 bit immediates are to be avoided.
22537 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22538 !DAG.getMachineFunction().getFunction().hasMinSize()) {
22539 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22540 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22541 // Don't do this if the immediate can fit in 8-bits.
22542 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22543 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22544 unsigned ExtendOp =
22545 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22546 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22547 // For equality comparisons try to use SIGN_EXTEND if the input was
22548 // truncate from something with enough sign bits.
22549 if (Op0.getOpcode() == ISD::TRUNCATE) {
22550 SDValue In = Op0.getOperand(0);
22551 unsigned EffBits =
22552 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22553 if (EffBits <= 16)
22554 ExtendOp = ISD::SIGN_EXTEND;
22555 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22556 SDValue In = Op1.getOperand(0);
22557 unsigned EffBits =
22558 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22559 if (EffBits <= 16)
22560 ExtendOp = ISD::SIGN_EXTEND;
22561 }
22562 }
22563
22564 CmpVT = MVT::i32;
22565 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22566 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22567 }
22568 }
22569
22570 // Try to shrink i64 compares if the input has enough zero bits.
22571 // FIXME: Do this for non-constant compares for constant on LHS?
22572 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22573 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22574 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22575 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22576 CmpVT = MVT::i32;
22577 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22578 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22579 }
22580
22581 // 0-x == y --> x+y == 0
22582 // 0-x != y --> x+y != 0
22583 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22584 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22585 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22586 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22587 return Add.getValue(1);
22588 }
22589
22590 // x == 0-y --> x+y == 0
22591 // x != 0-y --> x+y != 0
22592 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22593 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22594 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22595 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22596 return Add.getValue(1);
22597 }
22598
22599 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22600 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22601 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22602 return Sub.getValue(1);
22603}
22604
22605/// Check if replacement of SQRT with RSQRT should be disabled.
22606bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22607 EVT VT = Op.getValueType();
22608
22609 // We never want to use both SQRT and RSQRT instructions for the same input.
22610 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22611 return false;
22612
22613 if (VT.isVector())
22614 return Subtarget.hasFastVectorFSQRT();
22615 return Subtarget.hasFastScalarFSQRT();
22616}
22617
22618/// The minimum architected relative accuracy is 2^-12. We need one
22619/// Newton-Raphson step to have a good float result (24 bits of precision).
22620SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22621 SelectionDAG &DAG, int Enabled,
22622 int &RefinementSteps,
22623 bool &UseOneConstNR,
22624 bool Reciprocal) const {
22625 EVT VT = Op.getValueType();
22626
22627 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22628 // It is likely not profitable to do this for f64 because a double-precision
22629 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22630 // instructions: convert to single, rsqrtss, convert back to double, refine
22631 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22632 // along with FMA, this could be a throughput win.
22633 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22634 // after legalize types.
22635 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22636 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22637 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22638 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22639 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22640 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22641 RefinementSteps = 1;
22642
22643 UseOneConstNR = false;
22644 // There is no FSQRT for 512-bits, but there is RSQRT14.
22645 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22646 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22647 }
22648 return SDValue();
22649}
22650
22651/// The minimum architected relative accuracy is 2^-12. We need one
22652/// Newton-Raphson step to have a good float result (24 bits of precision).
22653SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22654 int Enabled,
22655 int &RefinementSteps) const {
22656 EVT VT = Op.getValueType();
22657
22658 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22659 // It is likely not profitable to do this for f64 because a double-precision
22660 // reciprocal estimate with refinement on x86 prior to FMA requires
22661 // 15 instructions: convert to single, rcpss, convert back to double, refine
22662 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22663 // along with FMA, this could be a throughput win.
22664
22665 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22666 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22667 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22668 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22669 // Enable estimate codegen with 1 refinement step for vector division.
22670 // Scalar division estimates are disabled because they break too much
22671 // real-world code. These defaults are intended to match GCC behavior.
22672 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22673 return SDValue();
22674
22675 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22676 RefinementSteps = 1;
22677
22678 // There is no FSQRT for 512-bits, but there is RCP14.
22679 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22680 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22681 }
22682 return SDValue();
22683}
22684
22685/// If we have at least two divisions that use the same divisor, convert to
22686/// multiplication by a reciprocal. This may need to be adjusted for a given
22687/// CPU if a division's cost is not at least twice the cost of a multiplication.
22688/// This is because we still need one division to calculate the reciprocal and
22689/// then we need two multiplies by that reciprocal as replacements for the
22690/// original divisions.
22691unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22692 return 2;
22693}
22694
22695SDValue
22696X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22697 SelectionDAG &DAG,
22698 SmallVectorImpl<SDNode *> &Created) const {
22699 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22700 if (isIntDivCheap(N->getValueType(0), Attr))
22701 return SDValue(N,0); // Lower SDIV as SDIV
22702
22703 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&((void)0)
22704 "Unexpected divisor!")((void)0);
22705
22706 // Only perform this transform if CMOV is supported otherwise the select
22707 // below will become a branch.
22708 if (!Subtarget.hasCMov())
22709 return SDValue();
22710
22711 // fold (sdiv X, pow2)
22712 EVT VT = N->getValueType(0);
22713 // FIXME: Support i8.
22714 if (VT != MVT::i16 && VT != MVT::i32 &&
22715 !(Subtarget.is64Bit() && VT == MVT::i64))
22716 return SDValue();
22717
22718 unsigned Lg2 = Divisor.countTrailingZeros();
22719
22720 // If the divisor is 2 or -2, the default expansion is better.
22721 if (Lg2 == 1)
22722 return SDValue();
22723
22724 SDLoc DL(N);
22725 SDValue N0 = N->getOperand(0);
22726 SDValue Zero = DAG.getConstant(0, DL, VT);
22727 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
22728 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
22729
22730 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
22731 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
22732 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
22733 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
22734
22735 Created.push_back(Cmp.getNode());
22736 Created.push_back(Add.getNode());
22737 Created.push_back(CMov.getNode());
22738
22739 // Divide by pow2.
22740 SDValue SRA =
22741 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
22742
22743 // If we're dividing by a positive value, we're done. Otherwise, we must
22744 // negate the result.
22745 if (Divisor.isNonNegative())
22746 return SRA;
22747
22748 Created.push_back(SRA.getNode());
22749 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
22750}
22751
22752/// Result of 'and' is compared against zero. Change to a BT node if possible.
22753/// Returns the BT node and the condition code needed to use it.
22754static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
22755 const SDLoc &dl, SelectionDAG &DAG,
22756 SDValue &X86CC) {
22757 assert(And.getOpcode() == ISD::AND && "Expected AND node!")((void)0);
22758 SDValue Op0 = And.getOperand(0);
22759 SDValue Op1 = And.getOperand(1);
22760 if (Op0.getOpcode() == ISD::TRUNCATE)
22761 Op0 = Op0.getOperand(0);
22762 if (Op1.getOpcode() == ISD::TRUNCATE)
22763 Op1 = Op1.getOperand(0);
22764
22765 SDValue Src, BitNo;
22766 if (Op1.getOpcode() == ISD::SHL)
22767 std::swap(Op0, Op1);
22768 if (Op0.getOpcode() == ISD::SHL) {
22769 if (isOneConstant(Op0.getOperand(0))) {
22770 // If we looked past a truncate, check that it's only truncating away
22771 // known zeros.
22772 unsigned BitWidth = Op0.getValueSizeInBits();
22773 unsigned AndBitWidth = And.getValueSizeInBits();
22774 if (BitWidth > AndBitWidth) {
22775 KnownBits Known = DAG.computeKnownBits(Op0);
22776 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22777 return SDValue();
22778 }
22779 Src = Op1;
22780 BitNo = Op0.getOperand(1);
22781 }
22782 } else if (Op1.getOpcode() == ISD::Constant) {
22783 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22784 uint64_t AndRHSVal = AndRHS->getZExtValue();
22785 SDValue AndLHS = Op0;
22786
22787 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22788 Src = AndLHS.getOperand(0);
22789 BitNo = AndLHS.getOperand(1);
22790 } else {
22791 // Use BT if the immediate can't be encoded in a TEST instruction or we
22792 // are optimizing for size and the immedaite won't fit in a byte.
22793 bool OptForSize = DAG.shouldOptForSize();
22794 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22795 isPowerOf2_64(AndRHSVal)) {
22796 Src = AndLHS;
22797 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22798 Src.getValueType());
22799 }
22800 }
22801 }
22802
22803 // No patterns found, give up.
22804 if (!Src.getNode())
22805 return SDValue();
22806
22807 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22808 // instruction. Since the shift amount is in-range-or-undefined, we know
22809 // that doing a bittest on the i32 value is ok. We extend to i32 because
22810 // the encoding for the i16 version is larger than the i32 version.
22811 // Also promote i16 to i32 for performance / code size reason.
22812 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
22813 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
22814
22815 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22816 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22817 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22818 // known to be zero.
22819 if (Src.getValueType() == MVT::i64 &&
22820 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22821 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
22822
22823 // If the operand types disagree, extend the shift amount to match. Since
22824 // BT ignores high bits (like shifts) we can use anyextend.
22825 if (Src.getValueType() != BitNo.getValueType())
22826 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
22827
22828 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
22829 dl, MVT::i8);
22830 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
22831}
22832
22833/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22834/// CMPs.
22835static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22836 SDValue &Op1, bool &IsAlwaysSignaling) {
22837 unsigned SSECC;
22838 bool Swap = false;
22839
22840 // SSE Condition code mapping:
22841 // 0 - EQ
22842 // 1 - LT
22843 // 2 - LE
22844 // 3 - UNORD
22845 // 4 - NEQ
22846 // 5 - NLT
22847 // 6 - NLE
22848 // 7 - ORD
22849 switch (SetCCOpcode) {
22850 default: llvm_unreachable("Unexpected SETCC condition")__builtin_unreachable();
22851 case ISD::SETOEQ:
22852 case ISD::SETEQ: SSECC = 0; break;
22853 case ISD::SETOGT:
22854 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22855 case ISD::SETLT:
22856 case ISD::SETOLT: SSECC = 1; break;
22857 case ISD::SETOGE:
22858 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22859 case ISD::SETLE:
22860 case ISD::SETOLE: SSECC = 2; break;
22861 case ISD::SETUO: SSECC = 3; break;
22862 case ISD::SETUNE:
22863 case ISD::SETNE: SSECC = 4; break;
22864 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22865 case ISD::SETUGE: SSECC = 5; break;
22866 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22867 case ISD::SETUGT: SSECC = 6; break;
22868 case ISD::SETO: SSECC = 7; break;
22869 case ISD::SETUEQ: SSECC = 8; break;
22870 case ISD::SETONE: SSECC = 12; break;
22871 }
22872 if (Swap)
22873 std::swap(Op0, Op1);
22874
22875 switch (SetCCOpcode) {
22876 default:
22877 IsAlwaysSignaling = true;
22878 break;
22879 case ISD::SETEQ:
22880 case ISD::SETOEQ:
22881 case ISD::SETUEQ:
22882 case ISD::SETNE:
22883 case ISD::SETONE:
22884 case ISD::SETUNE:
22885 case ISD::SETO:
22886 case ISD::SETUO:
22887 IsAlwaysSignaling = false;
22888 break;
22889 }
22890
22891 return SSECC;
22892}
22893
22894/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22895/// concatenate the result back.
22896static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22897 ISD::CondCode Cond, SelectionDAG &DAG,
22898 const SDLoc &dl) {
22899 assert(VT.isInteger() && VT == LHS.getValueType() &&((void)0)
22900 VT == RHS.getValueType() && "Unsupported VTs!")((void)0);
22901
22902 SDValue CC = DAG.getCondCode(Cond);
22903
22904 // Extract the LHS Lo/Hi vectors
22905 SDValue LHS1, LHS2;
22906 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22907
22908 // Extract the RHS Lo/Hi vectors
22909 SDValue RHS1, RHS2;
22910 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22911
22912 // Issue the operation on the smaller types and concatenate the result back
22913 EVT LoVT, HiVT;
22914 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22915 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22916 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22917 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22918}
22919
22920static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22921
22922 SDValue Op0 = Op.getOperand(0);
22923 SDValue Op1 = Op.getOperand(1);
22924 SDValue CC = Op.getOperand(2);
22925 MVT VT = Op.getSimpleValueType();
22926 SDLoc dl(Op);
22927
22928 assert(VT.getVectorElementType() == MVT::i1 &&((void)0)
22929 "Cannot set masked compare for this operation")((void)0);
22930
22931 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22932
22933 // Prefer SETGT over SETLT.
22934 if (SetCCOpcode == ISD::SETLT) {
22935 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22936 std::swap(Op0, Op1);
22937 }
22938
22939 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22940}
22941
22942/// Given a buildvector constant, return a new vector constant with each element
22943/// incremented or decremented. If incrementing or decrementing would result in
22944/// unsigned overflow or underflow or this is not a simple vector constant,
22945/// return an empty value.
22946static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
22947 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22948 if (!BV)
22949 return SDValue();
22950
22951 MVT VT = V.getSimpleValueType();
22952 MVT EltVT = VT.getVectorElementType();
22953 unsigned NumElts = VT.getVectorNumElements();
22954 SmallVector<SDValue, 8> NewVecC;
22955 SDLoc DL(V);
22956 for (unsigned i = 0; i < NumElts; ++i) {
22957 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22958 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22959 return SDValue();
22960
22961 // Avoid overflow/underflow.
22962 const APInt &EltC = Elt->getAPIntValue();
22963 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
22964 return SDValue();
22965
22966 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22967 }
22968
22969 return DAG.getBuildVector(VT, DL, NewVecC);
22970}
22971
22972/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22973/// Op0 u<= Op1:
22974/// t = psubus Op0, Op1
22975/// pcmpeq t, <0..0>
22976static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22977 ISD::CondCode Cond, const SDLoc &dl,
22978 const X86Subtarget &Subtarget,
22979 SelectionDAG &DAG) {
22980 if (!Subtarget.hasSSE2())
22981 return SDValue();
22982
22983 MVT VET = VT.getVectorElementType();
22984 if (VET != MVT::i8 && VET != MVT::i16)
22985 return SDValue();
22986
22987 switch (Cond) {
22988 default:
22989 return SDValue();
22990 case ISD::SETULT: {
22991 // If the comparison is against a constant we can turn this into a
22992 // setule. With psubus, setule does not require a swap. This is
22993 // beneficial because the constant in the register is no longer
22994 // destructed as the destination so it can be hoisted out of a loop.
22995 // Only do this pre-AVX since vpcmp* is no longer destructive.
22996 if (Subtarget.hasAVX())
22997 return SDValue();
22998 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
22999 if (!ULEOp1)
23000 return SDValue();
23001 Op1 = ULEOp1;
23002 break;
23003 }
23004 case ISD::SETUGT: {
23005 // If the comparison is against a constant, we can turn this into a setuge.
23006 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23007 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23008 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23009 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23010 if (!UGEOp1)
23011 return SDValue();
23012 Op1 = Op0;
23013 Op0 = UGEOp1;
23014 break;
23015 }
23016 // Psubus is better than flip-sign because it requires no inversion.
23017 case ISD::SETUGE:
23018 std::swap(Op0, Op1);
23019 break;
23020 case ISD::SETULE:
23021 break;
23022 }
23023
23024 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23025 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23026 DAG.getConstant(0, dl, VT));
23027}
23028
23029static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23030 SelectionDAG &DAG) {
23031 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23032 Op.getOpcode() == ISD::STRICT_FSETCCS;
23033 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23034 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23035 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23036 MVT VT = Op->getSimpleValueType(0);
23037 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23038 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23039 SDLoc dl(Op);
23040
23041 if (isFP) {
23042#ifndef NDEBUG1
23043 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23044 assert(EltVT == MVT::f32 || EltVT == MVT::f64)((void)0);
23045#endif
23046
23047 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23048 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23049
23050 // If we have a strict compare with a vXi1 result and the input is 128/256
23051 // bits we can't use a masked compare unless we have VLX. If we use a wider
23052 // compare like we do for non-strict, we might trigger spurious exceptions
23053 // from the upper elements. Instead emit a AVX compare and convert to mask.
23054 unsigned Opc;
23055 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23056 (!IsStrict || Subtarget.hasVLX() ||
23057 Op0.getSimpleValueType().is512BitVector())) {
23058 assert(VT.getVectorNumElements() <= 16)((void)0);
23059 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23060 } else {
23061 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23062 // The SSE/AVX packed FP comparison nodes are defined with a
23063 // floating-point vector result that matches the operand type. This allows
23064 // them to work with an SSE1 target (integer vector types are not legal).
23065 VT = Op0.getSimpleValueType();
23066 }
23067
23068 SDValue Cmp;
23069 bool IsAlwaysSignaling;
23070 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23071 if (!Subtarget.hasAVX()) {
23072 // TODO: We could use following steps to handle a quiet compare with
23073 // signaling encodings.
23074 // 1. Get ordered masks from a quiet ISD::SETO
23075 // 2. Use the masks to mask potential unordered elements in operand A, B
23076 // 3. Get the compare results of masked A, B
23077 // 4. Calculating final result using the mask and result from 3
23078 // But currently, we just fall back to scalar operations.
23079 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23080 return SDValue();
23081
23082 // Insert an extra signaling instruction to raise exception.
23083 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23084 SDValue SignalCmp = DAG.getNode(
23085 Opc, dl, {VT, MVT::Other},
23086 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23087 // FIXME: It seems we need to update the flags of all new strict nodes.
23088 // Otherwise, mayRaiseFPException in MI will return false due to
23089 // NoFPExcept = false by default. However, I didn't find it in other
23090 // patches.
23091 SignalCmp->setFlags(Op->getFlags());
23092 Chain = SignalCmp.getValue(1);
23093 }
23094
23095 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23096 // emit two comparisons and a logic op to tie them together.
23097 if (SSECC >= 8) {
23098 // LLVM predicate is SETUEQ or SETONE.
23099 unsigned CC0, CC1;
23100 unsigned CombineOpc;
23101 if (Cond == ISD::SETUEQ) {
23102 CC0 = 3; // UNORD
23103 CC1 = 0; // EQ
23104 CombineOpc = X86ISD::FOR;
23105 } else {
23106 assert(Cond == ISD::SETONE)((void)0);
23107 CC0 = 7; // ORD
23108 CC1 = 4; // NEQ
23109 CombineOpc = X86ISD::FAND;
23110 }
23111
23112 SDValue Cmp0, Cmp1;
23113 if (IsStrict) {
23114 Cmp0 = DAG.getNode(
23115 Opc, dl, {VT, MVT::Other},
23116 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23117 Cmp1 = DAG.getNode(
23118 Opc, dl, {VT, MVT::Other},
23119 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23120 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23121 Cmp1.getValue(1));
23122 } else {
23123 Cmp0 = DAG.getNode(
23124 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23125 Cmp1 = DAG.getNode(
23126 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23127 }
23128 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23129 } else {
23130 if (IsStrict) {
23131 Cmp = DAG.getNode(
23132 Opc, dl, {VT, MVT::Other},
23133 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23134 Chain = Cmp.getValue(1);
23135 } else
23136 Cmp = DAG.getNode(
23137 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23138 }
23139 } else {
23140 // Handle all other FP comparisons here.
23141 if (IsStrict) {
23142 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23143 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23144 Cmp = DAG.getNode(
23145 Opc, dl, {VT, MVT::Other},
23146 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23147 Chain = Cmp.getValue(1);
23148 } else
23149 Cmp = DAG.getNode(
23150 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23151 }
23152
23153 if (VT.getFixedSizeInBits() >
23154 Op.getSimpleValueType().getFixedSizeInBits()) {
23155 // We emitted a compare with an XMM/YMM result. Finish converting to a
23156 // mask register using a vptestm.
23157 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23158 Cmp = DAG.getBitcast(CastVT, Cmp);
23159 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23160 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23161 } else {
23162 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23163 // the result type of SETCC. The bitcast is expected to be optimized
23164 // away during combining/isel.
23165 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23166 }
23167
23168 if (IsStrict)
23169 return DAG.getMergeValues({Cmp, Chain}, dl);
23170
23171 return Cmp;
23172 }
23173
23174 assert(!IsStrict && "Strict SETCC only handles FP operands.")((void)0);
23175
23176 MVT VTOp0 = Op0.getSimpleValueType();
23177 (void)VTOp0;
23178 assert(VTOp0 == Op1.getSimpleValueType() &&((void)0)
23179 "Expected operands with same type!")((void)0);
23180 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((void)0)
23181 "Invalid number of packed elements for source and destination!")((void)0);
23182
23183 // The non-AVX512 code below works under the assumption that source and
23184 // destination types are the same.
23185 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&((void)0)
23186 "Value types for source and destination must be the same!")((void)0);
23187
23188 // The result is boolean, but operands are int/float
23189 if (VT.getVectorElementType() == MVT::i1) {
23190 // In AVX-512 architecture setcc returns mask with i1 elements,
23191 // But there is no compare instruction for i8 and i16 elements in KNL.
23192 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&((void)0)
23193 "Unexpected operand type")((void)0);
23194 return LowerIntVSETCC_AVX512(Op, DAG);
23195 }
23196
23197 // Lower using XOP integer comparisons.
23198 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23199 // Translate compare code to XOP PCOM compare mode.
23200 unsigned CmpMode = 0;
23201 switch (Cond) {
23202 default: llvm_unreachable("Unexpected SETCC condition")__builtin_unreachable();
23203 case ISD::SETULT:
23204 case ISD::SETLT: CmpMode = 0x00; break;
23205 case ISD::SETULE:
23206 case ISD::SETLE: CmpMode = 0x01; break;
23207 case ISD::SETUGT:
23208 case ISD::SETGT: CmpMode = 0x02; break;
23209 case ISD::SETUGE:
23210 case ISD::SETGE: CmpMode = 0x03; break;
23211 case ISD::SETEQ: CmpMode = 0x04; break;
23212 case ISD::SETNE: CmpMode = 0x05; break;
23213 }
23214
23215 // Are we comparing unsigned or signed integers?
23216 unsigned Opc =
23217 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23218
23219 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23220 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23221 }
23222
23223 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23224 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23225 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23226 SDValue BC0 = peekThroughBitcasts(Op0);
23227 if (BC0.getOpcode() == ISD::AND) {
23228 APInt UndefElts;
23229 SmallVector<APInt, 64> EltBits;
23230 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23231 VT.getScalarSizeInBits(), UndefElts,
23232 EltBits, false, false)) {
23233 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23234 Cond = ISD::SETEQ;
23235 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23236 }
23237 }
23238 }
23239 }
23240
23241 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23242 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23243 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23244 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23245 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23246 unsigned BitWidth = VT.getScalarSizeInBits();
23247 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23248
23249 SDValue Result = Op0.getOperand(0);
23250 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23251 DAG.getConstant(ShiftAmt, dl, VT));
23252 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23253 DAG.getConstant(BitWidth - 1, dl, VT));
23254 return Result;
23255 }
23256 }
23257
23258 // Break 256-bit integer vector compare into smaller ones.
23259 if (VT.is256BitVector() && !Subtarget.hasInt256())
23260 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23261
23262 if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23263 assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!")((void)0);
23264 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23265 }
23266
23267 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23268 // not-of-PCMPEQ:
23269 // X != INT_MIN --> X >s INT_MIN
23270 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23271 // +X != 0 --> +X >s 0
23272 APInt ConstValue;
23273 if (Cond == ISD::SETNE &&
23274 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23275 if (ConstValue.isMinSignedValue())
23276 Cond = ISD::SETGT;
23277 else if (ConstValue.isMaxSignedValue())
23278 Cond = ISD::SETLT;
23279 else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23280 Cond = ISD::SETGT;
23281 }
23282
23283 // If both operands are known non-negative, then an unsigned compare is the
23284 // same as a signed compare and there's no need to flip signbits.
23285 // TODO: We could check for more general simplifications here since we're
23286 // computing known bits.
23287 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23288 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23289
23290 // Special case: Use min/max operations for unsigned compares.
23291 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23292 if (ISD::isUnsignedIntSetCC(Cond) &&
23293 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23294 TLI.isOperationLegal(ISD::UMIN, VT)) {
23295 // If we have a constant operand, increment/decrement it and change the
23296 // condition to avoid an invert.
23297 if (Cond == ISD::SETUGT) {
23298 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23299 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23300 Op1 = UGTOp1;
23301 Cond = ISD::SETUGE;
23302 }
23303 }
23304 if (Cond == ISD::SETULT) {
23305 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23306 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23307 Op1 = ULTOp1;
23308 Cond = ISD::SETULE;
23309 }
23310 }
23311 bool Invert = false;
23312 unsigned Opc;
23313 switch (Cond) {
23314 default: llvm_unreachable("Unexpected condition code")__builtin_unreachable();
23315 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23316 case ISD::SETULE: Opc = ISD::UMIN; break;
23317 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23318 case ISD::SETUGE: Opc = ISD::UMAX; break;
23319 }
23320
23321 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23322 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23323
23324 // If the logical-not of the result is required, perform that now.
23325 if (Invert)
23326 Result = DAG.getNOT(dl, Result, VT);
23327
23328 return Result;
23329 }
23330
23331 // Try to use SUBUS and PCMPEQ.
23332 if (FlipSigns)
23333 if (SDValue V =
23334 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23335 return V;
23336
23337 // We are handling one of the integer comparisons here. Since SSE only has
23338 // GT and EQ comparisons for integer, swapping operands and multiple
23339 // operations may be required for some comparisons.
23340 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23341 : X86ISD::PCMPGT;
23342 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23343 Cond == ISD::SETGE || Cond == ISD::SETUGE;
23344 bool Invert = Cond == ISD::SETNE ||
23345 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23346
23347 if (Swap)
23348 std::swap(Op0, Op1);
23349
23350 // Check that the operation in question is available (most are plain SSE2,
23351 // but PCMPGTQ and PCMPEQQ have different requirements).
23352 if (VT == MVT::v2i64) {
23353 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23354 assert(Subtarget.hasSSE2() && "Don't know how to lower!")((void)0);
23355
23356 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23357 // the odd elements over the even elements.
23358 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23359 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23360 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23361
23362 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23363 static const int MaskHi[] = { 1, 1, 3, 3 };
23364 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23365
23366 return DAG.getBitcast(VT, Result);
23367 }
23368
23369 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23370 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23371 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23372
23373 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23374 static const int MaskHi[] = { 1, 1, 3, 3 };
23375 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23376
23377 return DAG.getBitcast(VT, Result);
23378 }
23379
23380 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23381 // bits of the inputs before performing those operations. The lower
23382 // compare is always unsigned.
23383 SDValue SB;
23384 if (FlipSigns) {
23385 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23386 } else {
23387 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23388 }
23389 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23390 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23391
23392 // Cast everything to the right type.
23393 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23394 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23395
23396 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23397 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23398 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23399
23400 // Create masks for only the low parts/high parts of the 64 bit integers.
23401 static const int MaskHi[] = { 1, 1, 3, 3 };
23402 static const int MaskLo[] = { 0, 0, 2, 2 };
23403 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23404 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23405 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23406
23407 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23408 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23409
23410 if (Invert)
23411 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23412
23413 return DAG.getBitcast(VT, Result);
23414 }
23415
23416 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23417 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23418 // pcmpeqd + pshufd + pand.
23419 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((void)0);
23420
23421 // First cast everything to the right type.
23422 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23423 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23424
23425 // Do the compare.
23426 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23427
23428 // Make sure the lower and upper halves are both all-ones.
23429 static const int Mask[] = { 1, 0, 3, 2 };
23430 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23431 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23432
23433 if (Invert)
23434 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23435
23436 return DAG.getBitcast(VT, Result);
23437 }
23438 }
23439
23440 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23441 // bits of the inputs before performing those operations.
23442 if (FlipSigns) {
23443 MVT EltVT = VT.getVectorElementType();
23444 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23445 VT);
23446 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23447 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23448 }
23449
23450 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23451
23452 // If the logical-not of the result is required, perform that now.
23453 if (Invert)
23454 Result = DAG.getNOT(dl, Result, VT);
23455
23456 return Result;
23457}
23458
23459// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23460static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23461 const SDLoc &dl, SelectionDAG &DAG,
23462 const X86Subtarget &Subtarget,
23463 SDValue &X86CC) {
23464 // Only support equality comparisons.
23465 if (CC != ISD::SETEQ && CC != ISD::SETNE)
23466 return SDValue();
23467
23468 // Must be a bitcast from vXi1.
23469 if (Op0.getOpcode() != ISD::BITCAST)
23470 return SDValue();
23471
23472 Op0 = Op0.getOperand(0);
23473 MVT VT = Op0.getSimpleValueType();
23474 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23475 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23476 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23477 return SDValue();
23478
23479 X86::CondCode X86Cond;
23480 if (isNullConstant(Op1)) {
23481 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23482 } else if (isAllOnesConstant(Op1)) {
23483 // C flag is set for all ones.
23484 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23485 } else
23486 return SDValue();
23487
23488 // If the input is an AND, we can combine it's operands into the KTEST.
23489 bool KTestable = false;
23490 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23491 KTestable = true;
23492 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23493 KTestable = true;
23494 if (!isNullConstant(Op1))
23495 KTestable = false;
23496 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23497 SDValue LHS = Op0.getOperand(0);
23498 SDValue RHS = Op0.getOperand(1);
23499 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23500 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23501 }
23502
23503 // If the input is an OR, we can combine it's operands into the KORTEST.
23504 SDValue LHS = Op0;
23505 SDValue RHS = Op0;
23506 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23507 LHS = Op0.getOperand(0);
23508 RHS = Op0.getOperand(1);
23509 }
23510
23511 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23512 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23513}
23514
23515/// Emit flags for the given setcc condition and operands. Also returns the
23516/// corresponding X86 condition code constant in X86CC.
23517SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23518 ISD::CondCode CC, const SDLoc &dl,
23519 SelectionDAG &DAG,
23520 SDValue &X86CC) const {
23521 // Optimize to BT if possible.
23522 // Lower (X & (1 << N)) == 0 to BT(X, N).
23523 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23524 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23525 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23526 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23527 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23528 return BT;
23529 }
23530
23531 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23532 // TODO: We could do AND tree with all 1s as well by using the C flag.
23533 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23534 if (SDValue CmpZ =
23535 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23536 return CmpZ;
23537
23538 // Try to lower using KORTEST or KTEST.
23539 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23540 return Test;
23541
23542 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
23543 // these.
23544 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
23545 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23546 // If the input is a setcc, then reuse the input setcc or use a new one with
23547 // the inverted condition.
23548 if (Op0.getOpcode() == X86ISD::SETCC) {
23549 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23550
23551 X86CC = Op0.getOperand(0);
23552 if (Invert) {
23553 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23554 CCode = X86::GetOppositeBranchCondition(CCode);
23555 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23556 }
23557
23558 return Op0.getOperand(1);
23559 }
23560 }
23561
23562 // Try to use the carry flag from the add in place of an separate CMP for:
23563 // (seteq (add X, -1), -1). Similar for setne.
23564 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23565 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23566 if (isProfitableToUseFlagOp(Op0)) {
23567 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23568
23569 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23570 Op0.getOperand(1));
23571 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23572 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23573 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23574 return SDValue(New.getNode(), 1);
23575 }
23576 }
23577
23578 X86::CondCode CondCode =
23579 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23580 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")((void)0);
23581
23582 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23583 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23584 return EFLAGS;
23585}
23586
23587SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23588
23589 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23590 Op.getOpcode() == ISD::STRICT_FSETCCS;
23591 MVT VT = Op->getSimpleValueType(0);
23592
23593 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23594
23595 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((void)0);
23596 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23597 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23598 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23599 SDLoc dl(Op);
23600 ISD::CondCode CC =
23601 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23602
23603 // Handle f128 first, since one possible outcome is a normal integer
23604 // comparison which gets handled by emitFlagsForSetcc.
23605 if (Op0.getValueType() == MVT::f128) {
23606 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23607 Op.getOpcode() == ISD::STRICT_FSETCCS);
23608
23609 // If softenSetCCOperands returned a scalar, use it.
23610 if (!Op1.getNode()) {
23611 assert(Op0.getValueType() == Op.getValueType() &&((void)0)
23612 "Unexpected setcc expansion!")((void)0);
23613 if (IsStrict)
23614 return DAG.getMergeValues({Op0, Chain}, dl);
23615 return Op0;
23616 }
23617 }
23618
23619 if (Op0.getSimpleValueType().isInteger()) {
23620 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23621 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23622 // this may translate to less uops depending on uarch implementation. The
23623 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23624 // canonicalize to that CondCode.
23625 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23626 // encoding size - so it must either already be a i8 or i32 immediate, or it
23627 // shrinks down to that. We don't do this for any i64's to avoid additional
23628 // constant materializations.
23629 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23630 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23631 const APInt &Op1Val = Op1C->getAPIntValue();
23632 if (!Op1Val.isNullValue()) {
23633 // Ensure the constant+1 doesn't overflow.
23634 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23635 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23636 APInt Op1ValPlusOne = Op1Val + 1;
23637 if (Op1ValPlusOne.isSignedIntN(32) &&
23638 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23639 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23640 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
23641 : ISD::CondCode::SETUGE;
23642 }
23643 }
23644 }
23645 }
23646
23647 SDValue X86CC;
23648 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23649 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23650 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23651 }
23652
23653 // Handle floating point.
23654 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23655 if (CondCode == X86::COND_INVALID)
23656 return SDValue();
23657
23658 SDValue EFLAGS;
23659 if (IsStrict) {
23660 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23661 EFLAGS =
23662 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23663 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23664 Chain = EFLAGS.getValue(1);
23665 } else {
23666 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23667 }
23668
23669 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23670 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23671 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23672}
23673
23674SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23675 SDValue LHS = Op.getOperand(0);
23676 SDValue RHS = Op.getOperand(1);
23677 SDValue Carry = Op.getOperand(2);
23678 SDValue Cond = Op.getOperand(3);
23679 SDLoc DL(Op);
23680
23681 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((void)0);
23682 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23683
23684 // Recreate the carry if needed.
23685 EVT CarryVT = Carry.getValueType();
23686 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23687 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23688
23689 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23690 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23691 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23692}
23693
23694// This function returns three things: the arithmetic computation itself
23695// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23696// flag and the condition code define the case in which the arithmetic
23697// computation overflows.
23698static std::pair<SDValue, SDValue>
23699getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23700 assert(Op.getResNo() == 0 && "Unexpected result number!")((void)0);
23701 SDValue Value, Overflow;
23702 SDValue LHS = Op.getOperand(0);
23703 SDValue RHS = Op.getOperand(1);
23704 unsigned BaseOp = 0;
23705 SDLoc DL(Op);
23706 switch (Op.getOpcode()) {
23707 default: llvm_unreachable("Unknown ovf instruction!")__builtin_unreachable();
23708 case ISD::SADDO:
23709 BaseOp = X86ISD::ADD;
23710 Cond = X86::COND_O;
23711 break;
23712 case ISD::UADDO:
23713 BaseOp = X86ISD::ADD;
23714 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23715 break;
23716 case ISD::SSUBO:
23717 BaseOp = X86ISD::SUB;
23718 Cond = X86::COND_O;
23719 break;
23720 case ISD::USUBO:
23721 BaseOp = X86ISD::SUB;
23722 Cond = X86::COND_B;
23723 break;
23724 case ISD::SMULO:
23725 BaseOp = X86ISD::SMUL;
23726 Cond = X86::COND_O;
23727 break;
23728 case ISD::UMULO:
23729 BaseOp = X86ISD::UMUL;
23730 Cond = X86::COND_O;
23731 break;
23732 }
23733
23734 if (BaseOp) {
23735 // Also sets EFLAGS.
23736 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23737 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23738 Overflow = Value.getValue(1);
23739 }
23740
23741 return std::make_pair(Value, Overflow);
23742}
23743
23744static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23745 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23746 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23747 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23748 // has only one use.
23749 SDLoc DL(Op);
23750 X86::CondCode Cond;
23751 SDValue Value, Overflow;
23752 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23753
23754 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23755 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")((void)0);
23756 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23757}
23758
23759/// Return true if opcode is a X86 logical comparison.
23760static bool isX86LogicalCmp(SDValue Op) {
23761 unsigned Opc = Op.getOpcode();
23762 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23763 Opc == X86ISD::FCMP)
23764 return true;
23765 if (Op.getResNo() == 1 &&
23766 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23767 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23768 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23769 return true;
23770
23771 return false;
23772}
23773
23774static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23775 if (V.getOpcode() != ISD::TRUNCATE)
23776 return false;
23777
23778 SDValue VOp0 = V.getOperand(0);
23779 unsigned InBits = VOp0.getValueSizeInBits();
23780 unsigned Bits = V.getValueSizeInBits();
23781 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23782}
23783
23784SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23785 bool AddTest = true;
23786 SDValue Cond = Op.getOperand(0);
23787 SDValue Op1 = Op.getOperand(1);
23788 SDValue Op2 = Op.getOperand(2);
23789 SDLoc DL(Op);
23790 MVT VT = Op1.getSimpleValueType();
23791 SDValue CC;
23792
23793 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23794 // are available or VBLENDV if AVX is available.
23795 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23796 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23797 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23798 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23799 bool IsAlwaysSignaling;
23800 unsigned SSECC =
23801 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23802 CondOp0, CondOp1, IsAlwaysSignaling);
23803
23804 if (Subtarget.hasAVX512()) {
23805 SDValue Cmp =
23806 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23807 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23808 assert(!VT.isVector() && "Not a scalar type?")((void)0);
23809 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23810 }
23811
23812 if (SSECC < 8 || Subtarget.hasAVX()) {
23813 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23814 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23815
23816 // If we have AVX, we can use a variable vector select (VBLENDV) instead
23817 // of 3 logic instructions for size savings and potentially speed.
23818 // Unfortunately, there is no scalar form of VBLENDV.
23819
23820 // If either operand is a +0.0 constant, don't try this. We can expect to
23821 // optimize away at least one of the logic instructions later in that
23822 // case, so that sequence would be faster than a variable blend.
23823
23824 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23825 // uses XMM0 as the selection register. That may need just as many
23826 // instructions as the AND/ANDN/OR sequence due to register moves, so
23827 // don't bother.
23828 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23829 !isNullFPConstant(Op2)) {
23830 // Convert to vectors, do a VSELECT, and convert back to scalar.
23831 // All of the conversions should be optimized away.
23832 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23833 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23834 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23835 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23836
23837 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23838 VCmp = DAG.getBitcast(VCmpVT, VCmp);
23839
23840 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23841
23842 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23843 VSel, DAG.getIntPtrConstant(0, DL));
23844 }
23845 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23846 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23847 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23848 }
23849 }
23850
23851 // AVX512 fallback is to lower selects of scalar floats to masked moves.
23852 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23853 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23854 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23855 }
23856
23857 if (Cond.getOpcode() == ISD::SETCC) {
23858 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23859 Cond = NewCond;
23860 // If the condition was updated, it's possible that the operands of the
23861 // select were also updated (for example, EmitTest has a RAUW). Refresh
23862 // the local references to the select operands in case they got stale.
23863 Op1 = Op.getOperand(1);
23864 Op2 = Op.getOperand(2);
23865 }
23866 }
23867
23868 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23869 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23870 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23871 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23872 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23873 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23874 if (Cond.getOpcode() == X86ISD::SETCC &&
23875 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23876 isNullConstant(Cond.getOperand(1).getOperand(1))) {
23877 SDValue Cmp = Cond.getOperand(1);
23878 SDValue CmpOp0 = Cmp.getOperand(0);
23879 unsigned CondCode = Cond.getConstantOperandVal(0);
23880
23881 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23882 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23883 // handle to keep the CMP with 0. This should be removed by
23884 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23885 // cttz_zero_undef.
23886 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23887 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23888 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23889 };
23890 if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
23891 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23892 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23893 // Keep Cmp.
23894 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23895 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23896 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23897
23898 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23899 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23900
23901 // Apply further optimizations for special cases
23902 // (select (x != 0), -1, 0) -> neg & sbb
23903 // (select (x == 0), 0, -1) -> neg & sbb
23904 if (isNullConstant(Y) &&
23905 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
23906 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23907 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23908 Zero = DAG.getConstant(0, DL, Op.getValueType());
23909 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
23910 }
23911
23912 Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
23913 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
23914
23915 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
23916 SDValue Res = // Res = 0 or -1.
23917 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
23918
23919 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
23920 Res = DAG.getNOT(DL, Res, Res.getValueType());
23921
23922 return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
23923 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
23924 Cmp.getOperand(0).getOpcode() == ISD::AND &&
23925 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
23926 SDValue Src1, Src2;
23927 // true if Op2 is XOR or OR operator and one of its operands
23928 // is equal to Op1
23929 // ( a , a op b) || ( b , a op b)
23930 auto isOrXorPattern = [&]() {
23931 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23932 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23933 Src1 =
23934 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23935 Src2 = Op1;
23936 return true;
23937 }
23938 return false;
23939 };
23940
23941 if (isOrXorPattern()) {
23942 SDValue Neg;
23943 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23944 // we need mask of all zeros or ones with same size of the other
23945 // operands.
23946 if (CmpSz > VT.getSizeInBits())
23947 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23948 else if (CmpSz < VT.getSizeInBits())
23949 Neg = DAG.getNode(ISD::AND, DL, VT,
23950 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23951 DAG.getConstant(1, DL, VT));
23952 else
23953 Neg = CmpOp0;
23954 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23955 Neg); // -(and (x, 0x1))
23956 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23957 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
23958 }
23959 }
23960 }
23961
23962 // Look past (and (setcc_carry (cmp ...)), 1).
23963 if (Cond.getOpcode() == ISD::AND &&
23964 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23965 isOneConstant(Cond.getOperand(1)))
23966 Cond = Cond.getOperand(0);
23967
23968 // If condition flag is set by a X86ISD::CMP, then use it as the condition
23969 // setting operand in place of the X86ISD::SETCC.
23970 unsigned CondOpcode = Cond.getOpcode();
23971 if (CondOpcode == X86ISD::SETCC ||
23972 CondOpcode == X86ISD::SETCC_CARRY) {
23973 CC = Cond.getOperand(0);
23974
23975 SDValue Cmp = Cond.getOperand(1);
23976 bool IllegalFPCMov = false;
23977 if (VT.isFloatingPoint() && !VT.isVector() &&
23978 !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
23979 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23980
23981 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23982 Cmp.getOpcode() == X86ISD::BT) { // FIXME
23983 Cond = Cmp;
23984 AddTest = false;
23985 }
23986 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23987 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23988 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23989 SDValue Value;
23990 X86::CondCode X86Cond;
23991 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23992
23993 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23994 AddTest = false;
23995 }
23996
23997 if (AddTest) {
23998 // Look past the truncate if the high bits are known zero.
23999 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24000 Cond = Cond.getOperand(0);
24001
24002 // We know the result of AND is compared against zero. Try to match
24003 // it to BT.
24004 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24005 SDValue BTCC;
24006 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
24007 CC = BTCC;
24008 Cond = BT;
24009 AddTest = false;
24010 }
24011 }
24012 }
24013
24014 if (AddTest) {
24015 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24016 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24017 }
24018
24019 // a < b ? -1 : 0 -> RES = ~setcc_carry
24020 // a < b ? 0 : -1 -> RES = setcc_carry
24021 // a >= b ? -1 : 0 -> RES = setcc_carry
24022 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24023 if (Cond.getOpcode() == X86ISD::SUB) {
24024 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24025
24026 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24027 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24028 (isNullConstant(Op1) || isNullConstant(Op2))) {
24029 SDValue Res =
24030 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24031 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24032 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24033 return DAG.getNOT(DL, Res, Res.getValueType());
24034 return Res;
24035 }
24036 }
24037
24038 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24039 // widen the cmov and push the truncate through. This avoids introducing a new
24040 // branch during isel and doesn't add any extensions.
24041 if (Op.getValueType() == MVT::i8 &&
24042 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24043 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24044 if (T1.getValueType() == T2.getValueType() &&
24045 // Exclude CopyFromReg to avoid partial register stalls.
24046 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24047 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24048 CC, Cond);
24049 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24050 }
24051 }
24052
24053 // Or finally, promote i8 cmovs if we have CMOV,
24054 // or i16 cmovs if it won't prevent folding a load.
24055 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24056 // legal, but EmitLoweredSelect() can not deal with these extensions
24057 // being inserted between two CMOV's. (in i16 case too TBN)
24058 // https://bugs.llvm.org/show_bug.cgi?id=40974
24059 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
24060 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
24061 !MayFoldLoad(Op2))) {
24062 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24063 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24064 SDValue Ops[] = { Op2, Op1, CC, Cond };
24065 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24066 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24067 }
24068
24069 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24070 // condition is true.
24071 SDValue Ops[] = { Op2, Op1, CC, Cond };
24072 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24073}
24074
24075static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24076 const X86Subtarget &Subtarget,
24077 SelectionDAG &DAG) {
24078 MVT VT = Op->getSimpleValueType(0);
24079 SDValue In = Op->getOperand(0);
24080 MVT InVT = In.getSimpleValueType();
24081 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((void)0);
24082 MVT VTElt = VT.getVectorElementType();
24083 SDLoc dl(Op);
24084
24085 unsigned NumElts = VT.getVectorNumElements();
24086
24087 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24088 MVT ExtVT = VT;
24089 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24090 // If v16i32 is to be avoided, we'll need to split and concatenate.
24091 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24092 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24093
24094 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24095 }
24096
24097 // Widen to 512-bits if VLX is not supported.
24098 MVT WideVT = ExtVT;
24099 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24100 NumElts *= 512 / ExtVT.getSizeInBits();
24101 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24102 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24103 In, DAG.getIntPtrConstant(0, dl));
24104 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24105 }
24106
24107 SDValue V;
24108 MVT WideEltVT = WideVT.getVectorElementType();
24109 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24110 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24111 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24112 } else {
24113 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24114 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24115 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24116 }
24117
24118 // Truncate if we had to extend i16/i8 above.
24119 if (VT != ExtVT) {
24120 WideVT = MVT::getVectorVT(VTElt, NumElts);
24121 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24122 }
24123
24124 // Extract back to 128/256-bit if we widened.
24125 if (WideVT != VT)
24126 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24127 DAG.getIntPtrConstant(0, dl));
24128
24129 return V;
24130}
24131
24132static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24133 SelectionDAG &DAG) {
24134 SDValue In = Op->getOperand(0);
24135 MVT InVT = In.getSimpleValueType();
24136
24137 if (InVT.getVectorElementType() == MVT::i1)
24138 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24139
24140 assert(Subtarget.hasAVX() && "Expected AVX support")((void)0);
24141 return LowerAVXExtend(Op, DAG, Subtarget);
24142}
24143
24144// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24145// For sign extend this needs to handle all vector sizes and SSE4.1 and
24146// non-SSE4.1 targets. For zero extend this should only handle inputs of
24147// MVT::v64i8 when BWI is not supported, but AVX512 is.
24148static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24149 const X86Subtarget &Subtarget,
24150 SelectionDAG &DAG) {
24151 SDValue In = Op->getOperand(0);
24152 MVT VT = Op->getSimpleValueType(0);
24153 MVT InVT = In.getSimpleValueType();
24154
24155 MVT SVT = VT.getVectorElementType();
24156 MVT InSVT = InVT.getVectorElementType();
24157 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())((void)0);
24158
24159 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24160 return SDValue();
24161 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24162 return SDValue();
24163 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24164 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24165 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24166 return SDValue();
24167
24168 SDLoc dl(Op);
24169 unsigned Opc = Op.getOpcode();
24170 unsigned NumElts = VT.getVectorNumElements();
24171
24172 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24173 // For 512-bit vectors, we need 128-bits or 256-bits.
24174 if (InVT.getSizeInBits() > 128) {
24175 // Input needs to be at least the same number of elements as output, and
24176 // at least 128-bits.
24177 int InSize = InSVT.getSizeInBits() * NumElts;
24178 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24179 InVT = In.getSimpleValueType();
24180 }
24181
24182 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24183 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24184 // need to be handled here for 256/512-bit results.
24185 if (Subtarget.hasInt256()) {
24186 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((void)0);
24187
24188 if (InVT.getVectorNumElements() != NumElts)
24189 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24190
24191 // FIXME: Apparently we create inreg operations that could be regular
24192 // extends.
24193 unsigned ExtOpc =
24194 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24195 : ISD::ZERO_EXTEND;
24196 return DAG.getNode(ExtOpc, dl, VT, In);
24197 }
24198
24199 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24200 if (Subtarget.hasAVX()) {
24201 assert(VT.is256BitVector() && "256-bit vector expected")((void)0);
24202 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24203 int HalfNumElts = HalfVT.getVectorNumElements();
24204
24205 unsigned NumSrcElts = InVT.getVectorNumElements();
24206 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24207 for (int i = 0; i != HalfNumElts; ++i)
24208 HiMask[i] = HalfNumElts + i;
24209
24210 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24211 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24212 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24213 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24214 }
24215
24216 // We should only get here for sign extend.
24217 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")((void)0);
24218 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")((void)0);
24219
24220 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24221 SDValue Curr = In;
24222 SDValue SignExt = Curr;
24223
24224 // As SRAI is only available on i16/i32 types, we expand only up to i32
24225 // and handle i64 separately.
24226 if (InVT != MVT::v4i32) {
24227 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24228
24229 unsigned DestWidth = DestVT.getScalarSizeInBits();
24230 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24231
24232 unsigned InNumElts = InVT.getVectorNumElements();
24233 unsigned DestElts = DestVT.getVectorNumElements();
24234
24235 // Build a shuffle mask that takes each input element and places it in the
24236 // MSBs of the new element size.
24237 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24238 for (unsigned i = 0; i != DestElts; ++i)
24239 Mask[i * Scale + (Scale - 1)] = i;
24240
24241 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24242 Curr = DAG.getBitcast(DestVT, Curr);
24243
24244 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24245 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24246 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24247 }
24248
24249 if (VT == MVT::v2i64) {
24250 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")((void)0);
24251 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24252 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24253 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24254 SignExt = DAG.getBitcast(VT, SignExt);
24255 }
24256
24257 return SignExt;
24258}
24259
24260static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24261 SelectionDAG &DAG) {
24262 MVT VT = Op->getSimpleValueType(0);
24263 SDValue In = Op->getOperand(0);
24264 MVT InVT = In.getSimpleValueType();
24265 SDLoc dl(Op);
24266
24267 if (InVT.getVectorElementType() == MVT::i1)
24268 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24269
24270 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((void)0);
24271 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
24272 "Expected same number of elements")((void)0);
24273 assert((VT.getVectorElementType() == MVT::i16 ||((void)0)
24274 VT.getVectorElementType() == MVT::i32 ||((void)0)
24275 VT.getVectorElementType() == MVT::i64) &&((void)0)
24276 "Unexpected element type")((void)0);
24277 assert((InVT.getVectorElementType() == MVT::i8 ||((void)0)
24278 InVT.getVectorElementType() == MVT::i16 ||((void)0)
24279 InVT.getVectorElementType() == MVT::i32) &&((void)0)
24280 "Unexpected element type")((void)0);
24281
24282 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24283 assert(InVT == MVT::v32i8 && "Unexpected VT!")((void)0);
24284 return splitVectorIntUnary(Op, DAG);
24285 }
24286
24287 if (Subtarget.hasInt256())
24288 return Op;
24289
24290 // Optimize vectors in AVX mode
24291 // Sign extend v8i16 to v8i32 and
24292 // v4i32 to v4i64
24293 //
24294 // Divide input vector into two parts
24295 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24296 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24297 // concat the vectors to original VT
24298 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24299 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24300
24301 unsigned NumElems = InVT.getVectorNumElements();
24302 SmallVector<int,8> ShufMask(NumElems, -1);
24303 for (unsigned i = 0; i != NumElems/2; ++i)
24304 ShufMask[i] = i + NumElems/2;
24305
24306 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24307 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24308
24309 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24310}
24311
24312/// Change a vector store into a pair of half-size vector stores.
24313static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24314 SDValue StoredVal = Store->getValue();
24315 assert((StoredVal.getValueType().is256BitVector() ||((void)0)
24316 StoredVal.getValueType().is512BitVector()) &&((void)0)
24317 "Expecting 256/512-bit op")((void)0);
24318
24319 // Splitting volatile memory ops is not allowed unless the operation was not
24320 // legal to begin with. Assume the input store is legal (this transform is
24321 // only used for targets with AVX). Note: It is possible that we have an
24322 // illegal type like v2i128, and so we could allow splitting a volatile store
24323 // in that case if that is important.
24324 if (!Store->isSimple())
24325 return SDValue();
24326
24327 SDLoc DL(Store);
24328 SDValue Value0, Value1;
24329 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24330 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24331 SDValue Ptr0 = Store->getBasePtr();
24332 SDValue Ptr1 =
24333 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24334 SDValue Ch0 =
24335 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24336 Store->getOriginalAlign(),
24337 Store->getMemOperand()->getFlags());
24338 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24339 Store->getPointerInfo().getWithOffset(HalfOffset),
24340 Store->getOriginalAlign(),
24341 Store->getMemOperand()->getFlags());
24342 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24343}
24344
24345/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24346/// type.
24347static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24348 SelectionDAG &DAG) {
24349 SDValue StoredVal = Store->getValue();
24350 assert(StoreVT.is128BitVector() &&((void)0)
24351 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")((void)0);
24352 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24353
24354 // Splitting volatile memory ops is not allowed unless the operation was not
24355 // legal to begin with. We are assuming the input op is legal (this transform
24356 // is only used for targets with AVX).
24357 if (!Store->isSimple())
24358 return SDValue();
24359
24360 MVT StoreSVT = StoreVT.getScalarType();
24361 unsigned NumElems = StoreVT.getVectorNumElements();
24362 unsigned ScalarSize = StoreSVT.getStoreSize();
24363
24364 SDLoc DL(Store);
24365 SmallVector<SDValue, 4> Stores;
24366 for (unsigned i = 0; i != NumElems; ++i) {
24367 unsigned Offset = i * ScalarSize;
24368 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24369 TypeSize::Fixed(Offset), DL);
24370 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24371 DAG.getIntPtrConstant(i, DL));
24372 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24373 Store->getPointerInfo().getWithOffset(Offset),
24374 Store->getOriginalAlign(),
24375 Store->getMemOperand()->getFlags());
24376 Stores.push_back(Ch);
24377 }
24378 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24379}
24380
24381static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24382 SelectionDAG &DAG) {
24383 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24384 SDLoc dl(St);
24385 SDValue StoredVal = St->getValue();
24386
24387 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24388 if (StoredVal.getValueType().isVector() &&
24389 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24390 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24391 assert(NumElts <= 8 && "Unexpected VT")((void)0);
24392 assert(!St->isTruncatingStore() && "Expected non-truncating store")((void)0);
24393 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((void)0)
24394 "Expected AVX512F without AVX512DQI")((void)0);
24395
24396 // We must pad with zeros to ensure we store zeroes to any unused bits.
24397 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24398 DAG.getUNDEF(MVT::v16i1), StoredVal,
24399 DAG.getIntPtrConstant(0, dl));
24400 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24401 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24402 // Make sure we store zeros in the extra bits.
24403 if (NumElts < 8)
24404 StoredVal = DAG.getZeroExtendInReg(
24405 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24406
24407 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24408 St->getPointerInfo(), St->getOriginalAlign(),
24409 St->getMemOperand()->getFlags());
24410 }
24411
24412 if (St->isTruncatingStore())
24413 return SDValue();
24414
24415 // If this is a 256-bit store of concatenated ops, we are better off splitting
24416 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24417 // and each half can execute independently. Some cores would split the op into
24418 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24419 MVT StoreVT = StoredVal.getSimpleValueType();
24420 if (StoreVT.is256BitVector() ||
24421 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24422 !Subtarget.hasBWI())) {
24423 SmallVector<SDValue, 4> CatOps;
24424 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24425 return splitVectorStore(St, DAG);
24426 return SDValue();
24427 }
24428
24429 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24430 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&((void)0)
24431 "Unexpected VT")((void)0);
24432 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==((void)0)
24433 TargetLowering::TypeWidenVector && "Unexpected type action!")((void)0);
24434
24435 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24436 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24437 DAG.getUNDEF(StoreVT));
24438
24439 if (Subtarget.hasSSE2()) {
24440 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24441 // and store it.
24442 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24443 MVT CastVT = MVT::getVectorVT(StVT, 2);
24444 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24445 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24446 DAG.getIntPtrConstant(0, dl));
24447
24448 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24449 St->getPointerInfo(), St->getOriginalAlign(),
24450 St->getMemOperand()->getFlags());
24451 }
24452 assert(Subtarget.hasSSE1() && "Expected SSE")((void)0);
24453 SDVTList Tys = DAG.getVTList(MVT::Other);
24454 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24455 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24456 St->getMemOperand());
24457}
24458
24459// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24460// may emit an illegal shuffle but the expansion is still better than scalar
24461// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24462// we'll emit a shuffle and a arithmetic shift.
24463// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24464// TODO: It is possible to support ZExt by zeroing the undef values during
24465// the shuffle phase or after the shuffle.
24466static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24467 SelectionDAG &DAG) {
24468 MVT RegVT = Op.getSimpleValueType();
24469 assert(RegVT.isVector() && "We only custom lower vector loads.")((void)0);
24470 assert(RegVT.isInteger() &&((void)0)
24471 "We only custom lower integer vector loads.")((void)0);
24472
24473 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24474 SDLoc dl(Ld);
24475
24476 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24477 if (RegVT.getVectorElementType() == MVT::i1) {
24478 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")((void)0);
24479 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")((void)0);
24480 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((void)0)
24481 "Expected AVX512F without AVX512DQI")((void)0);
24482
24483 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24484 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24485 Ld->getMemOperand()->getFlags());
24486
24487 // Replace chain users with the new chain.
24488 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")((void)0);
24489
24490 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24491 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24492 DAG.getBitcast(MVT::v16i1, Val),
24493 DAG.getIntPtrConstant(0, dl));
24494 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24495 }
24496
24497 return SDValue();
24498}
24499
24500/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24501/// each of which has no other use apart from the AND / OR.
24502static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24503 Opc = Op.getOpcode();
24504 if (Opc != ISD::OR && Opc != ISD::AND)
24505 return false;
24506 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24507 Op.getOperand(0).hasOneUse() &&
24508 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24509 Op.getOperand(1).hasOneUse());
24510}
24511
24512SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24513 SDValue Chain = Op.getOperand(0);
24514 SDValue Cond = Op.getOperand(1);
24515 SDValue Dest = Op.getOperand(2);
24516 SDLoc dl(Op);
24517
24518 if (Cond.getOpcode() == ISD::SETCC &&
24519 Cond.getOperand(0).getValueType() != MVT::f128) {
24520 SDValue LHS = Cond.getOperand(0);
24521 SDValue RHS = Cond.getOperand(1);
24522 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24523
24524 // Special case for
24525 // setcc([su]{add,sub,mul}o == 0)
24526 // setcc([su]{add,sub,mul}o != 1)
24527 if (ISD::isOverflowIntrOpRes(LHS) &&
24528 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24529 (isNullConstant(RHS) || isOneConstant(RHS))) {
24530 SDValue Value, Overflow;
24531 X86::CondCode X86Cond;
24532 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24533
24534 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24535 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24536
24537 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24538 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24539 Overflow);
24540 }
24541
24542 if (LHS.getSimpleValueType().isInteger()) {
24543 SDValue CCVal;
24544 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24545 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24546 EFLAGS);
24547 }
24548
24549 if (CC == ISD::SETOEQ) {
24550 // For FCMP_OEQ, we can emit
24551 // two branches instead of an explicit AND instruction with a
24552 // separate test. However, we only do this if this block doesn't
24553 // have a fall-through edge, because this requires an explicit
24554 // jmp when the condition is false.
24555 if (Op.getNode()->hasOneUse()) {
24556 SDNode *User = *Op.getNode()->use_begin();
24557 // Look for an unconditional branch following this conditional branch.
24558 // We need this because we need to reverse the successors in order
24559 // to implement FCMP_OEQ.
24560 if (User->getOpcode() == ISD::BR) {
24561 SDValue FalseBB = User->getOperand(1);
24562 SDNode *NewBR =
24563 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24564 assert(NewBR == User)((void)0);
24565 (void)NewBR;
24566 Dest = FalseBB;
24567
24568 SDValue Cmp =
24569 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24570 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24571 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24572 CCVal, Cmp);
24573 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24574 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24575 Cmp);
24576 }
24577 }
24578 } else if (CC == ISD::SETUNE) {
24579 // For FCMP_UNE, we can emit
24580 // two branches instead of an explicit OR instruction with a
24581 // separate test.
24582 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24583 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24584 Chain =
24585 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24586 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24587 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24588 Cmp);
24589 } else {
24590 X86::CondCode X86Cond =
24591 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24592 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24593 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24594 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24595 Cmp);
24596 }
24597 }
24598
24599 if (ISD::isOverflowIntrOpRes(Cond)) {
24600 SDValue Value, Overflow;
24601 X86::CondCode X86Cond;
24602 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24603
24604 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24605 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24606 Overflow);
24607 }
24608
24609 // Look past the truncate if the high bits are known zero.
24610 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24611 Cond = Cond.getOperand(0);
24612
24613 EVT CondVT = Cond.getValueType();
24614
24615 // Add an AND with 1 if we don't already have one.
24616 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24617 Cond =
24618 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24619
24620 SDValue LHS = Cond;
24621 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24622
24623 SDValue CCVal;
24624 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24625 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24626 EFLAGS);
24627}
24628
24629// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24630// Calls to _alloca are needed to probe the stack when allocating more than 4k
24631// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24632// that the guard pages used by the OS virtual memory manager are allocated in
24633// correct sequence.
24634SDValue
24635X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24636 SelectionDAG &DAG) const {
24637 MachineFunction &MF = DAG.getMachineFunction();
24638 bool SplitStack = MF.shouldSplitStack();
24639 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24640 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24641 SplitStack || EmitStackProbeCall;
24642 SDLoc dl(Op);
24643
24644 // Get the inputs.
24645 SDNode *Node = Op.getNode();
24646 SDValue Chain = Op.getOperand(0);
24647 SDValue Size = Op.getOperand(1);
24648 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24649 EVT VT = Node->getValueType(0);
24650
24651 // Chain the dynamic stack allocation so that it doesn't modify the stack
24652 // pointer when other instructions are using the stack.
24653 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24654
24655 bool Is64Bit = Subtarget.is64Bit();
24656 MVT SPTy = getPointerTy(DAG.getDataLayout());
24657
24658 SDValue Result;
24659 if (!Lower) {
24660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24661 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24662 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((void)0)
24663 " not tell us which reg is the stack pointer!")((void)0);
24664
24665 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24666 const Align StackAlign = TFI.getStackAlign();
24667 if (hasInlineStackProbe(MF)) {
24668 MachineRegisterInfo &MRI = MF.getRegInfo();
24669
24670 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24671 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24672 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24673 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24674 DAG.getRegister(Vreg, SPTy));
24675 } else {
24676 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24677 Chain = SP.getValue(1);
24678 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24679 }
24680 if (Alignment && *Alignment > StackAlign)
24681 Result =
24682 DAG.getNode(ISD::AND, dl, VT, Result,
24683 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24684 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24685 } else if (SplitStack) {
24686 MachineRegisterInfo &MRI = MF.getRegInfo();
24687
24688 if (Is64Bit) {
24689 // The 64 bit implementation of segmented stacks needs to clobber both r10
24690 // r11. This makes it impossible to use it along with nested parameters.
24691 const Function &F = MF.getFunction();
24692 for (const auto &A : F.args()) {
24693 if (A.hasNestAttr())
24694 report_fatal_error("Cannot use segmented stacks with functions that "
24695 "have nested arguments.");
24696 }
24697 }
24698
24699 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24700 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24701 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24702 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24703 DAG.getRegister(Vreg, SPTy));
24704 } else {
24705 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24706 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
24707 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
24708
24709 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24710 Register SPReg = RegInfo->getStackRegister();
24711 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24712 Chain = SP.getValue(1);
24713
24714 if (Alignment) {
24715 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24716 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24717 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24718 }
24719
24720 Result = SP;
24721 }
24722
24723 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
24724 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
24725
24726 SDValue Ops[2] = {Result, Chain};
24727 return DAG.getMergeValues(Ops, dl);
24728}
24729
24730SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24731 MachineFunction &MF = DAG.getMachineFunction();
24732 auto PtrVT = getPointerTy(MF.getDataLayout());
24733 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24734
24735 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24736 SDLoc DL(Op);
24737
24738 if (!Subtarget.is64Bit() ||
24739 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24740 // vastart just stores the address of the VarArgsFrameIndex slot into the
24741 // memory location argument.
24742 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24743 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24744 MachinePointerInfo(SV));
24745 }
24746
24747 // __va_list_tag:
24748 // gp_offset (0 - 6 * 8)
24749 // fp_offset (48 - 48 + 8 * 16)
24750 // overflow_arg_area (point to parameters coming in memory).
24751 // reg_save_area
24752 SmallVector<SDValue, 8> MemOps;
24753 SDValue FIN = Op.getOperand(1);
24754 // Store gp_offset
24755 SDValue Store = DAG.getStore(
24756 Op.getOperand(0), DL,
24757 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24758 MachinePointerInfo(SV));
24759 MemOps.push_back(Store);
24760
24761 // Store fp_offset
24762 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24763 Store = DAG.getStore(
24764 Op.getOperand(0), DL,
24765 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24766 MachinePointerInfo(SV, 4));
24767 MemOps.push_back(Store);
24768
24769 // Store ptr to overflow_arg_area
24770 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24771 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24772 Store =
24773 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24774 MemOps.push_back(Store);
24775
24776 // Store ptr to reg_save_area.
24777 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24778 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24779 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24780 Store = DAG.getStore(
24781 Op.getOperand(0), DL, RSFIN, FIN,
24782 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24783 MemOps.push_back(Store);
24784 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24785}
24786
24787SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24788 assert(Subtarget.is64Bit() &&((void)0)
24789 "LowerVAARG only handles 64-bit va_arg!")((void)0);
24790 assert(Op.getNumOperands() == 4)((void)0);
24791
24792 MachineFunction &MF = DAG.getMachineFunction();
24793 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24794 // The Win64 ABI uses char* instead of a structure.
24795 return DAG.expandVAArg(Op.getNode());
24796
24797 SDValue Chain = Op.getOperand(0);
24798 SDValue SrcPtr = Op.getOperand(1);
24799 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24800 unsigned Align = Op.getConstantOperandVal(3);
24801 SDLoc dl(Op);
24802
24803 EVT ArgVT = Op.getNode()->getValueType(0);
24804 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24805 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24806 uint8_t ArgMode;
24807
24808 // Decide which area this value should be read from.
24809 // TODO: Implement the AMD64 ABI in its entirety. This simple
24810 // selection mechanism works only for the basic types.
24811 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")((void)0);
24812 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24813 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
24814 } else {
24815 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&((void)0)
24816 "Unhandled argument type in LowerVAARG")((void)0);
24817 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
24818 }
24819
24820 if (ArgMode == 2) {
24821 // Sanity Check: Make sure using fp_offset makes sense.
24822 assert(!Subtarget.useSoftFloat() &&((void)0)
24823 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&((void)0)
24824 Subtarget.hasSSE1())((void)0);
24825 }
24826
24827 // Insert VAARG node into the DAG
24828 // VAARG returns two values: Variable Argument Address, Chain
24829 SDValue InstOps[] = {Chain, SrcPtr,
24830 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24831 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24832 DAG.getTargetConstant(Align, dl, MVT::i32)};
24833 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24834 SDValue VAARG = DAG.getMemIntrinsicNode(
24835 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24836 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24837 /*Alignment=*/None,
24838 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24839 Chain = VAARG.getValue(1);
24840
24841 // Load the next argument and return it
24842 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24843}
24844
24845static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24846 SelectionDAG &DAG) {
24847 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24848 // where a va_list is still an i8*.
24849 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((void)0);
24850 if (Subtarget.isCallingConvWin64(
24851 DAG.getMachineFunction().getFunction().getCallingConv()))
24852 // Probably a Win64 va_copy.
24853 return DAG.expandVACopy(Op.getNode());
24854
24855 SDValue Chain = Op.getOperand(0);
24856 SDValue DstPtr = Op.getOperand(1);
24857 SDValue SrcPtr = Op.getOperand(2);
24858 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24859 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24860 SDLoc DL(Op);
24861
24862 return DAG.getMemcpy(
24863 Chain, DL, DstPtr, SrcPtr,
24864 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24865 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24866 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24867}
24868
24869// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
24870static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24871 switch (Opc) {
24872 case ISD::SHL:
24873 case X86ISD::VSHL:
24874 case X86ISD::VSHLI:
24875 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24876 case ISD::SRL:
24877 case X86ISD::VSRL:
24878 case X86ISD::VSRLI:
24879 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24880 case ISD::SRA:
24881 case X86ISD::VSRA:
24882 case X86ISD::VSRAI:
24883 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24884 }
24885 llvm_unreachable("Unknown target vector shift node")__builtin_unreachable();
24886}
24887
24888/// Handle vector element shifts where the shift amount is a constant.
24889/// Takes immediate version of shift as input.
24890static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24891 SDValue SrcOp, uint64_t ShiftAmt,
24892 SelectionDAG &DAG) {
24893 MVT ElementType = VT.getVectorElementType();
24894
24895 // Bitcast the source vector to the output type, this is mainly necessary for
24896 // vXi8/vXi64 shifts.
24897 if (VT != SrcOp.getSimpleValueType())
24898 SrcOp = DAG.getBitcast(VT, SrcOp);
24899
24900 // Fold this packed shift into its first operand if ShiftAmt is 0.
24901 if (ShiftAmt == 0)
24902 return SrcOp;
24903
24904 // Check for ShiftAmt >= element width
24905 if (ShiftAmt >= ElementType.getSizeInBits()) {
24906 if (Opc == X86ISD::VSRAI)
24907 ShiftAmt = ElementType.getSizeInBits() - 1;
24908 else
24909 return DAG.getConstant(0, dl, VT);
24910 }
24911
24912 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)((void)0)
24913 && "Unknown target vector shift-by-constant node")((void)0);
24914
24915 // Fold this packed vector shift into a build vector if SrcOp is a
24916 // vector of Constants or UNDEFs.
24917 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24918 SmallVector<SDValue, 8> Elts;
24919 unsigned NumElts = SrcOp->getNumOperands();
24920
24921 switch (Opc) {
24922 default: llvm_unreachable("Unknown opcode!")__builtin_unreachable();
24923 case X86ISD::VSHLI:
24924 for (unsigned i = 0; i != NumElts; ++i) {
24925 SDValue CurrentOp = SrcOp->getOperand(i);
24926 if (CurrentOp->isUndef()) {
24927 // Must produce 0s in the correct bits.
24928 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24929 continue;
24930 }
24931 auto *ND = cast<ConstantSDNode>(CurrentOp);
24932 const APInt &C = ND->getAPIntValue();
24933 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
24934 }
24935 break;
24936 case X86ISD::VSRLI:
24937 for (unsigned i = 0; i != NumElts; ++i) {
24938 SDValue CurrentOp = SrcOp->getOperand(i);
24939 if (CurrentOp->isUndef()) {
24940 // Must produce 0s in the correct bits.
24941 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24942 continue;
24943 }
24944 auto *ND = cast<ConstantSDNode>(CurrentOp);
24945 const APInt &C = ND->getAPIntValue();
24946 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
24947 }
24948 break;
24949 case X86ISD::VSRAI:
24950 for (unsigned i = 0; i != NumElts; ++i) {
24951 SDValue CurrentOp = SrcOp->getOperand(i);
24952 if (CurrentOp->isUndef()) {
24953 // All shifted in bits must be the same so use 0.
24954 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24955 continue;
24956 }
24957 auto *ND = cast<ConstantSDNode>(CurrentOp);
24958 const APInt &C = ND->getAPIntValue();
24959 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
24960 }
24961 break;
24962 }
24963
24964 return DAG.getBuildVector(VT, dl, Elts);
24965 }
24966
24967 return DAG.getNode(Opc, dl, VT, SrcOp,
24968 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24969}
24970
24971/// Handle vector element shifts where the shift amount may or may not be a
24972/// constant. Takes immediate version of shift as input.
24973static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24974 SDValue SrcOp, SDValue ShAmt,
24975 const X86Subtarget &Subtarget,
24976 SelectionDAG &DAG) {
24977 MVT SVT = ShAmt.getSimpleValueType();
24978 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")((void)0);
24979
24980 // Catch shift-by-constant.
24981 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
24982 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
24983 CShAmt->getZExtValue(), DAG);
24984
24985 // Change opcode to non-immediate version.
24986 Opc = getTargetVShiftUniformOpcode(Opc, true);
24987
24988 // Need to build a vector containing shift amount.
24989 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
24990 // +====================+============+=======================================+
24991 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
24992 // +====================+============+=======================================+
24993 // | i64 | Yes, No | Use ShAmt as lowest elt |
24994 // | i32 | Yes | zero-extend in-reg |
24995 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
24996 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
24997 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
24998 // +====================+============+=======================================+
24999
25000 if (SVT == MVT::i64)
25001 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
25002 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
25003 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25004 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
25005 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
25006 ShAmt = ShAmt.getOperand(0);
25007 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
25008 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
25009 if (Subtarget.hasSSE41())
25010 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25011 MVT::v2i64, ShAmt);
25012 else {
25013 SDValue ByteShift = DAG.getTargetConstant(
25014 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25015 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25016 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25017 ByteShift);
25018 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25019 ByteShift);
25020 }
25021 } else if (Subtarget.hasSSE41() &&
25022 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25023 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25024 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25025 MVT::v2i64, ShAmt);
25026 } else {
25027 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
25028 DAG.getUNDEF(SVT)};
25029 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
25030 }
25031
25032 // The return type has to be a 128-bit type with the same element
25033 // type as the input type.
25034 MVT EltVT = VT.getVectorElementType();
25035 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25036
25037 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25038 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25039}
25040
25041/// Return Mask with the necessary casting or extending
25042/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25043static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25044 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25045 const SDLoc &dl) {
25046
25047 if (isAllOnesConstant(Mask))
25048 return DAG.getConstant(1, dl, MaskVT);
25049 if (X86::isZeroNode(Mask))
25050 return DAG.getConstant(0, dl, MaskVT);
25051
25052 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")((void)0);
25053
25054 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25055 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")((void)0);
25056 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((void)0);
25057 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25058 SDValue Lo, Hi;
25059 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25060 DAG.getConstant(0, dl, MVT::i32));
25061 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25062 DAG.getConstant(1, dl, MVT::i32));
25063
25064 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25065 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25066
25067 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25068 } else {
25069 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25070 Mask.getSimpleValueType().getSizeInBits());
25071 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25072 // are extracted by EXTRACT_SUBVECTOR.
25073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25074 DAG.getBitcast(BitcastVT, Mask),
25075 DAG.getIntPtrConstant(0, dl));
25076 }
25077}
25078
25079/// Return (and \p Op, \p Mask) for compare instructions or
25080/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25081/// necessary casting or extending for \p Mask when lowering masking intrinsics
25082static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25083 SDValue PreservedSrc,
25084 const X86Subtarget &Subtarget,
25085 SelectionDAG &DAG) {
25086 MVT VT = Op.getSimpleValueType();
25087 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25088 unsigned OpcodeSelect = ISD::VSELECT;
25089 SDLoc dl(Op);
25090
25091 if (isAllOnesConstant(Mask))
25092 return Op;
25093
25094 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25095
25096 if (PreservedSrc.isUndef())
25097 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25098 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25099}
25100
25101/// Creates an SDNode for a predicated scalar operation.
25102/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25103/// The mask is coming as MVT::i8 and it should be transformed
25104/// to MVT::v1i1 while lowering masking intrinsics.
25105/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25106/// "X86select" instead of "vselect". We just can't create the "vselect" node
25107/// for a scalar instruction.
25108static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25109 SDValue PreservedSrc,
25110 const X86Subtarget &Subtarget,
25111 SelectionDAG &DAG) {
25112
25113 if (auto *MaskConst
7.1
'MaskConst' is null
7.1
'MaskConst' is null
= dyn_cast<ConstantSDNode>(Mask))
8
Taking false branch
25114 if (MaskConst->getZExtValue() & 0x1)
25115 return Op;
25116
25117 MVT VT = Op.getSimpleValueType();
25118 SDLoc dl(Op);
25119
25120 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")((void)0);
25121 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25122 DAG.getBitcast(MVT::v8i1, Mask),
25123 DAG.getIntPtrConstant(0, dl));
25124 if (Op.getOpcode() == X86ISD::FSETCCM ||
9
Assuming the condition is false
12
Taking false branch
25125 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
10
Assuming the condition is false
25126 Op.getOpcode() == X86ISD::VFPCLASSS)
11
Assuming the condition is false
25127 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25128
25129 if (PreservedSrc.isUndef())
13
Calling 'SDValue::isUndef'
25130 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25131 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25132}
25133
25134static int getSEHRegistrationNodeSize(const Function *Fn) {
25135 if (!Fn->hasPersonalityFn())
25136 report_fatal_error(
25137 "querying registration node size for function without personality");
25138 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25139 // WinEHStatePass for the full struct definition.
25140 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25141 case EHPersonality::MSVC_X86SEH: return 24;
25142 case EHPersonality::MSVC_CXX: return 16;
25143 default: break;
25144 }
25145 report_fatal_error(
25146 "can only recover FP for 32-bit MSVC EH personality functions");
25147}
25148
25149/// When the MSVC runtime transfers control to us, either to an outlined
25150/// function or when returning to a parent frame after catching an exception, we
25151/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25152/// Here's the math:
25153/// RegNodeBase = EntryEBP - RegNodeSize
25154/// ParentFP = RegNodeBase - ParentFrameOffset
25155/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25156/// subtracting the offset (negative on x86) takes us back to the parent FP.
25157static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25158 SDValue EntryEBP) {
25159 MachineFunction &MF = DAG.getMachineFunction();
25160 SDLoc dl;
25161
25162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25163 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25164
25165 // It's possible that the parent function no longer has a personality function
25166 // if the exceptional code was optimized away, in which case we just return
25167 // the incoming EBP.
25168 if (!Fn->hasPersonalityFn())
25169 return EntryEBP;
25170
25171 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25172 // registration, or the .set_setframe offset.
25173 MCSymbol *OffsetSym =
25174 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25175 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25176 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25177 SDValue ParentFrameOffset =
25178 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25179
25180 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25181 // prologue to RBP in the parent function.
25182 const X86Subtarget &Subtarget =
25183 static_cast<const X86Subtarget &>(DAG.getSubtarget());
25184 if (Subtarget.is64Bit())
25185 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25186
25187 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25188 // RegNodeBase = EntryEBP - RegNodeSize
25189 // ParentFP = RegNodeBase - ParentFrameOffset
25190 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25191 DAG.getConstant(RegNodeSize, dl, PtrVT));
25192 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25193}
25194
25195SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25196 SelectionDAG &DAG) const {
25197 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25198 auto isRoundModeCurDirection = [](SDValue Rnd) {
25199 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25200 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25201
25202 return false;
25203 };
25204 auto isRoundModeSAE = [](SDValue Rnd) {
25205 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25206 unsigned RC = C->getZExtValue();
25207 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25208 // Clear the NO_EXC bit and check remaining bits.
25209 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25210 // As a convenience we allow no other bits or explicitly
25211 // current direction.
25212 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25213 }
25214 }
25215
25216 return false;
25217 };
25218 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25219 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25220 RC = C->getZExtValue();
25221 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25222 // Clear the NO_EXC bit and check remaining bits.
25223 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25224 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25225 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25226 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25227 RC == X86::STATIC_ROUNDING::TO_ZERO;
25228 }
25229 }
25230
25231 return false;
25232 };
25233
25234 SDLoc dl(Op);
25235 unsigned IntNo = Op.getConstantOperandVal(0);
25236 MVT VT = Op.getSimpleValueType();
25237 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25238
25239 // Propagate flags from original node to transformed node(s).
25240 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25241
25242 if (IntrData
0.1
'IntrData' is non-null
0.1
'IntrData' is non-null
) {
1
Taking true branch
25243 switch(IntrData->Type) {
2
Control jumps to 'case FPCLASSS:' at line 25566
25244 case INTR_TYPE_1OP: {
25245 // We specify 2 possible opcodes for intrinsics with rounding modes.
25246 // First, we check if the intrinsic may have non-default rounding mode,
25247 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25248 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25249 if (IntrWithRoundingModeOpcode != 0) {
25250 SDValue Rnd = Op.getOperand(2);
25251 unsigned RC = 0;
25252 if (isRoundModeSAEToX(Rnd, RC))
25253 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25254 Op.getOperand(1),
25255 DAG.getTargetConstant(RC, dl, MVT::i32));
25256 if (!isRoundModeCurDirection(Rnd))
25257 return SDValue();
25258 }
25259 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25260 Op.getOperand(1));
25261 }
25262 case INTR_TYPE_1OP_SAE: {
25263 SDValue Sae = Op.getOperand(2);
25264
25265 unsigned Opc;
25266 if (isRoundModeCurDirection(Sae))
25267 Opc = IntrData->Opc0;
25268 else if (isRoundModeSAE(Sae))
25269 Opc = IntrData->Opc1;
25270 else
25271 return SDValue();
25272
25273 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25274 }
25275 case INTR_TYPE_2OP: {
25276 SDValue Src2 = Op.getOperand(2);
25277
25278 // We specify 2 possible opcodes for intrinsics with rounding modes.
25279 // First, we check if the intrinsic may have non-default rounding mode,
25280 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25281 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25282 if (IntrWithRoundingModeOpcode != 0) {
25283 SDValue Rnd = Op.getOperand(3);
25284 unsigned RC = 0;
25285 if (isRoundModeSAEToX(Rnd, RC))
25286 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25287 Op.getOperand(1), Src2,
25288 DAG.getTargetConstant(RC, dl, MVT::i32));
25289 if (!isRoundModeCurDirection(Rnd))
25290 return SDValue();
25291 }
25292
25293 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25294 Op.getOperand(1), Src2);
25295 }
25296 case INTR_TYPE_2OP_SAE: {
25297 SDValue Sae = Op.getOperand(3);
25298
25299 unsigned Opc;
25300 if (isRoundModeCurDirection(Sae))
25301 Opc = IntrData->Opc0;
25302 else if (isRoundModeSAE(Sae))
25303 Opc = IntrData->Opc1;
25304 else
25305 return SDValue();
25306
25307 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25308 Op.getOperand(2));
25309 }
25310 case INTR_TYPE_3OP:
25311 case INTR_TYPE_3OP_IMM8: {
25312 SDValue Src1 = Op.getOperand(1);
25313 SDValue Src2 = Op.getOperand(2);
25314 SDValue Src3 = Op.getOperand(3);
25315
25316 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25317 Src3.getValueType() != MVT::i8) {
25318 Src3 = DAG.getTargetConstant(
25319 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25320 }
25321
25322 // We specify 2 possible opcodes for intrinsics with rounding modes.
25323 // First, we check if the intrinsic may have non-default rounding mode,
25324 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25325 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25326 if (IntrWithRoundingModeOpcode != 0) {
25327 SDValue Rnd = Op.getOperand(4);
25328 unsigned RC = 0;
25329 if (isRoundModeSAEToX(Rnd, RC))
25330 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25331 Src1, Src2, Src3,
25332 DAG.getTargetConstant(RC, dl, MVT::i32));
25333 if (!isRoundModeCurDirection(Rnd))
25334 return SDValue();
25335 }
25336
25337 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25338 {Src1, Src2, Src3});
25339 }
25340 case INTR_TYPE_4OP_IMM8: {
25341 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)((void)0);
25342 SDValue Src4 = Op.getOperand(4);
25343 if (Src4.getValueType() != MVT::i8) {
25344 Src4 = DAG.getTargetConstant(
25345 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25346 }
25347
25348 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25349 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25350 Src4);
25351 }
25352 case INTR_TYPE_1OP_MASK: {
25353 SDValue Src = Op.getOperand(1);
25354 SDValue PassThru = Op.getOperand(2);
25355 SDValue Mask = Op.getOperand(3);
25356 // We add rounding mode to the Node when
25357 // - RC Opcode is specified and
25358 // - RC is not "current direction".
25359 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25360 if (IntrWithRoundingModeOpcode != 0) {
25361 SDValue Rnd = Op.getOperand(4);
25362 unsigned RC = 0;
25363 if (isRoundModeSAEToX(Rnd, RC))
25364 return getVectorMaskingNode(
25365 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25366 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25367 Mask, PassThru, Subtarget, DAG);
25368 if (!isRoundModeCurDirection(Rnd))
25369 return SDValue();
25370 }
25371 return getVectorMaskingNode(
25372 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25373 Subtarget, DAG);
25374 }
25375 case INTR_TYPE_1OP_MASK_SAE: {
25376 SDValue Src = Op.getOperand(1);
25377 SDValue PassThru = Op.getOperand(2);
25378 SDValue Mask = Op.getOperand(3);
25379 SDValue Rnd = Op.getOperand(4);
25380
25381 unsigned Opc;
25382 if (isRoundModeCurDirection(Rnd))
25383 Opc = IntrData->Opc0;
25384 else if (isRoundModeSAE(Rnd))
25385 Opc = IntrData->Opc1;
25386 else
25387 return SDValue();
25388
25389 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25390 Subtarget, DAG);
25391 }
25392 case INTR_TYPE_SCALAR_MASK: {
25393 SDValue Src1 = Op.getOperand(1);
25394 SDValue Src2 = Op.getOperand(2);
25395 SDValue passThru = Op.getOperand(3);
25396 SDValue Mask = Op.getOperand(4);
25397 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25398 // There are 2 kinds of intrinsics in this group:
25399 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25400 // (2) With rounding mode and sae - 7 operands.
25401 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25402 if (Op.getNumOperands() == (5U + HasRounding)) {
25403 if (HasRounding) {
25404 SDValue Rnd = Op.getOperand(5);
25405 unsigned RC = 0;
25406 if (isRoundModeSAEToX(Rnd, RC))
25407 return getScalarMaskingNode(
25408 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25409 DAG.getTargetConstant(RC, dl, MVT::i32)),
25410 Mask, passThru, Subtarget, DAG);
25411 if (!isRoundModeCurDirection(Rnd))
25412 return SDValue();
25413 }
25414 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25415 Src2),
25416 Mask, passThru, Subtarget, DAG);
25417 }
25418
25419 assert(Op.getNumOperands() == (6U + HasRounding) &&((void)0)
25420 "Unexpected intrinsic form")((void)0);
25421 SDValue RoundingMode = Op.getOperand(5);
25422 unsigned Opc = IntrData->Opc0;
25423 if (HasRounding) {
25424 SDValue Sae = Op.getOperand(6);
25425 if (isRoundModeSAE(Sae))
25426 Opc = IntrWithRoundingModeOpcode;
25427 else if (!isRoundModeCurDirection(Sae))
25428 return SDValue();
25429 }
25430 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25431 Src2, RoundingMode),
25432 Mask, passThru, Subtarget, DAG);
25433 }
25434 case INTR_TYPE_SCALAR_MASK_RND: {
25435 SDValue Src1 = Op.getOperand(1);
25436 SDValue Src2 = Op.getOperand(2);
25437 SDValue passThru = Op.getOperand(3);
25438 SDValue Mask = Op.getOperand(4);
25439 SDValue Rnd = Op.getOperand(5);
25440
25441 SDValue NewOp;
25442 unsigned RC = 0;
25443 if (isRoundModeCurDirection(Rnd))
25444 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25445 else if (isRoundModeSAEToX(Rnd, RC))
25446 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25447 DAG.getTargetConstant(RC, dl, MVT::i32));
25448 else
25449 return SDValue();
25450
25451 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25452 }
25453 case INTR_TYPE_SCALAR_MASK_SAE: {
25454 SDValue Src1 = Op.getOperand(1);
25455 SDValue Src2 = Op.getOperand(2);
25456 SDValue passThru = Op.getOperand(3);
25457 SDValue Mask = Op.getOperand(4);
25458 SDValue Sae = Op.getOperand(5);
25459 unsigned Opc;
25460 if (isRoundModeCurDirection(Sae))
25461 Opc = IntrData->Opc0;
25462 else if (isRoundModeSAE(Sae))
25463 Opc = IntrData->Opc1;
25464 else
25465 return SDValue();
25466
25467 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25468 Mask, passThru, Subtarget, DAG);
25469 }
25470 case INTR_TYPE_2OP_MASK: {
25471 SDValue Src1 = Op.getOperand(1);
25472 SDValue Src2 = Op.getOperand(2);
25473 SDValue PassThru = Op.getOperand(3);
25474 SDValue Mask = Op.getOperand(4);
25475 SDValue NewOp;
25476 if (IntrData->Opc1 != 0) {
25477 SDValue Rnd = Op.getOperand(5);
25478 unsigned RC = 0;
25479 if (isRoundModeSAEToX(Rnd, RC))
25480 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25481 DAG.getTargetConstant(RC, dl, MVT::i32));
25482 else if (!isRoundModeCurDirection(Rnd))
25483 return SDValue();
25484 }
25485 if (!NewOp)
25486 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25487 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25488 }
25489 case INTR_TYPE_2OP_MASK_SAE: {
25490 SDValue Src1 = Op.getOperand(1);
25491 SDValue Src2 = Op.getOperand(2);
25492 SDValue PassThru = Op.getOperand(3);
25493 SDValue Mask = Op.getOperand(4);
25494
25495 unsigned Opc = IntrData->Opc0;
25496 if (IntrData->Opc1 != 0) {
25497 SDValue Sae = Op.getOperand(5);
25498 if (isRoundModeSAE(Sae))
25499 Opc = IntrData->Opc1;
25500 else if (!isRoundModeCurDirection(Sae))
25501 return SDValue();
25502 }
25503
25504 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25505 Mask, PassThru, Subtarget, DAG);
25506 }
25507 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25508 SDValue Src1 = Op.getOperand(1);
25509 SDValue Src2 = Op.getOperand(2);
25510 SDValue Src3 = Op.getOperand(3);
25511 SDValue PassThru = Op.getOperand(4);
25512 SDValue Mask = Op.getOperand(5);
25513 SDValue Sae = Op.getOperand(6);
25514 unsigned Opc;
25515 if (isRoundModeCurDirection(Sae))
25516 Opc = IntrData->Opc0;
25517 else if (isRoundModeSAE(Sae))
25518 Opc = IntrData->Opc1;
25519 else
25520 return SDValue();
25521
25522 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25523 Mask, PassThru, Subtarget, DAG);
25524 }
25525 case INTR_TYPE_3OP_MASK_SAE: {
25526 SDValue Src1 = Op.getOperand(1);
25527 SDValue Src2 = Op.getOperand(2);
25528 SDValue Src3 = Op.getOperand(3);
25529 SDValue PassThru = Op.getOperand(4);
25530 SDValue Mask = Op.getOperand(5);
25531
25532 unsigned Opc = IntrData->Opc0;
25533 if (IntrData->Opc1 != 0) {
25534 SDValue Sae = Op.getOperand(6);
25535 if (isRoundModeSAE(Sae))
25536 Opc = IntrData->Opc1;
25537 else if (!isRoundModeCurDirection(Sae))
25538 return SDValue();
25539 }
25540 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25541 Mask, PassThru, Subtarget, DAG);
25542 }
25543 case BLENDV: {
25544 SDValue Src1 = Op.getOperand(1);
25545 SDValue Src2 = Op.getOperand(2);
25546 SDValue Src3 = Op.getOperand(3);
25547
25548 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25549 Src3 = DAG.getBitcast(MaskVT, Src3);
25550
25551 // Reverse the operands to match VSELECT order.
25552 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25553 }
25554 case VPERM_2OP : {
25555 SDValue Src1 = Op.getOperand(1);
25556 SDValue Src2 = Op.getOperand(2);
25557
25558 // Swap Src1 and Src2 in the node creation
25559 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25560 }
25561 case IFMA_OP:
25562 // NOTE: We need to swizzle the operands to pass the multiply operands
25563 // first.
25564 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25565 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25566 case FPCLASSS: {
25567 SDValue Src1 = Op.getOperand(1);
25568 SDValue Imm = Op.getOperand(2);
25569 SDValue Mask = Op.getOperand(3);
25570 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25571 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
3
Calling defaulted default constructor for 'SDValue'
6
Returning from default constructor for 'SDValue'
7
Calling 'getScalarMaskingNode'
25572 Subtarget, DAG);
25573 // Need to fill with zeros to ensure the bitcast will produce zeroes
25574 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25575 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25576 DAG.getConstant(0, dl, MVT::v8i1),
25577 FPclassMask, DAG.getIntPtrConstant(0, dl));
25578 return DAG.getBitcast(MVT::i8, Ins);
25579 }
25580
25581 case CMP_MASK_CC: {
25582 MVT MaskVT = Op.getSimpleValueType();
25583 SDValue CC = Op.getOperand(3);
25584 SDValue Mask = Op.getOperand(4);
25585 // We specify 2 possible opcodes for intrinsics with rounding modes.
25586 // First, we check if the intrinsic may have non-default rounding mode,
25587 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25588 if (IntrData->Opc1 != 0) {
25589 SDValue Sae = Op.getOperand(5);
25590 if (isRoundModeSAE(Sae))
25591 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25592 Op.getOperand(2), CC, Mask, Sae);
25593 if (!isRoundModeCurDirection(Sae))
25594 return SDValue();
25595 }
25596 //default rounding mode
25597 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25598 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25599 }
25600 case CMP_MASK_SCALAR_CC: {
25601 SDValue Src1 = Op.getOperand(1);
25602 SDValue Src2 = Op.getOperand(2);
25603 SDValue CC = Op.getOperand(3);
25604 SDValue Mask = Op.getOperand(4);
25605
25606 SDValue Cmp;
25607 if (IntrData->Opc1 != 0) {
25608 SDValue Sae = Op.getOperand(5);
25609 if (isRoundModeSAE(Sae))
25610 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25611 else if (!isRoundModeCurDirection(Sae))
25612 return SDValue();
25613 }
25614 //default rounding mode
25615 if (!Cmp.getNode())
25616 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25617
25618 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25619 Subtarget, DAG);
25620 // Need to fill with zeros to ensure the bitcast will produce zeroes
25621 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25622 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25623 DAG.getConstant(0, dl, MVT::v8i1),
25624 CmpMask, DAG.getIntPtrConstant(0, dl));
25625 return DAG.getBitcast(MVT::i8, Ins);
25626 }
25627 case COMI: { // Comparison intrinsics
25628 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25629 SDValue LHS = Op.getOperand(1);
25630 SDValue RHS = Op.getOperand(2);
25631 // Some conditions require the operands to be swapped.
25632 if (CC == ISD::SETLT || CC == ISD::SETLE)
25633 std::swap(LHS, RHS);
25634
25635 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25636 SDValue SetCC;
25637 switch (CC) {
25638 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25639 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25640 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25641 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25642 break;
25643 }
25644 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25645 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25646 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25647 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25648 break;
25649 }
25650 case ISD::SETGT: // (CF = 0 and ZF = 0)
25651 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25652 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25653 break;
25654 }
25655 case ISD::SETGE: // CF = 0
25656 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25657 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25658 break;
25659 default:
25660 llvm_unreachable("Unexpected illegal condition!")__builtin_unreachable();
25661 }
25662 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25663 }
25664 case COMI_RM: { // Comparison intrinsics with Sae
25665 SDValue LHS = Op.getOperand(1);
25666 SDValue RHS = Op.getOperand(2);
25667 unsigned CondVal = Op.getConstantOperandVal(3);
25668 SDValue Sae = Op.getOperand(4);
25669
25670 SDValue FCmp;
25671 if (isRoundModeCurDirection(Sae))
25672 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25673 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25674 else if (isRoundModeSAE(Sae))
25675 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25676 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25677 else
25678 return SDValue();
25679 // Need to fill with zeros to ensure the bitcast will produce zeroes
25680 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25681 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25682 DAG.getConstant(0, dl, MVT::v16i1),
25683 FCmp, DAG.getIntPtrConstant(0, dl));
25684 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25685 DAG.getBitcast(MVT::i16, Ins));
25686 }
25687 case VSHIFT:
25688 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25689 Op.getOperand(1), Op.getOperand(2), Subtarget,
25690 DAG);
25691 case COMPRESS_EXPAND_IN_REG: {
25692 SDValue Mask = Op.getOperand(3);
25693 SDValue DataToCompress = Op.getOperand(1);
25694 SDValue PassThru = Op.getOperand(2);
25695 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25696 return Op.getOperand(1);
25697
25698 // Avoid false dependency.
25699 if (PassThru.isUndef())
25700 PassThru = DAG.getConstant(0, dl, VT);
25701
25702 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25703 Mask);
25704 }
25705 case FIXUPIMM:
25706 case FIXUPIMM_MASKZ: {
25707 SDValue Src1 = Op.getOperand(1);
25708 SDValue Src2 = Op.getOperand(2);
25709 SDValue Src3 = Op.getOperand(3);
25710 SDValue Imm = Op.getOperand(4);
25711 SDValue Mask = Op.getOperand(5);
25712 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25713 ? Src1
25714 : getZeroVector(VT, Subtarget, DAG, dl);
25715
25716 unsigned Opc = IntrData->Opc0;
25717 if (IntrData->Opc1 != 0) {
25718 SDValue Sae = Op.getOperand(6);
25719 if (isRoundModeSAE(Sae))
25720 Opc = IntrData->Opc1;
25721 else if (!isRoundModeCurDirection(Sae))
25722 return SDValue();
25723 }
25724
25725 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25726
25727 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25728 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25729
25730 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25731 }
25732 case ROUNDP: {
25733 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")((void)0);
25734 // Clear the upper bits of the rounding immediate so that the legacy
25735 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25736 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25737 SDValue RoundingMode =
25738 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25739 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25740 Op.getOperand(1), RoundingMode);
25741 }
25742 case ROUNDS: {
25743 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")((void)0);
25744 // Clear the upper bits of the rounding immediate so that the legacy
25745 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25746 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25747 SDValue RoundingMode =
25748 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25749 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25750 Op.getOperand(1), Op.getOperand(2), RoundingMode);
25751 }
25752 case BEXTRI: {
25753 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")((void)0);
25754
25755 uint64_t Imm = Op.getConstantOperandVal(2);
25756 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25757 Op.getValueType());
25758 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25759 Op.getOperand(1), Control);
25760 }
25761 // ADC/ADCX/SBB
25762 case ADX: {
25763 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25764 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25765
25766 SDValue Res;
25767 // If the carry in is zero, then we should just use ADD/SUB instead of
25768 // ADC/SBB.
25769 if (isNullConstant(Op.getOperand(1))) {
25770 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25771 Op.getOperand(3));
25772 } else {
25773 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25774 DAG.getConstant(-1, dl, MVT::i8));
25775 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25776 Op.getOperand(3), GenCF.getValue(1));
25777 }
25778 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25779 SDValue Results[] = { SetCC, Res };
25780 return DAG.getMergeValues(Results, dl);
25781 }
25782 case CVTPD2PS_MASK:
25783 case CVTPD2DQ_MASK:
25784 case CVTQQ2PS_MASK:
25785 case TRUNCATE_TO_REG: {
25786 SDValue Src = Op.getOperand(1);
25787 SDValue PassThru = Op.getOperand(2);
25788 SDValue Mask = Op.getOperand(3);
25789
25790 if (isAllOnesConstant(Mask))
25791 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25792
25793 MVT SrcVT = Src.getSimpleValueType();
25794 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25795 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25796 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25797 {Src, PassThru, Mask});
25798 }
25799 case CVTPS2PH_MASK: {
25800 SDValue Src = Op.getOperand(1);
25801 SDValue Rnd = Op.getOperand(2);
25802 SDValue PassThru = Op.getOperand(3);
25803 SDValue Mask = Op.getOperand(4);
25804
25805 if (isAllOnesConstant(Mask))
25806 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
25807
25808 MVT SrcVT = Src.getSimpleValueType();
25809 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25810 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25811 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
25812 PassThru, Mask);
25813
25814 }
25815 case CVTNEPS2BF16_MASK: {
25816 SDValue Src = Op.getOperand(1);
25817 SDValue PassThru = Op.getOperand(2);
25818 SDValue Mask = Op.getOperand(3);
25819
25820 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25821 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25822
25823 // Break false dependency.
25824 if (PassThru.isUndef())
25825 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25826
25827 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25828 Mask);
25829 }
25830 default:
25831 break;
25832 }
25833 }
25834
25835 switch (IntNo) {
25836 default: return SDValue(); // Don't custom lower most intrinsics.
25837
25838 // ptest and testp intrinsics. The intrinsic these come from are designed to
25839 // return an integer value, not just an instruction so lower it to the ptest
25840 // or testp pattern and a setcc for the result.
25841 case Intrinsic::x86_avx512_ktestc_b:
25842 case Intrinsic::x86_avx512_ktestc_w:
25843 case Intrinsic::x86_avx512_ktestc_d:
25844 case Intrinsic::x86_avx512_ktestc_q:
25845 case Intrinsic::x86_avx512_ktestz_b:
25846 case Intrinsic::x86_avx512_ktestz_w:
25847 case Intrinsic::x86_avx512_ktestz_d:
25848 case Intrinsic::x86_avx512_ktestz_q:
25849 case Intrinsic::x86_sse41_ptestz:
25850 case Intrinsic::x86_sse41_ptestc:
25851 case Intrinsic::x86_sse41_ptestnzc:
25852 case Intrinsic::x86_avx_ptestz_256:
25853 case Intrinsic::x86_avx_ptestc_256:
25854 case Intrinsic::x86_avx_ptestnzc_256:
25855 case Intrinsic::x86_avx_vtestz_ps:
25856 case Intrinsic::x86_avx_vtestc_ps:
25857 case Intrinsic::x86_avx_vtestnzc_ps:
25858 case Intrinsic::x86_avx_vtestz_pd:
25859 case Intrinsic::x86_avx_vtestc_pd:
25860 case Intrinsic::x86_avx_vtestnzc_pd:
25861 case Intrinsic::x86_avx_vtestz_ps_256:
25862 case Intrinsic::x86_avx_vtestc_ps_256:
25863 case Intrinsic::x86_avx_vtestnzc_ps_256:
25864 case Intrinsic::x86_avx_vtestz_pd_256:
25865 case Intrinsic::x86_avx_vtestc_pd_256:
25866 case Intrinsic::x86_avx_vtestnzc_pd_256: {
25867 unsigned TestOpc = X86ISD::PTEST;
25868 X86::CondCode X86CC;
25869 switch (IntNo) {
25870 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")__builtin_unreachable();
25871 case Intrinsic::x86_avx512_ktestc_b:
25872 case Intrinsic::x86_avx512_ktestc_w:
25873 case Intrinsic::x86_avx512_ktestc_d:
25874 case Intrinsic::x86_avx512_ktestc_q:
25875 // CF = 1
25876 TestOpc = X86ISD::KTEST;
25877 X86CC = X86::COND_B;
25878 break;
25879 case Intrinsic::x86_avx512_ktestz_b:
25880 case Intrinsic::x86_avx512_ktestz_w:
25881 case Intrinsic::x86_avx512_ktestz_d:
25882 case Intrinsic::x86_avx512_ktestz_q:
25883 TestOpc = X86ISD::KTEST;
25884 X86CC = X86::COND_E;
25885 break;
25886 case Intrinsic::x86_avx_vtestz_ps:
25887 case Intrinsic::x86_avx_vtestz_pd:
25888 case Intrinsic::x86_avx_vtestz_ps_256:
25889 case Intrinsic::x86_avx_vtestz_pd_256:
25890 TestOpc = X86ISD::TESTP;
25891 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25892 case Intrinsic::x86_sse41_ptestz:
25893 case Intrinsic::x86_avx_ptestz_256:
25894 // ZF = 1
25895 X86CC = X86::COND_E;
25896 break;
25897 case Intrinsic::x86_avx_vtestc_ps:
25898 case Intrinsic::x86_avx_vtestc_pd:
25899 case Intrinsic::x86_avx_vtestc_ps_256:
25900 case Intrinsic::x86_avx_vtestc_pd_256:
25901 TestOpc = X86ISD::TESTP;
25902 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25903 case Intrinsic::x86_sse41_ptestc:
25904 case Intrinsic::x86_avx_ptestc_256:
25905 // CF = 1
25906 X86CC = X86::COND_B;
25907 break;
25908 case Intrinsic::x86_avx_vtestnzc_ps:
25909 case Intrinsic::x86_avx_vtestnzc_pd:
25910 case Intrinsic::x86_avx_vtestnzc_ps_256:
25911 case Intrinsic::x86_avx_vtestnzc_pd_256:
25912 TestOpc = X86ISD::TESTP;
25913 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25914 case Intrinsic::x86_sse41_ptestnzc:
25915 case Intrinsic::x86_avx_ptestnzc_256:
25916 // ZF and CF = 0
25917 X86CC = X86::COND_A;
25918 break;
25919 }
25920
25921 SDValue LHS = Op.getOperand(1);
25922 SDValue RHS = Op.getOperand(2);
25923 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25924 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25925 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25926 }
25927
25928 case Intrinsic::x86_sse42_pcmpistria128:
25929 case Intrinsic::x86_sse42_pcmpestria128:
25930 case Intrinsic::x86_sse42_pcmpistric128:
25931 case Intrinsic::x86_sse42_pcmpestric128:
25932 case Intrinsic::x86_sse42_pcmpistrio128:
25933 case Intrinsic::x86_sse42_pcmpestrio128:
25934 case Intrinsic::x86_sse42_pcmpistris128:
25935 case Intrinsic::x86_sse42_pcmpestris128:
25936 case Intrinsic::x86_sse42_pcmpistriz128:
25937 case Intrinsic::x86_sse42_pcmpestriz128: {
25938 unsigned Opcode;
25939 X86::CondCode X86CC;
25940 switch (IntNo) {
25941 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable(); // Can't reach here.
25942 case Intrinsic::x86_sse42_pcmpistria128:
25943 Opcode = X86ISD::PCMPISTR;
25944 X86CC = X86::COND_A;
25945 break;
25946 case Intrinsic::x86_sse42_pcmpestria128:
25947 Opcode = X86ISD::PCMPESTR;
25948 X86CC = X86::COND_A;
25949 break;
25950 case Intrinsic::x86_sse42_pcmpistric128:
25951 Opcode = X86ISD::PCMPISTR;
25952 X86CC = X86::COND_B;
25953 break;
25954 case Intrinsic::x86_sse42_pcmpestric128:
25955 Opcode = X86ISD::PCMPESTR;
25956 X86CC = X86::COND_B;
25957 break;
25958 case Intrinsic::x86_sse42_pcmpistrio128:
25959 Opcode = X86ISD::PCMPISTR;
25960 X86CC = X86::COND_O;
25961 break;
25962 case Intrinsic::x86_sse42_pcmpestrio128:
25963 Opcode = X86ISD::PCMPESTR;
25964 X86CC = X86::COND_O;
25965 break;
25966 case Intrinsic::x86_sse42_pcmpistris128:
25967 Opcode = X86ISD::PCMPISTR;
25968 X86CC = X86::COND_S;
25969 break;
25970 case Intrinsic::x86_sse42_pcmpestris128:
25971 Opcode = X86ISD::PCMPESTR;
25972 X86CC = X86::COND_S;
25973 break;
25974 case Intrinsic::x86_sse42_pcmpistriz128:
25975 Opcode = X86ISD::PCMPISTR;
25976 X86CC = X86::COND_E;
25977 break;
25978 case Intrinsic::x86_sse42_pcmpestriz128:
25979 Opcode = X86ISD::PCMPESTR;
25980 X86CC = X86::COND_E;
25981 break;
25982 }
25983 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25984 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25985 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25986 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25987 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25988 }
25989
25990 case Intrinsic::x86_sse42_pcmpistri128:
25991 case Intrinsic::x86_sse42_pcmpestri128: {
25992 unsigned Opcode;
25993 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25994 Opcode = X86ISD::PCMPISTR;
25995 else
25996 Opcode = X86ISD::PCMPESTR;
25997
25998 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25999 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26000 return DAG.getNode(Opcode, dl, VTs, NewOps);
26001 }
26002
26003 case Intrinsic::x86_sse42_pcmpistrm128:
26004 case Intrinsic::x86_sse42_pcmpestrm128: {
26005 unsigned Opcode;
26006 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26007 Opcode = X86ISD::PCMPISTR;
26008 else
26009 Opcode = X86ISD::PCMPESTR;
26010
26011 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26012 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26013 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26014 }
26015
26016 case Intrinsic::eh_sjlj_lsda: {
26017 MachineFunction &MF = DAG.getMachineFunction();
26018 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26019 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26020 auto &Context = MF.getMMI().getContext();
26021 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26022 Twine(MF.getFunctionNumber()));
26023 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26024 DAG.getMCSymbol(S, PtrVT));
26025 }
26026
26027 case Intrinsic::x86_seh_lsda: {
26028 // Compute the symbol for the LSDA. We know it'll get emitted later.
26029 MachineFunction &MF = DAG.getMachineFunction();
26030 SDValue Op1 = Op.getOperand(1);
26031 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26032 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26033 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26034
26035 // Generate a simple absolute symbol reference. This intrinsic is only
26036 // supported on 32-bit Windows, which isn't PIC.
26037 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26038 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26039 }
26040
26041 case Intrinsic::eh_recoverfp: {
26042 SDValue FnOp = Op.getOperand(1);
26043 SDValue IncomingFPOp = Op.getOperand(2);
26044 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26045 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26046 if (!Fn)
26047 report_fatal_error(
26048 "llvm.eh.recoverfp must take a function as the first argument");
26049 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26050 }
26051
26052 case Intrinsic::localaddress: {
26053 // Returns one of the stack, base, or frame pointer registers, depending on
26054 // which is used to reference local variables.
26055 MachineFunction &MF = DAG.getMachineFunction();
26056 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26057 unsigned Reg;
26058 if (RegInfo->hasBasePointer(MF))
26059 Reg = RegInfo->getBaseRegister();
26060 else { // Handles the SP or FP case.
26061 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26062 if (CantUseFP)
26063 Reg = RegInfo->getPtrSizedStackRegister(MF);
26064 else
26065 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26066 }
26067 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26068 }
26069 case Intrinsic::swift_async_context_addr: {
26070 auto &MF = DAG.getMachineFunction();
26071 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26072 if (Subtarget.is64Bit()) {
26073 MF.getFrameInfo().setFrameAddressIsTaken(true);
26074 X86FI->setHasSwiftAsyncContext(true);
26075 return SDValue(
26076 DAG.getMachineNode(
26077 X86::SUB64ri8, dl, MVT::i64,
26078 DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
26079 DAG.getTargetConstant(8, dl, MVT::i32)),
26080 0);
26081 } else {
26082 // 32-bit so no special extended frame, create or reuse an existing stack
26083 // slot.
26084 if (!X86FI->getSwiftAsyncContextFrameIdx())
26085 X86FI->setSwiftAsyncContextFrameIdx(
26086 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26087 return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26088 }
26089 }
26090 case Intrinsic::x86_avx512_vp2intersect_q_512:
26091 case Intrinsic::x86_avx512_vp2intersect_q_256:
26092 case Intrinsic::x86_avx512_vp2intersect_q_128:
26093 case Intrinsic::x86_avx512_vp2intersect_d_512:
26094 case Intrinsic::x86_avx512_vp2intersect_d_256:
26095 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26096 MVT MaskVT = Op.getSimpleValueType();
26097
26098 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26099 SDLoc DL(Op);
26100
26101 SDValue Operation =
26102 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26103 Op->getOperand(1), Op->getOperand(2));
26104
26105 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26106 MaskVT, Operation);
26107 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26108 MaskVT, Operation);
26109 return DAG.getMergeValues({Result0, Result1}, DL);
26110 }
26111 case Intrinsic::x86_mmx_pslli_w:
26112 case Intrinsic::x86_mmx_pslli_d:
26113 case Intrinsic::x86_mmx_pslli_q:
26114 case Intrinsic::x86_mmx_psrli_w:
26115 case Intrinsic::x86_mmx_psrli_d:
26116 case Intrinsic::x86_mmx_psrli_q:
26117 case Intrinsic::x86_mmx_psrai_w:
26118 case Intrinsic::x86_mmx_psrai_d: {
26119 SDLoc DL(Op);
26120 SDValue ShAmt = Op.getOperand(2);
26121 // If the argument is a constant, convert it to a target constant.
26122 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26123 // Clamp out of bounds shift amounts since they will otherwise be masked
26124 // to 8-bits which may make it no longer out of bounds.
26125 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26126 if (ShiftAmount == 0)
26127 return Op.getOperand(1);
26128
26129 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26130 Op.getOperand(0), Op.getOperand(1),
26131 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26132 }
26133
26134 unsigned NewIntrinsic;
26135 switch (IntNo) {
26136 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable(); // Can't reach here.
26137 case Intrinsic::x86_mmx_pslli_w:
26138 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26139 break;
26140 case Intrinsic::x86_mmx_pslli_d:
26141 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26142 break;
26143 case Intrinsic::x86_mmx_pslli_q:
26144 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26145 break;
26146 case Intrinsic::x86_mmx_psrli_w:
26147 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26148 break;
26149 case Intrinsic::x86_mmx_psrli_d:
26150 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26151 break;
26152 case Intrinsic::x86_mmx_psrli_q:
26153 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26154 break;
26155 case Intrinsic::x86_mmx_psrai_w:
26156 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26157 break;
26158 case Intrinsic::x86_mmx_psrai_d:
26159 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26160 break;
26161 }
26162
26163 // The vector shift intrinsics with scalars uses 32b shift amounts but
26164 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26165 // MMX register.
26166 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26167 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26168 DAG.getTargetConstant(NewIntrinsic, DL,
26169 getPointerTy(DAG.getDataLayout())),
26170 Op.getOperand(1), ShAmt);
26171 }
26172 }
26173}
26174
26175static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26176 SDValue Src, SDValue Mask, SDValue Base,
26177 SDValue Index, SDValue ScaleOp, SDValue Chain,
26178 const X86Subtarget &Subtarget) {
26179 SDLoc dl(Op);
26180 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26181 // Scale must be constant.
26182 if (!C)
26183 return SDValue();
26184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26185 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26186 TLI.getPointerTy(DAG.getDataLayout()));
26187 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26188 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26189 // If source is undef or we know it won't be used, use a zero vector
26190 // to break register dependency.
26191 // TODO: use undef instead and let BreakFalseDeps deal with it?
26192 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26193 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26194
26195 // Cast mask to an integer type.
26196 Mask = DAG.getBitcast(MaskVT, Mask);
26197
26198 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26199
26200 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26201 SDValue Res =
26202 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26203 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26204 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26205}
26206
26207static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26208 SDValue Src, SDValue Mask, SDValue Base,
26209 SDValue Index, SDValue ScaleOp, SDValue Chain,
26210 const X86Subtarget &Subtarget) {
26211 MVT VT = Op.getSimpleValueType();
26212 SDLoc dl(Op);
26213 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26214 // Scale must be constant.
26215 if (!C)
26216 return SDValue();
26217 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26218 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26219 TLI.getPointerTy(DAG.getDataLayout()));
26220 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26221 VT.getVectorNumElements());
26222 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26223
26224 // We support two versions of the gather intrinsics. One with scalar mask and
26225 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26226 if (Mask.getValueType() != MaskVT)
26227 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26228
26229 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26230 // If source is undef or we know it won't be used, use a zero vector
26231 // to break register dependency.
26232 // TODO: use undef instead and let BreakFalseDeps deal with it?
26233 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26234 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26235
26236 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26237
26238 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26239 SDValue Res =
26240 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26241 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26242 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26243}
26244
26245static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26246 SDValue Src, SDValue Mask, SDValue Base,
26247 SDValue Index, SDValue ScaleOp, SDValue Chain,
26248 const X86Subtarget &Subtarget) {
26249 SDLoc dl(Op);
26250 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26251 // Scale must be constant.
26252 if (!C)
26253 return SDValue();
26254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26255 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26256 TLI.getPointerTy(DAG.getDataLayout()));
26257 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26258 Src.getSimpleValueType().getVectorNumElements());
26259 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26260
26261 // We support two versions of the scatter intrinsics. One with scalar mask and
26262 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26263 if (Mask.getValueType() != MaskVT)
26264 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26265
26266 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26267
26268 SDVTList VTs = DAG.getVTList(MVT::Other);
26269 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26270 SDValue Res =
26271 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26272 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26273 return Res;
26274}
26275
26276static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26277 SDValue Mask, SDValue Base, SDValue Index,
26278 SDValue ScaleOp, SDValue Chain,
26279 const X86Subtarget &Subtarget) {
26280 SDLoc dl(Op);
26281 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26282 // Scale must be constant.
26283 if (!C)
26284 return SDValue();
26285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26286 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26287 TLI.getPointerTy(DAG.getDataLayout()));
26288 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26289 SDValue Segment = DAG.getRegister(0, MVT::i32);
26290 MVT MaskVT =
26291 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26292 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26293 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26294 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26295 return SDValue(Res, 0);
26296}
26297
26298/// Handles the lowering of builtin intrinsics with chain that return their
26299/// value into registers EDX:EAX.
26300/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26301/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26302/// TargetOpcode.
26303/// Returns a Glue value which can be used to add extra copy-from-reg if the
26304/// expanded intrinsics implicitly defines extra registers (i.e. not just
26305/// EDX:EAX).
26306static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26307 SelectionDAG &DAG,
26308 unsigned TargetOpcode,
26309 unsigned SrcReg,
26310 const X86Subtarget &Subtarget,
26311 SmallVectorImpl<SDValue> &Results) {
26312 SDValue Chain = N->getOperand(0);
26313 SDValue Glue;
26314
26315 if (SrcReg) {
26316 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((void)0);
26317 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26318 Glue = Chain.getValue(1);
26319 }
26320
26321 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26322 SDValue N1Ops[] = {Chain, Glue};
26323 SDNode *N1 = DAG.getMachineNode(
26324 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26325 Chain = SDValue(N1, 0);
26326
26327 // Reads the content of XCR and returns it in registers EDX:EAX.
26328 SDValue LO, HI;
26329 if (Subtarget.is64Bit()) {
26330 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26331 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26332 LO.getValue(2));
26333 } else {
26334 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26335 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26336 LO.getValue(2));
26337 }
26338 Chain = HI.getValue(1);
26339 Glue = HI.getValue(2);
26340
26341 if (Subtarget.is64Bit()) {
26342 // Merge the two 32-bit values into a 64-bit one.
26343 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26344 DAG.getConstant(32, DL, MVT::i8));
26345 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26346 Results.push_back(Chain);
26347 return Glue;
26348 }
26349
26350 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26351 SDValue Ops[] = { LO, HI };
26352 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26353 Results.push_back(Pair);
26354 Results.push_back(Chain);
26355 return Glue;
26356}
26357
26358/// Handles the lowering of builtin intrinsics that read the time stamp counter
26359/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26360/// READCYCLECOUNTER nodes.
26361static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26362 SelectionDAG &DAG,
26363 const X86Subtarget &Subtarget,
26364 SmallVectorImpl<SDValue> &Results) {
26365 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26366 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26367 // and the EAX register is loaded with the low-order 32 bits.
26368 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26369 /* NoRegister */0, Subtarget,
26370 Results);
26371 if (Opcode != X86::RDTSCP)
26372 return;
26373
26374 SDValue Chain = Results[1];
26375 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26376 // the ECX register. Add 'ecx' explicitly to the chain.
26377 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26378 Results[1] = ecx;
26379 Results.push_back(ecx.getValue(1));
26380}
26381
26382static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26383 SelectionDAG &DAG) {
26384 SmallVector<SDValue, 3> Results;
26385 SDLoc DL(Op);
26386 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26387 Results);
26388 return DAG.getMergeValues(Results, DL);
26389}
26390
26391static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26392 MachineFunction &MF = DAG.getMachineFunction();
26393 SDValue Chain = Op.getOperand(0);
26394 SDValue RegNode = Op.getOperand(2);
26395 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26396 if (!EHInfo)
26397 report_fatal_error("EH registrations only live in functions using WinEH");
26398
26399 // Cast the operand to an alloca, and remember the frame index.
26400 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26401 if (!FINode)
26402 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26403 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26404
26405 // Return the chain operand without making any DAG nodes.
26406 return Chain;
26407}
26408
26409static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26410 MachineFunction &MF = DAG.getMachineFunction();
26411 SDValue Chain = Op.getOperand(0);
26412 SDValue EHGuard = Op.getOperand(2);
26413 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26414 if (!EHInfo)
26415 report_fatal_error("EHGuard only live in functions using WinEH");
26416
26417 // Cast the operand to an alloca, and remember the frame index.
26418 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26419 if (!FINode)
26420 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26421 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26422
26423 // Return the chain operand without making any DAG nodes.
26424 return Chain;
26425}
26426
26427/// Emit Truncating Store with signed or unsigned saturation.
26428static SDValue
26429EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26430 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26431 SelectionDAG &DAG) {
26432 SDVTList VTs = DAG.getVTList(MVT::Other);
26433 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26434 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26435 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26436 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26437}
26438
26439/// Emit Masked Truncating Store with signed or unsigned saturation.
26440static SDValue
26441EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26442 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26443 MachineMemOperand *MMO, SelectionDAG &DAG) {
26444 SDVTList VTs = DAG.getVTList(MVT::Other);
26445 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26446 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26447 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26448}
26449
26450static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26451 SelectionDAG &DAG) {
26452 unsigned IntNo = Op.getConstantOperandVal(1);
26453 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26454 if (!IntrData) {
26455 switch (IntNo) {
26456 case llvm::Intrinsic::x86_seh_ehregnode:
26457 return MarkEHRegistrationNode(Op, DAG);
26458 case llvm::Intrinsic::x86_seh_ehguard:
26459 return MarkEHGuard(Op, DAG);
26460 case llvm::Intrinsic::x86_rdpkru: {
26461 SDLoc dl(Op);
26462 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26463 // Create a RDPKRU node and pass 0 to the ECX parameter.
26464 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26465 DAG.getConstant(0, dl, MVT::i32));
26466 }
26467 case llvm::Intrinsic::x86_wrpkru: {
26468 SDLoc dl(Op);
26469 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26470 // to the EDX and ECX parameters.
26471 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26472 Op.getOperand(0), Op.getOperand(2),
26473 DAG.getConstant(0, dl, MVT::i32),
26474 DAG.getConstant(0, dl, MVT::i32));
26475 }
26476 case llvm::Intrinsic::x86_flags_read_u32:
26477 case llvm::Intrinsic::x86_flags_read_u64:
26478 case llvm::Intrinsic::x86_flags_write_u32:
26479 case llvm::Intrinsic::x86_flags_write_u64: {
26480 // We need a frame pointer because this will get lowered to a PUSH/POP
26481 // sequence.
26482 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26483 MFI.setHasCopyImplyingStackAdjustment(true);
26484 // Don't do anything here, we will expand these intrinsics out later
26485 // during FinalizeISel in EmitInstrWithCustomInserter.
26486 return Op;
26487 }
26488 case Intrinsic::x86_lwpins32:
26489 case Intrinsic::x86_lwpins64:
26490 case Intrinsic::x86_umwait:
26491 case Intrinsic::x86_tpause: {
26492 SDLoc dl(Op);
26493 SDValue Chain = Op->getOperand(0);
26494 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26495 unsigned Opcode;
26496
26497 switch (IntNo) {
26498 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26499 case Intrinsic::x86_umwait:
26500 Opcode = X86ISD::UMWAIT;
26501 break;
26502 case Intrinsic::x86_tpause:
26503 Opcode = X86ISD::TPAUSE;
26504 break;
26505 case Intrinsic::x86_lwpins32:
26506 case Intrinsic::x86_lwpins64:
26507 Opcode = X86ISD::LWPINS;
26508 break;
26509 }
26510
26511 SDValue Operation =
26512 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26513 Op->getOperand(3), Op->getOperand(4));
26514 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26515 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26516 Operation.getValue(1));
26517 }
26518 case Intrinsic::x86_enqcmd:
26519 case Intrinsic::x86_enqcmds: {
26520 SDLoc dl(Op);
26521 SDValue Chain = Op.getOperand(0);
26522 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26523 unsigned Opcode;
26524 switch (IntNo) {
26525 default: llvm_unreachable("Impossible intrinsic!")__builtin_unreachable();
26526 case Intrinsic::x86_enqcmd:
26527 Opcode = X86ISD::ENQCMD;
26528 break;
26529 case Intrinsic::x86_enqcmds:
26530 Opcode = X86ISD::ENQCMDS;
26531 break;
26532 }
26533 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26534 Op.getOperand(3));
26535 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26536 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26537 Operation.getValue(1));
26538 }
26539 case Intrinsic::x86_aesenc128kl:
26540 case Intrinsic::x86_aesdec128kl:
26541 case Intrinsic::x86_aesenc256kl:
26542 case Intrinsic::x86_aesdec256kl: {
26543 SDLoc DL(Op);
26544 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26545 SDValue Chain = Op.getOperand(0);
26546 unsigned Opcode;
26547
26548 switch (IntNo) {
26549 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26550 case Intrinsic::x86_aesenc128kl:
26551 Opcode = X86ISD::AESENC128KL;
26552 break;
26553 case Intrinsic::x86_aesdec128kl:
26554 Opcode = X86ISD::AESDEC128KL;
26555 break;
26556 case Intrinsic::x86_aesenc256kl:
26557 Opcode = X86ISD::AESENC256KL;
26558 break;
26559 case Intrinsic::x86_aesdec256kl:
26560 Opcode = X86ISD::AESDEC256KL;
26561 break;
26562 }
26563
26564 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26565 MachineMemOperand *MMO = MemIntr->getMemOperand();
26566 EVT MemVT = MemIntr->getMemoryVT();
26567 SDValue Operation = DAG.getMemIntrinsicNode(
26568 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26569 MMO);
26570 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26571
26572 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26573 {ZF, Operation.getValue(0), Operation.getValue(2)});
26574 }
26575 case Intrinsic::x86_aesencwide128kl:
26576 case Intrinsic::x86_aesdecwide128kl:
26577 case Intrinsic::x86_aesencwide256kl:
26578 case Intrinsic::x86_aesdecwide256kl: {
26579 SDLoc DL(Op);
26580 SDVTList VTs = DAG.getVTList(
26581 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26582 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26583 SDValue Chain = Op.getOperand(0);
26584 unsigned Opcode;
26585
26586 switch (IntNo) {
26587 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26588 case Intrinsic::x86_aesencwide128kl:
26589 Opcode = X86ISD::AESENCWIDE128KL;
26590 break;
26591 case Intrinsic::x86_aesdecwide128kl:
26592 Opcode = X86ISD::AESDECWIDE128KL;
26593 break;
26594 case Intrinsic::x86_aesencwide256kl:
26595 Opcode = X86ISD::AESENCWIDE256KL;
26596 break;
26597 case Intrinsic::x86_aesdecwide256kl:
26598 Opcode = X86ISD::AESDECWIDE256KL;
26599 break;
26600 }
26601
26602 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26603 MachineMemOperand *MMO = MemIntr->getMemOperand();
26604 EVT MemVT = MemIntr->getMemoryVT();
26605 SDValue Operation = DAG.getMemIntrinsicNode(
26606 Opcode, DL, VTs,
26607 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26608 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26609 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26610 MemVT, MMO);
26611 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26612
26613 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26614 {ZF, Operation.getValue(1), Operation.getValue(2),
26615 Operation.getValue(3), Operation.getValue(4),
26616 Operation.getValue(5), Operation.getValue(6),
26617 Operation.getValue(7), Operation.getValue(8),
26618 Operation.getValue(9)});
26619 }
26620 case Intrinsic::x86_testui: {
26621 SDLoc dl(Op);
26622 SDValue Chain = Op.getOperand(0);
26623 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26624 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26625 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26626 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26627 Operation.getValue(1));
26628 }
26629 }
26630 return SDValue();
26631 }
26632
26633 SDLoc dl(Op);
26634 switch(IntrData->Type) {
26635 default: llvm_unreachable("Unknown Intrinsic Type")__builtin_unreachable();
26636 case RDSEED:
26637 case RDRAND: {
26638 // Emit the node with the right value type.
26639 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26640 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26641
26642 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26643 // Otherwise return the value from Rand, which is always 0, casted to i32.
26644 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26645 DAG.getConstant(1, dl, Op->getValueType(1)),
26646 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26647 SDValue(Result.getNode(), 1)};
26648 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26649
26650 // Return { result, isValid, chain }.
26651 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26652 SDValue(Result.getNode(), 2));
26653 }
26654 case GATHER_AVX2: {
26655 SDValue Chain = Op.getOperand(0);
26656 SDValue Src = Op.getOperand(2);
26657 SDValue Base = Op.getOperand(3);
26658 SDValue Index = Op.getOperand(4);
26659 SDValue Mask = Op.getOperand(5);
26660 SDValue Scale = Op.getOperand(6);
26661 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26662 Scale, Chain, Subtarget);
26663 }
26664 case GATHER: {
26665 //gather(v1, mask, index, base, scale);
26666 SDValue Chain = Op.getOperand(0);
26667 SDValue Src = Op.getOperand(2);
26668 SDValue Base = Op.getOperand(3);
26669 SDValue Index = Op.getOperand(4);
26670 SDValue Mask = Op.getOperand(5);
26671 SDValue Scale = Op.getOperand(6);
26672 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26673 Chain, Subtarget);
26674 }
26675 case SCATTER: {
26676 //scatter(base, mask, index, v1, scale);
26677 SDValue Chain = Op.getOperand(0);
26678 SDValue Base = Op.getOperand(2);
26679 SDValue Mask = Op.getOperand(3);
26680 SDValue Index = Op.getOperand(4);
26681 SDValue Src = Op.getOperand(5);
26682 SDValue Scale = Op.getOperand(6);
26683 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26684 Scale, Chain, Subtarget);
26685 }
26686 case PREFETCH: {
26687 const APInt &HintVal = Op.getConstantOperandAPInt(6);
26688 assert((HintVal == 2 || HintVal == 3) &&((void)0)
26689 "Wrong prefetch hint in intrinsic: should be 2 or 3")((void)0);
26690 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26691 SDValue Chain = Op.getOperand(0);
26692 SDValue Mask = Op.getOperand(2);
26693 SDValue Index = Op.getOperand(3);
26694 SDValue Base = Op.getOperand(4);
26695 SDValue Scale = Op.getOperand(5);
26696 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26697 Subtarget);
26698 }
26699 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26700 case RDTSC: {
26701 SmallVector<SDValue, 2> Results;
26702 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26703 Results);
26704 return DAG.getMergeValues(Results, dl);
26705 }
26706 // Read Performance Monitoring Counters.
26707 case RDPMC:
26708 // GetExtended Control Register.
26709 case XGETBV: {
26710 SmallVector<SDValue, 2> Results;
26711
26712 // RDPMC uses ECX to select the index of the performance counter to read.
26713 // XGETBV uses ECX to select the index of the XCR register to return.
26714 // The result is stored into registers EDX:EAX.
26715 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26716 Subtarget, Results);
26717 return DAG.getMergeValues(Results, dl);
26718 }
26719 // XTEST intrinsics.
26720 case XTEST: {
26721 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26722 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26723
26724 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26725 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26726 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26727 Ret, SDValue(InTrans.getNode(), 1));
26728 }
26729 case TRUNCATE_TO_MEM_VI8:
26730 case TRUNCATE_TO_MEM_VI16:
26731 case TRUNCATE_TO_MEM_VI32: {
26732 SDValue Mask = Op.getOperand(4);
26733 SDValue DataToTruncate = Op.getOperand(3);
26734 SDValue Addr = Op.getOperand(2);
26735 SDValue Chain = Op.getOperand(0);
26736
26737 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26738 assert(MemIntr && "Expected MemIntrinsicSDNode!")((void)0);
26739
26740 EVT MemVT = MemIntr->getMemoryVT();
26741
26742 uint16_t TruncationOp = IntrData->Opc0;
26743 switch (TruncationOp) {
26744 case X86ISD::VTRUNC: {
26745 if (isAllOnesConstant(Mask)) // return just a truncate store
26746 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26747 MemIntr->getMemOperand());
26748
26749 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26750 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26751 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26752
26753 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26754 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26755 true /* truncating */);
26756 }
26757 case X86ISD::VTRUNCUS:
26758 case X86ISD::VTRUNCS: {
26759 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26760 if (isAllOnesConstant(Mask))
26761 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26762 MemIntr->getMemOperand(), DAG);
26763
26764 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26765 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26766
26767 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26768 VMask, MemVT, MemIntr->getMemOperand(), DAG);
26769 }
26770 default:
26771 llvm_unreachable("Unsupported truncstore intrinsic")__builtin_unreachable();
26772 }
26773 }
26774 }
26775}
26776
26777SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26778 SelectionDAG &DAG) const {
26779 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26780 MFI.setReturnAddressIsTaken(true);
26781
26782 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26783 return SDValue();
26784
26785 unsigned Depth = Op.getConstantOperandVal(0);
26786 SDLoc dl(Op);
26787 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26788
26789 if (Depth > 0) {
26790 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26791 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26792 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26793 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26794 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26795 MachinePointerInfo());
26796 }
26797
26798 // Just load the return address.
26799 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26800 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26801 MachinePointerInfo());
26802}
26803
26804SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26805 SelectionDAG &DAG) const {
26806 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26807 return getReturnAddressFrameIndex(DAG);
26808}
26809
26810SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26811 MachineFunction &MF = DAG.getMachineFunction();
26812 MachineFrameInfo &MFI = MF.getFrameInfo();
26813 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26814 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26815 EVT VT = Op.getValueType();
26816
26817 MFI.setFrameAddressIsTaken(true);
26818
26819 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26820 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
26821 // is not possible to crawl up the stack without looking at the unwind codes
26822 // simultaneously.
26823 int FrameAddrIndex = FuncInfo->getFAIndex();
26824 if (!FrameAddrIndex) {
26825 // Set up a frame object for the return address.
26826 unsigned SlotSize = RegInfo->getSlotSize();
26827 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26828 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26829 FuncInfo->setFAIndex(FrameAddrIndex);
26830 }
26831 return DAG.getFrameIndex(FrameAddrIndex, VT);
26832 }
26833
26834 unsigned FrameReg =
26835 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26836 SDLoc dl(Op); // FIXME probably not meaningful
26837 unsigned Depth = Op.getConstantOperandVal(0);
26838 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((void)0)
26839 (FrameReg == X86::EBP && VT == MVT::i32)) &&((void)0)
26840 "Invalid Frame Register!")((void)0);
26841 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26842 while (Depth--)
26843 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
26844 MachinePointerInfo());
26845 return FrameAddr;
26846}
26847
26848// FIXME? Maybe this could be a TableGen attribute on some registers and
26849// this table could be generated automatically from RegInfo.
26850Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
26851 const MachineFunction &MF) const {
26852 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26853
26854 Register Reg = StringSwitch<unsigned>(RegName)
26855 .Case("esp", X86::ESP)
26856 .Case("rsp", X86::RSP)
26857 .Case("ebp", X86::EBP)
26858 .Case("rbp", X86::RBP)
26859 .Default(0);
26860
26861 if (Reg == X86::EBP || Reg == X86::RBP) {
26862 if (!TFI.hasFP(MF))
26863 report_fatal_error("register " + StringRef(RegName) +
26864 " is allocatable: function has no frame pointer");
26865#ifndef NDEBUG1
26866 else {
26867 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26868 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
26869 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&((void)0)
26870 "Invalid Frame Register!")((void)0);
26871 }
26872#endif
26873 }
26874
26875 if (Reg)
26876 return Reg;
26877
26878 report_fatal_error("Invalid register name global variable");
26879}
26880
26881SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
26882 SelectionDAG &DAG) const {
26883 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26884 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
26885}
26886
26887Register X86TargetLowering::getExceptionPointerRegister(
26888 const Constant *PersonalityFn) const {
26889 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
26890 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26891
26892 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
26893}
26894
26895Register X86TargetLowering::getExceptionSelectorRegister(
26896 const Constant *PersonalityFn) const {
26897 // Funclet personalities don't use selectors (the runtime does the selection).
26898 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
26899 return X86::NoRegister;
26900 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26901}
26902
26903bool X86TargetLowering::needsFixedCatchObjects() const {
26904 return Subtarget.isTargetWin64();
26905}
26906
26907SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
26908 SDValue Chain = Op.getOperand(0);
26909 SDValue Offset = Op.getOperand(1);
26910 SDValue Handler = Op.getOperand(2);
26911 SDLoc dl (Op);
26912
26913 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26914 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26915 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
26916 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((void)0)
26917 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((void)0)
26918 "Invalid Frame Register!")((void)0);
26919 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
26920 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
26921
26922 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
26923 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
26924 dl));
26925 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
26926 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
26927 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
26928
26929 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
26930 DAG.getRegister(StoreAddrReg, PtrVT));
26931}
26932
26933SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
26934 SelectionDAG &DAG) const {
26935 SDLoc DL(Op);
26936 // If the subtarget is not 64bit, we may need the global base reg
26937 // after isel expand pseudo, i.e., after CGBR pass ran.
26938 // Therefore, ask for the GlobalBaseReg now, so that the pass
26939 // inserts the code for us in case we need it.
26940 // Otherwise, we will end up in a situation where we will
26941 // reference a virtual register that is not defined!
26942 if (!Subtarget.is64Bit()) {
26943 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26944 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
26945 }
26946 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
26947 DAG.getVTList(MVT::i32, MVT::Other),
26948 Op.getOperand(0), Op.getOperand(1));
26949}
26950
26951SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
26952 SelectionDAG &DAG) const {
26953 SDLoc DL(Op);
26954 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
26955 Op.getOperand(0), Op.getOperand(1));
26956}
26957
26958SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
26959 SelectionDAG &DAG) const {
26960 SDLoc DL(Op);
26961 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
26962 Op.getOperand(0));
26963}
26964
26965static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
26966 return Op.getOperand(0);
26967}
26968
26969SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
26970 SelectionDAG &DAG) const {
26971 SDValue Root = Op.getOperand(0);
26972 SDValue Trmp = Op.getOperand(1); // trampoline
26973 SDValue FPtr = Op.getOperand(2); // nested function
26974 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
26975 SDLoc dl (Op);
26976
26977 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26978 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26979
26980 if (Subtarget.is64Bit()) {
26981 SDValue OutChains[6];
26982
26983 // Large code-model.
26984 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
26985 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
26986
26987 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
26988 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
26989
26990 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
26991
26992 // Load the pointer to the nested function into R11.
26993 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
26994 SDValue Addr = Trmp;
26995 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26996 Addr, MachinePointerInfo(TrmpAddr));
26997
26998 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26999 DAG.getConstant(2, dl, MVT::i64));
27000 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27001 MachinePointerInfo(TrmpAddr, 2), Align(2));
27002
27003 // Load the 'nest' parameter value into R10.
27004 // R10 is specified in X86CallingConv.td
27005 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27006 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27007 DAG.getConstant(10, dl, MVT::i64));
27008 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27009 Addr, MachinePointerInfo(TrmpAddr, 10));
27010
27011 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27012 DAG.getConstant(12, dl, MVT::i64));
27013 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27014 MachinePointerInfo(TrmpAddr, 12), Align(2));
27015
27016 // Jump to the nested function.
27017 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27018 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27019 DAG.getConstant(20, dl, MVT::i64));
27020 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27021 Addr, MachinePointerInfo(TrmpAddr, 20));
27022
27023 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27024 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27025 DAG.getConstant(22, dl, MVT::i64));
27026 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27027 Addr, MachinePointerInfo(TrmpAddr, 22));
27028
27029 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27030 } else {
27031 const Function *Func =
27032 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27033 CallingConv::ID CC = Func->getCallingConv();
27034 unsigned NestReg;
27035
27036 switch (CC) {
27037 default:
27038 llvm_unreachable("Unsupported calling convention")__builtin_unreachable();
27039 case CallingConv::C:
27040 case CallingConv::X86_StdCall: {
27041 // Pass 'nest' parameter in ECX.
27042 // Must be kept in sync with X86CallingConv.td
27043 NestReg = X86::ECX;
27044
27045 // Check that ECX wasn't needed by an 'inreg' parameter.
27046 FunctionType *FTy = Func->getFunctionType();
27047 const AttributeList &Attrs = Func->getAttributes();
27048
27049 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27050 unsigned InRegCount = 0;
27051 unsigned Idx = 1;
27052
27053 for (FunctionType::param_iterator I = FTy->param_begin(),
27054 E = FTy->param_end(); I != E; ++I, ++Idx)
27055 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
27056 const DataLayout &DL = DAG.getDataLayout();
27057 // FIXME: should only count parameters that are lowered to integers.
27058 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27059 }
27060
27061 if (InRegCount > 2) {
27062 report_fatal_error("Nest register in use - reduce number of inreg"
27063 " parameters!");
27064 }
27065 }
27066 break;
27067 }
27068 case CallingConv::X86_FastCall:
27069 case CallingConv::X86_ThisCall:
27070 case CallingConv::Fast:
27071 case CallingConv::Tail:
27072 case CallingConv::SwiftTail:
27073 // Pass 'nest' parameter in EAX.
27074 // Must be kept in sync with X86CallingConv.td
27075 NestReg = X86::EAX;
27076 break;
27077 }
27078
27079 SDValue OutChains[4];
27080 SDValue Addr, Disp;
27081
27082 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27083 DAG.getConstant(10, dl, MVT::i32));
27084 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27085
27086 // This is storing the opcode for MOV32ri.
27087 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27088 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27089 OutChains[0] =
27090 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27091 Trmp, MachinePointerInfo(TrmpAddr));
27092
27093 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27094 DAG.getConstant(1, dl, MVT::i32));
27095 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27096 MachinePointerInfo(TrmpAddr, 1), Align(1));
27097
27098 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27099 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27100 DAG.getConstant(5, dl, MVT::i32));
27101 OutChains[2] =
27102 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27103 MachinePointerInfo(TrmpAddr, 5), Align(1));
27104
27105 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27106 DAG.getConstant(6, dl, MVT::i32));
27107 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27108 MachinePointerInfo(TrmpAddr, 6), Align(1));
27109
27110 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27111 }
27112}
27113
27114SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
27115 SelectionDAG &DAG) const {
27116 /*
27117 The rounding mode is in bits 11:10 of FPSR, and has the following
27118 settings:
27119 00 Round to nearest
27120 01 Round to -inf
27121 10 Round to +inf
27122 11 Round to 0
27123
27124 FLT_ROUNDS, on the other hand, expects the following:
27125 -1 Undefined
27126 0 Round to 0
27127 1 Round to nearest
27128 2 Round to +inf
27129 3 Round to -inf
27130
27131 To perform the conversion, we use a packed lookup table of the four 2-bit
27132 values that we can index by FPSP[11:10]
27133 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27134
27135 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27136 */
27137
27138 MachineFunction &MF = DAG.getMachineFunction();
27139 MVT VT = Op.getSimpleValueType();
27140 SDLoc DL(Op);
27141
27142 // Save FP Control Word to stack slot
27143 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27144 SDValue StackSlot =
27145 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27146
27147 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27148
27149 SDValue Chain = Op.getOperand(0);
27150 SDValue Ops[] = {Chain, StackSlot};
27151 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27152 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27153 Align(2), MachineMemOperand::MOStore);
27154
27155 // Load FP Control Word from stack slot
27156 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27157 Chain = CWD.getValue(1);
27158
27159 // Mask and turn the control bits into a shift for the lookup table.
27160 SDValue Shift =
27161 DAG.getNode(ISD::SRL, DL, MVT::i16,
27162 DAG.getNode(ISD::AND, DL, MVT::i16,
27163 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27164 DAG.getConstant(9, DL, MVT::i8));
27165 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27166
27167 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27168 SDValue RetVal =
27169 DAG.getNode(ISD::AND, DL, MVT::i32,
27170 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27171 DAG.getConstant(3, DL, MVT::i32));
27172
27173 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27174
27175 return DAG.getMergeValues({RetVal, Chain}, DL);
27176}
27177
27178SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27179 SelectionDAG &DAG) const {
27180 MachineFunction &MF = DAG.getMachineFunction();
27181 SDLoc DL(Op);
27182 SDValue Chain = Op.getNode()->getOperand(0);
27183
27184 // FP control word may be set only from data in memory. So we need to allocate
27185 // stack space to save/load FP control word.
27186 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27187 SDValue StackSlot =
27188 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27189 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27190 MachineMemOperand *MMO =
27191 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27192
27193 // Store FP control word into memory.
27194 SDValue Ops[] = {Chain, StackSlot};
27195 Chain = DAG.getMemIntrinsicNode(
27196 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27197
27198 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27199 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27200 Chain = CWD.getValue(1);
27201 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27202 DAG.getConstant(0xf3ff, DL, MVT::i16));
27203
27204 // Calculate new rounding mode.
27205 SDValue NewRM = Op.getNode()->getOperand(1);
27206 SDValue RMBits;
27207 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27208 uint64_t RM = CVal->getZExtValue();
27209 int FieldVal;
27210 switch (static_cast<RoundingMode>(RM)) {
27211 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27212 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27213 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27214 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27215 default:
27216 llvm_unreachable("rounding mode is not supported by X86 hardware")__builtin_unreachable();
27217 }
27218 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27219 } else {
27220 // Need to convert argument into bits of control word:
27221 // 0 Round to 0 -> 11
27222 // 1 Round to nearest -> 00
27223 // 2 Round to +inf -> 10
27224 // 3 Round to -inf -> 01
27225 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27226 // To make the conversion, put all these values into a value 0xc9 and shift
27227 // it left depending on the rounding mode:
27228 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27229 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27230 // ...
27231 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27232 SDValue ShiftValue =
27233 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27234 DAG.getNode(ISD::ADD, DL, MVT::i32,
27235 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27236 DAG.getConstant(1, DL, MVT::i8)),
27237 DAG.getConstant(4, DL, MVT::i32)));
27238 SDValue Shifted =
27239 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27240 ShiftValue);
27241 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27242 DAG.getConstant(0xc00, DL, MVT::i16));
27243 }
27244
27245 // Update rounding mode bits and store the new FP Control Word into stack.
27246 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27247 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27248
27249 // Load FP control word from the slot.
27250 SDValue OpsLD[] = {Chain, StackSlot};
27251 MachineMemOperand *MMOL =
27252 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27253 Chain = DAG.getMemIntrinsicNode(
27254 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27255
27256 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27257 // same way but in bits 14:13.
27258 if (Subtarget.hasSSE1()) {
27259 // Store MXCSR into memory.
27260 Chain = DAG.getNode(
27261 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27262 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27263 StackSlot);
27264
27265 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27266 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27267 Chain = CWD.getValue(1);
27268 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27269 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27270
27271 // Shift X87 RM bits from 11:10 to 14:13.
27272 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27273 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27274 DAG.getConstant(3, DL, MVT::i8));
27275
27276 // Update rounding mode bits and store the new FP Control Word into stack.
27277 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27278 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27279
27280 // Load MXCSR from the slot.
27281 Chain = DAG.getNode(
27282 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27283 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27284 StackSlot);
27285 }
27286
27287 return Chain;
27288}
27289
27290/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27291//
27292// i8/i16 vector implemented using dword LZCNT vector instruction
27293// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27294// split the vector, perform operation on it's Lo a Hi part and
27295// concatenate the results.
27296static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27297 const X86Subtarget &Subtarget) {
27298 assert(Op.getOpcode() == ISD::CTLZ)((void)0);
27299 SDLoc dl(Op);
27300 MVT VT = Op.getSimpleValueType();
27301 MVT EltVT = VT.getVectorElementType();
27302 unsigned NumElems = VT.getVectorNumElements();
27303
27304 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&((void)0)
27305 "Unsupported element type")((void)0);
27306
27307 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27308 if (NumElems > 16 ||
27309 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27310 return splitVectorIntUnary(Op, DAG);
27311
27312 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27313 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&((void)0)
27314 "Unsupported value type for operation")((void)0);
27315
27316 // Use native supported vector instruction vplzcntd.
27317 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27318 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27319 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27320 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27321
27322 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27323}
27324
27325// Lower CTLZ using a PSHUFB lookup table implementation.
27326static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27327 const X86Subtarget &Subtarget,
27328 SelectionDAG &DAG) {
27329 MVT VT = Op.getSimpleValueType();
27330 int NumElts = VT.getVectorNumElements();
27331 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27332 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27333
27334 // Per-nibble leading zero PSHUFB lookup table.
27335 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27336 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27337 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27338 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27339
27340 SmallVector<SDValue, 64> LUTVec;
27341 for (int i = 0; i < NumBytes; ++i)
27342 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27343 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27344
27345 // Begin by bitcasting the input to byte vector, then split those bytes
27346 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27347 // If the hi input nibble is zero then we add both results together, otherwise
27348 // we just take the hi result (by masking the lo result to zero before the
27349 // add).
27350 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27351 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27352
27353 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27354 SDValue Lo = Op0;
27355 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27356 SDValue HiZ;
27357 if (CurrVT.is512BitVector()) {
27358 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27359 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27360 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27361 } else {
27362 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27363 }
27364
27365 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27366 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27367 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27368 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27369
27370 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27371 // of the current vector width in the same way we did for the nibbles.
27372 // If the upper half of the input element is zero then add the halves'
27373 // leading zero counts together, otherwise just use the upper half's.
27374 // Double the width of the result until we are at target width.
27375 while (CurrVT != VT) {
27376 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27377 int CurrNumElts = CurrVT.getVectorNumElements();
27378 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27379 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27380 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27381
27382 // Check if the upper half of the input element is zero.
27383 if (CurrVT.is512BitVector()) {
27384 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27385 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27386 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27387 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27388 } else {
27389 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27390 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27391 }
27392 HiZ = DAG.getBitcast(NextVT, HiZ);
27393
27394 // Move the upper/lower halves to the lower bits as we'll be extending to
27395 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27396 // together.
27397 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27398 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27399 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27400 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27401 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27402 CurrVT = NextVT;
27403 }
27404
27405 return Res;
27406}
27407
27408static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27409 const X86Subtarget &Subtarget,
27410 SelectionDAG &DAG) {
27411 MVT VT = Op.getSimpleValueType();
27412
27413 if (Subtarget.hasCDI() &&
27414 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27415 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27416 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27417
27418 // Decompose 256-bit ops into smaller 128-bit ops.
27419 if (VT.is256BitVector() && !Subtarget.hasInt256())
27420 return splitVectorIntUnary(Op, DAG);
27421
27422 // Decompose 512-bit ops into smaller 256-bit ops.
27423 if (VT.is512BitVector() && !Subtarget.hasBWI())
27424 return splitVectorIntUnary(Op, DAG);
27425
27426 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((void)0);
27427 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27428}
27429
27430static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27431 SelectionDAG &DAG) {
27432 MVT VT = Op.getSimpleValueType();
27433 MVT OpVT = VT;
27434 unsigned NumBits = VT.getSizeInBits();
27435 SDLoc dl(Op);
27436 unsigned Opc = Op.getOpcode();
27437
27438 if (VT.isVector())
27439 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27440
27441 Op = Op.getOperand(0);
27442 if (VT == MVT::i8) {
27443 // Zero extend to i32 since there is not an i8 bsr.
27444 OpVT = MVT::i32;
27445 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27446 }
27447
27448 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27449 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27450 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27451
27452 if (Opc == ISD::CTLZ) {
27453 // If src is zero (i.e. bsr sets ZF), returns NumBits.
27454 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27455 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27456 Op.getValue(1)};
27457 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27458 }
27459
27460 // Finally xor with NumBits-1.
27461 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27462 DAG.getConstant(NumBits - 1, dl, OpVT));
27463
27464 if (VT == MVT::i8)
27465 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27466 return Op;
27467}
27468
27469static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27470 SelectionDAG &DAG) {
27471 MVT VT = Op.getSimpleValueType();
27472 unsigned NumBits = VT.getScalarSizeInBits();
27473 SDValue N0 = Op.getOperand(0);
27474 SDLoc dl(Op);
27475
27476 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&((void)0)
27477 "Only scalar CTTZ requires custom lowering")((void)0);
27478
27479 // Issue a bsf (scan bits forward) which also sets EFLAGS.
27480 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27481 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27482
27483 // If src is zero (i.e. bsf sets ZF), returns NumBits.
27484 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27485 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27486 Op.getValue(1)};
27487 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27488}
27489
27490static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27491 const X86Subtarget &Subtarget) {
27492 MVT VT = Op.getSimpleValueType();
27493 if (VT == MVT::i16 || VT == MVT::i32)
27494 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27495
27496 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27497 return splitVectorIntBinary(Op, DAG);
27498
27499 assert(Op.getSimpleValueType().is256BitVector() &&((void)0)
27500 Op.getSimpleValueType().isInteger() &&((void)0)
27501 "Only handle AVX 256-bit vector integer operation")((void)0);
27502 return splitVectorIntBinary(Op, DAG);
27503}
27504
27505static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27506 const X86Subtarget &Subtarget) {
27507 MVT VT = Op.getSimpleValueType();
27508 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27509 unsigned Opcode = Op.getOpcode();
27510 SDLoc DL(Op);
27511
27512 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27513 (VT.is256BitVector() && !Subtarget.hasInt256())) {
27514 assert(Op.getSimpleValueType().isInteger() &&((void)0)
27515 "Only handle AVX vector integer operation")((void)0);
27516 return splitVectorIntBinary(Op, DAG);
27517 }
27518
27519 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27520 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27521 EVT SetCCResultType =
27522 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27523
27524 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
27525 // usubsat X, Y --> (X >u Y) ? X - Y : 0
27526 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27527 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27528 // TODO: Move this to DAGCombiner?
27529 if (SetCCResultType == VT &&
27530 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27531 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27532 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27533 }
27534
27535 // Use default expansion.
27536 return SDValue();
27537}
27538
27539static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27540 SelectionDAG &DAG) {
27541 MVT VT = Op.getSimpleValueType();
27542 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27543 // Since X86 does not have CMOV for 8-bit integer, we don't convert
27544 // 8-bit integer abs to NEG and CMOV.
27545 SDLoc DL(Op);
27546 SDValue N0 = Op.getOperand(0);
27547 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27548 DAG.getConstant(0, DL, VT), N0);
27549 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
27550 SDValue(Neg.getNode(), 1)};
27551 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27552 }
27553
27554 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27555 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27556 SDLoc DL(Op);
27557 SDValue Src = Op.getOperand(0);
27558 SDValue Sub =
27559 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27560 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27561 }
27562
27563 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27564 assert(VT.isInteger() &&((void)0)
27565 "Only handle AVX 256-bit vector integer operation")((void)0);
27566 return splitVectorIntUnary(Op, DAG);
27567 }
27568
27569 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27570 return splitVectorIntUnary(Op, DAG);
27571
27572 // Default to expand.
27573 return SDValue();
27574}
27575
27576static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
27577 MVT VT = Op.getSimpleValueType();
27578
27579 // For AVX1 cases, split to use legal ops (everything but v4i64).
27580 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
27581 return splitVectorIntBinary(Op, DAG);
27582
27583 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27584 return splitVectorIntBinary(Op, DAG);
27585
27586 // Default to expand.
27587 return SDValue();
27588}
27589
27590static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
27591 SelectionDAG &DAG) {
27592 SDLoc dl(Op);
27593 MVT VT = Op.getSimpleValueType();
27594
27595 // Decompose 256-bit ops into 128-bit ops.
27596 if (VT.is256BitVector() && !Subtarget.hasInt256())
27597 return splitVectorIntBinary(Op, DAG);
27598
27599 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27600 return splitVectorIntBinary(Op, DAG);
27601
27602 SDValue A = Op.getOperand(0);
27603 SDValue B = Op.getOperand(1);
27604
27605 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
27606 // vector pairs, multiply and truncate.
27607 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
27608 unsigned NumElts = VT.getVectorNumElements();
27609
27610 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27611 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27612 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27613 return DAG.getNode(
27614 ISD::TRUNCATE, dl, VT,
27615 DAG.getNode(ISD::MUL, dl, ExVT,
27616 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
27617 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
27618 }
27619
27620 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27621
27622 // Extract the lo/hi parts to any extend to i16.
27623 // We're going to mask off the low byte of each result element of the
27624 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
27625 // element.
27626 SDValue Undef = DAG.getUNDEF(VT);
27627 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
27628 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
27629
27630 SDValue BLo, BHi;
27631 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27632 // If the RHS is a constant, manually unpackl/unpackh.
27633 SmallVector<SDValue, 16> LoOps, HiOps;
27634 for (unsigned i = 0; i != NumElts; i += 16) {
27635 for (unsigned j = 0; j != 8; ++j) {
27636 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
27637 MVT::i16));
27638 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
27639 MVT::i16));
27640 }
27641 }
27642
27643 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27644 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27645 } else {
27646 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
27647 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
27648 }
27649
27650 // Multiply, mask the lower 8bits of the lo/hi results and pack.
27651 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27652 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27653 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
27654 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
27655 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27656 }
27657
27658 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
27659 if (VT == MVT::v4i32) {
27660 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((void)0)
27661 "Should not custom lower when pmulld is available!")((void)0);
27662
27663 // Extract the odd parts.
27664 static const int UnpackMask[] = { 1, -1, 3, -1 };
27665 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
27666 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
27667
27668 // Multiply the even parts.
27669 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27670 DAG.getBitcast(MVT::v2i64, A),
27671 DAG.getBitcast(MVT::v2i64, B));
27672 // Now multiply odd parts.
27673 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27674 DAG.getBitcast(MVT::v2i64, Aodds),
27675 DAG.getBitcast(MVT::v2i64, Bodds));
27676
27677 Evens = DAG.getBitcast(VT, Evens);
27678 Odds = DAG.getBitcast(VT, Odds);
27679
27680 // Merge the two vectors back together with a shuffle. This expands into 2
27681 // shuffles.
27682 static const int ShufMask[] = { 0, 4, 2, 6 };
27683 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
27684 }
27685
27686 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&((void)0)
27687 "Only know how to lower V2I64/V4I64/V8I64 multiply")((void)0);
27688 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")((void)0);
27689
27690 // Ahi = psrlqi(a, 32);
27691 // Bhi = psrlqi(b, 32);
27692 //
27693 // AloBlo = pmuludq(a, b);
27694 // AloBhi = pmuludq(a, Bhi);
27695 // AhiBlo = pmuludq(Ahi, b);
27696 //
27697 // Hi = psllqi(AloBhi + AhiBlo, 32);
27698 // return AloBlo + Hi;
27699 KnownBits AKnown = DAG.computeKnownBits(A);
27700 KnownBits BKnown = DAG.computeKnownBits(B);
27701
27702 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
27703 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
27704 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
27705
27706 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
27707 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
27708 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
27709
27710 SDValue Zero = DAG.getConstant(0, dl, VT);
27711
27712 // Only multiply lo/hi halves that aren't known to be zero.
27713 SDValue AloBlo = Zero;
27714 if (!ALoIsZero && !BLoIsZero)
27715 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
27716
27717 SDValue AloBhi = Zero;
27718 if (!ALoIsZero && !BHiIsZero) {
27719 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
27720 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
27721 }
27722
27723 SDValue AhiBlo = Zero;
27724 if (!AHiIsZero && !BLoIsZero) {
27725 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
27726 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
27727 }
27728
27729 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
27730 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
27731
27732 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
27733}
27734
27735static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
27736 MVT VT, bool IsSigned,
27737 const X86Subtarget &Subtarget,
27738 SelectionDAG &DAG,
27739 SDValue *Low = nullptr) {
27740 unsigned NumElts = VT.getVectorNumElements();
27741
27742 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
27743 // to a vXi16 type. Do the multiplies, shift the results and pack the half
27744 // lane results back together.
27745
27746 // We'll take different approaches for signed and unsigned.
27747 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
27748 // and use pmullw to calculate the full 16-bit product.
27749 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
27750 // shift them left into the upper byte of each word. This allows us to use
27751 // pmulhw to calculate the full 16-bit product. This trick means we don't
27752 // need to sign extend the bytes to use pmullw.
27753
27754 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27755 SDValue Zero = DAG.getConstant(0, dl, VT);
27756
27757 SDValue ALo, AHi;
27758 if (IsSigned) {
27759 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
27760 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
27761 } else {
27762 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
27763 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
27764 }
27765
27766 SDValue BLo, BHi;
27767 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27768 // If the RHS is a constant, manually unpackl/unpackh and extend.
27769 SmallVector<SDValue, 16> LoOps, HiOps;
27770 for (unsigned i = 0; i != NumElts; i += 16) {
27771 for (unsigned j = 0; j != 8; ++j) {
27772 SDValue LoOp = B.getOperand(i + j);
27773 SDValue HiOp = B.getOperand(i + j + 8);
27774
27775 if (IsSigned) {
27776 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
27777 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
27778 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
27779 DAG.getConstant(8, dl, MVT::i16));
27780 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
27781 DAG.getConstant(8, dl, MVT::i16));
27782 } else {
27783 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
27784 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
27785 }
27786
27787 LoOps.push_back(LoOp);
27788 HiOps.push_back(HiOp);
27789 }
27790 }
27791
27792 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27793 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27794 } else if (IsSigned) {
27795 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
27796 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
27797 } else {
27798 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
27799 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
27800 }
27801
27802 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
27803 // pack back to vXi8.
27804 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
27805 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
27806 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
27807
27808 if (Low) {
27809 // Mask the lower bits and pack the results to rejoin the halves.
27810 SDValue Mask = DAG.getConstant(255, dl, ExVT);
27811 SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
27812 SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
27813 *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
27814 }
27815
27816 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
27817 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
27818
27819 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
27820 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27821}
27822
27823static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
27824 SelectionDAG &DAG) {
27825 SDLoc dl(Op);
27826 MVT VT = Op.getSimpleValueType();
27827 bool IsSigned = Op->getOpcode() == ISD::MULHS;
27828 unsigned NumElts = VT.getVectorNumElements();
27829 SDValue A = Op.getOperand(0);
27830 SDValue B = Op.getOperand(1);
27831
27832 // Decompose 256-bit ops into 128-bit ops.
27833 if (VT.is256BitVector() && !Subtarget.hasInt256())
27834 return splitVectorIntBinary(Op, DAG);
27835
27836 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27837 return splitVectorIntBinary(Op, DAG);
27838
27839 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
27840 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||((void)0)
27841 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||((void)0)
27842 (VT == MVT::v16i32 && Subtarget.hasAVX512()))((void)0);
27843
27844 // PMULxD operations multiply each even value (starting at 0) of LHS with
27845 // the related value of RHS and produce a widen result.
27846 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27847 // => <2 x i64> <ae|cg>
27848 //
27849 // In other word, to have all the results, we need to perform two PMULxD:
27850 // 1. one with the even values.
27851 // 2. one with the odd values.
27852 // To achieve #2, with need to place the odd values at an even position.
27853 //
27854 // Place the odd value at an even position (basically, shift all values 1
27855 // step to the left):
27856 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
27857 9, -1, 11, -1, 13, -1, 15, -1};
27858 // <a|b|c|d> => <b|undef|d|undef>
27859 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
27860 makeArrayRef(&Mask[0], NumElts));
27861 // <e|f|g|h> => <f|undef|h|undef>
27862 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
27863 makeArrayRef(&Mask[0], NumElts));
27864
27865 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
27866 // ints.
27867 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
27868 unsigned Opcode =
27869 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
27870 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27871 // => <2 x i64> <ae|cg>
27872 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27873 DAG.getBitcast(MulVT, A),
27874 DAG.getBitcast(MulVT, B)));
27875 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
27876 // => <2 x i64> <bf|dh>
27877 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27878 DAG.getBitcast(MulVT, Odd0),
27879 DAG.getBitcast(MulVT, Odd1)));
27880
27881 // Shuffle it back into the right order.
27882 SmallVector<int, 16> ShufMask(NumElts);
27883 for (int i = 0; i != (int)NumElts; ++i)
27884 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
27885
27886 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
27887
27888 // If we have a signed multiply but no PMULDQ fix up the result of an
27889 // unsigned multiply.
27890 if (IsSigned && !Subtarget.hasSSE41()) {
27891 SDValue Zero = DAG.getConstant(0, dl, VT);
27892 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
27893 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
27894 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
27895 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
27896
27897 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
27898 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
27899 }
27900
27901 return Res;
27902 }
27903
27904 // Only i8 vectors should need custom lowering after this.
27905 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||((void)0)
27906 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&((void)0)
27907 "Unsupported vector type")((void)0);
27908
27909 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
27910 // logical shift down the upper half and pack back to i8.
27911
27912 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
27913 // and then ashr/lshr the upper bits down to the lower bits before multiply.
27914
27915 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27916 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27917 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27918 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27919 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27920 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27921 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27922 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27923 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27924 }
27925
27926 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
27927}
27928
27929// Custom lowering for SMULO/UMULO.
27930static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
27931 SelectionDAG &DAG) {
27932 MVT VT = Op.getSimpleValueType();
27933
27934 // Scalars defer to LowerXALUO.
27935 if (!VT.isVector())
27936 return LowerXALUO(Op, DAG);
27937
27938 SDLoc dl(Op);
27939 bool IsSigned = Op->getOpcode() == ISD::SMULO;
27940 SDValue A = Op.getOperand(0);
27941 SDValue B = Op.getOperand(1);
27942 EVT OvfVT = Op->getValueType(1);
27943
27944 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
27945 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
27946 // Extract the LHS Lo/Hi vectors
27947 SDValue LHSLo, LHSHi;
27948 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
27949
27950 // Extract the RHS Lo/Hi vectors
27951 SDValue RHSLo, RHSHi;
27952 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
27953
27954 EVT LoOvfVT, HiOvfVT;
27955 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
27956 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
27957 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
27958
27959 // Issue the split operations.
27960 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
27961 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
27962
27963 // Join the separate data results and the overflow results.
27964 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
27965 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
27966 Hi.getValue(1));
27967
27968 return DAG.getMergeValues({Res, Ovf}, dl);
27969 }
27970
27971 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27972 EVT SetccVT =
27973 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27974
27975 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27976 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27977 unsigned NumElts = VT.getVectorNumElements();
27978 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27979 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27980 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27981 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27982 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27983
27984 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27985
27986 SDValue Ovf;
27987 if (IsSigned) {
27988 SDValue High, LowSign;
27989 if (OvfVT.getVectorElementType() == MVT::i1 &&
27990 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27991 // Rather the truncating try to do the compare on vXi16 or vXi32.
27992 // Shift the high down filling with sign bits.
27993 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
27994 // Fill all 16 bits with the sign bit from the low.
27995 LowSign =
27996 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
27997 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
27998 15, DAG);
27999 SetccVT = OvfVT;
28000 if (!Subtarget.hasBWI()) {
28001 // We can't do a vXi16 compare so sign extend to v16i32.
28002 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28003 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28004 }
28005 } else {
28006 // Otherwise do the compare at vXi8.
28007 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28008 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28009 LowSign =
28010 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28011 }
28012
28013 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28014 } else {
28015 SDValue High =
28016 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28017 if (OvfVT.getVectorElementType() == MVT::i1 &&
28018 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28019 // Rather the truncating try to do the compare on vXi16 or vXi32.
28020 SetccVT = OvfVT;
28021 if (!Subtarget.hasBWI()) {
28022 // We can't do a vXi16 compare so sign extend to v16i32.
28023 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28024 }
28025 } else {
28026 // Otherwise do the compare at vXi8.
28027 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28028 }
28029
28030 Ovf =
28031 DAG.getSetCC(dl, SetccVT, High,
28032 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28033 }
28034
28035 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28036
28037 return DAG.getMergeValues({Low, Ovf}, dl);
28038 }
28039
28040 SDValue Low;
28041 SDValue High =
28042 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28043
28044 SDValue Ovf;
28045 if (IsSigned) {
28046 // SMULO overflows if the high bits don't match the sign of the low.
28047 SDValue LowSign =
28048 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28049 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28050 } else {
28051 // UMULO overflows if the high bits are non-zero.
28052 Ovf =
28053 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28054 }
28055
28056 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28057
28058 return DAG.getMergeValues({Low, Ovf}, dl);
28059}
28060
28061SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28062 assert(Subtarget.isTargetWin64() && "Unexpected target")((void)0);
28063 EVT VT = Op.getValueType();
28064 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((void)0)
28065 "Unexpected return type for lowering")((void)0);
28066
28067 RTLIB::Libcall LC;
28068 bool isSigned;
28069 switch (Op->getOpcode()) {
28070 default: llvm_unreachable("Unexpected request for libcall!")__builtin_unreachable();
28071 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28072 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28073 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28074 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28075 }
28076
28077 SDLoc dl(Op);
28078 SDValue InChain = DAG.getEntryNode();
28079
28080 TargetLowering::ArgListTy Args;
28081 TargetLowering::ArgListEntry Entry;
28082 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28083 EVT ArgVT = Op->getOperand(i).getValueType();
28084 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((void)0)
28085 "Unexpected argument type for lowering")((void)0);
28086 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28087 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28088 MachinePointerInfo MPI =
28089 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28090 Entry.Node = StackPtr;
28091 InChain =
28092 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28093 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28094 Entry.Ty = PointerType::get(ArgTy,0);
28095 Entry.IsSExt = false;
28096 Entry.IsZExt = false;
28097 Args.push_back(Entry);
28098 }
28099
28100 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28101 getPointerTy(DAG.getDataLayout()));
28102
28103 TargetLowering::CallLoweringInfo CLI(DAG);
28104 CLI.setDebugLoc(dl)
28105 .setChain(InChain)
28106 .setLibCallee(
28107 getLibcallCallingConv(LC),
28108 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28109 std::move(Args))
28110 .setInRegister()
28111 .setSExtResult(isSigned)
28112 .setZExtResult(!isSigned);
28113
28114 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28115 return DAG.getBitcast(VT, CallInfo.first);
28116}
28117
28118// Return true if the required (according to Opcode) shift-imm form is natively
28119// supported by the Subtarget
28120static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
28121 unsigned Opcode) {
28122 if (VT.getScalarSizeInBits() < 16)
28123 return false;
28124
28125 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
28126 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28127 return true;
28128
28129 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28130 (VT.is256BitVector() && Subtarget.hasInt256());
28131
28132 bool AShift = LShift && (Subtarget.hasAVX512() ||
28133 (VT != MVT::v2i64 && VT != MVT::v4i64));
28134 return (Opcode == ISD::SRA) ? AShift : LShift;
28135}
28136
28137// The shift amount is a variable, but it is the same for all vector lanes.
28138// These instructions are defined together with shift-immediate.
28139static
28140bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
28141 unsigned Opcode) {
28142 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
28143}
28144
28145// Return true if the required (according to Opcode) variable-shift form is
28146// natively supported by the Subtarget
28147static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
28148 unsigned Opcode) {
28149
28150 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28151 return false;
28152
28153 // vXi16 supported only on AVX-512, BWI
28154 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28155 return false;
28156
28157 if (Subtarget.hasAVX512())
28158 return true;
28159
28160 bool LShift = VT.is128BitVector() || VT.is256BitVector();
28161 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
28162 return (Opcode == ISD::SRA) ? AShift : LShift;
28163}
28164
28165static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
28166 const X86Subtarget &Subtarget) {
28167 MVT VT = Op.getSimpleValueType();
28168 SDLoc dl(Op);
28169 SDValue R = Op.getOperand(0);
28170 SDValue Amt = Op.getOperand(1);
28171 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28172
28173 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28174 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")((void)0);
28175 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28176 SDValue Ex = DAG.getBitcast(ExVT, R);
28177
28178 // ashr(R, 63) === cmp_slt(R, 0)
28179 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28180 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&((void)0)
28181 "Unsupported PCMPGT op")((void)0);
28182 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28183 }
28184
28185 if (ShiftAmt >= 32) {
28186 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28187 SDValue Upper =
28188 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28189 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28190 ShiftAmt - 32, DAG);
28191 if (VT == MVT::v2i64)
28192 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28193 if (VT == MVT::v4i64)
28194 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28195 {9, 1, 11, 3, 13, 5, 15, 7});
28196 } else {
28197 // SRA upper i32, SRL whole i64 and select lower i32.
28198 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28199 ShiftAmt, DAG);
28200 SDValue Lower =
28201 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28202 Lower = DAG.getBitcast(ExVT, Lower);
28203 if (VT == MVT::v2i64)
28204 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28205 if (VT == MVT::v4i64)
28206 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28207 {8, 1, 10, 3, 12, 5, 14, 7});
28208 }
28209 return DAG.getBitcast(VT, Ex);
28210 };
28211
28212 // Optimize shl/srl/sra with constant shift amount.
28213 APInt APIntShiftAmt;
28214 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28215 return SDValue();
28216
28217 // If the shift amount is out of range, return undef.
28218 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28219 return DAG.getUNDEF(VT);
28220
28221 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28222
28223 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28224 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28225
28226 // i64 SRA needs to be performed as partial shifts.
28227 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28228 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28229 Op.getOpcode() == ISD::SRA)
28230 return ArithmeticShiftRight64(ShiftAmt);
28231
28232 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28233 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28234 unsigned NumElts = VT.getVectorNumElements();
28235 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28236
28237 // Simple i8 add case
28238 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
28239 return DAG.getNode(ISD::ADD, dl, VT, R, R);
28240
28241 // ashr(R, 7) === cmp_slt(R, 0)
28242 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28243 SDValue Zeros = DAG.getConstant(0, dl, VT);
28244 if (VT.is512BitVector()) {
28245 assert(VT == MVT::v64i8 && "Unexpected element type!")((void)0);
28246 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28247 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28248 }
28249 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28250 }
28251
28252 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28253 if (VT == MVT::v16i8 && Subtarget.hasXOP())
28254 return SDValue();
28255
28256 if (Op.getOpcode() == ISD::SHL) {
28257 // Make a large shift.
28258 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28259 ShiftAmt, DAG);
28260 SHL = DAG.getBitcast(VT, SHL);
28261 // Zero out the rightmost bits.
28262 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28263 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28264 }
28265 if (Op.getOpcode() == ISD::SRL) {
28266 // Make a large shift.
28267 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28268 ShiftAmt, DAG);
28269 SRL = DAG.getBitcast(VT, SRL);
28270 // Zero out the leftmost bits.
28271 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28272 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28273 }
28274 if (Op.getOpcode() == ISD::SRA) {
28275 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28276 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28277
28278 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28279 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28280 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28281 return Res;
28282 }
28283 llvm_unreachable("Unknown shift opcode.")__builtin_unreachable();
28284 }
28285
28286 return SDValue();
28287}
28288
28289static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28290 const X86Subtarget &Subtarget) {
28291 MVT VT = Op.getSimpleValueType();
28292 SDLoc dl(Op);
28293 SDValue R = Op.getOperand(0);
28294 SDValue Amt = Op.getOperand(1);
28295 unsigned Opcode = Op.getOpcode();
28296 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28297 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28298
28299 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28300 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28301 MVT EltVT = VT.getVectorElementType();
28302 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((void)0);
28303 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28304 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28305 else if (EltVT.bitsLT(MVT::i32))
28306 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28307
28308 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28309 }
28310
28311 // vXi8 shifts - shift as v8i16 + mask result.
28312 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28313 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28314 VT == MVT::v64i8) &&
28315 !Subtarget.hasXOP()) {
28316 unsigned NumElts = VT.getVectorNumElements();
28317 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28318 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28319 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28320 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28321 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28322
28323 // Create the mask using vXi16 shifts. For shift-rights we need to move
28324 // the upper byte down before splatting the vXi8 mask.
28325 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28326 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28327 BaseShAmt, Subtarget, DAG);
28328 if (Opcode != ISD::SHL)
28329 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28330 8, DAG);
28331 BitMask = DAG.getBitcast(VT, BitMask);
28332 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28333 SmallVector<int, 64>(NumElts, 0));
28334
28335 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28336 DAG.getBitcast(ExtVT, R), BaseShAmt,
28337 Subtarget, DAG);
28338 Res = DAG.getBitcast(VT, Res);
28339 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28340
28341 if (Opcode == ISD::SRA) {
28342 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28343 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28344 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28345 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28346 BaseShAmt, Subtarget, DAG);
28347 SignMask = DAG.getBitcast(VT, SignMask);
28348 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28349 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28350 }
28351 return Res;
28352 }
28353 }
28354 }
28355
28356 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28357 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28358 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28359 Amt = Amt.getOperand(0);
28360 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28361 std::vector<SDValue> Vals(Ratio);
28362 for (unsigned i = 0; i != Ratio; ++i)
28363 Vals[i] = Amt.getOperand(i);
28364 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28365 for (unsigned j = 0; j != Ratio; ++j)
28366 if (Vals[j] != Amt.getOperand(i + j))
28367 return SDValue();
28368 }
28369
28370 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28371 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28372 }
28373 return SDValue();
28374}
28375
28376// Convert a shift/rotate left amount to a multiplication scale factor.
28377static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28378 const X86Subtarget &Subtarget,
28379 SelectionDAG &DAG) {
28380 MVT VT = Amt.getSimpleValueType();
28381 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28382 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28383 (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28384 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28385 return SDValue();
28386
28387 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28388 SmallVector<SDValue, 8> Elts;
28389 MVT SVT = VT.getVectorElementType();
28390 unsigned SVTBits = SVT.getSizeInBits();
28391 APInt One(SVTBits, 1);
28392 unsigned NumElems = VT.getVectorNumElements();
28393
28394 for (unsigned i = 0; i != NumElems; ++i) {
28395 SDValue Op = Amt->getOperand(i);
28396 if (Op->isUndef()) {
28397 Elts.push_back(Op);
28398 continue;
28399 }
28400
28401 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28402 APInt C(SVTBits, ND->getZExtValue());
28403 uint64_t ShAmt = C.getZExtValue();
28404 if (ShAmt >= SVTBits) {
28405 Elts.push_back(DAG.getUNDEF(SVT));
28406 continue;
28407 }
28408 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28409 }
28410 return DAG.getBuildVector(VT, dl, Elts);
28411 }
28412
28413 // If the target doesn't support variable shifts, use either FP conversion
28414 // or integer multiplication to avoid shifting each element individually.
28415 if (VT == MVT::v4i32) {
28416 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28417 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28418 DAG.getConstant(0x3f800000U, dl, VT));
28419 Amt = DAG.getBitcast(MVT::v4f32, Amt);
28420 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28421 }
28422
28423 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28424 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28425 SDValue Z = DAG.getConstant(0, dl, VT);
28426 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28427 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28428 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28429 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28430 if (Subtarget.hasSSE41())
28431 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28432
28433 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28434 DAG.getBitcast(VT, Hi),
28435 {0, 2, 4, 6, 8, 10, 12, 14});
28436 }
28437
28438 return SDValue();
28439}
28440
28441static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28442 SelectionDAG &DAG) {
28443 MVT VT = Op.getSimpleValueType();
28444 SDLoc dl(Op);
28445 SDValue R = Op.getOperand(0);
28446 SDValue Amt = Op.getOperand(1);
28447 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28448 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28449
28450 unsigned Opc = Op.getOpcode();
28451 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28452 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28453
28454 assert(VT.isVector() && "Custom lowering only for vector shifts!")((void)0);
28455 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((void)0);
28456
28457 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28458 return V;
28459
28460 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28461 return V;
28462
28463 if (SupportedVectorVarShift(VT, Subtarget, Opc))
28464 return Op;
28465
28466 // XOP has 128-bit variable logical/arithmetic shifts.
28467 // +ve/-ve Amt = shift left/right.
28468 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
28469 VT == MVT::v8i16 || VT == MVT::v16i8)) {
28470 if (Opc == ISD::SRL || Opc == ISD::SRA) {
28471 SDValue Zero = DAG.getConstant(0, dl, VT);
28472 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
28473 }
28474 if (Opc == ISD::SHL || Opc == ISD::SRL)
28475 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
28476 if (Opc == ISD::SRA)
28477 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
28478 }
28479
28480 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
28481 // shifts per-lane and then shuffle the partial results back together.
28482 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
28483 // Splat the shift amounts so the scalar shifts above will catch it.
28484 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
28485 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
28486 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
28487 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
28488 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
28489 }
28490
28491 // i64 vector arithmetic shift can be emulated with the transform:
28492 // M = lshr(SIGN_MASK, Amt)
28493 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
28494 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
28495 Opc == ISD::SRA) {
28496 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
28497 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
28498 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28499 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
28500 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
28501 return R;
28502 }
28503
28504 // If possible, lower this shift as a sequence of two shifts by
28505 // constant plus a BLENDing shuffle instead of scalarizing it.
28506 // Example:
28507 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
28508 //
28509 // Could be rewritten as:
28510 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
28511 //
28512 // The advantage is that the two shifts from the example would be
28513 // lowered as X86ISD::VSRLI nodes in parallel before blending.
28514 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
28515 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28516 SDValue Amt1, Amt2;
28517 unsigned NumElts = VT.getVectorNumElements();
28518 SmallVector<int, 8> ShuffleMask;
28519 for (unsigned i = 0; i != NumElts; ++i) {
28520 SDValue A = Amt->getOperand(i);
28521 if (A.isUndef()) {
28522 ShuffleMask.push_back(SM_SentinelUndef);
28523 continue;
28524 }
28525 if (!Amt1 || Amt1 == A) {
28526 ShuffleMask.push_back(i);
28527 Amt1 = A;
28528 continue;
28529 }
28530 if (!Amt2 || Amt2 == A) {
28531 ShuffleMask.push_back(i + NumElts);
28532 Amt2 = A;
28533 continue;
28534 }
28535 break;
28536 }
28537
28538 // Only perform this blend if we can perform it without loading a mask.
28539 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
28540 (VT != MVT::v16i16 ||
28541 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
28542 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
28543 canWidenShuffleElements(ShuffleMask))) {
28544 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
28545 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
28546 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
28547 Cst2->getAPIntValue().ult(EltSizeInBits)) {
28548 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28549 Cst1->getZExtValue(), DAG);
28550 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28551 Cst2->getZExtValue(), DAG);
28552 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
28553 }
28554 }
28555 }
28556
28557 // If possible, lower this packed shift into a vector multiply instead of
28558 // expanding it into a sequence of scalar shifts.
28559 if (Opc == ISD::SHL)
28560 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
28561 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
28562
28563 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
28564 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
28565 if (Opc == ISD::SRL && ConstantAmt &&
28566 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28567 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28568 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28569 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28570 SDValue Zero = DAG.getConstant(0, dl, VT);
28571 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
28572 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
28573 return DAG.getSelect(dl, VT, ZAmt, R, Res);
28574 }
28575 }
28576
28577 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
28578 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
28579 // TODO: Special case handling for shift by 0/1, really we can afford either
28580 // of these cases in pre-SSE41/XOP/AVX512 but not both.
28581 if (Opc == ISD::SRA && ConstantAmt &&
28582 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
28583 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
28584 !Subtarget.hasAVX512()) ||
28585 DAG.isKnownNeverZero(Amt))) {
28586 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28587 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28588 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28589 SDValue Amt0 =
28590 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
28591 SDValue Amt1 =
28592 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
28593 SDValue Sra1 =
28594 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
28595 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
28596 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
28597 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
28598 }
28599 }
28600
28601 // v4i32 Non Uniform Shifts.
28602 // If the shift amount is constant we can shift each lane using the SSE2
28603 // immediate shifts, else we need to zero-extend each lane to the lower i64
28604 // and shift using the SSE2 variable shifts.
28605 // The separate results can then be blended together.
28606 if (VT == MVT::v4i32) {
28607 SDValue Amt0, Amt1, Amt2, Amt3;
28608 if (ConstantAmt) {
28609 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
28610 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
28611 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
28612 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
28613 } else {
28614 // The SSE2 shifts use the lower i64 as the same shift amount for
28615 // all lanes and the upper i64 is ignored. On AVX we're better off
28616 // just zero-extending, but for SSE just duplicating the top 16-bits is
28617 // cheaper and has the same effect for out of range values.
28618 if (Subtarget.hasAVX()) {
28619 SDValue Z = DAG.getConstant(0, dl, VT);
28620 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
28621 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
28622 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
28623 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
28624 } else {
28625 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
28626 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28627 {4, 5, 6, 7, -1, -1, -1, -1});
28628 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28629 {0, 1, 1, 1, -1, -1, -1, -1});
28630 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28631 {2, 3, 3, 3, -1, -1, -1, -1});
28632 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28633 {0, 1, 1, 1, -1, -1, -1, -1});
28634 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28635 {2, 3, 3, 3, -1, -1, -1, -1});
28636 }
28637 }
28638
28639 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
28640 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
28641 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
28642 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
28643 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
28644
28645 // Merge the shifted lane results optimally with/without PBLENDW.
28646 // TODO - ideally shuffle combining would handle this.
28647 if (Subtarget.hasSSE41()) {
28648 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
28649 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
28650 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
28651 }
28652 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
28653 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
28654 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
28655 }
28656
28657 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
28658 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
28659 // make the existing SSE solution better.
28660 // NOTE: We honor prefered vector width before promoting to 512-bits.
28661 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
28662 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
28663 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
28664 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
28665 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
28666 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&((void)0)
28667 "Unexpected vector type")((void)0);
28668 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
28669 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
28670 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28671 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
28672 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
28673 return DAG.getNode(ISD::TRUNCATE, dl, VT,
28674 DAG.getNode(Opc, dl, ExtVT, R, Amt));
28675 }
28676
28677 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
28678 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
28679 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
28680 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28681 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28682 !Subtarget.hasXOP()) {
28683 int NumElts = VT.getVectorNumElements();
28684 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
28685
28686 // Extend constant shift amount to vXi16 (it doesn't matter if the type
28687 // isn't legal).
28688 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28689 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
28690 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
28691 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
28692 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&((void)0)
28693 "Constant build vector expected")((void)0);
28694
28695 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
28696 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
28697 : DAG.getZExtOrTrunc(R, dl, ExVT);
28698 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
28699 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
28700 return DAG.getZExtOrTrunc(R, dl, VT);
28701 }
28702
28703 SmallVector<SDValue, 16> LoAmt, HiAmt;
28704 for (int i = 0; i != NumElts; i += 16) {
28705 for (int j = 0; j != 8; ++j) {
28706 LoAmt.push_back(Amt.getOperand(i + j));
28707 HiAmt.push_back(Amt.getOperand(i + j + 8));
28708 }
28709 }
28710
28711 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
28712 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
28713 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
28714
28715 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
28716 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
28717 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
28718 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
28719 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
28720 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
28721 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
28722 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
28723 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
28724 }
28725
28726 if (VT == MVT::v16i8 ||
28727 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
28728 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
28729 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
28730
28731 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28732 if (VT.is512BitVector()) {
28733 // On AVX512BW targets we make use of the fact that VSELECT lowers
28734 // to a masked blend which selects bytes based just on the sign bit
28735 // extracted to a mask.
28736 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
28737 V0 = DAG.getBitcast(VT, V0);
28738 V1 = DAG.getBitcast(VT, V1);
28739 Sel = DAG.getBitcast(VT, Sel);
28740 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
28741 ISD::SETGT);
28742 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
28743 } else if (Subtarget.hasSSE41()) {
28744 // On SSE41 targets we can use PBLENDVB which selects bytes based just
28745 // on the sign bit.
28746 V0 = DAG.getBitcast(VT, V0);
28747 V1 = DAG.getBitcast(VT, V1);
28748 Sel = DAG.getBitcast(VT, Sel);
28749 return DAG.getBitcast(SelVT,
28750 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
28751 }
28752 // On pre-SSE41 targets we test for the sign bit by comparing to
28753 // zero - a negative value will set all bits of the lanes to true
28754 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28755 SDValue Z = DAG.getConstant(0, dl, SelVT);
28756 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
28757 return DAG.getSelect(dl, SelVT, C, V0, V1);
28758 };
28759
28760 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28761 // We can safely do this using i16 shifts as we're only interested in
28762 // the 3 lower bits of each byte.
28763 Amt = DAG.getBitcast(ExtVT, Amt);
28764 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
28765 Amt = DAG.getBitcast(VT, Amt);
28766
28767 if (Opc == ISD::SHL || Opc == ISD::SRL) {
28768 // r = VSELECT(r, shift(r, 4), a);
28769 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
28770 R = SignBitSelect(VT, Amt, M, R);
28771
28772 // a += a
28773 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28774
28775 // r = VSELECT(r, shift(r, 2), a);
28776 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
28777 R = SignBitSelect(VT, Amt, M, R);
28778
28779 // a += a
28780 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28781
28782 // return VSELECT(r, shift(r, 1), a);
28783 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
28784 R = SignBitSelect(VT, Amt, M, R);
28785 return R;
28786 }
28787
28788 if (Opc == ISD::SRA) {
28789 // For SRA we need to unpack each byte to the higher byte of a i16 vector
28790 // so we can correctly sign extend. We don't care what happens to the
28791 // lower byte.
28792 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28793 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28794 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
28795 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
28796 ALo = DAG.getBitcast(ExtVT, ALo);
28797 AHi = DAG.getBitcast(ExtVT, AHi);
28798 RLo = DAG.getBitcast(ExtVT, RLo);
28799 RHi = DAG.getBitcast(ExtVT, RHi);
28800
28801 // r = VSELECT(r, shift(r, 4), a);
28802 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
28803 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
28804 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28805 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28806
28807 // a += a
28808 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28809 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28810
28811 // r = VSELECT(r, shift(r, 2), a);
28812 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
28813 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
28814 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28815 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28816
28817 // a += a
28818 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28819 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28820
28821 // r = VSELECT(r, shift(r, 1), a);
28822 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
28823 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
28824 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28825 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28826
28827 // Logical shift the result back to the lower byte, leaving a zero upper
28828 // byte meaning that we can safely pack with PACKUSWB.
28829 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
28830 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
28831 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28832 }
28833 }
28834
28835 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
28836 MVT ExtVT = MVT::v8i32;
28837 SDValue Z = DAG.getConstant(0, dl, VT);
28838 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
28839 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
28840 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
28841 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
28842 ALo = DAG.getBitcast(ExtVT, ALo);
28843 AHi = DAG.getBitcast(ExtVT, AHi);
28844 RLo = DAG.getBitcast(ExtVT, RLo);
28845 RHi = DAG.getBitcast(ExtVT, RHi);
28846 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
28847 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
28848 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
28849 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
28850 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28851 }
28852
28853 if (VT == MVT::v8i16) {
28854 // If we have a constant shift amount, the non-SSE41 path is best as
28855 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
28856 bool UseSSE41 = Subtarget.hasSSE41() &&
28857 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28858
28859 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
28860 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
28861 // the sign bit.
28862 if (UseSSE41) {
28863 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
28864 V0 = DAG.getBitcast(ExtVT, V0);
28865 V1 = DAG.getBitcast(ExtVT, V1);
28866 Sel = DAG.getBitcast(ExtVT, Sel);
28867 return DAG.getBitcast(
28868 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
28869 }
28870 // On pre-SSE41 targets we splat the sign bit - a negative value will
28871 // set all bits of the lanes to true and VSELECT uses that in
28872 // its OR(AND(V0,C),AND(V1,~C)) lowering.
28873 SDValue C =
28874 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
28875 return DAG.getSelect(dl, VT, C, V0, V1);
28876 };
28877
28878 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
28879 if (UseSSE41) {
28880 // On SSE41 targets we need to replicate the shift mask in both
28881 // bytes for PBLENDVB.
28882 Amt = DAG.getNode(
28883 ISD::OR, dl, VT,
28884 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
28885 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
28886 } else {
28887 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
28888 }
28889
28890 // r = VSELECT(r, shift(r, 8), a);
28891 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
28892 R = SignBitSelect(Amt, M, R);
28893
28894 // a += a
28895 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28896
28897 // r = VSELECT(r, shift(r, 4), a);
28898 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
28899 R = SignBitSelect(Amt, M, R);
28900
28901 // a += a
28902 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28903
28904 // r = VSELECT(r, shift(r, 2), a);
28905 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
28906 R = SignBitSelect(Amt, M, R);
28907
28908 // a += a
28909 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28910
28911 // return VSELECT(r, shift(r, 1), a);
28912 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
28913 R = SignBitSelect(Amt, M, R);
28914 return R;
28915 }
28916
28917 // Decompose 256-bit shifts into 128-bit shifts.
28918 if (VT.is256BitVector())
28919 return splitVectorIntBinary(Op, DAG);
28920
28921 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28922 return splitVectorIntBinary(Op, DAG);
28923
28924 return SDValue();
28925}
28926
28927static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
28928 SelectionDAG &DAG) {
28929 MVT VT = Op.getSimpleValueType();
28930 assert(VT.isVector() && "Custom lowering only for vector rotates!")((void)0);
28931
28932 SDLoc DL(Op);
28933 SDValue R = Op.getOperand(0);
28934 SDValue Amt = Op.getOperand(1);
28935 unsigned Opcode = Op.getOpcode();
28936 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28937 int NumElts = VT.getVectorNumElements();
28938
28939 // Check for constant splat rotation amount.
28940 APInt CstSplatValue;
28941 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
28942
28943 // Check for splat rotate by zero.
28944 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
28945 return R;
28946
28947 // AVX512 implicitly uses modulo rotation amounts.
28948 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
28949 // Attempt to rotate by immediate.
28950 if (IsCstSplat) {
28951 unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
28952 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28953 return DAG.getNode(RotOpc, DL, VT, R,
28954 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28955 }
28956
28957 // Else, fall-back on VPROLV/VPRORV.
28958 return Op;
28959 }
28960
28961 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
28962 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
28963 unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
28964 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
28965 }
28966
28967 assert((Opcode == ISD::ROTL) && "Only ROTL supported")((void)0);
28968
28969 // XOP has 128-bit vector variable + immediate rotates.
28970 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
28971 // XOP implicitly uses modulo rotation amounts.
28972 if (Subtarget.hasXOP()) {
28973 if (VT.is256BitVector())
28974 return splitVectorIntBinary(Op, DAG);
28975 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((void)0);
28976
28977 // Attempt to rotate by immediate.
28978 if (IsCstSplat) {
28979 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28980 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
28981 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28982 }
28983
28984 // Use general rotate by variable (per-element).
28985 return Op;
28986 }
28987
28988 // Split 256-bit integers on pre-AVX2 targets.
28989 if (VT.is256BitVector() && !Subtarget.hasAVX2())
28990 return splitVectorIntBinary(Op, DAG);
28991
28992 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||((void)0)
28993 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||((void)0)
28994 VT == MVT::v32i16) &&((void)0)
28995 Subtarget.hasAVX2())) &&((void)0)
28996 "Only vXi32/vXi16/vXi8 vector rotates supported")((void)0);
28997
28998 // Rotate by an uniform constant - expand back to shifts.
28999 if (IsCstSplat)
29000 return SDValue();
29001
29002 bool IsSplatAmt = DAG.isSplatValue(Amt);
29003
29004 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
29005 // the amount bit.
29006 if (EltSizeInBits == 8 && !IsSplatAmt) {
29007 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
29008 return SDValue();
29009
29010 // We don't need ModuloAmt here as we just peek at individual bits.
29011 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29012
29013 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29014 if (Subtarget.hasSSE41()) {
29015 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29016 // on the sign bit.
29017 V0 = DAG.getBitcast(VT, V0);
29018 V1 = DAG.getBitcast(VT, V1);
29019 Sel = DAG.getBitcast(VT, Sel);
29020 return DAG.getBitcast(SelVT,
29021 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29022 }
29023 // On pre-SSE41 targets we test for the sign bit by comparing to
29024 // zero - a negative value will set all bits of the lanes to true
29025 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29026 SDValue Z = DAG.getConstant(0, DL, SelVT);
29027 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29028 return DAG.getSelect(DL, SelVT, C, V0, V1);
29029 };
29030
29031 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29032 // We can safely do this using i16 shifts as we're only interested in
29033 // the 3 lower bits of each byte.
29034 Amt = DAG.getBitcast(ExtVT, Amt);
29035 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29036 Amt = DAG.getBitcast(VT, Amt);
29037
29038 // r = VSELECT(r, rot(r, 4), a);
29039 SDValue M;
29040 M = DAG.getNode(
29041 ISD::OR, DL, VT,
29042 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
29043 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
29044 R = SignBitSelect(VT, Amt, M, R);
29045
29046 // a += a
29047 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29048
29049 // r = VSELECT(r, rot(r, 2), a);
29050 M = DAG.getNode(
29051 ISD::OR, DL, VT,
29052 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
29053 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
29054 R = SignBitSelect(VT, Amt, M, R);
29055
29056 // a += a
29057 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29058
29059 // return VSELECT(r, rot(r, 1), a);
29060 M = DAG.getNode(
29061 ISD::OR, DL, VT,
29062 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
29063 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
29064 return SignBitSelect(VT, Amt, M, R);
29065 }
29066
29067 // ISD::ROT* uses modulo rotate amounts.
29068 if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
29069 // If the amount is a splat, perform the modulo BEFORE the splat,
29070 // this helps LowerScalarVariableShift to remove the splat later.
29071 Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
29072 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29073 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29074 Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
29075 SmallVector<int>(NumElts, 0));
29076 } else {
29077 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29078 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29079 }
29080
29081 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29082 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29083 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
29084
29085 // Fallback for splats + all supported variable shifts.
29086 // Fallback for non-constants AVX2 vXi16 as well.
29087 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29088 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29089 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29090 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
29091 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
29092 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29093 }
29094
29095 // As with shifts, convert the rotation amount to a multiplication factor.
29096 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29097 assert(Scale && "Failed to convert ROTL amount to scale")((void)0);
29098
29099 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29100 if (EltSizeInBits == 16) {
29101 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29102 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29103 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29104 }
29105
29106 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29107 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29108 // that can then be OR'd with the lower 32-bits.
29109 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")((void)0);
29110 static const int OddMask[] = {1, -1, 3, -1};
29111 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29112 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29113
29114 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29115 DAG.getBitcast(MVT::v2i64, R),
29116 DAG.getBitcast(MVT::v2i64, Scale));
29117 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29118 DAG.getBitcast(MVT::v2i64, R13),
29119 DAG.getBitcast(MVT::v2i64, Scale13));
29120 Res02 = DAG.getBitcast(VT, Res02);
29121 Res13 = DAG.getBitcast(VT, Res13);
29122
29123 return DAG.getNode(ISD::OR, DL, VT,
29124 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29125 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29126}
29127
29128/// Returns true if the operand type is exactly twice the native width, and
29129/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29130/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29131/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
29132bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
29133 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
29134
29135 if (OpWidth == 64)
29136 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
29137 if (OpWidth == 128)
29138 return Subtarget.hasCmpxchg16b();
29139
29140 return false;
29141}
29142
29143bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
29144 Type *MemType = SI->getValueOperand()->getType();
29145
29146 bool NoImplicitFloatOps =
29147 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29148 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29149 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29150 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29151 return false;
29152
29153 return needsCmpXchgNb(MemType);
29154}
29155
29156// Note: this turns large loads into lock cmpxchg8b/16b.
29157// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
29158TargetLowering::AtomicExpansionKind
29159X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
29160 Type *MemType = LI->getType();
29161
29162 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
29163 // can use movq to do the load. If we have X87 we can load into an 80-bit
29164 // X87 register and store it to a stack temporary.
29165 bool NoImplicitFloatOps =
29166 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29167 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29168 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29169 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29170 return AtomicExpansionKind::None;
29171
29172 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29173 : AtomicExpansionKind::None;
29174}
29175
29176TargetLowering::AtomicExpansionKind
29177X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29178 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29179 Type *MemType = AI->getType();
29180
29181 // If the operand is too big, we must see if cmpxchg8/16b is available
29182 // and default to library calls otherwise.
29183 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29184 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29185 : AtomicExpansionKind::None;
29186 }
29187
29188 AtomicRMWInst::BinOp Op = AI->getOperation();
29189 switch (Op) {
29190 default:
29191 llvm_unreachable("Unknown atomic operation")__builtin_unreachable();
29192 case AtomicRMWInst::Xchg:
29193 case AtomicRMWInst::Add:
29194 case AtomicRMWInst::Sub:
29195 // It's better to use xadd, xsub or xchg for these in all cases.
29196 return AtomicExpansionKind::None;
29197 case AtomicRMWInst::Or:
29198 case AtomicRMWInst::And:
29199 case AtomicRMWInst::Xor:
29200 // If the atomicrmw's result isn't actually used, we can just add a "lock"
29201 // prefix to a normal instruction for these operations.
29202 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29203 : AtomicExpansionKind::None;
29204 case AtomicRMWInst::Nand:
29205 case AtomicRMWInst::Max:
29206 case AtomicRMWInst::Min:
29207 case AtomicRMWInst::UMax:
29208 case AtomicRMWInst::UMin:
29209 case AtomicRMWInst::FAdd:
29210 case AtomicRMWInst::FSub:
29211 // These always require a non-trivial set of data operations on x86. We must
29212 // use a cmpxchg loop.
29213 return AtomicExpansionKind::CmpXChg;
29214 }
29215}
29216
29217LoadInst *
29218X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29219 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29220 Type *MemType = AI->getType();
29221 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29222 // there is no benefit in turning such RMWs into loads, and it is actually
29223 // harmful as it introduces a mfence.
29224 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29225 return nullptr;
29226
29227 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29228 // lowering available in lowerAtomicArith.
29229 // TODO: push more cases through this path.
29230 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29231 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29232 AI->use_empty())
29233 return nullptr;
29234
29235 IRBuilder<> Builder(AI);
29236 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29237 auto SSID = AI->getSyncScopeID();
29238 // We must restrict the ordering to avoid generating loads with Release or
29239 // ReleaseAcquire orderings.
29240 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29241
29242 // Before the load we need a fence. Here is an example lifted from
29243 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29244 // is required:
29245 // Thread 0:
29246 // x.store(1, relaxed);
29247 // r1 = y.fetch_add(0, release);
29248 // Thread 1:
29249 // y.fetch_add(42, acquire);
29250 // r2 = x.load(relaxed);
29251 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29252 // lowered to just a load without a fence. A mfence flushes the store buffer,
29253 // making the optimization clearly correct.
29254 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29255 // otherwise, we might be able to be more aggressive on relaxed idempotent
29256 // rmw. In practice, they do not look useful, so we don't try to be
29257 // especially clever.
29258 if (SSID == SyncScope::SingleThread)
29259 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29260 // the IR level, so we must wrap it in an intrinsic.
29261 return nullptr;
29262
29263 if (!Subtarget.hasMFence())
29264 // FIXME: it might make sense to use a locked operation here but on a
29265 // different cache-line to prevent cache-line bouncing. In practice it
29266 // is probably a small win, and x86 processors without mfence are rare
29267 // enough that we do not bother.
29268 return nullptr;
29269
29270 Function *MFence =
29271 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29272 Builder.CreateCall(MFence, {});
29273
29274 // Finally we can emit the atomic load.
29275 LoadInst *Loaded = Builder.CreateAlignedLoad(
29276 AI->getType(), AI->getPointerOperand(), AI->getAlign());
29277 Loaded->setAtomic(Order, SSID);
29278 AI->replaceAllUsesWith(Loaded);
29279 AI->eraseFromParent();
29280 return Loaded;
29281}
29282
29283bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29284 if (!SI.isUnordered())
29285 return false;
29286 return ExperimentalUnorderedISEL;
29287}
29288bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29289 if (!LI.isUnordered())
29290 return false;
29291 return ExperimentalUnorderedISEL;
29292}
29293
29294
29295/// Emit a locked operation on a stack location which does not change any
29296/// memory location, but does involve a lock prefix. Location is chosen to be
29297/// a) very likely accessed only by a single thread to minimize cache traffic,
29298/// and b) definitely dereferenceable. Returns the new Chain result.
29299static SDValue emitLockedStackOp(SelectionDAG &DAG,
29300 const X86Subtarget &Subtarget, SDValue Chain,
29301 const SDLoc &DL) {
29302 // Implementation notes:
29303 // 1) LOCK prefix creates a full read/write reordering barrier for memory
29304 // operations issued by the current processor. As such, the location
29305 // referenced is not relevant for the ordering properties of the instruction.
29306 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
29307 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
29308 // 2) Using an immediate operand appears to be the best encoding choice
29309 // here since it doesn't require an extra register.
29310 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29311 // is small enough it might just be measurement noise.)
29312 // 4) When choosing offsets, there are several contributing factors:
29313 // a) If there's no redzone, we default to TOS. (We could allocate a cache
29314 // line aligned stack object to improve this case.)
29315 // b) To minimize our chances of introducing a false dependence, we prefer
29316 // to offset the stack usage from TOS slightly.
29317 // c) To minimize concerns about cross thread stack usage - in particular,
29318 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29319 // captures state in the TOS frame and accesses it from many threads -
29320 // we want to use an offset such that the offset is in a distinct cache
29321 // line from the TOS frame.
29322 //
29323 // For a general discussion of the tradeoffs and benchmark results, see:
29324 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29325
29326 auto &MF = DAG.getMachineFunction();
29327 auto &TFL = *Subtarget.getFrameLowering();
29328 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29329
29330 if (Subtarget.is64Bit()) {
29331 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29332 SDValue Ops[] = {
29333 DAG.getRegister(X86::RSP, MVT::i64), // Base
29334 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29335 DAG.getRegister(0, MVT::i64), // Index
29336 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29337 DAG.getRegister(0, MVT::i16), // Segment.
29338 Zero,
29339 Chain};
29340 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29341 MVT::Other, Ops);
29342 return SDValue(Res, 1);
29343 }
29344
29345 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29346 SDValue Ops[] = {
29347 DAG.getRegister(X86::ESP, MVT::i32), // Base
29348 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29349 DAG.getRegister(0, MVT::i32), // Index
29350 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29351 DAG.getRegister(0, MVT::i16), // Segment.
29352 Zero,
29353 Chain
29354 };
29355 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29356 MVT::Other, Ops);
29357 return SDValue(Res, 1);
29358}
29359
29360static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29361 SelectionDAG &DAG) {
29362 SDLoc dl(Op);
29363 AtomicOrdering FenceOrdering =
29364 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29365 SyncScope::ID FenceSSID =
29366 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29367
29368 // The only fence that needs an instruction is a sequentially-consistent
29369 // cross-thread fence.
29370 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29371 FenceSSID == SyncScope::System) {
29372 if (Subtarget.hasMFence())
29373 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29374
29375 SDValue Chain = Op.getOperand(0);
29376 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29377 }
29378
29379 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29380 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29381}
29382
29383static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29384 SelectionDAG &DAG) {
29385 MVT T = Op.getSimpleValueType();
29386 SDLoc DL(Op);
29387 unsigned Reg = 0;
29388 unsigned size = 0;
29389 switch(T.SimpleTy) {
29390 default: llvm_unreachable("Invalid value type!")__builtin_unreachable();
29391 case MVT::i8: Reg = X86::AL; size = 1; break;
29392 case MVT::i16: Reg = X86::AX; size = 2; break;
29393 case MVT::i32: Reg = X86::EAX; size = 4; break;
29394 case MVT::i64:
29395 assert(Subtarget.is64Bit() && "Node not type legal!")((void)0);
29396 Reg = X86::RAX; size = 8;
29397 break;
29398 }
29399 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29400 Op.getOperand(2), SDValue());
29401 SDValue Ops[] = { cpIn.getValue(0),
29402 Op.getOperand(1),
29403 Op.getOperand(3),
29404 DAG.getTargetConstant(size, DL, MVT::i8),
29405 cpIn.getValue(1) };
29406 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29407 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29408 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29409 Ops, T, MMO);
29410
29411 SDValue cpOut =
29412 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29413 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29414 MVT::i32, cpOut.getValue(2));
29415 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29416
29417 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29418 cpOut, Success, EFLAGS.getValue(1));
29419}
29420
29421// Create MOVMSKB, taking into account whether we need to split for AVX1.
29422static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29423 const X86Subtarget &Subtarget) {
29424 MVT InVT = V.getSimpleValueType();
29425
29426 if (InVT == MVT::v64i8) {
29427 SDValue Lo, Hi;
29428 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29429 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29430 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29431 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29432 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29433 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29434 DAG.getConstant(32, DL, MVT::i8));
29435 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29436 }
29437 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29438 SDValue Lo, Hi;
29439 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29440 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29441 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29442 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29443 DAG.getConstant(16, DL, MVT::i8));
29444 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29445 }
29446
29447 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29448}
29449
29450static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29451 SelectionDAG &DAG) {
29452 SDValue Src = Op.getOperand(0);
29453 MVT SrcVT = Src.getSimpleValueType();
29454 MVT DstVT = Op.getSimpleValueType();
29455
29456 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29457 // half to v32i1 and concatenating the result.
29458 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29459 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((void)0);
29460 assert(Subtarget.hasBWI() && "Expected BWI target")((void)0);
29461 SDLoc dl(Op);
29462 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29463 DAG.getIntPtrConstant(0, dl));
29464 Lo = DAG.getBitcast(MVT::v32i1, Lo);
29465 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29466 DAG.getIntPtrConstant(1, dl));
29467 Hi = DAG.getBitcast(MVT::v32i1, Hi);
29468 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
29469 }
29470
29471 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
29472 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
29473 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")((void)0);
29474 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
29475 SDLoc DL(Op);
29476 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
29477 V = getPMOVMSKB(DL, V, DAG, Subtarget);
29478 return DAG.getZExtOrTrunc(V, DL, DstVT);
29479 }
29480
29481 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||((void)0)
29482 SrcVT == MVT::i64) && "Unexpected VT!")((void)0);
29483
29484 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
29485 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
29486 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
29487 // This conversion needs to be expanded.
29488 return SDValue();
29489
29490 SDLoc dl(Op);
29491 if (SrcVT.isVector()) {
29492 // Widen the vector in input in the case of MVT::v2i32.
29493 // Example: from MVT::v2i32 to MVT::v4i32.
29494 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
29495 SrcVT.getVectorNumElements() * 2);
29496 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
29497 DAG.getUNDEF(SrcVT));
29498 } else {
29499 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((void)0)
29500 "Unexpected source type in LowerBITCAST")((void)0);
29501 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
29502 }
29503
29504 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
29505 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
29506
29507 if (DstVT == MVT::x86mmx)
29508 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
29509
29510 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
29511 DAG.getIntPtrConstant(0, dl));
29512}
29513
29514/// Compute the horizontal sum of bytes in V for the elements of VT.
29515///
29516/// Requires V to be a byte vector and VT to be an integer vector type with
29517/// wider elements than V's type. The width of the elements of VT determines
29518/// how many bytes of V are summed horizontally to produce each element of the
29519/// result.
29520static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
29521 const X86Subtarget &Subtarget,
29522 SelectionDAG &DAG) {
29523 SDLoc DL(V);
29524 MVT ByteVecVT = V.getSimpleValueType();
29525 MVT EltVT = VT.getVectorElementType();
29526 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((void)0)
29527 "Expected value to have byte element type.")((void)0);
29528 assert(EltVT != MVT::i8 &&((void)0)
29529 "Horizontal byte sum only makes sense for wider elements!")((void)0);
29530 unsigned VecSize = VT.getSizeInBits();
29531 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((void)0);
29532
29533 // PSADBW instruction horizontally add all bytes and leave the result in i64
29534 // chunks, thus directly computes the pop count for v2i64 and v4i64.
29535 if (EltVT == MVT::i64) {
29536 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
29537 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29538 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
29539 return DAG.getBitcast(VT, V);
29540 }
29541
29542 if (EltVT == MVT::i32) {
29543 // We unpack the low half and high half into i32s interleaved with zeros so
29544 // that we can use PSADBW to horizontally sum them. The most useful part of
29545 // this is that it lines up the results of two PSADBW instructions to be
29546 // two v2i64 vectors which concatenated are the 4 population counts. We can
29547 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
29548 SDValue Zeros = DAG.getConstant(0, DL, VT);
29549 SDValue V32 = DAG.getBitcast(VT, V);
29550 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
29551 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
29552
29553 // Do the horizontal sums into two v2i64s.
29554 Zeros = DAG.getConstant(0, DL, ByteVecVT);
29555 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29556 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29557 DAG.getBitcast(ByteVecVT, Low), Zeros);
29558 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29559 DAG.getBitcast(ByteVecVT, High), Zeros);
29560
29561 // Merge them together.
29562 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
29563 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
29564 DAG.getBitcast(ShortVecVT, Low),
29565 DAG.getBitcast(ShortVecVT, High));
29566
29567 return DAG.getBitcast(VT, V);
29568 }
29569
29570 // The only element type left is i16.
29571 assert(EltVT == MVT::i16 && "Unknown how to handle type")((void)0);
29572
29573 // To obtain pop count for each i16 element starting from the pop count for
29574 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
29575 // right by 8. It is important to shift as i16s as i8 vector shift isn't
29576 // directly supported.
29577 SDValue ShifterV = DAG.getConstant(8, DL, VT);
29578 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29579 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
29580 DAG.getBitcast(ByteVecVT, V));
29581 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29582}
29583
29584static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
29585 const X86Subtarget &Subtarget,
29586 SelectionDAG &DAG) {
29587 MVT VT = Op.getSimpleValueType();
29588 MVT EltVT = VT.getVectorElementType();
29589 int NumElts = VT.getVectorNumElements();
29590 (void)EltVT;
29591 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")((void)0);
29592
29593 // Implement a lookup table in register by using an algorithm based on:
29594 // http://wm.ite.pl/articles/sse-popcount.html
29595 //
29596 // The general idea is that every lower byte nibble in the input vector is an
29597 // index into a in-register pre-computed pop count table. We then split up the
29598 // input vector in two new ones: (1) a vector with only the shifted-right
29599 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
29600 // masked out higher ones) for each byte. PSHUFB is used separately with both
29601 // to index the in-register table. Next, both are added and the result is a
29602 // i8 vector where each element contains the pop count for input byte.
29603 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
29604 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
29605 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
29606 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
29607
29608 SmallVector<SDValue, 64> LUTVec;
29609 for (int i = 0; i < NumElts; ++i)
29610 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29611 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
29612 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
29613
29614 // High nibbles
29615 SDValue FourV = DAG.getConstant(4, DL, VT);
29616 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
29617
29618 // Low nibbles
29619 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
29620
29621 // The input vector is used as the shuffle mask that index elements into the
29622 // LUT. After counting low and high nibbles, add the vector to obtain the
29623 // final pop count per i8 element.
29624 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
29625 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
29626 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
29627}
29628
29629// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
29630// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
29631static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29632 SelectionDAG &DAG) {
29633 MVT VT = Op.getSimpleValueType();
29634 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&((void)0)
29635 "Unknown CTPOP type to handle")((void)0);
29636 SDLoc DL(Op.getNode());
29637 SDValue Op0 = Op.getOperand(0);
29638
29639 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
29640 if (Subtarget.hasVPOPCNTDQ()) {
29641 unsigned NumElems = VT.getVectorNumElements();
29642 assert((VT.getVectorElementType() == MVT::i8 ||((void)0)
29643 VT.getVectorElementType() == MVT::i16) && "Unexpected type")((void)0);
29644 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
29645 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29646 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
29647 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
29648 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
29649 }
29650 }
29651
29652 // Decompose 256-bit ops into smaller 128-bit ops.
29653 if (VT.is256BitVector() && !Subtarget.hasInt256())
29654 return splitVectorIntUnary(Op, DAG);
29655
29656 // Decompose 512-bit ops into smaller 256-bit ops.
29657 if (VT.is512BitVector() && !Subtarget.hasBWI())
29658 return splitVectorIntUnary(Op, DAG);
29659
29660 // For element types greater than i8, do vXi8 pop counts and a bytesum.
29661 if (VT.getScalarType() != MVT::i8) {
29662 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
29663 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
29664 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
29665 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
29666 }
29667
29668 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
29669 if (!Subtarget.hasSSSE3())
29670 return SDValue();
29671
29672 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
29673}
29674
29675static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29676 SelectionDAG &DAG) {
29677 assert(Op.getSimpleValueType().isVector() &&((void)0)
29678 "We only do custom lowering for vector population count.")((void)0);
29679 return LowerVectorCTPOP(Op, Subtarget, DAG);
29680}
29681
29682static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
29683 MVT VT = Op.getSimpleValueType();
29684 SDValue In = Op.getOperand(0);
29685 SDLoc DL(Op);
29686
29687 // For scalars, its still beneficial to transfer to/from the SIMD unit to
29688 // perform the BITREVERSE.
29689 if (!VT.isVector()) {
29690 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
29691 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
29692 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
29693 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
29694 DAG.getIntPtrConstant(0, DL));
29695 }
29696
29697 int NumElts = VT.getVectorNumElements();
29698 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
29699
29700 // Decompose 256-bit ops into smaller 128-bit ops.
29701 if (VT.is256BitVector())
29702 return splitVectorIntUnary(Op, DAG);
29703
29704 assert(VT.is128BitVector() &&((void)0)
29705 "Only 128-bit vector bitreverse lowering supported.")((void)0);
29706
29707 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
29708 // perform the BSWAP in the shuffle.
29709 // Its best to shuffle using the second operand as this will implicitly allow
29710 // memory folding for multiple vectors.
29711 SmallVector<SDValue, 16> MaskElts;
29712 for (int i = 0; i != NumElts; ++i) {
29713 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
29714 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
29715 int PermuteByte = SourceByte | (2 << 5);
29716 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
29717 }
29718 }
29719
29720 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
29721 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
29722 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
29723 Res, Mask);
29724 return DAG.getBitcast(VT, Res);
29725}
29726
29727static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
29728 SelectionDAG &DAG) {
29729 MVT VT = Op.getSimpleValueType();
29730
29731 if (Subtarget.hasXOP() && !VT.is512BitVector())
29732 return LowerBITREVERSE_XOP(Op, DAG);
29733
29734 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((void)0);
29735
29736 SDValue In = Op.getOperand(0);
29737 SDLoc DL(Op);
29738
29739 assert(VT.getScalarType() == MVT::i8 &&((void)0)
29740 "Only byte vector BITREVERSE supported")((void)0);
29741
29742 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
29743 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
29744 return splitVectorIntUnary(Op, DAG);
29745
29746 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
29747 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
29748 return splitVectorIntUnary(Op, DAG);
29749
29750 unsigned NumElts = VT.getVectorNumElements();
29751
29752 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
29753 if (Subtarget.hasGFNI()) {
29754 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29755 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
29756 Matrix = DAG.getBitcast(VT, Matrix);
29757 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
29758 DAG.getTargetConstant(0, DL, MVT::i8));
29759 }
29760
29761 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
29762 // two nibbles and a PSHUFB lookup to find the bitreverse of each
29763 // 0-15 value (moved to the other nibble).
29764 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
29765 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
29766 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
29767
29768 const int LoLUT[16] = {
29769 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
29770 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
29771 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
29772 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
29773 const int HiLUT[16] = {
29774 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
29775 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
29776 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
29777 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
29778
29779 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
29780 for (unsigned i = 0; i < NumElts; ++i) {
29781 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
29782 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
29783 }
29784
29785 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
29786 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
29787 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
29788 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
29789 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29790}
29791
29792static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
29793 SelectionDAG &DAG) {
29794 SDLoc DL(Op);
29795 SDValue X = Op.getOperand(0);
29796 MVT VT = Op.getSimpleValueType();
29797
29798 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
29799 if (VT == MVT::i8 ||
29800 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
29801 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29802 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
29803 DAG.getConstant(0, DL, MVT::i8));
29804 // Copy the inverse of the parity flag into a register with setcc.
29805 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29806 // Extend to the original type.
29807 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29808 }
29809
29810 if (VT == MVT::i64) {
29811 // Xor the high and low 16-bits together using a 32-bit operation.
29812 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
29813 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
29814 DAG.getConstant(32, DL, MVT::i8)));
29815 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
29816 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
29817 }
29818
29819 if (VT != MVT::i16) {
29820 // Xor the high and low 16-bits together using a 32-bit operation.
29821 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
29822 DAG.getConstant(16, DL, MVT::i8));
29823 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
29824 } else {
29825 // If the input is 16-bits, we need to extend to use an i32 shift below.
29826 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
29827 }
29828
29829 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
29830 // This should allow an h-reg to be used to save a shift.
29831 SDValue Hi = DAG.getNode(
29832 ISD::TRUNCATE, DL, MVT::i8,
29833 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
29834 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29835 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
29836 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
29837
29838 // Copy the inverse of the parity flag into a register with setcc.
29839 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29840 // Extend to the original type.
29841 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29842}
29843
29844static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
29845 const X86Subtarget &Subtarget) {
29846 unsigned NewOpc = 0;
29847 switch (N->getOpcode()) {
29848 case ISD::ATOMIC_LOAD_ADD:
29849 NewOpc = X86ISD::LADD;
29850 break;
29851 case ISD::ATOMIC_LOAD_SUB:
29852 NewOpc = X86ISD::LSUB;
29853 break;
29854 case ISD::ATOMIC_LOAD_OR:
29855 NewOpc = X86ISD::LOR;
29856 break;
29857 case ISD::ATOMIC_LOAD_XOR:
29858 NewOpc = X86ISD::LXOR;
29859 break;
29860 case ISD::ATOMIC_LOAD_AND:
29861 NewOpc = X86ISD::LAND;
29862 break;
29863 default:
29864 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")__builtin_unreachable();
29865 }
29866
29867 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
29868
29869 return DAG.getMemIntrinsicNode(
29870 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
29871 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
29872 /*MemVT=*/N->getSimpleValueType(0), MMO);
29873}
29874
29875/// Lower atomic_load_ops into LOCK-prefixed operations.
29876static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
29877 const X86Subtarget &Subtarget) {
29878 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
29879 SDValue Chain = N->getOperand(0);
29880 SDValue LHS = N->getOperand(1);
29881 SDValue RHS = N->getOperand(2);
29882 unsigned Opc = N->getOpcode();
29883 MVT VT = N->getSimpleValueType(0);
29884 SDLoc DL(N);
29885
29886 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
29887 // can only be lowered when the result is unused. They should have already
29888 // been transformed into a cmpxchg loop in AtomicExpand.
29889 if (N->hasAnyUseOfValue(0)) {
29890 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
29891 // select LXADD if LOCK_SUB can't be selected.
29892 if (Opc == ISD::ATOMIC_LOAD_SUB) {
29893 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
29894 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
29895 RHS, AN->getMemOperand());
29896 }
29897 assert(Opc == ISD::ATOMIC_LOAD_ADD &&((void)0)
29898 "Used AtomicRMW ops other than Add should have been expanded!")((void)0);
29899 return N;
29900 }
29901
29902 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
29903 // The core idea here is that since the memory location isn't actually
29904 // changing, all we need is a lowering for the *ordering* impacts of the
29905 // atomicrmw. As such, we can chose a different operation and memory
29906 // location to minimize impact on other code.
29907 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
29908 // On X86, the only ordering which actually requires an instruction is
29909 // seq_cst which isn't SingleThread, everything just needs to be preserved
29910 // during codegen and then dropped. Note that we expect (but don't assume),
29911 // that orderings other than seq_cst and acq_rel have been canonicalized to
29912 // a store or load.
29913 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
29914 AN->getSyncScopeID() == SyncScope::System) {
29915 // Prefer a locked operation against a stack location to minimize cache
29916 // traffic. This assumes that stack locations are very likely to be
29917 // accessed only by the owning thread.
29918 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
29919 assert(!N->hasAnyUseOfValue(0))((void)0);
29920 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29921 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29922 DAG.getUNDEF(VT), NewChain);
29923 }
29924 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29925 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
29926 assert(!N->hasAnyUseOfValue(0))((void)0);
29927 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29928 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29929 DAG.getUNDEF(VT), NewChain);
29930 }
29931
29932 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
29933 // RAUW the chain, but don't worry about the result, as it's unused.
29934 assert(!N->hasAnyUseOfValue(0))((void)0);
29935 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29936 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29937 DAG.getUNDEF(VT), LockOp.getValue(1));
29938}
29939
29940static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
29941 const X86Subtarget &Subtarget) {
29942 auto *Node = cast<AtomicSDNode>(Op.getNode());
29943 SDLoc dl(Node);
29944 EVT VT = Node->getMemoryVT();
29945
29946 bool IsSeqCst =
29947 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
29948 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
29949
29950 // If this store is not sequentially consistent and the type is legal
29951 // we can just keep it.
29952 if (!IsSeqCst && IsTypeLegal)
29953 return Op;
29954
29955 if (VT == MVT::i64 && !IsTypeLegal) {
29956 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
29957 // is enabled.
29958 bool NoImplicitFloatOps =
29959 DAG.getMachineFunction().getFunction().hasFnAttribute(
29960 Attribute::NoImplicitFloat);
29961 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29962 SDValue Chain;
29963 if (Subtarget.hasSSE1()) {
29964 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
29965 Node->getOperand(2));
29966 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29967 SclToVec = DAG.getBitcast(StVT, SclToVec);
29968 SDVTList Tys = DAG.getVTList(MVT::Other);
29969 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
29970 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
29971 MVT::i64, Node->getMemOperand());
29972 } else if (Subtarget.hasX87()) {
29973 // First load this into an 80-bit X87 register using a stack temporary.
29974 // This will put the whole integer into the significand.
29975 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29976 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29977 MachinePointerInfo MPI =
29978 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29979 Chain =
29980 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
29981 MPI, MaybeAlign(), MachineMemOperand::MOStore);
29982 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29983 SDValue LdOps[] = {Chain, StackPtr};
29984 SDValue Value =
29985 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
29986 /*Align*/ None, MachineMemOperand::MOLoad);
29987 Chain = Value.getValue(1);
29988
29989 // Now use an FIST to do the atomic store.
29990 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
29991 Chain =
29992 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
29993 StoreOps, MVT::i64, Node->getMemOperand());
29994 }
29995
29996 if (Chain) {
29997 // If this is a sequentially consistent store, also emit an appropriate
29998 // barrier.
29999 if (IsSeqCst)
30000 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
30001
30002 return Chain;
30003 }
30004 }
30005 }
30006
30007 // Convert seq_cst store -> xchg
30008 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
30009 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
30010 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
30011 Node->getMemoryVT(),
30012 Node->getOperand(0),
30013 Node->getOperand(1), Node->getOperand(2),
30014 Node->getMemOperand());
30015 return Swap.getValue(1);
30016}
30017
30018static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
30019 SDNode *N = Op.getNode();
30020 MVT VT = N->getSimpleValueType(0);
30021 unsigned Opc = Op.getOpcode();
30022
30023 // Let legalize expand this if it isn't a legal type yet.
30024 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30025 return SDValue();
30026
30027 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30028 SDLoc DL(N);
30029
30030 // Set the carry flag.
30031 SDValue Carry = Op.getOperand(2);
30032 EVT CarryVT = Carry.getValueType();
30033 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
30034 Carry, DAG.getAllOnesConstant(DL, CarryVT));
30035
30036 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
30037 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
30038 Op.getOperand(0), Op.getOperand(1),
30039 Carry.getValue(1));
30040
30041 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
30042 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
30043 Sum.getValue(1), DL, DAG);
30044 if (N->getValueType(1) == MVT::i1)
30045 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
30046
30047 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
30048}
30049
30050static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
30051 SelectionDAG &DAG) {
30052 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((void)0);
30053
30054 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
30055 // which returns the values as { float, float } (in XMM0) or
30056 // { double, double } (which is returned in XMM0, XMM1).
30057 SDLoc dl(Op);
30058 SDValue Arg = Op.getOperand(0);
30059 EVT ArgVT = Arg.getValueType();
30060 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30061
30062 TargetLowering::ArgListTy Args;
30063 TargetLowering::ArgListEntry Entry;
30064
30065 Entry.Node = Arg;
30066 Entry.Ty = ArgTy;
30067 Entry.IsSExt = false;
30068 Entry.IsZExt = false;
30069 Args.push_back(Entry);
30070
30071 bool isF64 = ArgVT == MVT::f64;
30072 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
30073 // the small struct {f32, f32} is returned in (eax, edx). For f64,
30074 // the results are returned via SRet in memory.
30075 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30076 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
30077 const char *LibcallName = TLI.getLibcallName(LC);
30078 SDValue Callee =
30079 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
30080
30081 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
30082 : (Type *)FixedVectorType::get(ArgTy, 4);
30083
30084 TargetLowering::CallLoweringInfo CLI(DAG);
30085 CLI.setDebugLoc(dl)
30086 .setChain(DAG.getEntryNode())
30087 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
30088
30089 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
30090
30091 if (isF64)
30092 // Returned in xmm0 and xmm1.
30093 return CallResult.first;
30094
30095 // Returned in bits 0:31 and 32:64 xmm0.
30096 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30097 CallResult.first, DAG.getIntPtrConstant(0, dl));
30098 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30099 CallResult.first, DAG.getIntPtrConstant(1, dl));
30100 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
30101 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
30102}
30103
30104/// Widen a vector input to a vector of NVT. The
30105/// input vector must have the same element type as NVT.
30106static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
30107 bool FillWithZeroes = false) {
30108 // Check if InOp already has the right width.
30109 MVT InVT = InOp.getSimpleValueType();
30110 if (InVT == NVT)
30111 return InOp;
30112
30113 if (InOp.isUndef())
30114 return DAG.getUNDEF(NVT);
30115
30116 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((void)0)
30117 "input and widen element type must match")((void)0);
30118
30119 unsigned InNumElts = InVT.getVectorNumElements();
30120 unsigned WidenNumElts = NVT.getVectorNumElements();
30121 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((void)0)
30122 "Unexpected request for vector widening")((void)0);
30123
30124 SDLoc dl(InOp);
30125 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
30126 InOp.getNumOperands() == 2) {
30127 SDValue N1 = InOp.getOperand(1);
30128 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
30129 N1.isUndef()) {
30130 InOp = InOp.getOperand(0);
30131 InVT = InOp.getSimpleValueType();
30132 InNumElts = InVT.getVectorNumElements();
30133 }
30134 }
30135 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
30136 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
30137 SmallVector<SDValue, 16> Ops;
30138 for (unsigned i = 0; i < InNumElts; ++i)
30139 Ops.push_back(InOp.getOperand(i));
30140
30141 EVT EltVT = InOp.getOperand(0).getValueType();
30142
30143 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
30144 DAG.getUNDEF(EltVT);
30145 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
30146 Ops.push_back(FillVal);
30147 return DAG.getBuildVector(NVT, dl, Ops);
30148 }
30149 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
30150 DAG.getUNDEF(NVT);
30151 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
30152 InOp, DAG.getIntPtrConstant(0, dl));
30153}
30154
30155static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
30156 SelectionDAG &DAG) {
30157 assert(Subtarget.hasAVX512() &&((void)0)
30158 "MGATHER/MSCATTER are supported on AVX-512 arch only")((void)0);
30159
30160 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
30161 SDValue Src = N->getValue();
30162 MVT VT = Src.getSimpleValueType();
30163 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((void)0);
30164 SDLoc dl(Op);
30165
30166 SDValue Scale = N->getScale();
30167 SDValue Index = N->getIndex();
30168 SDValue Mask = N->getMask();
30169 SDValue Chain = N->getChain();
30170 SDValue BasePtr = N->getBasePtr();
30171
30172 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
30173 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((void)0);
30174 // If the index is v2i64 and we have VLX we can use xmm for data and index.
30175 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
30176 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30177 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
30178 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30179 SDVTList VTs = DAG.getVTList(MVT::Other);
30180 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30181 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30182 N->getMemoryVT(), N->getMemOperand());
30183 }
30184 return SDValue();
30185 }
30186
30187 MVT IndexVT = Index.getSimpleValueType();
30188
30189 // If the index is v2i32, we're being called by type legalization and we
30190 // should just let the default handling take care of it.
30191 if (IndexVT == MVT::v2i32)
30192 return SDValue();
30193
30194 // If we don't have VLX and neither the passthru or index is 512-bits, we
30195 // need to widen until one is.
30196 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30197 !Index.getSimpleValueType().is512BitVector()) {
30198 // Determine how much we need to widen by to get a 512-bit type.
30199 unsigned Factor = std::min(512/VT.getSizeInBits(),
30200 512/IndexVT.getSizeInBits());
30201 unsigned NumElts = VT.getVectorNumElements() * Factor;
30202
30203 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30204 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30205 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30206
30207 Src = ExtendToType(Src, VT, DAG);
30208 Index = ExtendToType(Index, IndexVT, DAG);
30209 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30210 }
30211
30212 SDVTList VTs = DAG.getVTList(MVT::Other);
30213 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30214 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30215 N->getMemoryVT(), N->getMemOperand());
30216}
30217
30218static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30219 SelectionDAG &DAG) {
30220
30221 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30222 MVT VT = Op.getSimpleValueType();
30223 MVT ScalarVT = VT.getScalarType();
30224 SDValue Mask = N->getMask();
30225 MVT MaskVT = Mask.getSimpleValueType();
30226 SDValue PassThru = N->getPassThru();
30227 SDLoc dl(Op);
30228
30229 // Handle AVX masked loads which don't support passthru other than 0.
30230 if (MaskVT.getVectorElementType() != MVT::i1) {
30231 // We also allow undef in the isel pattern.
30232 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30233 return Op;
30234
30235 SDValue NewLoad = DAG.getMaskedLoad(
30236 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30237 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30238 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30239 N->isExpandingLoad());
30240 // Emit a blend.
30241 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30242 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30243 }
30244
30245 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&((void)0)
30246 "Expanding masked load is supported on AVX-512 target only!")((void)0);
30247
30248 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&((void)0)
30249 "Expanding masked load is supported for 32 and 64-bit types only!")((void)0);
30250
30251 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((void)0)
30252 "Cannot lower masked load op.")((void)0);
30253
30254 assert((ScalarVT.getSizeInBits() >= 32 ||((void)0)
30255 (Subtarget.hasBWI() &&((void)0)
30256 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&((void)0)
30257 "Unsupported masked load op.")((void)0);
30258
30259 // This operation is legal for targets with VLX, but without
30260 // VLX the vector should be widened to 512 bit
30261 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30262 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30263 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30264
30265 // Mask element has to be i1.
30266 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((void)0)
30267 "Unexpected mask type")((void)0);
30268
30269 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30270
30271 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30272 SDValue NewLoad = DAG.getMaskedLoad(
30273 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30274 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30275 N->getExtensionType(), N->isExpandingLoad());
30276
30277 SDValue Extract =
30278 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30279 DAG.getIntPtrConstant(0, dl));
30280 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30281 return DAG.getMergeValues(RetOps, dl);
30282}
30283
30284static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30285 SelectionDAG &DAG) {
30286 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30287 SDValue DataToStore = N->getValue();
30288 MVT VT = DataToStore.getSimpleValueType();
30289 MVT ScalarVT = VT.getScalarType();
30290 SDValue Mask = N->getMask();
30291 SDLoc dl(Op);
30292
30293 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&((void)0)
30294 "Expanding masked load is supported on AVX-512 target only!")((void)0);
30295
30296 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&((void)0)
30297 "Expanding masked load is supported for 32 and 64-bit types only!")((void)0);
30298
30299 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((void)0)
30300 "Cannot lower masked store op.")((void)0);
30301
30302 assert((ScalarVT.getSizeInBits() >= 32 ||((void)0)
30303 (Subtarget.hasBWI() &&((void)0)
30304 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&((void)0)
30305 "Unsupported masked store op.")((void)0);
30306
30307 // This operation is legal for targets with VLX, but without
30308 // VLX the vector should be widened to 512 bit
30309 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
30310 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30311
30312 // Mask element has to be i1.
30313 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((void)0)
30314 "Unexpected mask type")((void)0);
30315
30316 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30317
30318 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
30319 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30320 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
30321 N->getOffset(), Mask, N->getMemoryVT(),
30322 N->getMemOperand(), N->getAddressingMode(),
30323 N->isTruncatingStore(), N->isCompressingStore());
30324}
30325
30326static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
30327 SelectionDAG &DAG) {
30328 assert(Subtarget.hasAVX2() &&((void)0)
30329 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")((void)0);
30330
30331 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
30332 SDLoc dl(Op);
30333 MVT VT = Op.getSimpleValueType();
30334 SDValue Index = N->getIndex();
30335 SDValue Mask = N->getMask();
30336 SDValue PassThru = N->getPassThru();
30337 MVT IndexVT = Index.getSimpleValueType();
30338
30339 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")((void)0);
30340
30341 // If the index is v2i32, we're being called by type legalization.
30342 if (IndexVT == MVT::v2i32)
30343 return SDValue();
30344
30345 // If we don't have VLX and neither the passthru or index is 512-bits, we
30346 // need to widen until one is.
30347 MVT OrigVT = VT;
30348 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30349 !IndexVT.is512BitVector()) {
30350 // Determine how much we need to widen by to get a 512-bit type.
30351 unsigned Factor = std::min(512/VT.getSizeInBits(),
30352 512/IndexVT.getSizeInBits());
30353
30354 unsigned NumElts = VT.getVectorNumElements() * Factor;
30355
30356 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30357 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30358 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30359
30360 PassThru = ExtendToType(PassThru, VT, DAG);
30361 Index = ExtendToType(Index, IndexVT, DAG);
30362 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30363 }
30364
30365 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
30366 N->getScale() };
30367 SDValue NewGather = DAG.getMemIntrinsicNode(
30368 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
30369 N->getMemOperand());
30370 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
30371 NewGather, DAG.getIntPtrConstant(0, dl));
30372 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
30373}
30374
30375static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
30376 SDLoc dl(Op);
30377 SDValue Src = Op.getOperand(0);
30378 MVT DstVT = Op.getSimpleValueType();
30379
30380 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
30381 unsigned SrcAS = N->getSrcAddressSpace();
30382
30383 assert(SrcAS != N->getDestAddressSpace() &&((void)0)
30384 "addrspacecast must be between different address spaces")((void)0);
30385
30386 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
30387 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
30388 } else if (DstVT == MVT::i64) {
30389 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
30390 } else if (DstVT == MVT::i32) {
30391 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
30392 } else {
30393 report_fatal_error("Bad address space in addrspacecast");
30394 }
30395 return Op;
30396}
30397
30398SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
30399 SelectionDAG &DAG) const {
30400 // TODO: Eventually, the lowering of these nodes should be informed by or
30401 // deferred to the GC strategy for the function in which they appear. For
30402 // now, however, they must be lowered to something. Since they are logically
30403 // no-ops in the case of a null GC strategy (or a GC strategy which does not
30404 // require special handling for these nodes), lower them as literal NOOPs for
30405 // the time being.
30406 SmallVector<SDValue, 2> Ops;
30407
30408 Ops.push_back(Op.getOperand(0));
30409 if (Op->getGluedNode())
30410 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
30411
30412 SDLoc OpDL(Op);
30413 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
30414 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
30415
30416 return NOOP;
30417}
30418
30419// Custom split CVTPS2PH with wide types.
30420static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
30421 SDLoc dl(Op);
30422 EVT VT = Op.getValueType();
30423 SDValue Lo, Hi;
30424 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
30425 EVT LoVT, HiVT;
30426 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30427 SDValue RC = Op.getOperand(1);
30428 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
30429 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
30430 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30431}
30432
30433/// Provide custom lowering hooks for some operations.
30434SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
30435 switch (Op.getOpcode()) {
30436 default: llvm_unreachable("Should not custom lower this!")__builtin_unreachable();
30437 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
30438 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
30439 return LowerCMP_SWAP(Op, Subtarget, DAG);
30440 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
30441 case ISD::ATOMIC_LOAD_ADD:
30442 case ISD::ATOMIC_LOAD_SUB:
30443 case ISD::ATOMIC_LOAD_OR:
30444 case ISD::ATOMIC_LOAD_XOR:
30445 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
30446 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
30447 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
30448 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
30449 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
30450 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
30451 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
30452 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
30453 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
30454 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
30455 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
30456 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
30457 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
30458 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
30459 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
30460 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
30461 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
30462 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
30463 case ISD::SHL_PARTS:
30464 case ISD::SRA_PARTS:
30465 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
30466 case ISD::FSHL:
30467 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
30468 case ISD::STRICT_SINT_TO_FP:
30469 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
30470 case ISD::STRICT_UINT_TO_FP:
30471 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
30472 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
30473 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
30474 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
30475 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
30476 case ISD::ZERO_EXTEND_VECTOR_INREG:
30477 case ISD::SIGN_EXTEND_VECTOR_INREG:
30478 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
30479 case ISD::FP_TO_SINT:
30480 case ISD::STRICT_FP_TO_SINT:
30481 case ISD::FP_TO_UINT:
30482 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
30483 case ISD::FP_TO_SINT_SAT:
30484 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
30485 case ISD::FP_EXTEND:
30486 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
30487 case ISD::FP_ROUND:
30488 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
30489 case ISD::FP16_TO_FP:
30490 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
30491 case ISD::FP_TO_FP16:
30492 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
30493 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
30494 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
30495 case ISD::FADD:
30496 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
30497 case ISD::FROUND: return LowerFROUND(Op, DAG);
30498 case ISD::FABS:
30499 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
30500 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
30501 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
30502 case ISD::LRINT:
30503 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
30504 case ISD::SETCC:
30505 case ISD::STRICT_FSETCC:
30506 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
30507 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
30508 case ISD::SELECT: return LowerSELECT(Op, DAG);
30509 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
30510 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
30511 case ISD::VASTART: return LowerVASTART(Op, DAG);
30512 case ISD::VAARG: return LowerVAARG(Op, DAG);
30513 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
30514 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
30515 case ISD::INTRINSIC_VOID:
30516 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
30517 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
30518 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
30519 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
30520 case ISD::FRAME_TO_ARGS_OFFSET:
30521 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
30522 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
30523 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
30524 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
30525 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
30526 case ISD::EH_SJLJ_SETUP_DISPATCH:
30527 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
30528 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
30529 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
30530 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
30531 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
30532 case ISD::CTLZ:
30533 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
30534 case ISD::CTTZ:
30535 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
30536 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
30537 case ISD::MULHS:
30538 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
30539 case ISD::ROTL:
30540 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
30541 case ISD::SRA:
30542 case ISD::SRL:
30543 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
30544 case ISD::SADDO:
30545 case ISD::UADDO:
30546 case ISD::SSUBO:
30547 case ISD::USUBO: return LowerXALUO(Op, DAG);
30548 case ISD::SMULO:
30549 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
30550 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
30551 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
30552 case ISD::SADDO_CARRY:
30553 case ISD::SSUBO_CARRY:
30554 case ISD::ADDCARRY:
30555 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
30556 case ISD::ADD:
30557 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
30558 case ISD::UADDSAT:
30559 case ISD::SADDSAT:
30560 case ISD::USUBSAT:
30561 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
30562 case ISD::SMAX:
30563 case ISD::SMIN:
30564 case ISD::UMAX:
30565 case ISD::UMIN: return LowerMINMAX(Op, DAG);
30566 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
30567 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
30568 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
30569 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
30570 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
30571 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
30572 case ISD::GC_TRANSITION_START:
30573 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
30574 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
30575 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
30576 }
30577}
30578
30579/// Replace a node with an illegal result type with a new node built out of
30580/// custom code.
30581void X86TargetLowering::ReplaceNodeResults(SDNode *N,
30582 SmallVectorImpl<SDValue>&Results,
30583 SelectionDAG &DAG) const {
30584 SDLoc dl(N);
30585 switch (N->getOpcode()) {
30586 default:
30587#ifndef NDEBUG1
30588 dbgs() << "ReplaceNodeResults: ";
30589 N->dump(&DAG);
30590#endif
30591 llvm_unreachable("Do not know how to custom type legalize this operation!")__builtin_unreachable();
30592 case X86ISD::CVTPH2PS: {
30593 EVT VT = N->getValueType(0);
30594 SDValue Lo, Hi;
30595 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
30596 EVT LoVT, HiVT;
30597 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30598 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
30599 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
30600 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30601 Results.push_back(Res);
30602 return;
30603 }
30604 case X86ISD::STRICT_CVTPH2PS: {
30605 EVT VT = N->getValueType(0);
30606 SDValue Lo, Hi;
30607 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
30608 EVT LoVT, HiVT;
30609 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30610 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
30611 {N->getOperand(0), Lo});
30612 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
30613 {N->getOperand(0), Hi});
30614 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30615 Lo.getValue(1), Hi.getValue(1));
30616 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30617 Results.push_back(Res);
30618 Results.push_back(Chain);
30619 return;
30620 }
30621 case X86ISD::CVTPS2PH:
30622 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
30623 return;
30624 case ISD::CTPOP: {
30625 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((void)0);
30626 // Use a v2i64 if possible.
30627 bool NoImplicitFloatOps =
30628 DAG.getMachineFunction().getFunction().hasFnAttribute(
30629 Attribute::NoImplicitFloat);
30630 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
30631 SDValue Wide =
30632 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
30633 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
30634 // Bit count should fit in 32-bits, extract it as that and then zero
30635 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
30636 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
30637 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
30638 DAG.getIntPtrConstant(0, dl));
30639 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
30640 Results.push_back(Wide);
30641 }
30642 return;
30643 }
30644 case ISD::MUL: {
30645 EVT VT = N->getValueType(0);
30646 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30647 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")((void)0);
30648 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
30649 // elements are needed.
30650 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30651 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
30652 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
30653 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
30654 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30655 unsigned NumConcats = 16 / VT.getVectorNumElements();
30656 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30657 ConcatOps[0] = Res;
30658 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
30659 Results.push_back(Res);
30660 return;
30661 }
30662 case X86ISD::VPMADDWD:
30663 case X86ISD::AVG: {
30664 // Legalize types for X86ISD::AVG/VPMADDWD by widening.
30665 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
30666
30667 EVT VT = N->getValueType(0);
30668 EVT InVT = N->getOperand(0).getValueType();
30669 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&((void)0)
30670 "Expected a VT that divides into 128 bits.")((void)0);
30671 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30672 "Unexpected type action!")((void)0);
30673 unsigned NumConcat = 128 / InVT.getSizeInBits();
30674
30675 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
30676 InVT.getVectorElementType(),
30677 NumConcat * InVT.getVectorNumElements());
30678 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
30679 VT.getVectorElementType(),
30680 NumConcat * VT.getVectorNumElements());
30681
30682 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
30683 Ops[0] = N->getOperand(0);
30684 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30685 Ops[0] = N->getOperand(1);
30686 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30687
30688 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
30689 Results.push_back(Res);
30690 return;
30691 }
30692 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
30693 case X86ISD::FMINC:
30694 case X86ISD::FMIN:
30695 case X86ISD::FMAXC:
30696 case X86ISD::FMAX: {
30697 EVT VT = N->getValueType(0);
30698 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")((void)0);
30699 SDValue UNDEF = DAG.getUNDEF(VT);
30700 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30701 N->getOperand(0), UNDEF);
30702 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30703 N->getOperand(1), UNDEF);
30704 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
30705 return;
30706 }
30707 case ISD::SDIV:
30708 case ISD::UDIV:
30709 case ISD::SREM:
30710 case ISD::UREM: {
30711 EVT VT = N->getValueType(0);
30712 if (VT.isVector()) {
30713 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30714 "Unexpected type action!")((void)0);
30715 // If this RHS is a constant splat vector we can widen this and let
30716 // division/remainder by constant optimize it.
30717 // TODO: Can we do something for non-splat?
30718 APInt SplatVal;
30719 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
30720 unsigned NumConcats = 128 / VT.getSizeInBits();
30721 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
30722 Ops0[0] = N->getOperand(0);
30723 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
30724 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
30725 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
30726 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
30727 Results.push_back(Res);
30728 }
30729 return;
30730 }
30731
30732 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
30733 Results.push_back(V);
30734 return;
30735 }
30736 case ISD::TRUNCATE: {
30737 MVT VT = N->getSimpleValueType(0);
30738 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
30739 return;
30740
30741 // The generic legalizer will try to widen the input type to the same
30742 // number of elements as the widened result type. But this isn't always
30743 // the best thing so do some custom legalization to avoid some cases.
30744 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
30745 SDValue In = N->getOperand(0);
30746 EVT InVT = In.getValueType();
30747
30748 unsigned InBits = InVT.getSizeInBits();
30749 if (128 % InBits == 0) {
30750 // 128 bit and smaller inputs should avoid truncate all together and
30751 // just use a build_vector that will become a shuffle.
30752 // TODO: Widen and use a shuffle directly?
30753 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
30754 EVT EltVT = VT.getVectorElementType();
30755 unsigned WidenNumElts = WidenVT.getVectorNumElements();
30756 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
30757 // Use the original element count so we don't do more scalar opts than
30758 // necessary.
30759 unsigned MinElts = VT.getVectorNumElements();
30760 for (unsigned i=0; i < MinElts; ++i) {
30761 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
30762 DAG.getIntPtrConstant(i, dl));
30763 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
30764 }
30765 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
30766 return;
30767 }
30768 // With AVX512 there are some cases that can use a target specific
30769 // truncate node to go from 256/512 to less than 128 with zeros in the
30770 // upper elements of the 128 bit result.
30771 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
30772 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
30773 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
30774 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30775 return;
30776 }
30777 // There's one case we can widen to 512 bits and use VTRUNC.
30778 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
30779 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
30780 DAG.getUNDEF(MVT::v4i64));
30781 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30782 return;
30783 }
30784 }
30785 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
30786 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
30787 isTypeLegal(MVT::v4i64)) {
30788 // Input needs to be split and output needs to widened. Let's use two
30789 // VTRUNCs, and shuffle their results together into the wider type.
30790 SDValue Lo, Hi;
30791 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
30792
30793 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
30794 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
30795 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
30796 { 0, 1, 2, 3, 16, 17, 18, 19,
30797 -1, -1, -1, -1, -1, -1, -1, -1 });
30798 Results.push_back(Res);
30799 return;
30800 }
30801
30802 return;
30803 }
30804 case ISD::ANY_EXTEND:
30805 // Right now, only MVT::v8i8 has Custom action for an illegal type.
30806 // It's intended to custom handle the input type.
30807 assert(N->getValueType(0) == MVT::v8i8 &&((void)0)
30808 "Do not know how to legalize this Node")((void)0);
30809 return;
30810 case ISD::SIGN_EXTEND:
30811 case ISD::ZERO_EXTEND: {
30812 EVT VT = N->getValueType(0);
30813 SDValue In = N->getOperand(0);
30814 EVT InVT = In.getValueType();
30815 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
30816 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
30817 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&((void)0)
30818 "Unexpected type action!")((void)0);
30819 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")((void)0);
30820 // Custom split this so we can extend i8/i16->i32 invec. This is better
30821 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
30822 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
30823 // we allow the sra from the extend to i32 to be shared by the split.
30824 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
30825
30826 // Fill a vector with sign bits for each element.
30827 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
30828 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
30829
30830 // Create an unpackl and unpackh to interleave the sign bits then bitcast
30831 // to v2i64.
30832 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30833 {0, 4, 1, 5});
30834 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
30835 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30836 {2, 6, 3, 7});
30837 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
30838
30839 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30840 Results.push_back(Res);
30841 return;
30842 }
30843
30844 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
30845 if (!InVT.is128BitVector()) {
30846 // Not a 128 bit vector, but maybe type legalization will promote
30847 // it to 128 bits.
30848 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
30849 return;
30850 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
30851 if (!InVT.is128BitVector())
30852 return;
30853
30854 // Promote the input to 128 bits. Type legalization will turn this into
30855 // zext_inreg/sext_inreg.
30856 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
30857 }
30858
30859 // Perform custom splitting instead of the two stage extend we would get
30860 // by default.
30861 EVT LoVT, HiVT;
30862 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
30863 assert(isTypeLegal(LoVT) && "Split VT not legal?")((void)0);
30864
30865 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
30866
30867 // We need to shift the input over by half the number of elements.
30868 unsigned NumElts = InVT.getVectorNumElements();
30869 unsigned HalfNumElts = NumElts / 2;
30870 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
30871 for (unsigned i = 0; i != HalfNumElts; ++i)
30872 ShufMask[i] = i + HalfNumElts;
30873
30874 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
30875 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
30876
30877 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30878 Results.push_back(Res);
30879 }
30880 return;
30881 }
30882 case ISD::FP_TO_SINT:
30883 case ISD::STRICT_FP_TO_SINT:
30884 case ISD::FP_TO_UINT:
30885 case ISD::STRICT_FP_TO_UINT: {
30886 bool IsStrict = N->isStrictFPOpcode();
30887 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
30888 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
30889 EVT VT = N->getValueType(0);
30890 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30891 EVT SrcVT = Src.getValueType();
30892
30893 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
30894 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30895 "Unexpected type action!")((void)0);
30896
30897 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
30898 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
30899 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
30900 VT.getVectorNumElements());
30901 SDValue Res;
30902 SDValue Chain;
30903 if (IsStrict) {
30904 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
30905 {N->getOperand(0), Src});
30906 Chain = Res.getValue(1);
30907 } else
30908 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
30909
30910 // Preserve what we know about the size of the original result. If the
30911 // result is v2i32, we have to manually widen the assert.
30912 if (PromoteVT == MVT::v2i32)
30913 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
30914 DAG.getUNDEF(MVT::v2i32));
30915
30916 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
30917 Res.getValueType(), Res,
30918 DAG.getValueType(VT.getVectorElementType()));
30919
30920 if (PromoteVT == MVT::v2i32)
30921 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
30922 DAG.getIntPtrConstant(0, dl));
30923
30924 // Truncate back to the original width.
30925 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30926
30927 // Now widen to 128 bits.
30928 unsigned NumConcats = 128 / VT.getSizeInBits();
30929 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
30930 VT.getVectorNumElements() * NumConcats);
30931 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30932 ConcatOps[0] = Res;
30933 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
30934 Results.push_back(Res);
30935 if (IsStrict)
30936 Results.push_back(Chain);
30937 return;
30938 }
30939
30940
30941 if (VT == MVT::v2i32) {
30942 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&((void)0)
30943 "Strict unsigned conversion requires AVX512")((void)0);
30944 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
30945 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30946 "Unexpected type action!")((void)0);
30947 if (Src.getValueType() == MVT::v2f64) {
30948 if (!IsSigned && !Subtarget.hasAVX512()) {
30949 SDValue Res =
30950 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
30951 Results.push_back(Res);
30952 return;
30953 }
30954
30955 unsigned Opc;
30956 if (IsStrict)
30957 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30958 else
30959 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30960
30961 // If we have VLX we can emit a target specific FP_TO_UINT node,.
30962 if (!IsSigned && !Subtarget.hasVLX()) {
30963 // Otherwise we can defer to the generic legalizer which will widen
30964 // the input as well. This will be further widened during op
30965 // legalization to v8i32<-v8f64.
30966 // For strict nodes we'll need to widen ourselves.
30967 // FIXME: Fix the type legalizer to safely widen strict nodes?
30968 if (!IsStrict)
30969 return;
30970 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
30971 DAG.getConstantFP(0.0, dl, MVT::v2f64));
30972 Opc = N->getOpcode();
30973 }
30974 SDValue Res;
30975 SDValue Chain;
30976 if (IsStrict) {
30977 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
30978 {N->getOperand(0), Src});
30979 Chain = Res.getValue(1);
30980 } else {
30981 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
30982 }
30983 Results.push_back(Res);
30984 if (IsStrict)
30985 Results.push_back(Chain);
30986 return;
30987 }
30988
30989 // Custom widen strict v2f32->v2i32 by padding with zeros.
30990 // FIXME: Should generic type legalizer do this?
30991 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
30992 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
30993 DAG.getConstantFP(0.0, dl, MVT::v2f32));
30994 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
30995 {N->getOperand(0), Src});
30996 Results.push_back(Res);
30997 Results.push_back(Res.getValue(1));
30998 return;
30999 }
31000
31001 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
31002 // so early out here.
31003 return;
31004 }
31005
31006 assert(!VT.isVector() && "Vectors should have been handled above!")((void)0);
31007
31008 if (Subtarget.hasDQI() && VT == MVT::i64 &&
31009 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
31010 assert(!Subtarget.is64Bit() && "i64 should be legal")((void)0);
31011 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
31012 // If we use a 128-bit result we might need to use a target specific node.
31013 unsigned SrcElts =
31014 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
31015 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
31016 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
31017 unsigned Opc = N->getOpcode();
31018 if (NumElts != SrcElts) {
31019 if (IsStrict)
31020 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31021 else
31022 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31023 }
31024
31025 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
31026 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
31027 DAG.getConstantFP(0.0, dl, VecInVT), Src,
31028 ZeroIdx);
31029 SDValue Chain;
31030 if (IsStrict) {
31031 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
31032 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
31033 Chain = Res.getValue(1);
31034 } else
31035 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
31036 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
31037 Results.push_back(Res);
31038 if (IsStrict)
31039 Results.push_back(Chain);
31040 return;
31041 }
31042
31043 SDValue Chain;
31044 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
31045 Results.push_back(V);
31046 if (IsStrict)
31047 Results.push_back(Chain);
31048 }
31049 return;
31050 }
31051 case ISD::LRINT:
31052 case ISD::LLRINT: {
31053 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
31054 Results.push_back(V);
31055 return;
31056 }
31057
31058 case ISD::SINT_TO_FP:
31059 case ISD::STRICT_SINT_TO_FP:
31060 case ISD::UINT_TO_FP:
31061 case ISD::STRICT_UINT_TO_FP: {
31062 bool IsStrict = N->isStrictFPOpcode();
31063 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
31064 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
31065 EVT VT = N->getValueType(0);
31066 if (VT != MVT::v2f32)
31067 return;
31068 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31069 EVT SrcVT = Src.getValueType();
31070 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
31071 if (IsStrict) {
31072 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
31073 : X86ISD::STRICT_CVTUI2P;
31074 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
31075 {N->getOperand(0), Src});
31076 Results.push_back(Res);
31077 Results.push_back(Res.getValue(1));
31078 } else {
31079 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
31080 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
31081 }
31082 return;
31083 }
31084 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
31085 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
31086 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
31087 SDValue One = DAG.getConstant(1, dl, SrcVT);
31088 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
31089 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
31090 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
31091 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
31092 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
31093 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
31094 for (int i = 0; i != 2; ++i) {
31095 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
31096 SignSrc, DAG.getIntPtrConstant(i, dl));
31097 if (IsStrict)
31098 SignCvts[i] =
31099 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
31100 {N->getOperand(0), Elt});
31101 else
31102 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
31103 };
31104 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
31105 SDValue Slow, Chain;
31106 if (IsStrict) {
31107 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31108 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
31109 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
31110 {Chain, SignCvt, SignCvt});
31111 Chain = Slow.getValue(1);
31112 } else {
31113 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
31114 }
31115 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
31116 IsNeg =
31117 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
31118 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
31119 Results.push_back(Cvt);
31120 if (IsStrict)
31121 Results.push_back(Chain);
31122 return;
31123 }
31124
31125 if (SrcVT != MVT::v2i32)
31126 return;
31127
31128 if (IsSigned || Subtarget.hasAVX512()) {
31129 if (!IsStrict)
31130 return;
31131
31132 // Custom widen strict v2i32->v2f32 to avoid scalarization.
31133 // FIXME: Should generic type legalizer do this?
31134 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
31135 DAG.getConstant(0, dl, MVT::v2i32));
31136 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
31137 {N->getOperand(0), Src});
31138 Results.push_back(Res);
31139 Results.push_back(Res.getValue(1));
31140 return;
31141 }
31142
31143 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
31144 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
31145 SDValue VBias =
31146 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
31147 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
31148 DAG.getBitcast(MVT::v2i64, VBias));
31149 Or = DAG.getBitcast(MVT::v2f64, Or);
31150 if (IsStrict) {
31151 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
31152 {N->getOperand(0), Or, VBias});
31153 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
31154 {MVT::v4f32, MVT::Other},
31155 {Sub.getValue(1), Sub});
31156 Results.push_back(Res);
31157 Results.push_back(Res.getValue(1));
31158 } else {
31159 // TODO: Are there any fast-math-flags to propagate here?
31160 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
31161 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
31162 }
31163 return;
31164 }
31165 case ISD::STRICT_FP_ROUND:
31166 case ISD::FP_ROUND: {
31167 bool IsStrict = N->isStrictFPOpcode();
31168 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31169 if (!isTypeLegal(Src.getValueType()))
31170 return;
31171 SDValue V;
31172 if (IsStrict)
31173 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
31174 {N->getOperand(0), N->getOperand(1)});
31175 else
31176 V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
31177 Results.push_back(V);
31178 if (IsStrict)
31179 Results.push_back(V.getValue(1));
31180 return;
31181 }
31182 case ISD::FP_EXTEND:
31183 case ISD::STRICT_FP_EXTEND: {
31184 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
31185 // No other ValueType for FP_EXTEND should reach this point.
31186 assert(N->getValueType(0) == MVT::v2f32 &&((void)0)
31187 "Do not know how to legalize this Node")((void)0);
31188 return;
31189 }
31190 case ISD::INTRINSIC_W_CHAIN: {
31191 unsigned IntNo = N->getConstantOperandVal(1);
31192 switch (IntNo) {
31193 default : llvm_unreachable("Do not know how to custom type "__builtin_unreachable()
31194 "legalize this intrinsic operation!")__builtin_unreachable();
31195 case Intrinsic::x86_rdtsc:
31196 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
31197 Results);
31198 case Intrinsic::x86_rdtscp:
31199 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
31200 Results);
31201 case Intrinsic::x86_rdpmc:
31202 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
31203 Results);
31204 return;
31205 case Intrinsic::x86_xgetbv:
31206 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
31207 Results);
31208 return;
31209 }
31210 }
31211 case ISD::READCYCLECOUNTER: {
31212 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
31213 }
31214 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
31215 EVT T = N->getValueType(0);
31216 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")((void)0);
31217 bool Regs64bit = T == MVT::i128;
31218 assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&((void)0)
31219 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")((void)0);
31220 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
31221 SDValue cpInL, cpInH;
31222 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31223 DAG.getConstant(0, dl, HalfT));
31224 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31225 DAG.getConstant(1, dl, HalfT));
31226 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
31227 Regs64bit ? X86::RAX : X86::EAX,
31228 cpInL, SDValue());
31229 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
31230 Regs64bit ? X86::RDX : X86::EDX,
31231 cpInH, cpInL.getValue(1));
31232 SDValue swapInL, swapInH;
31233 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31234 DAG.getConstant(0, dl, HalfT));
31235 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31236 DAG.getConstant(1, dl, HalfT));
31237 swapInH =
31238 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
31239 swapInH, cpInH.getValue(1));
31240
31241 // In 64-bit mode we might need the base pointer in RBX, but we can't know
31242 // until later. So we keep the RBX input in a vreg and use a custom
31243 // inserter.
31244 // Since RBX will be a reserved register the register allocator will not
31245 // make sure its value will be properly saved and restored around this
31246 // live-range.
31247 SDValue Result;
31248 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31249 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
31250 if (Regs64bit) {
31251 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
31252 swapInH.getValue(1)};
31253 Result =
31254 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
31255 } else {
31256 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
31257 swapInH.getValue(1));
31258 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
31259 swapInL.getValue(1)};
31260 Result =
31261 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
31262 }
31263
31264 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
31265 Regs64bit ? X86::RAX : X86::EAX,
31266 HalfT, Result.getValue(1));
31267 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
31268 Regs64bit ? X86::RDX : X86::EDX,
31269 HalfT, cpOutL.getValue(2));
31270 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
31271
31272 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
31273 MVT::i32, cpOutH.getValue(2));
31274 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
31275 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
31276
31277 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
31278 Results.push_back(Success);
31279 Results.push_back(EFLAGS.getValue(1));
31280 return;
31281 }
31282 case ISD::ATOMIC_LOAD: {
31283 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((void)0);
31284 bool NoImplicitFloatOps =
31285 DAG.getMachineFunction().getFunction().hasFnAttribute(
31286 Attribute::NoImplicitFloat);
31287 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31288 auto *Node = cast<AtomicSDNode>(N);
31289 if (Subtarget.hasSSE1()) {
31290 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
31291 // Then extract the lower 64-bits.
31292 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31293 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
31294 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31295 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31296 MVT::i64, Node->getMemOperand());
31297 if (Subtarget.hasSSE2()) {
31298 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
31299 DAG.getIntPtrConstant(0, dl));
31300 Results.push_back(Res);
31301 Results.push_back(Ld.getValue(1));
31302 return;
31303 }
31304 // We use an alternative sequence for SSE1 that extracts as v2f32 and
31305 // then casts to i64. This avoids a 128-bit stack temporary being
31306 // created by type legalization if we were to cast v4f32->v2i64.
31307 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
31308 DAG.getIntPtrConstant(0, dl));
31309 Res = DAG.getBitcast(MVT::i64, Res);
31310 Results.push_back(Res);
31311 Results.push_back(Ld.getValue(1));
31312 return;
31313 }
31314 if (Subtarget.hasX87()) {
31315 // First load this into an 80-bit X87 register. This will put the whole
31316 // integer into the significand.
31317 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31318 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31319 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
31320 dl, Tys, Ops, MVT::i64,
31321 Node->getMemOperand());
31322 SDValue Chain = Result.getValue(1);
31323
31324 // Now store the X87 register to a stack temporary and convert to i64.
31325 // This store is not atomic and doesn't need to be.
31326 // FIXME: We don't need a stack temporary if the result of the load
31327 // is already being stored. We could just directly store there.
31328 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31329 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31330 MachinePointerInfo MPI =
31331 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31332 SDValue StoreOps[] = { Chain, Result, StackPtr };
31333 Chain = DAG.getMemIntrinsicNode(
31334 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
31335 MPI, None /*Align*/, MachineMemOperand::MOStore);
31336
31337 // Finally load the value back from the stack temporary and return it.
31338 // This load is not atomic and doesn't need to be.
31339 // This load will be further type legalized.
31340 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
31341 Results.push_back(Result);
31342 Results.push_back(Result.getValue(1));
31343 return;
31344 }
31345 }
31346 // TODO: Use MOVLPS when SSE1 is available?
31347 // Delegate to generic TypeLegalization. Situations we can really handle
31348 // should have already been dealt with by AtomicExpandPass.cpp.
31349 break;
31350 }
31351 case ISD::ATOMIC_SWAP:
31352 case ISD::ATOMIC_LOAD_ADD:
31353 case ISD::ATOMIC_LOAD_SUB:
31354 case ISD::ATOMIC_LOAD_AND:
31355 case ISD::ATOMIC_LOAD_OR:
31356 case ISD::ATOMIC_LOAD_XOR:
31357 case ISD::ATOMIC_LOAD_NAND:
31358 case ISD::ATOMIC_LOAD_MIN:
31359 case ISD::ATOMIC_LOAD_MAX:
31360 case ISD::ATOMIC_LOAD_UMIN:
31361 case ISD::ATOMIC_LOAD_UMAX:
31362 // Delegate to generic TypeLegalization. Situations we can really handle
31363 // should have already been dealt with by AtomicExpandPass.cpp.
31364 break;
31365
31366 case ISD::BITCAST: {
31367 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
31368 EVT DstVT = N->getValueType(0);
31369 EVT SrcVT = N->getOperand(0).getValueType();
31370
31371 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
31372 // we can split using the k-register rather than memory.
31373 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
31374 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((void)0);
31375 SDValue Lo, Hi;
31376 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31377 Lo = DAG.getBitcast(MVT::i32, Lo);
31378 Hi = DAG.getBitcast(MVT::i32, Hi);
31379 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
31380 Results.push_back(Res);
31381 return;
31382 }
31383
31384 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
31385 // FIXME: Use v4f32 for SSE1?
31386 assert(Subtarget.hasSSE2() && "Requires SSE2")((void)0);
31387 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&((void)0)
31388 "Unexpected type action!")((void)0);
31389 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
31390 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
31391 N->getOperand(0));
31392 Res = DAG.getBitcast(WideVT, Res);
31393 Results.push_back(Res);
31394 return;
31395 }
31396
31397 return;
31398 }
31399 case ISD::MGATHER: {
31400 EVT VT = N->getValueType(0);
31401 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
31402 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
31403 auto *Gather = cast<MaskedGatherSDNode>(N);
31404 SDValue Index = Gather->getIndex();
31405 if (Index.getValueType() != MVT::v2i64)
31406 return;
31407 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
31408 "Unexpected type action!")((void)0);
31409 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31410 SDValue Mask = Gather->getMask();
31411 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((void)0);
31412 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
31413 Gather->getPassThru(),
31414 DAG.getUNDEF(VT));
31415 if (!Subtarget.hasVLX()) {
31416 // We need to widen the mask, but the instruction will only use 2
31417 // of its elements. So we can use undef.
31418 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
31419 DAG.getUNDEF(MVT::v2i1));
31420 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
31421 }
31422 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
31423 Gather->getBasePtr(), Index, Gather->getScale() };
31424 SDValue Res = DAG.getMemIntrinsicNode(
31425 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
31426 Gather->getMemoryVT(), Gather->getMemOperand());
31427 Results.push_back(Res);
31428 Results.push_back(Res.getValue(1));
31429 return;
31430 }
31431 return;
31432 }
31433 case ISD::LOAD: {
31434 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
31435 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
31436 // cast since type legalization will try to use an i64 load.
31437 MVT VT = N->getSimpleValueType(0);
31438 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")((void)0);
31439 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
31440 "Unexpected type action!")((void)0);
31441 if (!ISD::isNON_EXTLoad(N))
31442 return;
31443 auto *Ld = cast<LoadSDNode>(N);
31444 if (Subtarget.hasSSE2()) {
31445 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
31446 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
31447 Ld->getPointerInfo(), Ld->getOriginalAlign(),
31448 Ld->getMemOperand()->getFlags());
31449 SDValue Chain = Res.getValue(1);
31450 MVT VecVT = MVT::getVectorVT(LdVT, 2);
31451 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
31452 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31453 Res = DAG.getBitcast(WideVT, Res);
31454 Results.push_back(Res);
31455 Results.push_back(Chain);
31456 return;
31457 }
31458 assert(Subtarget.hasSSE1() && "Expected SSE")((void)0);
31459 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
31460 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
31461 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31462 MVT::i64, Ld->getMemOperand());
31463 Results.push_back(Res);
31464 Results.push_back(Res.getValue(1));
31465 return;
31466 }
31467 case ISD::ADDRSPACECAST: {
31468 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
31469 Results.push_back(V);
31470 return;
31471 }
31472 case ISD::BITREVERSE:
31473 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((void)0);
31474 assert(Subtarget.hasXOP() && "Expected XOP")((void)0);
31475 // We can use VPPERM by copying to a vector register and back. We'll need
31476 // to move the scalar in two i32 pieces.
31477 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
31478 return;
31479 }
31480}
31481
31482const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
31483 switch ((X86ISD::NodeType)Opcode) {
31484 case X86ISD::FIRST_NUMBER: break;
31485#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
31486 NODE_NAME_CASE(BSF)
31487 NODE_NAME_CASE(BSR)
31488 NODE_NAME_CASE(FSHL)
31489 NODE_NAME_CASE(FSHR)
31490 NODE_NAME_CASE(FAND)
31491 NODE_NAME_CASE(FANDN)
31492 NODE_NAME_CASE(FOR)
31493 NODE_NAME_CASE(FXOR)
31494 NODE_NAME_CASE(FILD)
31495 NODE_NAME_CASE(FIST)
31496 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
31497 NODE_NAME_CASE(FLD)
31498 NODE_NAME_CASE(FST)
31499 NODE_NAME_CASE(CALL)
31500 NODE_NAME_CASE(CALL_RVMARKER)
31501 NODE_NAME_CASE(BT)
31502 NODE_NAME_CASE(CMP)
31503 NODE_NAME_CASE(FCMP)
31504 NODE_NAME_CASE(STRICT_FCMP)
31505 NODE_NAME_CASE(STRICT_FCMPS)
31506 NODE_NAME_CASE(COMI)
31507 NODE_NAME_CASE(UCOMI)
31508 NODE_NAME_CASE(CMPM)
31509 NODE_NAME_CASE(CMPMM)
31510 NODE_NAME_CASE(STRICT_CMPM)
31511 NODE_NAME_CASE(CMPMM_SAE)
31512 NODE_NAME_CASE(SETCC)
31513 NODE_NAME_CASE(SETCC_CARRY)
31514 NODE_NAME_CASE(FSETCC)
31515 NODE_NAME_CASE(FSETCCM)
31516 NODE_NAME_CASE(FSETCCM_SAE)
31517 NODE_NAME_CASE(CMOV)
31518 NODE_NAME_CASE(BRCOND)
31519 NODE_NAME_CASE(RET_FLAG)
31520 NODE_NAME_CASE(IRET)
31521 NODE_NAME_CASE(REP_STOS)
31522 NODE_NAME_CASE(REP_MOVS)
31523 NODE_NAME_CASE(GlobalBaseReg)
31524 NODE_NAME_CASE(Wrapper)
31525 NODE_NAME_CASE(WrapperRIP)
31526 NODE_NAME_CASE(MOVQ2DQ)
31527 NODE_NAME_CASE(MOVDQ2Q)
31528 NODE_NAME_CASE(MMX_MOVD2W)
31529 NODE_NAME_CASE(MMX_MOVW2D)
31530 NODE_NAME_CASE(PEXTRB)
31531 NODE_NAME_CASE(PEXTRW)
31532 NODE_NAME_CASE(INSERTPS)
31533 NODE_NAME_CASE(PINSRB)
31534 NODE_NAME_CASE(PINSRW)
31535 NODE_NAME_CASE(PSHUFB)
31536 NODE_NAME_CASE(ANDNP)
31537 NODE_NAME_CASE(BLENDI)
31538 NODE_NAME_CASE(BLENDV)
31539 NODE_NAME_CASE(HADD)
31540 NODE_NAME_CASE(HSUB)
31541 NODE_NAME_CASE(FHADD)
31542 NODE_NAME_CASE(FHSUB)
31543 NODE_NAME_CASE(CONFLICT)
31544 NODE_NAME_CASE(FMAX)
31545 NODE_NAME_CASE(FMAXS)
31546 NODE_NAME_CASE(FMAX_SAE)
31547 NODE_NAME_CASE(FMAXS_SAE)
31548 NODE_NAME_CASE(FMIN)
31549 NODE_NAME_CASE(FMINS)
31550 NODE_NAME_CASE(FMIN_SAE)
31551 NODE_NAME_CASE(FMINS_SAE)
31552 NODE_NAME_CASE(FMAXC)
31553 NODE_NAME_CASE(FMINC)
31554 NODE_NAME_CASE(FRSQRT)
31555 NODE_NAME_CASE(FRCP)
31556 NODE_NAME_CASE(EXTRQI)
31557 NODE_NAME_CASE(INSERTQI)
31558 NODE_NAME_CASE(TLSADDR)
31559 NODE_NAME_CASE(TLSBASEADDR)
31560 NODE_NAME_CASE(TLSCALL)
31561 NODE_NAME_CASE(EH_SJLJ_SETJMP)
31562 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
31563 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
31564 NODE_NAME_CASE(EH_RETURN)
31565 NODE_NAME_CASE(TC_RETURN)
31566 NODE_NAME_CASE(FNSTCW16m)
31567 NODE_NAME_CASE(FLDCW16m)
31568 NODE_NAME_CASE(LCMPXCHG_DAG)
31569 NODE_NAME_CASE(LCMPXCHG8_DAG)
31570 NODE_NAME_CASE(LCMPXCHG16_DAG)
31571 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
31572 NODE_NAME_CASE(LADD)
31573 NODE_NAME_CASE(LSUB)
31574 NODE_NAME_CASE(LOR)
31575 NODE_NAME_CASE(LXOR)
31576 NODE_NAME_CASE(LAND)
31577 NODE_NAME_CASE(VZEXT_MOVL)
31578 NODE_NAME_CASE(VZEXT_LOAD)
31579 NODE_NAME_CASE(VEXTRACT_STORE)
31580 NODE_NAME_CASE(VTRUNC)
31581 NODE_NAME_CASE(VTRUNCS)
31582 NODE_NAME_CASE(VTRUNCUS)
31583 NODE_NAME_CASE(VMTRUNC)
31584 NODE_NAME_CASE(VMTRUNCS)
31585 NODE_NAME_CASE(VMTRUNCUS)
31586 NODE_NAME_CASE(VTRUNCSTORES)
31587 NODE_NAME_CASE(VTRUNCSTOREUS)
31588 NODE_NAME_CASE(VMTRUNCSTORES)
31589 NODE_NAME_CASE(VMTRUNCSTOREUS)
31590 NODE_NAME_CASE(VFPEXT)
31591 NODE_NAME_CASE(STRICT_VFPEXT)
31592 NODE_NAME_CASE(VFPEXT_SAE)
31593 NODE_NAME_CASE(VFPEXTS)
31594 NODE_NAME_CASE(VFPEXTS_SAE)
31595 NODE_NAME_CASE(VFPROUND)
31596 NODE_NAME_CASE(STRICT_VFPROUND)
31597 NODE_NAME_CASE(VMFPROUND)
31598 NODE_NAME_CASE(VFPROUND_RND)
31599 NODE_NAME_CASE(VFPROUNDS)
31600 NODE_NAME_CASE(VFPROUNDS_RND)
31601 NODE_NAME_CASE(VSHLDQ)
31602 NODE_NAME_CASE(VSRLDQ)
31603 NODE_NAME_CASE(VSHL)
31604 NODE_NAME_CASE(VSRL)
31605 NODE_NAME_CASE(VSRA)
31606 NODE_NAME_CASE(VSHLI)
31607 NODE_NAME_CASE(VSRLI)
31608 NODE_NAME_CASE(VSRAI)
31609 NODE_NAME_CASE(VSHLV)
31610 NODE_NAME_CASE(VSRLV)
31611 NODE_NAME_CASE(VSRAV)
31612 NODE_NAME_CASE(VROTLI)
31613 NODE_NAME_CASE(VROTRI)
31614 NODE_NAME_CASE(VPPERM)
31615 NODE_NAME_CASE(CMPP)
31616 NODE_NAME_CASE(STRICT_CMPP)
31617 NODE_NAME_CASE(PCMPEQ)
31618 NODE_NAME_CASE(PCMPGT)
31619 NODE_NAME_CASE(PHMINPOS)
31620 NODE_NAME_CASE(ADD)
31621 NODE_NAME_CASE(SUB)
31622 NODE_NAME_CASE(ADC)
31623 NODE_NAME_CASE(SBB)
31624 NODE_NAME_CASE(SMUL)
31625 NODE_NAME_CASE(UMUL)
31626 NODE_NAME_CASE(OR)
31627 NODE_NAME_CASE(XOR)
31628 NODE_NAME_CASE(AND)
31629 NODE_NAME_CASE(BEXTR)
31630 NODE_NAME_CASE(BEXTRI)
31631 NODE_NAME_CASE(BZHI)
31632 NODE_NAME_CASE(PDEP)
31633 NODE_NAME_CASE(PEXT)
31634 NODE_NAME_CASE(MUL_IMM)
31635 NODE_NAME_CASE(MOVMSK)
31636 NODE_NAME_CASE(PTEST)
31637 NODE_NAME_CASE(TESTP)
31638 NODE_NAME_CASE(KORTEST)
31639 NODE_NAME_CASE(KTEST)
31640 NODE_NAME_CASE(KADD)
31641 NODE_NAME_CASE(KSHIFTL)
31642 NODE_NAME_CASE(KSHIFTR)
31643 NODE_NAME_CASE(PACKSS)
31644 NODE_NAME_CASE(PACKUS)
31645 NODE_NAME_CASE(PALIGNR)
31646 NODE_NAME_CASE(VALIGN)
31647 NODE_NAME_CASE(VSHLD)
31648 NODE_NAME_CASE(VSHRD)
31649 NODE_NAME_CASE(VSHLDV)
31650 NODE_NAME_CASE(VSHRDV)
31651 NODE_NAME_CASE(PSHUFD)
31652 NODE_NAME_CASE(PSHUFHW)
31653 NODE_NAME_CASE(PSHUFLW)
31654 NODE_NAME_CASE(SHUFP)
31655 NODE_NAME_CASE(SHUF128)
31656 NODE_NAME_CASE(MOVLHPS)
31657 NODE_NAME_CASE(MOVHLPS)
31658 NODE_NAME_CASE(MOVDDUP)
31659 NODE_NAME_CASE(MOVSHDUP)
31660 NODE_NAME_CASE(MOVSLDUP)
31661 NODE_NAME_CASE(MOVSD)
31662 NODE_NAME_CASE(MOVSS)
31663 NODE_NAME_CASE(UNPCKL)
31664 NODE_NAME_CASE(UNPCKH)
31665 NODE_NAME_CASE(VBROADCAST)
31666 NODE_NAME_CASE(VBROADCAST_LOAD)
31667 NODE_NAME_CASE(VBROADCASTM)
31668 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
31669 NODE_NAME_CASE(VPERMILPV)
31670 NODE_NAME_CASE(VPERMILPI)
31671 NODE_NAME_CASE(VPERM2X128)
31672 NODE_NAME_CASE(VPERMV)
31673 NODE_NAME_CASE(VPERMV3)
31674 NODE_NAME_CASE(VPERMI)
31675 NODE_NAME_CASE(VPTERNLOG)
31676 NODE_NAME_CASE(VFIXUPIMM)
31677 NODE_NAME_CASE(VFIXUPIMM_SAE)
31678 NODE_NAME_CASE(VFIXUPIMMS)
31679 NODE_NAME_CASE(VFIXUPIMMS_SAE)
31680 NODE_NAME_CASE(VRANGE)
31681 NODE_NAME_CASE(VRANGE_SAE)
31682 NODE_NAME_CASE(VRANGES)
31683 NODE_NAME_CASE(VRANGES_SAE)
31684 NODE_NAME_CASE(PMULUDQ)
31685 NODE_NAME_CASE(PMULDQ)
31686 NODE_NAME_CASE(PSADBW)
31687 NODE_NAME_CASE(DBPSADBW)
31688 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
31689 NODE_NAME_CASE(VAARG_64)
31690 NODE_NAME_CASE(VAARG_X32)
31691 NODE_NAME_CASE(WIN_ALLOCA)
31692 NODE_NAME_CASE(MEMBARRIER)
31693 NODE_NAME_CASE(MFENCE)
31694 NODE_NAME_CASE(SEG_ALLOCA)
31695 NODE_NAME_CASE(PROBED_ALLOCA)
31696 NODE_NAME_CASE(RDRAND)
31697 NODE_NAME_CASE(RDSEED)
31698 NODE_NAME_CASE(RDPKRU)
31699 NODE_NAME_CASE(WRPKRU)
31700 NODE_NAME_CASE(VPMADDUBSW)
31701 NODE_NAME_CASE(VPMADDWD)
31702 NODE_NAME_CASE(VPSHA)
31703 NODE_NAME_CASE(VPSHL)
31704 NODE_NAME_CASE(VPCOM)
31705 NODE_NAME_CASE(VPCOMU)
31706 NODE_NAME_CASE(VPERMIL2)
31707 NODE_NAME_CASE(FMSUB)
31708 NODE_NAME_CASE(STRICT_FMSUB)
31709 NODE_NAME_CASE(FNMADD)
31710 NODE_NAME_CASE(STRICT_FNMADD)
31711 NODE_NAME_CASE(FNMSUB)
31712 NODE_NAME_CASE(STRICT_FNMSUB)
31713 NODE_NAME_CASE(FMADDSUB)
31714 NODE_NAME_CASE(FMSUBADD)
31715 NODE_NAME_CASE(FMADD_RND)
31716 NODE_NAME_CASE(FNMADD_RND)
31717 NODE_NAME_CASE(FMSUB_RND)
31718 NODE_NAME_CASE(FNMSUB_RND)
31719 NODE_NAME_CASE(FMADDSUB_RND)
31720 NODE_NAME_CASE(FMSUBADD_RND)
31721 NODE_NAME_CASE(VPMADD52H)
31722 NODE_NAME_CASE(VPMADD52L)
31723 NODE_NAME_CASE(VRNDSCALE)
31724 NODE_NAME_CASE(STRICT_VRNDSCALE)
31725 NODE_NAME_CASE(VRNDSCALE_SAE)
31726 NODE_NAME_CASE(VRNDSCALES)
31727 NODE_NAME_CASE(VRNDSCALES_SAE)
31728 NODE_NAME_CASE(VREDUCE)
31729 NODE_NAME_CASE(VREDUCE_SAE)
31730 NODE_NAME_CASE(VREDUCES)
31731 NODE_NAME_CASE(VREDUCES_SAE)
31732 NODE_NAME_CASE(VGETMANT)
31733 NODE_NAME_CASE(VGETMANT_SAE)
31734 NODE_NAME_CASE(VGETMANTS)
31735 NODE_NAME_CASE(VGETMANTS_SAE)
31736 NODE_NAME_CASE(PCMPESTR)
31737 NODE_NAME_CASE(PCMPISTR)
31738 NODE_NAME_CASE(XTEST)
31739 NODE_NAME_CASE(COMPRESS)
31740 NODE_NAME_CASE(EXPAND)
31741 NODE_NAME_CASE(SELECTS)
31742 NODE_NAME_CASE(ADDSUB)
31743 NODE_NAME_CASE(RCP14)
31744 NODE_NAME_CASE(RCP14S)
31745 NODE_NAME_CASE(RCP28)
31746 NODE_NAME_CASE(RCP28_SAE)
31747 NODE_NAME_CASE(RCP28S)
31748 NODE_NAME_CASE(RCP28S_SAE)
31749 NODE_NAME_CASE(EXP2)
31750 NODE_NAME_CASE(EXP2_SAE)
31751 NODE_NAME_CASE(RSQRT14)
31752 NODE_NAME_CASE(RSQRT14S)
31753 NODE_NAME_CASE(RSQRT28)
31754 NODE_NAME_CASE(RSQRT28_SAE)
31755 NODE_NAME_CASE(RSQRT28S)
31756 NODE_NAME_CASE(RSQRT28S_SAE)
31757 NODE_NAME_CASE(FADD_RND)
31758 NODE_NAME_CASE(FADDS)
31759 NODE_NAME_CASE(FADDS_RND)
31760 NODE_NAME_CASE(FSUB_RND)
31761 NODE_NAME_CASE(FSUBS)
31762 NODE_NAME_CASE(FSUBS_RND)
31763 NODE_NAME_CASE(FMUL_RND)
31764 NODE_NAME_CASE(FMULS)
31765 NODE_NAME_CASE(FMULS_RND)
31766 NODE_NAME_CASE(FDIV_RND)
31767 NODE_NAME_CASE(FDIVS)
31768 NODE_NAME_CASE(FDIVS_RND)
31769 NODE_NAME_CASE(FSQRT_RND)
31770 NODE_NAME_CASE(FSQRTS)
31771 NODE_NAME_CASE(FSQRTS_RND)
31772 NODE_NAME_CASE(FGETEXP)
31773 NODE_NAME_CASE(FGETEXP_SAE)
31774 NODE_NAME_CASE(FGETEXPS)
31775 NODE_NAME_CASE(FGETEXPS_SAE)
31776 NODE_NAME_CASE(SCALEF)
31777 NODE_NAME_CASE(SCALEF_RND)
31778 NODE_NAME_CASE(SCALEFS)
31779 NODE_NAME_CASE(SCALEFS_RND)
31780 NODE_NAME_CASE(AVG)
31781 NODE_NAME_CASE(MULHRS)
31782 NODE_NAME_CASE(SINT_TO_FP_RND)
31783 NODE_NAME_CASE(UINT_TO_FP_RND)
31784 NODE_NAME_CASE(CVTTP2SI)
31785 NODE_NAME_CASE(CVTTP2UI)
31786 NODE_NAME_CASE(STRICT_CVTTP2SI)
31787 NODE_NAME_CASE(STRICT_CVTTP2UI)
31788 NODE_NAME_CASE(MCVTTP2SI)
31789 NODE_NAME_CASE(MCVTTP2UI)
31790 NODE_NAME_CASE(CVTTP2SI_SAE)
31791 NODE_NAME_CASE(CVTTP2UI_SAE)
31792 NODE_NAME_CASE(CVTTS2SI)
31793 NODE_NAME_CASE(CVTTS2UI)
31794 NODE_NAME_CASE(CVTTS2SI_SAE)
31795 NODE_NAME_CASE(CVTTS2UI_SAE)
31796 NODE_NAME_CASE(CVTSI2P)
31797 NODE_NAME_CASE(CVTUI2P)
31798 NODE_NAME_CASE(STRICT_CVTSI2P)
31799 NODE_NAME_CASE(STRICT_CVTUI2P)
31800 NODE_NAME_CASE(MCVTSI2P)
31801 NODE_NAME_CASE(MCVTUI2P)
31802 NODE_NAME_CASE(VFPCLASS)
31803 NODE_NAME_CASE(VFPCLASSS)
31804 NODE_NAME_CASE(MULTISHIFT)
31805 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
31806 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
31807 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
31808 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
31809 NODE_NAME_CASE(CVTPS2PH)
31810 NODE_NAME_CASE(STRICT_CVTPS2PH)
31811 NODE_NAME_CASE(MCVTPS2PH)
31812 NODE_NAME_CASE(CVTPH2PS)
31813 NODE_NAME_CASE(STRICT_CVTPH2PS)
31814 NODE_NAME_CASE(CVTPH2PS_SAE)
31815 NODE_NAME_CASE(CVTP2SI)
31816 NODE_NAME_CASE(CVTP2UI)
31817 NODE_NAME_CASE(MCVTP2SI)
31818 NODE_NAME_CASE(MCVTP2UI)
31819 NODE_NAME_CASE(CVTP2SI_RND)
31820 NODE_NAME_CASE(CVTP2UI_RND)
31821 NODE_NAME_CASE(CVTS2SI)
31822 NODE_NAME_CASE(CVTS2UI)
31823 NODE_NAME_CASE(CVTS2SI_RND)
31824 NODE_NAME_CASE(CVTS2UI_RND)
31825 NODE_NAME_CASE(CVTNE2PS2BF16)
31826 NODE_NAME_CASE(CVTNEPS2BF16)
31827 NODE_NAME_CASE(MCVTNEPS2BF16)
31828 NODE_NAME_CASE(DPBF16PS)
31829 NODE_NAME_CASE(LWPINS)
31830 NODE_NAME_CASE(MGATHER)
31831 NODE_NAME_CASE(MSCATTER)
31832 NODE_NAME_CASE(VPDPBUSD)
31833 NODE_NAME_CASE(VPDPBUSDS)
31834 NODE_NAME_CASE(VPDPWSSD)
31835 NODE_NAME_CASE(VPDPWSSDS)
31836 NODE_NAME_CASE(VPSHUFBITQMB)
31837 NODE_NAME_CASE(GF2P8MULB)
31838 NODE_NAME_CASE(GF2P8AFFINEQB)
31839 NODE_NAME_CASE(GF2P8AFFINEINVQB)
31840 NODE_NAME_CASE(NT_CALL)
31841 NODE_NAME_CASE(NT_BRIND)
31842 NODE_NAME_CASE(UMWAIT)
31843 NODE_NAME_CASE(TPAUSE)
31844 NODE_NAME_CASE(ENQCMD)
31845 NODE_NAME_CASE(ENQCMDS)
31846 NODE_NAME_CASE(VP2INTERSECT)
31847 NODE_NAME_CASE(AESENC128KL)
31848 NODE_NAME_CASE(AESDEC128KL)
31849 NODE_NAME_CASE(AESENC256KL)
31850 NODE_NAME_CASE(AESDEC256KL)
31851 NODE_NAME_CASE(AESENCWIDE128KL)
31852 NODE_NAME_CASE(AESDECWIDE128KL)
31853 NODE_NAME_CASE(AESENCWIDE256KL)
31854 NODE_NAME_CASE(AESDECWIDE256KL)
31855 NODE_NAME_CASE(TESTUI)
31856 }
31857 return nullptr;
31858#undef NODE_NAME_CASE
31859}
31860
31861/// Return true if the addressing mode represented by AM is legal for this
31862/// target, for a load/store of the specified type.
31863bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
31864 const AddrMode &AM, Type *Ty,
31865 unsigned AS,
31866 Instruction *I) const {
31867 // X86 supports extremely general addressing modes.
31868 CodeModel::Model M = getTargetMachine().getCodeModel();
31869
31870 // X86 allows a sign-extended 32-bit immediate field as a displacement.
31871 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
31872 return false;
31873
31874 if (AM.BaseGV) {
31875 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
31876
31877 // If a reference to this global requires an extra load, we can't fold it.
31878 if (isGlobalStubReference(GVFlags))
31879 return false;
31880
31881 // If BaseGV requires a register for the PIC base, we cannot also have a
31882 // BaseReg specified.
31883 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
31884 return false;
31885
31886 // If lower 4G is not available, then we must use rip-relative addressing.
31887 if ((M != CodeModel::Small || isPositionIndependent()) &&
31888 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
31889 return false;
31890 }
31891
31892 switch (AM.Scale) {
31893 case 0:
31894 case 1:
31895 case 2:
31896 case 4:
31897 case 8:
31898 // These scales always work.
31899 break;
31900 case 3:
31901 case 5:
31902 case 9:
31903 // These scales are formed with basereg+scalereg. Only accept if there is
31904 // no basereg yet.
31905 if (AM.HasBaseReg)
31906 return false;
31907 break;
31908 default: // Other stuff never works.
31909 return false;
31910 }
31911
31912 return true;
31913}
31914
31915bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
31916 unsigned Bits = Ty->getScalarSizeInBits();
31917
31918 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
31919 // particularly cheaper than those without.
31920 if (Bits == 8)
31921 return false;
31922
31923 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
31924 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
31925 if (Subtarget.hasXOP() &&
31926 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
31927 return false;
31928
31929 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
31930 // shifts just as cheap as scalar ones.
31931 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
31932 return false;
31933
31934 // AVX512BW has shifts such as vpsllvw.
31935 if (Subtarget.hasBWI() && Bits == 16)
31936 return false;
31937
31938 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
31939 // fully general vector.
31940 return true;
31941}
31942
31943bool X86TargetLowering::isBinOp(unsigned Opcode) const {
31944 switch (Opcode) {
31945 // These are non-commutative binops.
31946 // TODO: Add more X86ISD opcodes once we have test coverage.
31947 case X86ISD::ANDNP:
31948 case X86ISD::PCMPGT:
31949 case X86ISD::FMAX:
31950 case X86ISD::FMIN:
31951 case X86ISD::FANDN:
31952 return true;
31953 }
31954
31955 return TargetLoweringBase::isBinOp(Opcode);
31956}
31957
31958bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
31959 switch (Opcode) {
31960 // TODO: Add more X86ISD opcodes once we have test coverage.
31961 case X86ISD::PCMPEQ:
31962 case X86ISD::PMULDQ:
31963 case X86ISD::PMULUDQ:
31964 case X86ISD::FMAXC:
31965 case X86ISD::FMINC:
31966 case X86ISD::FAND:
31967 case X86ISD::FOR:
31968 case X86ISD::FXOR:
31969 return true;
31970 }
31971
31972 return TargetLoweringBase::isCommutativeBinOp(Opcode);
31973}
31974
31975bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
31976 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31977 return false;
31978 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
31979 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
31980 return NumBits1 > NumBits2;
31981}
31982
31983bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
31984 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31985 return false;
31986
31987 if (!isTypeLegal(EVT::getEVT(Ty1)))
31988 return false;
31989
31990 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((void)0);
31991
31992 // Assuming the caller doesn't have a zeroext or signext return parameter,
31993 // truncation all the way down to i1 is valid.
31994 return true;
31995}
31996
31997bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
31998 return isInt<32>(Imm);
31999}
32000
32001bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
32002 // Can also use sub to handle negated immediates.
32003 return isInt<32>(Imm);
32004}
32005
32006bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
32007 return isInt<32>(Imm);
32008}
32009
32010bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
32011 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
32012 return false;
32013 unsigned NumBits1 = VT1.getSizeInBits();
32014 unsigned NumBits2 = VT2.getSizeInBits();
32015 return NumBits1 > NumBits2;
32016}
32017
32018bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
32019 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32020 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
32021}
32022
32023bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
32024 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32025 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
32026}
32027
32028bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
32029 EVT VT1 = Val.getValueType();
32030 if (isZExtFree(VT1, VT2))
32031 return true;
32032
32033 if (Val.getOpcode() != ISD::LOAD)
32034 return false;
32035
32036 if (!VT1.isSimple() || !VT1.isInteger() ||
32037 !VT2.isSimple() || !VT2.isInteger())
32038 return false;
32039
32040 switch (VT1.getSimpleVT().SimpleTy) {
32041 default: break;
32042 case MVT::i8:
32043 case MVT::i16:
32044 case MVT::i32:
32045 // X86 has 8, 16, and 32-bit zero-extending loads.
32046 return true;
32047 }
32048
32049 return false;
32050}
32051
32052bool X86TargetLowering::shouldSinkOperands(Instruction *I,
32053 SmallVectorImpl<Use *> &Ops) const {
32054 // A uniform shift amount in a vector shift or funnel shift may be much
32055 // cheaper than a generic variable vector shift, so make that pattern visible
32056 // to SDAG by sinking the shuffle instruction next to the shift.
32057 int ShiftAmountOpNum = -1;
32058 if (I->isShift())
32059 ShiftAmountOpNum = 1;
32060 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
32061 if (II->getIntrinsicID() == Intrinsic::fshl ||
32062 II->getIntrinsicID() == Intrinsic::fshr)
32063 ShiftAmountOpNum = 2;
32064 }
32065
32066 if (ShiftAmountOpNum == -1)
32067 return false;
32068
32069 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
32070 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
32071 isVectorShiftByScalarCheap(I->getType())) {
32072 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
32073 return true;
32074 }
32075
32076 return false;
32077}
32078
32079bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
32080 if (!Subtarget.is64Bit())
32081 return false;
32082 return TargetLowering::shouldConvertPhiType(From, To);
32083}
32084
32085bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
32086 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
32087 return false;
32088
32089 EVT SrcVT = ExtVal.getOperand(0).getValueType();
32090
32091 // There is no extending load for vXi1.
32092 if (SrcVT.getScalarType() == MVT::i1)
32093 return false;
32094
32095 return true;
32096}
32097
32098bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
32099 EVT VT) const {
32100 if (!Subtarget.hasAnyFMA())
32101 return false;
32102
32103 VT = VT.getScalarType();
32104
32105 if (!VT.isSimple())
32106 return false;
32107
32108 switch (VT.getSimpleVT().SimpleTy) {
32109 case MVT::f32:
32110 case MVT::f64:
32111 return true;
32112 default:
32113 break;
32114 }
32115
32116 return false;
32117}
32118
32119bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
32120 // i16 instructions are longer (0x66 prefix) and potentially slower.
32121 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
32122}
32123
32124/// Targets can use this to indicate that they only support *some*
32125/// VECTOR_SHUFFLE operations, those with specific masks.
32126/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
32127/// are assumed to be legal.
32128bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
32129 if (!VT.isSimple())
32130 return false;
32131
32132 // Not for i1 vectors
32133 if (VT.getSimpleVT().getScalarType() == MVT::i1)
32134 return false;
32135
32136 // Very little shuffling can be done for 64-bit vectors right now.
32137 if (VT.getSimpleVT().getSizeInBits() == 64)
32138 return false;
32139
32140 // We only care that the types being shuffled are legal. The lowering can
32141 // handle any possible shuffle mask that results.
32142 return isTypeLegal(VT.getSimpleVT());
32143}
32144
32145bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
32146 EVT VT) const {
32147 // Don't convert an 'and' into a shuffle that we don't directly support.
32148 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
32149 if (!Subtarget.hasAVX2())
32150 if (VT == MVT::v32i8 || VT == MVT::v16i16)
32151 return false;
32152
32153 // Just delegate to the generic legality, clear masks aren't special.
32154 return isShuffleMaskLegal(Mask, VT);
32155}
32156
32157bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
32158 // If the subtarget is using thunks, we need to not generate jump tables.
32159 if (Subtarget.useIndirectThunkBranches())
32160 return false;
32161
32162 // Otherwise, fallback on the generic logic.
32163 return TargetLowering::areJTsAllowed(Fn);
32164}
32165
32166//===----------------------------------------------------------------------===//
32167// X86 Scheduler Hooks
32168//===----------------------------------------------------------------------===//
32169
32170// Returns true if EFLAG is consumed after this iterator in the rest of the
32171// basic block or any successors of the basic block.
32172static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
32173 MachineBasicBlock *BB) {
32174 // Scan forward through BB for a use/def of EFLAGS.
32175 for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
32176 miI != miE; ++miI) {
32177 const MachineInstr& mi = *miI;
32178 if (mi.readsRegister(X86::EFLAGS))
32179 return true;
32180 // If we found a def, we can stop searching.
32181 if (mi.definesRegister(X86::EFLAGS))
32182 return false;
32183 }
32184
32185 // If we hit the end of the block, check whether EFLAGS is live into a
32186 // successor.
32187 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
32188 sEnd = BB->succ_end();
32189 sItr != sEnd; ++sItr) {
32190 MachineBasicBlock* succ = *sItr;
32191 if (succ->isLiveIn(X86::EFLAGS))
32192 return true;
32193 }
32194
32195 return false;
32196}
32197
32198/// Utility function to emit xbegin specifying the start of an RTM region.
32199static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
32200 const TargetInstrInfo *TII) {
32201 const DebugLoc &DL = MI.getDebugLoc();
32202
32203 const BasicBlock *BB = MBB->getBasicBlock();
32204 MachineFunction::iterator I = ++MBB->getIterator();
32205
32206 // For the v = xbegin(), we generate
32207 //
32208 // thisMBB:
32209 // xbegin sinkMBB
32210 //
32211 // mainMBB:
32212 // s0 = -1
32213 //
32214 // fallBB:
32215 // eax = # XABORT_DEF
32216 // s1 = eax
32217 //
32218 // sinkMBB:
32219 // v = phi(s0/mainBB, s1/fallBB)
32220
32221 MachineBasicBlock *thisMBB = MBB;
32222 MachineFunction *MF = MBB->getParent();
32223 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32224 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32225 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32226 MF->insert(I, mainMBB);
32227 MF->insert(I, fallMBB);
32228 MF->insert(I, sinkMBB);
32229
32230 if (isEFLAGSLiveAfter(MI, MBB)) {
32231 mainMBB->addLiveIn(X86::EFLAGS);
32232 fallMBB->addLiveIn(X86::EFLAGS);
32233 sinkMBB->addLiveIn(X86::EFLAGS);
32234 }
32235
32236 // Transfer the remainder of BB and its successor edges to sinkMBB.
32237 sinkMBB->splice(sinkMBB->begin(), MBB,
32238 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32239 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32240
32241 MachineRegisterInfo &MRI = MF->getRegInfo();
32242 Register DstReg = MI.getOperand(0).getReg();
32243 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32244 Register mainDstReg = MRI.createVirtualRegister(RC);
32245 Register fallDstReg = MRI.createVirtualRegister(RC);
32246
32247 // thisMBB:
32248 // xbegin fallMBB
32249 // # fallthrough to mainMBB
32250 // # abortion to fallMBB
32251 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
32252 thisMBB->addSuccessor(mainMBB);
32253 thisMBB->addSuccessor(fallMBB);
32254
32255 // mainMBB:
32256 // mainDstReg := -1
32257 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
32258 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32259 mainMBB->addSuccessor(sinkMBB);
32260
32261 // fallMBB:
32262 // ; pseudo instruction to model hardware's definition from XABORT
32263 // EAX := XABORT_DEF
32264 // fallDstReg := EAX
32265 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
32266 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
32267 .addReg(X86::EAX);
32268 fallMBB->addSuccessor(sinkMBB);
32269
32270 // sinkMBB:
32271 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
32272 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
32273 .addReg(mainDstReg).addMBB(mainMBB)
32274 .addReg(fallDstReg).addMBB(fallMBB);
32275
32276 MI.eraseFromParent();
32277 return sinkMBB;
32278}
32279
32280MachineBasicBlock *
32281X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
32282 MachineBasicBlock *MBB) const {
32283 // Emit va_arg instruction on X86-64.
32284
32285 // Operands to this pseudo-instruction:
32286 // 0 ) Output : destination address (reg)
32287 // 1-5) Input : va_list address (addr, i64mem)
32288 // 6 ) ArgSize : Size (in bytes) of vararg type
32289 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
32290 // 8 ) Align : Alignment of type
32291 // 9 ) EFLAGS (implicit-def)
32292
32293 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")((void)0);
32294 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
32295
32296 Register DestReg = MI.getOperand(0).getReg();
32297 MachineOperand &Base = MI.getOperand(1);
32298 MachineOperand &Scale = MI.getOperand(2);
32299 MachineOperand &Index = MI.getOperand(3);
32300 MachineOperand &Disp = MI.getOperand(4);
32301 MachineOperand &Segment = MI.getOperand(5);
32302 unsigned ArgSize = MI.getOperand(6).getImm();
32303 unsigned ArgMode = MI.getOperand(7).getImm();
32304 Align Alignment = Align(MI.getOperand(8).getImm());
32305
32306 MachineFunction *MF = MBB->getParent();
32307
32308 // Memory Reference
32309 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")((void)0);
32310
32311 MachineMemOperand *OldMMO = MI.memoperands().front();
32312
32313 // Clone the MMO into two separate MMOs for loading and storing
32314 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
32315 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
32316 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
32317 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
32318
32319 // Machine Information
32320 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32321 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
32322 const TargetRegisterClass *AddrRegClass =
32323 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
32324 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
32325 const DebugLoc &DL = MI.getDebugLoc();
32326
32327 // struct va_list {
32328 // i32 gp_offset
32329 // i32 fp_offset
32330 // i64 overflow_area (address)
32331 // i64 reg_save_area (address)
32332 // }
32333 // sizeof(va_list) = 24
32334 // alignment(va_list) = 8
32335
32336 unsigned TotalNumIntRegs = 6;
32337 unsigned TotalNumXMMRegs = 8;
32338 bool UseGPOffset = (ArgMode == 1);
32339 bool UseFPOffset = (ArgMode == 2);
32340 unsigned MaxOffset = TotalNumIntRegs * 8 +
32341 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
32342
32343 /* Align ArgSize to a multiple of 8 */
32344 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
32345 bool NeedsAlign = (Alignment > 8);
32346
32347 MachineBasicBlock *thisMBB = MBB;
32348 MachineBasicBlock *overflowMBB;
32349 MachineBasicBlock *offsetMBB;
32350 MachineBasicBlock *endMBB;
32351
32352 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
32353 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
32354 unsigned OffsetReg = 0;
32355
32356 if (!UseGPOffset && !UseFPOffset) {
32357 // If we only pull from the overflow region, we don't create a branch.
32358 // We don't need to alter control flow.
32359 OffsetDestReg = 0; // unused
32360 OverflowDestReg = DestReg;
32361
32362 offsetMBB = nullptr;
32363 overflowMBB = thisMBB;
32364 endMBB = thisMBB;
32365 } else {
32366 // First emit code to check if gp_offset (or fp_offset) is below the bound.
32367 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
32368 // If not, pull from overflow_area. (branch to overflowMBB)
32369 //
32370 // thisMBB
32371 // | .
32372 // | .
32373 // offsetMBB overflowMBB
32374 // | .
32375 // | .
32376 // endMBB
32377
32378 // Registers for the PHI in endMBB
32379 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
32380 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
32381
32382 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32383 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32384 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32385 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32386
32387 MachineFunction::iterator MBBIter = ++MBB->getIterator();
32388
32389 // Insert the new basic blocks
32390 MF->insert(MBBIter, offsetMBB);
32391 MF->insert(MBBIter, overflowMBB);
32392 MF->insert(MBBIter, endMBB);
32393
32394 // Transfer the remainder of MBB and its successor edges to endMBB.
32395 endMBB->splice(endMBB->begin(), thisMBB,
32396 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
32397 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
32398
32399 // Make offsetMBB and overflowMBB successors of thisMBB
32400 thisMBB->addSuccessor(offsetMBB);
32401 thisMBB->addSuccessor(overflowMBB);
32402
32403 // endMBB is a successor of both offsetMBB and overflowMBB
32404 offsetMBB->addSuccessor(endMBB);
32405 overflowMBB->addSuccessor(endMBB);
32406
32407 // Load the offset value into a register
32408 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32409 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
32410 .add(Base)
32411 .add(Scale)
32412 .add(Index)
32413 .addDisp(Disp, UseFPOffset ? 4 : 0)
32414 .add(Segment)
32415 .setMemRefs(LoadOnlyMMO);
32416
32417 // Check if there is enough room left to pull this argument.
32418 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
32419 .addReg(OffsetReg)
32420 .addImm(MaxOffset + 8 - ArgSizeA8);
32421
32422 // Branch to "overflowMBB" if offset >= max
32423 // Fall through to "offsetMBB" otherwise
32424 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
32425 .addMBB(overflowMBB).addImm(X86::COND_AE);
32426 }
32427
32428 // In offsetMBB, emit code to use the reg_save_area.
32429 if (offsetMBB) {
32430 assert(OffsetReg != 0)((void)0);
32431
32432 // Read the reg_save_area address.
32433 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
32434 BuildMI(
32435 offsetMBB, DL,
32436 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32437 RegSaveReg)
32438 .add(Base)
32439 .add(Scale)
32440 .add(Index)
32441 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
32442 .add(Segment)
32443 .setMemRefs(LoadOnlyMMO);
32444
32445 if (Subtarget.isTarget64BitLP64()) {
32446 // Zero-extend the offset
32447 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
32448 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
32449 .addImm(0)
32450 .addReg(OffsetReg)
32451 .addImm(X86::sub_32bit);
32452
32453 // Add the offset to the reg_save_area to get the final address.
32454 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
32455 .addReg(OffsetReg64)
32456 .addReg(RegSaveReg);
32457 } else {
32458 // Add the offset to the reg_save_area to get the final address.
32459 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
32460 .addReg(OffsetReg)
32461 .addReg(RegSaveReg);
32462 }
32463
32464 // Compute the offset for the next argument
32465 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32466 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
32467 .addReg(OffsetReg)
32468 .addImm(UseFPOffset ? 16 : 8);
32469
32470 // Store it back into the va_list.
32471 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
32472 .add(Base)
32473 .add(Scale)
32474 .add(Index)
32475 .addDisp(Disp, UseFPOffset ? 4 : 0)
32476 .add(Segment)
32477 .addReg(NextOffsetReg)
32478 .setMemRefs(StoreOnlyMMO);
32479
32480 // Jump to endMBB
32481 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
32482 .addMBB(endMBB);
32483 }
32484
32485 //
32486 // Emit code to use overflow area
32487 //
32488
32489 // Load the overflow_area address into a register.
32490 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
32491 BuildMI(overflowMBB, DL,
32492 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32493 OverflowAddrReg)
32494 .add(Base)
32495 .add(Scale)
32496 .add(Index)
32497 .addDisp(Disp, 8)
32498 .add(Segment)
32499 .setMemRefs(LoadOnlyMMO);
32500
32501 // If we need to align it, do so. Otherwise, just copy the address
32502 // to OverflowDestReg.
32503 if (NeedsAlign) {
32504 // Align the overflow address
32505 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
32506
32507 // aligned_addr = (addr + (align-1)) & ~(align-1)
32508 BuildMI(
32509 overflowMBB, DL,
32510 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32511 TmpReg)
32512 .addReg(OverflowAddrReg)
32513 .addImm(Alignment.value() - 1);
32514
32515 BuildMI(
32516 overflowMBB, DL,
32517 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
32518 OverflowDestReg)
32519 .addReg(TmpReg)
32520 .addImm(~(uint64_t)(Alignment.value() - 1));
32521 } else {
32522 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
32523 .addReg(OverflowAddrReg);
32524 }
32525
32526 // Compute the next overflow address after this argument.
32527 // (the overflow address should be kept 8-byte aligned)
32528 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
32529 BuildMI(
32530 overflowMBB, DL,
32531 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32532 NextAddrReg)
32533 .addReg(OverflowDestReg)
32534 .addImm(ArgSizeA8);
32535
32536 // Store the new overflow address.
32537 BuildMI(overflowMBB, DL,
32538 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
32539 .add(Base)
32540 .add(Scale)
32541 .add(Index)
32542 .addDisp(Disp, 8)
32543 .add(Segment)
32544 .addReg(NextAddrReg)
32545 .setMemRefs(StoreOnlyMMO);
32546
32547 // If we branched, emit the PHI to the front of endMBB.
32548 if (offsetMBB) {
32549 BuildMI(*endMBB, endMBB->begin(), DL,
32550 TII->get(X86::PHI), DestReg)
32551 .addReg(OffsetDestReg).addMBB(offsetMBB)
32552 .addReg(OverflowDestReg).addMBB(overflowMBB);
32553 }
32554
32555 // Erase the pseudo instruction
32556 MI.eraseFromParent();
32557
32558 return endMBB;
32559}
32560
32561// The EFLAGS operand of SelectItr might be missing a kill marker
32562// because there were multiple uses of EFLAGS, and ISel didn't know
32563// which to mark. Figure out whether SelectItr should have had a
32564// kill marker, and set it if it should. Returns the correct kill
32565// marker value.
32566static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
32567 MachineBasicBlock* BB,
32568 const TargetRegisterInfo* TRI) {
32569 if (isEFLAGSLiveAfter(SelectItr, BB))
32570 return false;
32571
32572 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
32573 // out. SelectMI should have a kill flag on EFLAGS.
32574 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
32575 return true;
32576}
32577
32578// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
32579// together with other CMOV pseudo-opcodes into a single basic-block with
32580// conditional jump around it.
32581static bool isCMOVPseudo(MachineInstr &MI) {
32582 switch (MI.getOpcode()) {
32583 case X86::CMOV_FR32:
32584 case X86::CMOV_FR32X:
32585 case X86::CMOV_FR64:
32586 case X86::CMOV_FR64X:
32587 case X86::CMOV_GR8:
32588 case X86::CMOV_GR16:
32589 case X86::CMOV_GR32:
32590 case X86::CMOV_RFP32:
32591 case X86::CMOV_RFP64:
32592 case X86::CMOV_RFP80:
32593 case X86::CMOV_VR64:
32594 case X86::CMOV_VR128:
32595 case X86::CMOV_VR128X:
32596 case X86::CMOV_VR256:
32597 case X86::CMOV_VR256X:
32598 case X86::CMOV_VR512:
32599 case X86::CMOV_VK1:
32600 case X86::CMOV_VK2:
32601 case X86::CMOV_VK4:
32602 case X86::CMOV_VK8:
32603 case X86::CMOV_VK16:
32604 case X86::CMOV_VK32:
32605 case X86::CMOV_VK64:
32606 return true;
32607
32608 default:
32609 return false;
32610 }
32611}
32612
32613// Helper function, which inserts PHI functions into SinkMBB:
32614// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
32615// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
32616// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
32617// the last PHI function inserted.
32618static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
32619 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
32620 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
32621 MachineBasicBlock *SinkMBB) {
32622 MachineFunction *MF = TrueMBB->getParent();
32623 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
32624 const DebugLoc &DL = MIItBegin->getDebugLoc();
32625
32626 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
32627 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32628
32629 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
32630
32631 // As we are creating the PHIs, we have to be careful if there is more than
32632 // one. Later CMOVs may reference the results of earlier CMOVs, but later
32633 // PHIs have to reference the individual true/false inputs from earlier PHIs.
32634 // That also means that PHI construction must work forward from earlier to
32635 // later, and that the code must maintain a mapping from earlier PHI's
32636 // destination registers, and the registers that went into the PHI.
32637 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
32638 MachineInstrBuilder MIB;
32639
32640 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
32641 Register DestReg = MIIt->getOperand(0).getReg();
32642 Register Op1Reg = MIIt->getOperand(1).getReg();
32643 Register Op2Reg = MIIt->getOperand(2).getReg();
32644
32645 // If this CMOV we are generating is the opposite condition from
32646 // the jump we generated, then we have to swap the operands for the
32647 // PHI that is going to be generated.
32648 if (MIIt->getOperand(3).getImm() == OppCC)
32649 std::swap(Op1Reg, Op2Reg);
32650
32651 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
32652 Op1Reg = RegRewriteTable[Op1Reg].first;
32653
32654 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
32655 Op2Reg = RegRewriteTable[Op2Reg].second;
32656
32657 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
32658 .addReg(Op1Reg)
32659 .addMBB(FalseMBB)
32660 .addReg(Op2Reg)
32661 .addMBB(TrueMBB);
32662
32663 // Add this PHI to the rewrite table.
32664 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
32665 }
32666
32667 return MIB;
32668}
32669
32670// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
32671MachineBasicBlock *
32672X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
32673 MachineInstr &SecondCascadedCMOV,
32674 MachineBasicBlock *ThisMBB) const {
32675 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32676 const DebugLoc &DL = FirstCMOV.getDebugLoc();
32677
32678 // We lower cascaded CMOVs such as
32679 //
32680 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
32681 //
32682 // to two successive branches.
32683 //
32684 // Without this, we would add a PHI between the two jumps, which ends up
32685 // creating a few copies all around. For instance, for
32686 //
32687 // (sitofp (zext (fcmp une)))
32688 //
32689 // we would generate:
32690 //
32691 // ucomiss %xmm1, %xmm0
32692 // movss <1.0f>, %xmm0
32693 // movaps %xmm0, %xmm1
32694 // jne .LBB5_2
32695 // xorps %xmm1, %xmm1
32696 // .LBB5_2:
32697 // jp .LBB5_4
32698 // movaps %xmm1, %xmm0
32699 // .LBB5_4:
32700 // retq
32701 //
32702 // because this custom-inserter would have generated:
32703 //
32704 // A
32705 // | \
32706 // | B
32707 // | /
32708 // C
32709 // | \
32710 // | D
32711 // | /
32712 // E
32713 //
32714 // A: X = ...; Y = ...
32715 // B: empty
32716 // C: Z = PHI [X, A], [Y, B]
32717 // D: empty
32718 // E: PHI [X, C], [Z, D]
32719 //
32720 // If we lower both CMOVs in a single step, we can instead generate:
32721 //
32722 // A
32723 // | \
32724 // | C
32725 // | /|
32726 // |/ |
32727 // | |
32728 // | D
32729 // | /
32730 // E
32731 //
32732 // A: X = ...; Y = ...
32733 // D: empty
32734 // E: PHI [X, A], [X, C], [Y, D]
32735 //
32736 // Which, in our sitofp/fcmp example, gives us something like:
32737 //
32738 // ucomiss %xmm1, %xmm0
32739 // movss <1.0f>, %xmm0
32740 // jne .LBB5_4
32741 // jp .LBB5_4
32742 // xorps %xmm0, %xmm0
32743 // .LBB5_4:
32744 // retq
32745 //
32746
32747 // We lower cascaded CMOV into two successive branches to the same block.
32748 // EFLAGS is used by both, so mark it as live in the second.
32749 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32750 MachineFunction *F = ThisMBB->getParent();
32751 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32752 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32753 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32754
32755 MachineFunction::iterator It = ++ThisMBB->getIterator();
32756 F->insert(It, FirstInsertedMBB);
32757 F->insert(It, SecondInsertedMBB);
32758 F->insert(It, SinkMBB);
32759
32760 // For a cascaded CMOV, we lower it to two successive branches to
32761 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
32762 // the FirstInsertedMBB.
32763 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
32764
32765 // If the EFLAGS register isn't dead in the terminator, then claim that it's
32766 // live into the sink and copy blocks.
32767 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32768 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
32769 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
32770 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
32771 SinkMBB->addLiveIn(X86::EFLAGS);
32772 }
32773
32774 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32775 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
32776 std::next(MachineBasicBlock::iterator(FirstCMOV)),
32777 ThisMBB->end());
32778 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32779
32780 // Fallthrough block for ThisMBB.
32781 ThisMBB->addSuccessor(FirstInsertedMBB);
32782 // The true block target of the first branch is always SinkMBB.
32783 ThisMBB->addSuccessor(SinkMBB);
32784 // Fallthrough block for FirstInsertedMBB.
32785 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
32786 // The true block for the branch of FirstInsertedMBB.
32787 FirstInsertedMBB->addSuccessor(SinkMBB);
32788 // This is fallthrough.
32789 SecondInsertedMBB->addSuccessor(SinkMBB);
32790
32791 // Create the conditional branch instructions.
32792 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
32793 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
32794
32795 X86::CondCode SecondCC =
32796 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
32797 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
32798
32799 // SinkMBB:
32800 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
32801 Register DestReg = FirstCMOV.getOperand(0).getReg();
32802 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
32803 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
32804 MachineInstrBuilder MIB =
32805 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
32806 .addReg(Op1Reg)
32807 .addMBB(SecondInsertedMBB)
32808 .addReg(Op2Reg)
32809 .addMBB(ThisMBB);
32810
32811 // The second SecondInsertedMBB provides the same incoming value as the
32812 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
32813 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
32814 // Copy the PHI result to the register defined by the second CMOV.
32815 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
32816 TII->get(TargetOpcode::COPY),
32817 SecondCascadedCMOV.getOperand(0).getReg())
32818 .addReg(FirstCMOV.getOperand(0).getReg());
32819
32820 // Now remove the CMOVs.
32821 FirstCMOV.eraseFromParent();
32822 SecondCascadedCMOV.eraseFromParent();
32823
32824 return SinkMBB;
32825}
32826
32827MachineBasicBlock *
32828X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
32829 MachineBasicBlock *ThisMBB) const {
32830 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32831 const DebugLoc &DL = MI.getDebugLoc();
32832
32833 // To "insert" a SELECT_CC instruction, we actually have to insert the
32834 // diamond control-flow pattern. The incoming instruction knows the
32835 // destination vreg to set, the condition code register to branch on, the
32836 // true/false values to select between and a branch opcode to use.
32837
32838 // ThisMBB:
32839 // ...
32840 // TrueVal = ...
32841 // cmpTY ccX, r1, r2
32842 // bCC copy1MBB
32843 // fallthrough --> FalseMBB
32844
32845 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
32846 // as described above, by inserting a BB, and then making a PHI at the join
32847 // point to select the true and false operands of the CMOV in the PHI.
32848 //
32849 // The code also handles two different cases of multiple CMOV opcodes
32850 // in a row.
32851 //
32852 // Case 1:
32853 // In this case, there are multiple CMOVs in a row, all which are based on
32854 // the same condition setting (or the exact opposite condition setting).
32855 // In this case we can lower all the CMOVs using a single inserted BB, and
32856 // then make a number of PHIs at the join point to model the CMOVs. The only
32857 // trickiness here, is that in a case like:
32858 //
32859 // t2 = CMOV cond1 t1, f1
32860 // t3 = CMOV cond1 t2, f2
32861 //
32862 // when rewriting this into PHIs, we have to perform some renaming on the
32863 // temps since you cannot have a PHI operand refer to a PHI result earlier
32864 // in the same block. The "simple" but wrong lowering would be:
32865 //
32866 // t2 = PHI t1(BB1), f1(BB2)
32867 // t3 = PHI t2(BB1), f2(BB2)
32868 //
32869 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
32870 // renaming is to note that on the path through BB1, t2 is really just a
32871 // copy of t1, and do that renaming, properly generating:
32872 //
32873 // t2 = PHI t1(BB1), f1(BB2)
32874 // t3 = PHI t1(BB1), f2(BB2)
32875 //
32876 // Case 2:
32877 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
32878 // function - EmitLoweredCascadedSelect.
32879
32880 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
32881 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32882 MachineInstr *LastCMOV = &MI;
32883 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
32884
32885 // Check for case 1, where there are multiple CMOVs with the same condition
32886 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
32887 // number of jumps the most.
32888
32889 if (isCMOVPseudo(MI)) {
32890 // See if we have a string of CMOVS with the same condition. Skip over
32891 // intervening debug insts.
32892 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
32893 (NextMIIt->getOperand(3).getImm() == CC ||
32894 NextMIIt->getOperand(3).getImm() == OppCC)) {
32895 LastCMOV = &*NextMIIt;
32896 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
32897 }
32898 }
32899
32900 // This checks for case 2, but only do this if we didn't already find
32901 // case 1, as indicated by LastCMOV == MI.
32902 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
32903 NextMIIt->getOpcode() == MI.getOpcode() &&
32904 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
32905 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
32906 NextMIIt->getOperand(1).isKill()) {
32907 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
32908 }
32909
32910 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32911 MachineFunction *F = ThisMBB->getParent();
32912 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
32913 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32914
32915 MachineFunction::iterator It = ++ThisMBB->getIterator();
32916 F->insert(It, FalseMBB);
32917 F->insert(It, SinkMBB);
32918
32919 // If the EFLAGS register isn't dead in the terminator, then claim that it's
32920 // live into the sink and copy blocks.
32921 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32922 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
32923 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
32924 FalseMBB->addLiveIn(X86::EFLAGS);
32925 SinkMBB->addLiveIn(X86::EFLAGS);
32926 }
32927
32928 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
32929 auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
32930 auto DbgIt = MachineBasicBlock::iterator(MI);
32931 while (DbgIt != DbgEnd) {
32932 auto Next = std::next(DbgIt);
32933 if (DbgIt->isDebugInstr())
32934 SinkMBB->push_back(DbgIt->removeFromParent());
32935 DbgIt = Next;
32936 }
32937
32938 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32939 SinkMBB->splice(SinkMBB->end(), ThisMBB,
32940 std::next(MachineBasicBlock::iterator(LastCMOV)),
32941 ThisMBB->end());
32942 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32943
32944 // Fallthrough block for ThisMBB.
32945 ThisMBB->addSuccessor(FalseMBB);
32946 // The true block target of the first (or only) branch is always a SinkMBB.
32947 ThisMBB->addSuccessor(SinkMBB);
32948 // Fallthrough block for FalseMBB.
32949 FalseMBB->addSuccessor(SinkMBB);
32950
32951 // Create the conditional branch instruction.
32952 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
32953
32954 // SinkMBB:
32955 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
32956 // ...
32957 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
32958 MachineBasicBlock::iterator MIItEnd =
32959 std::next(MachineBasicBlock::iterator(LastCMOV));
32960 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
32961
32962 // Now remove the CMOV(s).
32963 ThisMBB->erase(MIItBegin, MIItEnd);
32964
32965 return SinkMBB;
32966}
32967
32968static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
32969 if (IsLP64) {
32970 if (isInt<8>(Imm))
32971 return X86::SUB64ri8;
32972 return X86::SUB64ri32;
32973 } else {
32974 if (isInt<8>(Imm))
32975 return X86::SUB32ri8;
32976 return X86::SUB32ri;
32977 }
32978}
32979
32980MachineBasicBlock *
32981X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
32982 MachineBasicBlock *MBB) const {
32983 MachineFunction *MF = MBB->getParent();
32984 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32985 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
32986 const DebugLoc &DL = MI.getDebugLoc();
32987 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32988
32989 const unsigned ProbeSize = getStackProbeSize(*MF);
32990
32991 MachineRegisterInfo &MRI = MF->getRegInfo();
32992 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32993 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32994 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32995
32996 MachineFunction::iterator MBBIter = ++MBB->getIterator();
32997 MF->insert(MBBIter, testMBB);
32998 MF->insert(MBBIter, blockMBB);
32999 MF->insert(MBBIter, tailMBB);
33000
33001 Register sizeVReg = MI.getOperand(1).getReg();
33002
33003 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
33004
33005 Register TmpStackPtr = MRI.createVirtualRegister(
33006 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33007 Register FinalStackPtr = MRI.createVirtualRegister(
33008 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33009
33010 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
33011 .addReg(physSPReg);
33012 {
33013 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
33014 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
33015 .addReg(TmpStackPtr)
33016 .addReg(sizeVReg);
33017 }
33018
33019 // test rsp size
33020
33021 BuildMI(testMBB, DL,
33022 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
33023 .addReg(FinalStackPtr)
33024 .addReg(physSPReg);
33025
33026 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
33027 .addMBB(tailMBB)
33028 .addImm(X86::COND_GE);
33029 testMBB->addSuccessor(blockMBB);
33030 testMBB->addSuccessor(tailMBB);
33031
33032 // Touch the block then extend it. This is done on the opposite side of
33033 // static probe where we allocate then touch, to avoid the need of probing the
33034 // tail of the static alloca. Possible scenarios are:
33035 //
33036 // + ---- <- ------------ <- ------------- <- ------------ +
33037 // | |
33038 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
33039 // | |
33040 // + <- ----------- <- ------------ <- ----------- <- ------------ +
33041 //
33042 // The property we want to enforce is to never have more than [page alloc] between two probes.
33043
33044 const unsigned XORMIOpc =
33045 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
33046 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
33047 .addImm(0);
33048
33049 BuildMI(blockMBB, DL,
33050 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
33051 .addReg(physSPReg)
33052 .addImm(ProbeSize);
33053
33054
33055 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
33056 blockMBB->addSuccessor(testMBB);
33057
33058 // Replace original instruction by the expected stack ptr
33059 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
33060 .addReg(FinalStackPtr);
33061
33062 tailMBB->splice(tailMBB->end(), MBB,
33063 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33064 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
33065 MBB->addSuccessor(testMBB);
33066
33067 // Delete the original pseudo instruction.
33068 MI.eraseFromParent();
33069
33070 // And we're done.
33071 return tailMBB;
33072}
33073
33074MachineBasicBlock *
33075X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
33076 MachineBasicBlock *BB) const {
33077 MachineFunction *MF = BB->getParent();
33078 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33079 const DebugLoc &DL = MI.getDebugLoc();
33080 const BasicBlock *LLVM_BB = BB->getBasicBlock();
33081
33082 assert(MF->shouldSplitStack())((void)0);
33083
33084 const bool Is64Bit = Subtarget.is64Bit();
33085 const bool IsLP64 = Subtarget.isTarget64BitLP64();
33086
33087 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
33088 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
33089
33090 // BB:
33091 // ... [Till the alloca]
33092 // If stacklet is not large enough, jump to mallocMBB
33093 //
33094 // bumpMBB:
33095 // Allocate by subtracting from RSP
33096 // Jump to continueMBB
33097 //
33098 // mallocMBB:
33099 // Allocate by call to runtime
33100 //
33101 // continueMBB:
33102 // ...
33103 // [rest of original BB]
33104 //
33105
33106 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33107 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33108 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33109
33110 MachineRegisterInfo &MRI = MF->getRegInfo();
33111 const TargetRegisterClass *AddrRegClass =
33112 getRegClassFor(getPointerTy(MF->getDataLayout()));
33113
33114 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33115 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33116 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
33117 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
33118 sizeVReg = MI.getOperand(1).getReg(),
33119 physSPReg =
33120 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
33121
33122 MachineFunction::iterator MBBIter = ++BB->getIterator();
33123
33124 MF->insert(MBBIter, bumpMBB);
33125 MF->insert(MBBIter, mallocMBB);
33126 MF->insert(MBBIter, continueMBB);
33127
33128 continueMBB->splice(continueMBB->begin(), BB,
33129 std::next(MachineBasicBlock::iterator(MI)), BB->end());
33130 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
33131
33132 // Add code to the main basic block to check if the stack limit has been hit,
33133 // and if so, jump to mallocMBB otherwise to bumpMBB.
33134 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
33135 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
33136 .addReg(tmpSPVReg).addReg(sizeVReg);
33137 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
33138 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
33139 .addReg(SPLimitVReg);
33140 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
33141
33142 // bumpMBB simply decreases the stack pointer, since we know the current
33143 // stacklet has enough space.
33144 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
33145 .addReg(SPLimitVReg);
33146 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
33147 .addReg(SPLimitVReg);
33148 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33149
33150 // Calls into a routine in libgcc to allocate more space from the heap.
33151 const uint32_t *RegMask =
33152 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
33153 if (IsLP64) {
33154 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
33155 .addReg(sizeVReg);
33156 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33157 .addExternalSymbol("__morestack_allocate_stack_space")
33158 .addRegMask(RegMask)
33159 .addReg(X86::RDI, RegState::Implicit)
33160 .addReg(X86::RAX, RegState::ImplicitDefine);
33161 } else if (Is64Bit) {
33162 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
33163 .addReg(sizeVReg);
33164 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33165 .addExternalSymbol("__morestack_allocate_stack_space")
33166 .addRegMask(RegMask)
33167 .addReg(X86::EDI, RegState::Implicit)
33168 .addReg(X86::EAX, RegState::ImplicitDefine);
33169 } else {
33170 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
33171 .addImm(12);
33172 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
33173 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
33174 .addExternalSymbol("__morestack_allocate_stack_space")
33175 .addRegMask(RegMask)
33176 .addReg(X86::EAX, RegState::ImplicitDefine);
33177 }
33178
33179 if (!Is64Bit)
33180 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
33181 .addImm(16);
33182
33183 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
33184 .addReg(IsLP64 ? X86::RAX : X86::EAX);
33185 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33186
33187 // Set up the CFG correctly.
33188 BB->addSuccessor(bumpMBB);
33189 BB->addSuccessor(mallocMBB);
33190 mallocMBB->addSuccessor(continueMBB);
33191 bumpMBB->addSuccessor(continueMBB);
33192
33193 // Take care of the PHI nodes.
33194 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
33195 MI.getOperand(0).getReg())
33196 .addReg(mallocPtrVReg)
33197 .addMBB(mallocMBB)
33198 .addReg(bumpSPPtrVReg)
33199 .addMBB(bumpMBB);
33200
33201 // Delete the original pseudo instruction.
33202 MI.eraseFromParent();
33203
33204 // And we're done.
33205 return continueMBB;
33206}
33207
33208MachineBasicBlock *
33209X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
33210 MachineBasicBlock *BB) const {
33211 MachineFunction *MF = BB->getParent();
33212 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33213 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
33214 const DebugLoc &DL = MI.getDebugLoc();
33215
33216 assert(!isAsynchronousEHPersonality(((void)0)
33217 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&((void)0)
33218 "SEH does not use catchret!")((void)0);
33219
33220 // Only 32-bit EH needs to worry about manually restoring stack pointers.
33221 if (!Subtarget.is32Bit())
33222 return BB;
33223
33224 // C++ EH creates a new target block to hold the restore code, and wires up
33225 // the new block to the return destination with a normal JMP_4.
33226 MachineBasicBlock *RestoreMBB =
33227 MF->CreateMachineBasicBlock(BB->getBasicBlock());
33228 assert(BB->succ_size() == 1)((void)0);
33229 MF->insert(std::next(BB->getIterator()), RestoreMBB);
33230 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
33231 BB->addSuccessor(RestoreMBB);
33232 MI.getOperand(0).setMBB(RestoreMBB);
33233
33234 // Marking this as an EH pad but not a funclet entry block causes PEI to
33235 // restore stack pointers in the block.
33236 RestoreMBB->setIsEHPad(true);
33237
33238 auto RestoreMBBI = RestoreMBB->begin();
33239 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
33240 return BB;
33241}
33242
33243MachineBasicBlock *
33244X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
33245 MachineBasicBlock *BB) const {
33246 // So, here we replace TLSADDR with the sequence:
33247 // adjust_stackdown -> TLSADDR -> adjust_stackup.
33248 // We need this because TLSADDR is lowered into calls
33249 // inside MC, therefore without the two markers shrink-wrapping
33250 // may push the prologue/epilogue pass them.
33251 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33252 const DebugLoc &DL = MI.getDebugLoc();
33253 MachineFunction &MF = *BB->getParent();
33254
33255 // Emit CALLSEQ_START right before the instruction.
33256 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
33257 MachineInstrBuilder CallseqStart =
33258 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
33259 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
33260
33261 // Emit CALLSEQ_END right after the instruction.
33262 // We don't call erase from parent because we want to keep the
33263 // original instruction around.
33264 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
33265 MachineInstrBuilder CallseqEnd =
33266 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
33267 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
33268
33269 return BB;
33270}
33271
33272MachineBasicBlock *
33273X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
33274 MachineBasicBlock *BB) const {
33275 // This is pretty easy. We're taking the value that we received from
33276 // our load from the relocation, sticking it in either RDI (x86-64)
33277 // or EAX and doing an indirect call. The return value will then
33278 // be in the normal return register.
33279 MachineFunction *F = BB->getParent();
33280 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33281 const DebugLoc &DL = MI.getDebugLoc();
33282
33283 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")((void)0);
33284 assert(MI.getOperand(3).isGlobal() && "This should be a global")((void)0);
33285
33286 // Get a register mask for the lowered call.
33287 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
33288 // proper register mask.
33289 const uint32_t *RegMask =
33290 Subtarget.is64Bit() ?
33291 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
33292 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
33293 if (Subtarget.is64Bit()) {
33294 MachineInstrBuilder MIB =
33295 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
33296 .addReg(X86::RIP)
33297 .addImm(0)
33298 .addReg(0)
33299 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33300 MI.getOperand(3).getTargetFlags())
33301 .addReg(0);
33302 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
33303 addDirectMem(MIB, X86::RDI);
33304 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
33305 } else if (!isPositionIndependent()) {
33306 MachineInstrBuilder MIB =
33307 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33308 .addReg(0)
33309 .addImm(0)
33310 .addReg(0)
33311 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33312 MI.getOperand(3).getTargetFlags())
33313 .addReg(0);
33314 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33315 addDirectMem(MIB, X86::EAX);
33316 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33317 } else {
33318 MachineInstrBuilder MIB =
33319 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33320 .addReg(TII->getGlobalBaseReg(F))
33321 .addImm(0)
33322 .addReg(0)
33323 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33324 MI.getOperand(3).getTargetFlags())
33325 .addReg(0);
33326 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33327 addDirectMem(MIB, X86::EAX);
33328 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33329 }
33330
33331 MI.eraseFromParent(); // The pseudo instruction is gone now.
33332 return BB;
33333}
33334
33335static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
33336 switch (RPOpc) {
33337 case X86::INDIRECT_THUNK_CALL32:
33338 return X86::CALLpcrel32;
33339 case X86::INDIRECT_THUNK_CALL64:
33340 return X86::CALL64pcrel32;
33341 case X86::INDIRECT_THUNK_TCRETURN32:
33342 return X86::TCRETURNdi;
33343 case X86::INDIRECT_THUNK_TCRETURN64:
33344 return X86::TCRETURNdi64;
33345 }
33346 llvm_unreachable("not indirect thunk opcode")__builtin_unreachable();
33347}
33348
33349static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
33350 unsigned Reg) {
33351 if (Subtarget.useRetpolineExternalThunk()) {
33352 // When using an external thunk for retpolines, we pick names that match the
33353 // names GCC happens to use as well. This helps simplify the implementation
33354 // of the thunks for kernels where they have no easy ability to create
33355 // aliases and are doing non-trivial configuration of the thunk's body. For
33356 // example, the Linux kernel will do boot-time hot patching of the thunk
33357 // bodies and cannot easily export aliases of these to loaded modules.
33358 //
33359 // Note that at any point in the future, we may need to change the semantics
33360 // of how we implement retpolines and at that time will likely change the
33361 // name of the called thunk. Essentially, there is no hard guarantee that
33362 // LLVM will generate calls to specific thunks, we merely make a best-effort
33363 // attempt to help out kernels and other systems where duplicating the
33364 // thunks is costly.
33365 switch (Reg) {
33366 case X86::EAX:
33367 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33368 return "__x86_indirect_thunk_eax";
33369 case X86::ECX:
33370 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33371 return "__x86_indirect_thunk_ecx";
33372 case X86::EDX:
33373 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33374 return "__x86_indirect_thunk_edx";
33375 case X86::EDI:
33376 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33377 return "__x86_indirect_thunk_edi";
33378 case X86::R11:
33379 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((void)0);
33380 return "__x86_indirect_thunk_r11";
33381 }
33382 llvm_unreachable("unexpected reg for external indirect thunk")__builtin_unreachable();
33383 }
33384
33385 if (Subtarget.useRetpolineIndirectCalls() ||
33386 Subtarget.useRetpolineIndirectBranches()) {
33387 // When targeting an internal COMDAT thunk use an LLVM-specific name.
33388 switch (Reg) {
33389 case X86::EAX:
33390 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33391 return "__llvm_retpoline_eax";
33392 case X86::ECX:
33393 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33394 return "__llvm_retpoline_ecx";
33395 case X86::EDX:
33396 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33397 return "__llvm_retpoline_edx";
33398 case X86::EDI:
33399 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33400 return "__llvm_retpoline_edi";
33401 case X86::R11:
33402 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((void)0);
33403 return "__llvm_retpoline_r11";
33404 }
33405 llvm_unreachable("unexpected reg for retpoline")__builtin_unreachable();
33406 }
33407
33408 if (Subtarget.useLVIControlFlowIntegrity()) {
33409 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((void)0);
33410 return "__llvm_lvi_thunk_r11";
33411 }
33412 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")__builtin_unreachable();
33413}
33414
33415MachineBasicBlock *
33416X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
33417 MachineBasicBlock *BB) const {
33418 // Copy the virtual register into the R11 physical register and
33419 // call the retpoline thunk.
33420 const DebugLoc &DL = MI.getDebugLoc();
33421 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33422 Register CalleeVReg = MI.getOperand(0).getReg();
33423 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
33424
33425 // Find an available scratch register to hold the callee. On 64-bit, we can
33426 // just use R11, but we scan for uses anyway to ensure we don't generate
33427 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
33428 // already a register use operand to the call to hold the callee. If none
33429 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
33430 // register and ESI is the base pointer to realigned stack frames with VLAs.
33431 SmallVector<unsigned, 3> AvailableRegs;
33432 if (Subtarget.is64Bit())
33433 AvailableRegs.push_back(X86::R11);
33434 else
33435 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
33436
33437 // Zero out any registers that are already used.
33438 for (const auto &MO : MI.operands()) {
33439 if (MO.isReg() && MO.isUse())
33440 for (unsigned &Reg : AvailableRegs)
33441 if (Reg == MO.getReg())
33442 Reg = 0;
33443 }
33444
33445 // Choose the first remaining non-zero available register.
33446 unsigned AvailableReg = 0;
33447 for (unsigned MaybeReg : AvailableRegs) {
33448 if (MaybeReg) {
33449 AvailableReg = MaybeReg;
33450 break;
33451 }
33452 }
33453 if (!AvailableReg)
33454 report_fatal_error("calling convention incompatible with retpoline, no "
33455 "available registers");
33456
33457 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
33458
33459 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
33460 .addReg(CalleeVReg);
33461 MI.getOperand(0).ChangeToES(Symbol);
33462 MI.setDesc(TII->get(Opc));
33463 MachineInstrBuilder(*BB->getParent(), &MI)
33464 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
33465 return BB;
33466}
33467
33468/// SetJmp implies future control flow change upon calling the corresponding
33469/// LongJmp.
33470/// Instead of using the 'return' instruction, the long jump fixes the stack and
33471/// performs an indirect branch. To do so it uses the registers that were stored
33472/// in the jump buffer (when calling SetJmp).
33473/// In case the shadow stack is enabled we need to fix it as well, because some
33474/// return addresses will be skipped.
33475/// The function will save the SSP for future fixing in the function
33476/// emitLongJmpShadowStackFix.
33477/// \sa emitLongJmpShadowStackFix
33478/// \param [in] MI The temporary Machine Instruction for the builtin.
33479/// \param [in] MBB The Machine Basic Block that will be modified.
33480void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
33481 MachineBasicBlock *MBB) const {
33482 const DebugLoc &DL = MI.getDebugLoc();
33483 MachineFunction *MF = MBB->getParent();
33484 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33485 MachineRegisterInfo &MRI = MF->getRegInfo();
33486 MachineInstrBuilder MIB;
33487
33488 // Memory Reference.
33489 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33490 MI.memoperands_end());
33491
33492 // Initialize a register with zero.
33493 MVT PVT = getPointerTy(MF->getDataLayout());
33494 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33495 Register ZReg = MRI.createVirtualRegister(PtrRC);
33496 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
33497 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
33498 .addDef(ZReg)
33499 .addReg(ZReg, RegState::Undef)
33500 .addReg(ZReg, RegState::Undef);
33501
33502 // Read the current SSP Register value to the zeroed register.
33503 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33504 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33505 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33506
33507 // Write the SSP register value to offset 3 in input memory buffer.
33508 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33509 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
33510 const int64_t SSPOffset = 3 * PVT.getStoreSize();
33511 const unsigned MemOpndSlot = 1;
33512 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33513 if (i == X86::AddrDisp)
33514 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
33515 else
33516 MIB.add(MI.getOperand(MemOpndSlot + i));
33517 }
33518 MIB.addReg(SSPCopyReg);
33519 MIB.setMemRefs(MMOs);
33520}
33521
33522MachineBasicBlock *
33523X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
33524 MachineBasicBlock *MBB) const {
33525 const DebugLoc &DL = MI.getDebugLoc();
33526 MachineFunction *MF = MBB->getParent();
33527 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33528 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33529 MachineRegisterInfo &MRI = MF->getRegInfo();
33530
33531 const BasicBlock *BB = MBB->getBasicBlock();
33532 MachineFunction::iterator I = ++MBB->getIterator();
33533
33534 // Memory Reference
33535 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33536 MI.memoperands_end());
33537
33538 unsigned DstReg;
33539 unsigned MemOpndSlot = 0;
33540
33541 unsigned CurOp = 0;
33542
33543 DstReg = MI.getOperand(CurOp++).getReg();
33544 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33545 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")((void)0);
33546 (void)TRI;
33547 Register mainDstReg = MRI.createVirtualRegister(RC);
33548 Register restoreDstReg = MRI.createVirtualRegister(RC);
33549
33550 MemOpndSlot = CurOp;
33551
33552 MVT PVT = getPointerTy(MF->getDataLayout());
33553 assert((PVT == MVT::i64 || PVT == MVT::i32) &&((void)0)
33554 "Invalid Pointer Size!")((void)0);
33555
33556 // For v = setjmp(buf), we generate
33557 //
33558 // thisMBB:
33559 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
33560 // SjLjSetup restoreMBB
33561 //
33562 // mainMBB:
33563 // v_main = 0
33564 //
33565 // sinkMBB:
33566 // v = phi(main, restore)
33567 //
33568 // restoreMBB:
33569 // if base pointer being used, load it from frame
33570 // v_restore = 1
33571
33572 MachineBasicBlock *thisMBB = MBB;
33573 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33574 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33575 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
33576 MF->insert(I, mainMBB);
33577 MF->insert(I, sinkMBB);
33578 MF->push_back(restoreMBB);
33579 restoreMBB->setHasAddressTaken();
33580
33581 MachineInstrBuilder MIB;
33582
33583 // Transfer the remainder of BB and its successor edges to sinkMBB.
33584 sinkMBB->splice(sinkMBB->begin(), MBB,
33585 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33586 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33587
33588 // thisMBB:
33589 unsigned PtrStoreOpc = 0;
33590 unsigned LabelReg = 0;
33591 const int64_t LabelOffset = 1 * PVT.getStoreSize();
33592 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33593 !isPositionIndependent();
33594
33595 // Prepare IP either in reg or imm.
33596 if (!UseImmLabel) {
33597 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33598 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33599 LabelReg = MRI.createVirtualRegister(PtrRC);
33600 if (Subtarget.is64Bit()) {
33601 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
33602 .addReg(X86::RIP)
33603 .addImm(0)
33604 .addReg(0)
33605 .addMBB(restoreMBB)
33606 .addReg(0);
33607 } else {
33608 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
33609 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
33610 .addReg(XII->getGlobalBaseReg(MF))
33611 .addImm(0)
33612 .addReg(0)
33613 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
33614 .addReg(0);
33615 }
33616 } else
33617 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33618 // Store IP
33619 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
33620 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33621 if (i == X86::AddrDisp)
33622 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
33623 else
33624 MIB.add(MI.getOperand(MemOpndSlot + i));
33625 }
33626 if (!UseImmLabel)
33627 MIB.addReg(LabelReg);
33628 else
33629 MIB.addMBB(restoreMBB);
33630 MIB.setMemRefs(MMOs);
33631
33632 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33633 emitSetJmpShadowStackFix(MI, thisMBB);
33634 }
33635
33636 // Setup
33637 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
33638 .addMBB(restoreMBB);
33639
33640 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33641 MIB.addRegMask(RegInfo->getNoPreservedMask());
33642 thisMBB->addSuccessor(mainMBB);
33643 thisMBB->addSuccessor(restoreMBB);
33644
33645 // mainMBB:
33646 // EAX = 0
33647 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
33648 mainMBB->addSuccessor(sinkMBB);
33649
33650 // sinkMBB:
33651 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
33652 TII->get(X86::PHI), DstReg)
33653 .addReg(mainDstReg).addMBB(mainMBB)
33654 .addReg(restoreDstReg).addMBB(restoreMBB);
33655
33656 // restoreMBB:
33657 if (RegInfo->hasBasePointer(*MF)) {
33658 const bool Uses64BitFramePtr =
33659 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33660 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
33661 X86FI->setRestoreBasePointer(MF);
33662 Register FramePtr = RegInfo->getFrameRegister(*MF);
33663 Register BasePtr = RegInfo->getBaseRegister();
33664 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
33665 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
33666 FramePtr, true, X86FI->getRestoreBasePointerOffset())
33667 .setMIFlag(MachineInstr::FrameSetup);
33668 }
33669 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
33670 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33671 restoreMBB->addSuccessor(sinkMBB);
33672
33673 MI.eraseFromParent();
33674 return sinkMBB;
33675}
33676
33677/// Fix the shadow stack using the previously saved SSP pointer.
33678/// \sa emitSetJmpShadowStackFix
33679/// \param [in] MI The temporary Machine Instruction for the builtin.
33680/// \param [in] MBB The Machine Basic Block that will be modified.
33681/// \return The sink MBB that will perform the future indirect branch.
33682MachineBasicBlock *
33683X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
33684 MachineBasicBlock *MBB) const {
33685 const DebugLoc &DL = MI.getDebugLoc();
33686 MachineFunction *MF = MBB->getParent();
33687 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33688 MachineRegisterInfo &MRI = MF->getRegInfo();
33689
33690 // Memory Reference
33691 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33692 MI.memoperands_end());
33693
33694 MVT PVT = getPointerTy(MF->getDataLayout());
33695 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33696
33697 // checkSspMBB:
33698 // xor vreg1, vreg1
33699 // rdssp vreg1
33700 // test vreg1, vreg1
33701 // je sinkMBB # Jump if Shadow Stack is not supported
33702 // fallMBB:
33703 // mov buf+24/12(%rip), vreg2
33704 // sub vreg1, vreg2
33705 // jbe sinkMBB # No need to fix the Shadow Stack
33706 // fixShadowMBB:
33707 // shr 3/2, vreg2
33708 // incssp vreg2 # fix the SSP according to the lower 8 bits
33709 // shr 8, vreg2
33710 // je sinkMBB
33711 // fixShadowLoopPrepareMBB:
33712 // shl vreg2
33713 // mov 128, vreg3
33714 // fixShadowLoopMBB:
33715 // incssp vreg3
33716 // dec vreg2
33717 // jne fixShadowLoopMBB # Iterate until you finish fixing
33718 // # the Shadow Stack
33719 // sinkMBB:
33720
33721 MachineFunction::iterator I = ++MBB->getIterator();
33722 const BasicBlock *BB = MBB->getBasicBlock();
33723
33724 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
33725 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33726 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
33727 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
33728 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
33729 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33730 MF->insert(I, checkSspMBB);
33731 MF->insert(I, fallMBB);
33732 MF->insert(I, fixShadowMBB);
33733 MF->insert(I, fixShadowLoopPrepareMBB);
33734 MF->insert(I, fixShadowLoopMBB);
33735 MF->insert(I, sinkMBB);
33736
33737 // Transfer the remainder of BB and its successor edges to sinkMBB.
33738 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
33739 MBB->end());
33740 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33741
33742 MBB->addSuccessor(checkSspMBB);
33743
33744 // Initialize a register with zero.
33745 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
33746 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
33747
33748 if (PVT == MVT::i64) {
33749 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
33750 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
33751 .addImm(0)
33752 .addReg(ZReg)
33753 .addImm(X86::sub_32bit);
33754 ZReg = TmpZReg;
33755 }
33756
33757 // Read the current SSP Register value to the zeroed register.
33758 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33759 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33760 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33761
33762 // Check whether the result of the SSP register is zero and jump directly
33763 // to the sink.
33764 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
33765 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
33766 .addReg(SSPCopyReg)
33767 .addReg(SSPCopyReg);
33768 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33769 checkSspMBB->addSuccessor(sinkMBB);
33770 checkSspMBB->addSuccessor(fallMBB);
33771
33772 // Reload the previously saved SSP register value.
33773 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
33774 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33775 const int64_t SPPOffset = 3 * PVT.getStoreSize();
33776 MachineInstrBuilder MIB =
33777 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
33778 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33779 const MachineOperand &MO = MI.getOperand(i);
33780 if (i == X86::AddrDisp)
33781 MIB.addDisp(MO, SPPOffset);
33782 else if (MO.isReg()) // Don't add the whole operand, we don't want to
33783 // preserve kill flags.
33784 MIB.addReg(MO.getReg());
33785 else
33786 MIB.add(MO);
33787 }
33788 MIB.setMemRefs(MMOs);
33789
33790 // Subtract the current SSP from the previous SSP.
33791 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
33792 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
33793 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
33794 .addReg(PrevSSPReg)
33795 .addReg(SSPCopyReg);
33796
33797 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
33798 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
33799 fallMBB->addSuccessor(sinkMBB);
33800 fallMBB->addSuccessor(fixShadowMBB);
33801
33802 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
33803 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
33804 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
33805 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
33806 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
33807 .addReg(SspSubReg)
33808 .addImm(Offset);
33809
33810 // Increase SSP when looking only on the lower 8 bits of the delta.
33811 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
33812 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
33813
33814 // Reset the lower 8 bits.
33815 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
33816 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
33817 .addReg(SspFirstShrReg)
33818 .addImm(8);
33819
33820 // Jump if the result of the shift is zero.
33821 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33822 fixShadowMBB->addSuccessor(sinkMBB);
33823 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
33824
33825 // Do a single shift left.
33826 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
33827 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
33828 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
33829 .addReg(SspSecondShrReg);
33830
33831 // Save the value 128 to a register (will be used next with incssp).
33832 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
33833 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
33834 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
33835 .addImm(128);
33836 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
33837
33838 // Since incssp only looks at the lower 8 bits, we might need to do several
33839 // iterations of incssp until we finish fixing the shadow stack.
33840 Register DecReg = MRI.createVirtualRegister(PtrRC);
33841 Register CounterReg = MRI.createVirtualRegister(PtrRC);
33842 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
33843 .addReg(SspAfterShlReg)
33844 .addMBB(fixShadowLoopPrepareMBB)
33845 .addReg(DecReg)
33846 .addMBB(fixShadowLoopMBB);
33847
33848 // Every iteration we increase the SSP by 128.
33849 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
33850
33851 // Every iteration we decrement the counter by 1.
33852 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
33853 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
33854
33855 // Jump if the counter is not zero yet.
33856 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
33857 fixShadowLoopMBB->addSuccessor(sinkMBB);
33858 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
33859
33860 return sinkMBB;
33861}
33862
33863MachineBasicBlock *
33864X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
33865 MachineBasicBlock *MBB) const {
33866 const DebugLoc &DL = MI.getDebugLoc();
33867 MachineFunction *MF = MBB->getParent();
33868 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33869 MachineRegisterInfo &MRI = MF->getRegInfo();
33870
33871 // Memory Reference
33872 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33873 MI.memoperands_end());
33874
33875 MVT PVT = getPointerTy(MF->getDataLayout());
33876 assert((PVT == MVT::i64 || PVT == MVT::i32) &&((void)0)
33877 "Invalid Pointer Size!")((void)0);
33878
33879 const TargetRegisterClass *RC =
33880 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33881 Register Tmp = MRI.createVirtualRegister(RC);
33882 // Since FP is only updated here but NOT referenced, it's treated as GPR.
33883 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33884 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
33885 Register SP = RegInfo->getStackRegister();
33886
33887 MachineInstrBuilder MIB;
33888
33889 const int64_t LabelOffset = 1 * PVT.getStoreSize();
33890 const int64_t SPOffset = 2 * PVT.getStoreSize();
33891
33892 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33893 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
33894
33895 MachineBasicBlock *thisMBB = MBB;
33896
33897 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
33898 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33899 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
33900 }
33901
33902 // Reload FP
33903 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
33904 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33905 const MachineOperand &MO = MI.getOperand(i);
33906 if (MO.isReg()) // Don't add the whole operand, we don't want to
33907 // preserve kill flags.
33908 MIB.addReg(MO.getReg());
33909 else
33910 MIB.add(MO);
33911 }
33912 MIB.setMemRefs(MMOs);
33913
33914 // Reload IP
33915 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
33916 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33917 const MachineOperand &MO = MI.getOperand(i);
33918 if (i == X86::AddrDisp)
33919 MIB.addDisp(MO, LabelOffset);
33920 else if (MO.isReg()) // Don't add the whole operand, we don't want to
33921 // preserve kill flags.
33922 MIB.addReg(MO.getReg());
33923 else
33924 MIB.add(MO);
33925 }
33926 MIB.setMemRefs(MMOs);
33927
33928 // Reload SP
33929 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
33930 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33931 if (i == X86::AddrDisp)
33932 MIB.addDisp(MI.getOperand(i), SPOffset);
33933 else
33934 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
33935 // the last instruction of the expansion.
33936 }
33937 MIB.setMemRefs(MMOs);
33938
33939 // Jump
33940 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
33941
33942 MI.eraseFromParent();
33943 return thisMBB;
33944}
33945
33946void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
33947 MachineBasicBlock *MBB,
33948 MachineBasicBlock *DispatchBB,
33949 int FI) const {
33950 const DebugLoc &DL = MI.getDebugLoc();
33951 MachineFunction *MF = MBB->getParent();
33952 MachineRegisterInfo *MRI = &MF->getRegInfo();
33953 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33954
33955 MVT PVT = getPointerTy(MF->getDataLayout());
33956 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")((void)0);
33957
33958 unsigned Op = 0;
33959 unsigned VR = 0;
33960
33961 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33962 !isPositionIndependent();
33963
33964 if (UseImmLabel) {
33965 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33966 } else {
33967 const TargetRegisterClass *TRC =
33968 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33969 VR = MRI->createVirtualRegister(TRC);
33970 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33971
33972 if (Subtarget.is64Bit())
33973 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
33974 .addReg(X86::RIP)
33975 .addImm(1)
33976 .addReg(0)
33977 .addMBB(DispatchBB)
33978 .addReg(0);
33979 else
33980 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
33981 .addReg(0) /* TII->getGlobalBaseReg(MF) */
33982 .addImm(1)
33983 .addReg(0)
33984 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
33985 .addReg(0);
33986 }
33987
33988 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
33989 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
33990 if (UseImmLabel)
33991 MIB.addMBB(DispatchBB);
33992 else
33993 MIB.addReg(VR);
33994}
33995
33996MachineBasicBlock *
33997X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
33998 MachineBasicBlock *BB) const {
33999 const DebugLoc &DL = MI.getDebugLoc();
34000 MachineFunction *MF = BB->getParent();
34001 MachineRegisterInfo *MRI = &MF->getRegInfo();
34002 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34003 int FI = MF->getFrameInfo().getFunctionContextIndex();
34004
34005 // Get a mapping of the call site numbers to all of the landing pads they're
34006 // associated with.
34007 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
34008 unsigned MaxCSNum = 0;
34009 for (auto &MBB : *MF) {
34010 if (!MBB.isEHPad())
34011 continue;
34012
34013 MCSymbol *Sym = nullptr;
34014 for (const auto &MI : MBB) {
34015 if (MI.isDebugInstr())
34016 continue;
34017
34018 assert(MI.isEHLabel() && "expected EH_LABEL")((void)0);
34019 Sym = MI.getOperand(0).getMCSymbol();
34020 break;
34021 }
34022
34023 if (!MF->hasCallSiteLandingPad(Sym))
34024 continue;
34025
34026 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
34027 CallSiteNumToLPad[CSI].push_back(&MBB);
34028 MaxCSNum = std::max(MaxCSNum, CSI);
34029 }
34030 }
34031
34032 // Get an ordered list of the machine basic blocks for the jump table.
34033 std::vector<MachineBasicBlock *> LPadList;
34034 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
34035 LPadList.reserve(CallSiteNumToLPad.size());
34036
34037 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
34038 for (auto &LP : CallSiteNumToLPad[CSI]) {
34039 LPadList.push_back(LP);
34040 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
34041 }
34042 }
34043
34044 assert(!LPadList.empty() &&((void)0)
34045 "No landing pad destinations for the dispatch jump table!")((void)0);
34046
34047 // Create the MBBs for the dispatch code.
34048
34049 // Shove the dispatch's address into the return slot in the function context.
34050 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
34051 DispatchBB->setIsEHPad(true);
34052
34053 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
34054 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
34055 DispatchBB->addSuccessor(TrapBB);
34056
34057 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
34058 DispatchBB->addSuccessor(DispContBB);
34059
34060 // Insert MBBs.
34061 MF->push_back(DispatchBB);
34062 MF->push_back(DispContBB);
34063 MF->push_back(TrapBB);
34064
34065 // Insert code into the entry block that creates and registers the function
34066 // context.
34067 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
34068
34069 // Create the jump table and associated information
34070 unsigned JTE = getJumpTableEncoding();
34071 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
34072 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
34073
34074 const X86RegisterInfo &RI = TII->getRegisterInfo();
34075 // Add a register mask with no preserved registers. This results in all
34076 // registers being marked as clobbered.
34077 if (RI.hasBasePointer(*MF)) {
34078 const bool FPIs64Bit =
34079 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34080 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
34081 MFI->setRestoreBasePointer(MF);
34082
34083 Register FP = RI.getFrameRegister(*MF);
34084 Register BP = RI.getBaseRegister();
34085 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
34086 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
34087 MFI->getRestoreBasePointerOffset())
34088 .addRegMask(RI.getNoPreservedMask());
34089 } else {
34090 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
34091 .addRegMask(RI.getNoPreservedMask());
34092 }
34093
34094 // IReg is used as an index in a memory operand and therefore can't be SP
34095 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
34096 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
34097 Subtarget.is64Bit() ? 8 : 4);
34098 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
34099 .addReg(IReg)
34100 .addImm(LPadList.size());
34101 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
34102
34103 if (Subtarget.is64Bit()) {
34104 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34105 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
34106
34107 // leaq .LJTI0_0(%rip), BReg
34108 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
34109 .addReg(X86::RIP)
34110 .addImm(1)
34111 .addReg(0)
34112 .addJumpTableIndex(MJTI)
34113 .addReg(0);
34114 // movzx IReg64, IReg
34115 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
34116 .addImm(0)
34117 .addReg(IReg)
34118 .addImm(X86::sub_32bit);
34119
34120 switch (JTE) {
34121 case MachineJumpTableInfo::EK_BlockAddress:
34122 // jmpq *(BReg,IReg64,8)
34123 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
34124 .addReg(BReg)
34125 .addImm(8)
34126 .addReg(IReg64)
34127 .addImm(0)
34128 .addReg(0);
34129 break;
34130 case MachineJumpTableInfo::EK_LabelDifference32: {
34131 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
34132 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
34133 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34134
34135 // movl (BReg,IReg64,4), OReg
34136 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
34137 .addReg(BReg)
34138 .addImm(4)
34139 .addReg(IReg64)
34140 .addImm(0)
34141 .addReg(0);
34142 // movsx OReg64, OReg
34143 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
34144 // addq BReg, OReg64, TReg
34145 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
34146 .addReg(OReg64)
34147 .addReg(BReg);
34148 // jmpq *TReg
34149 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
34150 break;
34151 }
34152 default:
34153 llvm_unreachable("Unexpected jump table encoding")__builtin_unreachable();
34154 }
34155 } else {
34156 // jmpl *.LJTI0_0(,IReg,4)
34157 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
34158 .addReg(0)
34159 .addImm(4)
34160 .addReg(IReg)
34161 .addJumpTableIndex(MJTI)
34162 .addReg(0);
34163 }
34164
34165 // Add the jump table entries as successors to the MBB.
34166 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
34167 for (auto &LP : LPadList)
34168 if (SeenMBBs.insert(LP).second)
34169 DispContBB->addSuccessor(LP);
34170
34171 // N.B. the order the invoke BBs are processed in doesn't matter here.
34172 SmallVector<MachineBasicBlock *, 64> MBBLPads;
34173 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
34174 for (MachineBasicBlock *MBB : InvokeBBs) {
34175 // Remove the landing pad successor from the invoke block and replace it
34176 // with the new dispatch block.
34177 // Keep a copy of Successors since it's modified inside the loop.
34178 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
34179 MBB->succ_rend());
34180 // FIXME: Avoid quadratic complexity.
34181 for (auto MBBS : Successors) {
34182 if (MBBS->isEHPad()) {
34183 MBB->removeSuccessor(MBBS);
34184 MBBLPads.push_back(MBBS);
34185 }
34186 }
34187
34188 MBB->addSuccessor(DispatchBB);
34189
34190 // Find the invoke call and mark all of the callee-saved registers as
34191 // 'implicit defined' so that they're spilled. This prevents code from
34192 // moving instructions to before the EH block, where they will never be
34193 // executed.
34194 for (auto &II : reverse(*MBB)) {
34195 if (!II.isCall())
34196 continue;
34197
34198 DenseMap<unsigned, bool> DefRegs;
34199 for (auto &MOp : II.operands())
34200 if (MOp.isReg())
34201 DefRegs[MOp.getReg()] = true;
34202
34203 MachineInstrBuilder MIB(*MF, &II);
34204 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
34205 unsigned Reg = SavedRegs[RegIdx];
34206 if (!DefRegs[Reg])
34207 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
34208 }
34209
34210 break;
34211 }
34212 }
34213
34214 // Mark all former landing pads as non-landing pads. The dispatch is the only
34215 // landing pad now.
34216 for (auto &LP : MBBLPads)
34217 LP->setIsEHPad(false);
34218
34219 // The instruction is gone now.
34220 MI.eraseFromParent();
34221 return BB;
34222}
34223
34224MachineBasicBlock *
34225X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
34226 MachineBasicBlock *BB) const {
34227 MachineFunction *MF = BB->getParent();
34228 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34229 const DebugLoc &DL = MI.getDebugLoc();
34230
34231 auto TMMImmToTMMReg = [](unsigned Imm) {
34232 assert (Imm < 8 && "Illegal tmm index")((void)0);
34233 return X86::TMM0 + Imm;
34234 };
34235 switch (MI.getOpcode()) {
34236 default: llvm_unreachable("Unexpected instr type to insert")__builtin_unreachable();
34237 case X86::TLS_addr32:
34238 case X86::TLS_addr64:
34239 case X86::TLS_addrX32:
34240 case X86::TLS_base_addr32:
34241 case X86::TLS_base_addr64:
34242 case X86::TLS_base_addrX32:
34243 return EmitLoweredTLSAddr(MI, BB);
34244 case X86::INDIRECT_THUNK_CALL32:
34245 case X86::INDIRECT_THUNK_CALL64:
34246 case X86::INDIRECT_THUNK_TCRETURN32:
34247 case X86::INDIRECT_THUNK_TCRETURN64:
34248 return EmitLoweredIndirectThunk(MI, BB);
34249 case X86::CATCHRET:
34250 return EmitLoweredCatchRet(MI, BB);
34251 case X86::SEG_ALLOCA_32:
34252 case X86::SEG_ALLOCA_64:
34253 return EmitLoweredSegAlloca(MI, BB);
34254 case X86::PROBED_ALLOCA_32:
34255 case X86::PROBED_ALLOCA_64:
34256 return EmitLoweredProbedAlloca(MI, BB);
34257 case X86::TLSCall_32:
34258 case X86::TLSCall_64:
34259 return EmitLoweredTLSCall(MI, BB);
34260 case X86::CMOV_FR32:
34261 case X86::CMOV_FR32X:
34262 case X86::CMOV_FR64:
34263 case X86::CMOV_FR64X:
34264 case X86::CMOV_GR8:
34265 case X86::CMOV_GR16:
34266 case X86::CMOV_GR32:
34267 case X86::CMOV_RFP32:
34268 case X86::CMOV_RFP64:
34269 case X86::CMOV_RFP80:
34270 case X86::CMOV_VR64:
34271 case X86::CMOV_VR128:
34272 case X86::CMOV_VR128X:
34273 case X86::CMOV_VR256:
34274 case X86::CMOV_VR256X:
34275 case X86::CMOV_VR512:
34276 case X86::CMOV_VK1:
34277 case X86::CMOV_VK2:
34278 case X86::CMOV_VK4:
34279 case X86::CMOV_VK8:
34280 case X86::CMOV_VK16:
34281 case X86::CMOV_VK32:
34282 case X86::CMOV_VK64:
34283 return EmitLoweredSelect(MI, BB);
34284
34285 case X86::RDFLAGS32:
34286 case X86::RDFLAGS64: {
34287 unsigned PushF =
34288 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
34289 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
34290 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
34291 // Permit reads of the EFLAGS and DF registers without them being defined.
34292 // This intrinsic exists to read external processor state in flags, such as
34293 // the trap flag, interrupt flag, and direction flag, none of which are
34294 // modeled by the backend.
34295 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&((void)0)
34296 "Unexpected register in operand!")((void)0);
34297 Push->getOperand(2).setIsUndef();
34298 assert(Push->getOperand(3).getReg() == X86::DF &&((void)0)
34299 "Unexpected register in operand!")((void)0);
34300 Push->getOperand(3).setIsUndef();
34301 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
34302
34303 MI.eraseFromParent(); // The pseudo is gone now.
34304 return BB;
34305 }
34306
34307 case X86::WRFLAGS32:
34308 case X86::WRFLAGS64: {
34309 unsigned Push =
34310 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
34311 unsigned PopF =
34312 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
34313 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
34314 BuildMI(*BB, MI, DL, TII->get(PopF));
34315
34316 MI.eraseFromParent(); // The pseudo is gone now.
34317 return BB;
34318 }
34319
34320 case X86::FP32_TO_INT16_IN_MEM:
34321 case X86::FP32_TO_INT32_IN_MEM:
34322 case X86::FP32_TO_INT64_IN_MEM:
34323 case X86::FP64_TO_INT16_IN_MEM:
34324 case X86::FP64_TO_INT32_IN_MEM:
34325 case X86::FP64_TO_INT64_IN_MEM:
34326 case X86::FP80_TO_INT16_IN_MEM:
34327 case X86::FP80_TO_INT32_IN_MEM:
34328 case X86::FP80_TO_INT64_IN_MEM: {
34329 // Change the floating point control register to use "round towards zero"
34330 // mode when truncating to an integer value.
34331 int OrigCWFrameIdx =
34332 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34333 addFrameReference(BuildMI(*BB, MI, DL,
34334 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
34335
34336 // Load the old value of the control word...
34337 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34338 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
34339 OrigCWFrameIdx);
34340
34341 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
34342 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34343 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
34344 .addReg(OldCW, RegState::Kill).addImm(0xC00);
34345
34346 // Extract to 16 bits.
34347 Register NewCW16 =
34348 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
34349 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
34350 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
34351
34352 // Prepare memory for FLDCW.
34353 int NewCWFrameIdx =
34354 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34355 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
34356 NewCWFrameIdx)
34357 .addReg(NewCW16, RegState::Kill);
34358
34359 // Reload the modified control word now...
34360 addFrameReference(BuildMI(*BB, MI, DL,
34361 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
34362
34363 // Get the X86 opcode to use.
34364 unsigned Opc;
34365 switch (MI.getOpcode()) {
34366 default: llvm_unreachable("illegal opcode!")__builtin_unreachable();
34367 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
34368 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
34369 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
34370 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
34371 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
34372 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
34373 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
34374 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
34375 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
34376 }
34377
34378 X86AddressMode AM = getAddressFromInstr(&MI, 0);
34379 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
34380 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
34381
34382 // Reload the original control word now.
34383 addFrameReference(BuildMI(*BB, MI, DL,
34384 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
34385
34386 MI.eraseFromParent(); // The pseudo instruction is gone now.
34387 return BB;
34388 }
34389
34390 // xbegin
34391 case X86::XBEGIN:
34392 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
34393
34394 case X86::VAARG_64:
34395 case X86::VAARG_X32:
34396 return EmitVAARGWithCustomInserter(MI, BB);
34397
34398 case X86::EH_SjLj_SetJmp32:
34399 case X86::EH_SjLj_SetJmp64:
34400 return emitEHSjLjSetJmp(MI, BB);
34401
34402 case X86::EH_SjLj_LongJmp32:
34403 case X86::EH_SjLj_LongJmp64:
34404 return emitEHSjLjLongJmp(MI, BB);
34405
34406 case X86::Int_eh_sjlj_setup_dispatch:
34407 return EmitSjLjDispatchBlock(MI, BB);
34408
34409 case TargetOpcode::STATEPOINT:
34410 // As an implementation detail, STATEPOINT shares the STACKMAP format at
34411 // this point in the process. We diverge later.
34412 return emitPatchPoint(MI, BB);
34413
34414 case TargetOpcode::STACKMAP:
34415 case TargetOpcode::PATCHPOINT:
34416 return emitPatchPoint(MI, BB);
34417
34418 case TargetOpcode::PATCHABLE_EVENT_CALL:
34419 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
34420 return BB;
34421
34422 case X86::LCMPXCHG8B: {
34423 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34424 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
34425 // requires a memory operand. If it happens that current architecture is
34426 // i686 and for current function we need a base pointer
34427 // - which is ESI for i686 - register allocator would not be able to
34428 // allocate registers for an address in form of X(%reg, %reg, Y)
34429 // - there never would be enough unreserved registers during regalloc
34430 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
34431 // We are giving a hand to register allocator by precomputing the address in
34432 // a new vreg using LEA.
34433
34434 // If it is not i686 or there is no base pointer - nothing to do here.
34435 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
34436 return BB;
34437
34438 // Even though this code does not necessarily needs the base pointer to
34439 // be ESI, we check for that. The reason: if this assert fails, there are
34440 // some changes happened in the compiler base pointer handling, which most
34441 // probably have to be addressed somehow here.
34442 assert(TRI->getBaseRegister() == X86::ESI &&((void)0)
34443 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "((void)0)
34444 "base pointer in mind")((void)0);
34445
34446 MachineRegisterInfo &MRI = MF->getRegInfo();
34447 MVT SPTy = getPointerTy(MF->getDataLayout());
34448 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
34449 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
34450
34451 X86AddressMode AM = getAddressFromInstr(&MI, 0);
34452 // Regalloc does not need any help when the memory operand of CMPXCHG8B
34453 // does not use index register.
34454 if (AM.IndexReg == X86::NoRegister)
34455 return BB;
34456
34457 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
34458 // four operand definitions that are E[ABCD] registers. We skip them and
34459 // then insert the LEA.
34460 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
34461 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
34462 RMBBI->definesRegister(X86::EBX) ||
34463 RMBBI->definesRegister(X86::ECX) ||
34464 RMBBI->definesRegister(X86::EDX))) {
34465 ++RMBBI;
34466 }
34467 MachineBasicBlock::iterator MBBI(RMBBI);
34468 addFullAddress(
34469 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
34470
34471 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
34472
34473 return BB;
34474 }
34475 case X86::LCMPXCHG16B_NO_RBX: {
34476 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34477 Register BasePtr = TRI->getBaseRegister();
34478 if (TRI->hasBasePointer(*MF) &&
34479 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
34480 if (!BB->isLiveIn(BasePtr))
34481 BB->addLiveIn(BasePtr);
34482 // Save RBX into a virtual register.
34483 Register SaveRBX =
34484 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34485 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34486 .addReg(X86::RBX);
34487 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34488 MachineInstrBuilder MIB =
34489 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
34490 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34491 MIB.add(MI.getOperand(Idx));
34492 MIB.add(MI.getOperand(X86::AddrNumOperands));
34493 MIB.addReg(SaveRBX);
34494 } else {
34495 // Simple case, just copy the virtual register to RBX.
34496 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
34497 .add(MI.getOperand(X86::AddrNumOperands));
34498 MachineInstrBuilder MIB =
34499 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
34500 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34501 MIB.add(MI.getOperand(Idx));
34502 }
34503 MI.eraseFromParent();
34504 return BB;
34505 }
34506 case X86::MWAITX: {
34507 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34508 Register BasePtr = TRI->getBaseRegister();
34509 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
34510 // If no need to save the base pointer, we generate MWAITXrrr,
34511 // else we generate pseudo MWAITX_SAVE_RBX.
34512 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
34513 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34514 .addReg(MI.getOperand(0).getReg());
34515 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34516 .addReg(MI.getOperand(1).getReg());
34517 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
34518 .addReg(MI.getOperand(2).getReg());
34519 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
34520 MI.eraseFromParent();
34521 } else {
34522 if (!BB->isLiveIn(BasePtr)) {
34523 BB->addLiveIn(BasePtr);
34524 }
34525 // Parameters can be copied into ECX and EAX but not EBX yet.
34526 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34527 .addReg(MI.getOperand(0).getReg());
34528 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34529 .addReg(MI.getOperand(1).getReg());
34530 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")((void)0);
34531 // Save RBX into a virtual register.
34532 Register SaveRBX =
34533 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34534 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34535 .addReg(X86::RBX);
34536 // Generate mwaitx pseudo.
34537 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34538 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
34539 .addDef(Dst) // Destination tied in with SaveRBX.
34540 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
34541 .addUse(SaveRBX); // Save of base pointer.
34542 MI.eraseFromParent();
34543 }
34544 return BB;
34545 }
34546 case TargetOpcode::PREALLOCATED_SETUP: {
34547 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")((void)0);
34548 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34549 MFI->setHasPreallocatedCall(true);
34550 int64_t PreallocatedId = MI.getOperand(0).getImm();
34551 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
34552 assert(StackAdjustment != 0 && "0 stack adjustment")((void)0);
34553 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { } while (false)
34554 << StackAdjustment << "\n")do { } while (false);
34555 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
34556 .addReg(X86::ESP)
34557 .addImm(StackAdjustment);
34558 MI.eraseFromParent();
34559 return BB;
34560 }
34561 case TargetOpcode::PREALLOCATED_ARG: {
34562 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")((void)0);
34563 int64_t PreallocatedId = MI.getOperand(1).getImm();
34564 int64_t ArgIdx = MI.getOperand(2).getImm();
34565 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34566 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
34567 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { } while (false)
34568 << ", arg offset " << ArgOffset << "\n")do { } while (false);
34569 // stack pointer + offset
34570 addRegOffset(
34571 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
34572 X86::ESP, false, ArgOffset);
34573 MI.eraseFromParent();
34574 return BB;
34575 }
34576 case X86::PTDPBSSD:
34577 case X86::PTDPBSUD:
34578 case X86::PTDPBUSD:
34579 case X86::PTDPBUUD:
34580 case X86::PTDPBF16PS: {
34581 unsigned Opc;
34582 switch (MI.getOpcode()) {
34583 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
34584 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
34585 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
34586 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
34587 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
34588 }
34589
34590 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34591 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
34592 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
34593 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
34594 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
34595
34596 MI.eraseFromParent(); // The pseudo is gone now.
34597 return BB;
34598 }
34599 case X86::PTILEZERO: {
34600 unsigned Imm = MI.getOperand(0).getImm();
34601 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
34602 MI.eraseFromParent(); // The pseudo is gone now.
34603 return BB;
34604 }
34605 case X86::PTILELOADD:
34606 case X86::PTILELOADDT1:
34607 case X86::PTILESTORED: {
34608 unsigned Opc;
34609 switch (MI.getOpcode()) {
34610 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
34611 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
34612 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
34613 }
34614
34615 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34616 unsigned CurOp = 0;
34617 if (Opc != X86::TILESTORED)
34618 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34619 RegState::Define);
34620
34621 MIB.add(MI.getOperand(CurOp++)); // base
34622 MIB.add(MI.getOperand(CurOp++)); // scale
34623 MIB.add(MI.getOperand(CurOp++)); // index -- stride
34624 MIB.add(MI.getOperand(CurOp++)); // displacement
34625 MIB.add(MI.getOperand(CurOp++)); // segment
34626
34627 if (Opc == X86::TILESTORED)
34628 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34629 RegState::Undef);
34630
34631 MI.eraseFromParent(); // The pseudo is gone now.
34632 return BB;
34633 }
34634 }
34635}
34636
34637//===----------------------------------------------------------------------===//
34638// X86 Optimization Hooks
34639//===----------------------------------------------------------------------===//
34640
34641bool
34642X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
34643 const APInt &DemandedBits,
34644 const APInt &DemandedElts,
34645 TargetLoweringOpt &TLO) const {
34646 EVT VT = Op.getValueType();
34647 unsigned Opcode = Op.getOpcode();
34648 unsigned EltSize = VT.getScalarSizeInBits();
34649
34650 if (VT.isVector()) {
34651 // If the constant is only all signbits in the active bits, then we should
34652 // extend it to the entire constant to allow it act as a boolean constant
34653 // vector.
34654 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
34655 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
34656 return false;
34657 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
34658 if (!DemandedElts[i] || V.getOperand(i).isUndef())
34659 continue;
34660 const APInt &Val = V.getConstantOperandAPInt(i);
34661 if (Val.getBitWidth() > Val.getNumSignBits() &&
34662 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
34663 return true;
34664 }
34665 return false;
34666 };
34667 // For vectors - if we have a constant, then try to sign extend.
34668 // TODO: Handle AND/ANDN cases.
34669 unsigned ActiveBits = DemandedBits.getActiveBits();
34670 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
34671 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
34672 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
34673 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
34674 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
34675 VT.getVectorNumElements());
34676 SDValue NewC =
34677 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
34678 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
34679 SDValue NewOp =
34680 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
34681 return TLO.CombineTo(Op, NewOp);
34682 }
34683 return false;
34684 }
34685
34686 // Only optimize Ands to prevent shrinking a constant that could be
34687 // matched by movzx.
34688 if (Opcode != ISD::AND)
34689 return false;
34690
34691 // Make sure the RHS really is a constant.
34692 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
34693 if (!C)
34694 return false;
34695
34696 const APInt &Mask = C->getAPIntValue();
34697
34698 // Clear all non-demanded bits initially.
34699 APInt ShrunkMask = Mask & DemandedBits;
34700
34701 // Find the width of the shrunk mask.
34702 unsigned Width = ShrunkMask.getActiveBits();
34703
34704 // If the mask is all 0s there's nothing to do here.
34705 if (Width == 0)
34706 return false;
34707
34708 // Find the next power of 2 width, rounding up to a byte.
34709 Width = PowerOf2Ceil(std::max(Width, 8U));
34710 // Truncate the width to size to handle illegal types.
34711 Width = std::min(Width, EltSize);
34712
34713 // Calculate a possible zero extend mask for this constant.
34714 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
34715
34716 // If we aren't changing the mask, just return true to keep it and prevent
34717 // the caller from optimizing.
34718 if (ZeroExtendMask == Mask)
34719 return true;
34720
34721 // Make sure the new mask can be represented by a combination of mask bits
34722 // and non-demanded bits.
34723 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
34724 return false;
34725
34726 // Replace the constant with the zero extend mask.
34727 SDLoc DL(Op);
34728 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
34729 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
34730 return TLO.CombineTo(Op, NewOp);
34731}
34732
34733void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
34734 KnownBits &Known,
34735 const APInt &DemandedElts,
34736 const SelectionDAG &DAG,
34737 unsigned Depth) const {
34738 unsigned BitWidth = Known.getBitWidth();
34739 unsigned NumElts = DemandedElts.getBitWidth();
34740 unsigned Opc = Op.getOpcode();
34741 EVT VT = Op.getValueType();
34742 assert((Opc >= ISD::BUILTIN_OP_END ||((void)0)
34743 Opc == ISD::INTRINSIC_WO_CHAIN ||((void)0)
34744 Opc == ISD::INTRINSIC_W_CHAIN ||((void)0)
34745 Opc == ISD::INTRINSIC_VOID) &&((void)0)
34746 "Should use MaskedValueIsZero if you don't know whether Op"((void)0)
34747 " is a target node!")((void)0);
34748
34749 Known.resetAll();
34750 switch (Opc) {
34751 default: break;
34752 case X86ISD::SETCC:
34753 Known.Zero.setBitsFrom(1);
34754 break;
34755 case X86ISD::MOVMSK: {
34756 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
34757 Known.Zero.setBitsFrom(NumLoBits);
34758 break;
34759 }
34760 case X86ISD::PEXTRB:
34761 case X86ISD::PEXTRW: {
34762 SDValue Src = Op.getOperand(0);
34763 EVT SrcVT = Src.getValueType();
34764 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
34765 Op.getConstantOperandVal(1));
34766 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
34767 Known = Known.anyextOrTrunc(BitWidth);
34768 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
34769 break;
34770 }
34771 case X86ISD::VSRAI:
34772 case X86ISD::VSHLI:
34773 case X86ISD::VSRLI: {
34774 unsigned ShAmt = Op.getConstantOperandVal(1);
34775 if (ShAmt >= VT.getScalarSizeInBits()) {
34776 Known.setAllZero();
34777 break;
34778 }
34779
34780 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34781 if (Opc == X86ISD::VSHLI) {
34782 Known.Zero <<= ShAmt;
34783 Known.One <<= ShAmt;
34784 // Low bits are known zero.
34785 Known.Zero.setLowBits(ShAmt);
34786 } else if (Opc == X86ISD::VSRLI) {
34787 Known.Zero.lshrInPlace(ShAmt);
34788 Known.One.lshrInPlace(ShAmt);
34789 // High bits are known zero.
34790 Known.Zero.setHighBits(ShAmt);
34791 } else {
34792 Known.Zero.ashrInPlace(ShAmt);
34793 Known.One.ashrInPlace(ShAmt);
34794 }
34795 break;
34796 }
34797 case X86ISD::PACKUS: {
34798 // PACKUS is just a truncation if the upper half is zero.
34799 APInt DemandedLHS, DemandedRHS;
34800 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34801
34802 Known.One = APInt::getAllOnesValue(BitWidth * 2);
34803 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
34804
34805 KnownBits Known2;
34806 if (!!DemandedLHS) {
34807 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34808 Known = KnownBits::commonBits(Known, Known2);
34809 }
34810 if (!!DemandedRHS) {
34811 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34812 Known = KnownBits::commonBits(Known, Known2);
34813 }
34814
34815 if (Known.countMinLeadingZeros() < BitWidth)
34816 Known.resetAll();
34817 Known = Known.trunc(BitWidth);
34818 break;
34819 }
34820 case X86ISD::VBROADCAST: {
34821 SDValue Src = Op.getOperand(0);
34822 if (!Src.getSimpleValueType().isVector()) {
34823 Known = DAG.computeKnownBits(Src, Depth + 1);
34824 return;
34825 }
34826 break;
34827 }
34828 case X86ISD::ANDNP: {
34829 KnownBits Known2;
34830 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34831 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34832
34833 // ANDNP = (~X & Y);
34834 Known.One &= Known2.Zero;
34835 Known.Zero |= Known2.One;
34836 break;
34837 }
34838 case X86ISD::FOR: {
34839 KnownBits Known2;
34840 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34841 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34842
34843 Known |= Known2;
34844 break;
34845 }
34846 case X86ISD::PSADBW: {
34847 assert(VT.getScalarType() == MVT::i64 &&((void)0)
34848 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&((void)0)
34849 "Unexpected PSADBW types")((void)0);
34850
34851 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
34852 Known.Zero.setBitsFrom(16);
34853 break;
34854 }
34855 case X86ISD::PMULUDQ: {
34856 KnownBits Known2;
34857 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34858 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34859
34860 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
34861 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
34862 Known = KnownBits::mul(Known, Known2);
34863 break;
34864 }
34865 case X86ISD::CMOV: {
34866 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
34867 // If we don't know any bits, early out.
34868 if (Known.isUnknown())
34869 break;
34870 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
34871
34872 // Only known if known in both the LHS and RHS.
34873 Known = KnownBits::commonBits(Known, Known2);
34874 break;
34875 }
34876 case X86ISD::BEXTR:
34877 case X86ISD::BEXTRI: {
34878 SDValue Op0 = Op.getOperand(0);
34879 SDValue Op1 = Op.getOperand(1);
34880
34881 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
34882 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
34883 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
34884
34885 // If the length is 0, the result is 0.
34886 if (Length == 0) {
34887 Known.setAllZero();
34888 break;
34889 }
34890
34891 if ((Shift + Length) <= BitWidth) {
34892 Known = DAG.computeKnownBits(Op0, Depth + 1);
34893 Known = Known.extractBits(Length, Shift);
34894 Known = Known.zextOrTrunc(BitWidth);
34895 }
34896 }
34897 break;
34898 }
34899 case X86ISD::PDEP: {
34900 KnownBits Known2;
34901 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34902 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34903 // Zeros are retained from the mask operand. But not ones.
34904 Known.One.clearAllBits();
34905 // The result will have at least as many trailing zeros as the non-mask
34906 // operand since bits can only map to the same or higher bit position.
34907 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
34908 break;
34909 }
34910 case X86ISD::PEXT: {
34911 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34912 // The result has as many leading zeros as the number of zeroes in the mask.
34913 unsigned Count = Known.Zero.countPopulation();
34914 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
34915 Known.One.clearAllBits();
34916 break;
34917 }
34918 case X86ISD::VTRUNC:
34919 case X86ISD::VTRUNCS:
34920 case X86ISD::VTRUNCUS:
34921 case X86ISD::CVTSI2P:
34922 case X86ISD::CVTUI2P:
34923 case X86ISD::CVTP2SI:
34924 case X86ISD::CVTP2UI:
34925 case X86ISD::MCVTP2SI:
34926 case X86ISD::MCVTP2UI:
34927 case X86ISD::CVTTP2SI:
34928 case X86ISD::CVTTP2UI:
34929 case X86ISD::MCVTTP2SI:
34930 case X86ISD::MCVTTP2UI:
34931 case X86ISD::MCVTSI2P:
34932 case X86ISD::MCVTUI2P:
34933 case X86ISD::VFPROUND:
34934 case X86ISD::VMFPROUND:
34935 case X86ISD::CVTPS2PH:
34936 case X86ISD::MCVTPS2PH: {
34937 // Truncations/Conversions - upper elements are known zero.
34938 EVT SrcVT = Op.getOperand(0).getValueType();
34939 if (SrcVT.isVector()) {
34940 unsigned NumSrcElts = SrcVT.getVectorNumElements();
34941 if (NumElts > NumSrcElts &&
34942 DemandedElts.countTrailingZeros() >= NumSrcElts)
34943 Known.setAllZero();
34944 }
34945 break;
34946 }
34947 case X86ISD::STRICT_CVTTP2SI:
34948 case X86ISD::STRICT_CVTTP2UI:
34949 case X86ISD::STRICT_CVTSI2P:
34950 case X86ISD::STRICT_CVTUI2P:
34951 case X86ISD::STRICT_VFPROUND:
34952 case X86ISD::STRICT_CVTPS2PH: {
34953 // Strict Conversions - upper elements are known zero.
34954 EVT SrcVT = Op.getOperand(1).getValueType();
34955 if (SrcVT.isVector()) {
34956 unsigned NumSrcElts = SrcVT.getVectorNumElements();
34957 if (NumElts > NumSrcElts &&
34958 DemandedElts.countTrailingZeros() >= NumSrcElts)
34959 Known.setAllZero();
34960 }
34961 break;
34962 }
34963 case X86ISD::MOVQ2DQ: {
34964 // Move from MMX to XMM. Upper half of XMM should be 0.
34965 if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
34966 Known.setAllZero();
34967 break;
34968 }
34969 }
34970
34971 // Handle target shuffles.
34972 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34973 if (isTargetShuffle(Opc)) {
34974 SmallVector<int, 64> Mask;
34975 SmallVector<SDValue, 2> Ops;
34976 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
34977 unsigned NumOps = Ops.size();
34978 unsigned NumElts = VT.getVectorNumElements();
34979 if (Mask.size() == NumElts) {
34980 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34981 Known.Zero.setAllBits(); Known.One.setAllBits();
34982 for (unsigned i = 0; i != NumElts; ++i) {
34983 if (!DemandedElts[i])
34984 continue;
34985 int M = Mask[i];
34986 if (M == SM_SentinelUndef) {
34987 // For UNDEF elements, we don't know anything about the common state
34988 // of the shuffle result.
34989 Known.resetAll();
34990 break;
34991 }
34992 if (M == SM_SentinelZero) {
34993 Known.One.clearAllBits();
34994 continue;
34995 }
34996 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((void)0)
34997 "Shuffle index out of range")((void)0);
34998
34999 unsigned OpIdx = (unsigned)M / NumElts;
35000 unsigned EltIdx = (unsigned)M % NumElts;
35001 if (Ops[OpIdx].getValueType() != VT) {
35002 // TODO - handle target shuffle ops with different value types.
35003 Known.resetAll();
35004 break;
35005 }
35006 DemandedOps[OpIdx].setBit(EltIdx);
35007 }
35008 // Known bits are the values that are shared by every demanded element.
35009 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
35010 if (!DemandedOps[i])
35011 continue;
35012 KnownBits Known2 =
35013 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
35014 Known = KnownBits::commonBits(Known, Known2);
35015 }
35016 }
35017 }
35018 }
35019}
35020
35021unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
35022 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
35023 unsigned Depth) const {
35024 EVT VT = Op.getValueType();
35025 unsigned VTBits = VT.getScalarSizeInBits();
35026 unsigned Opcode = Op.getOpcode();
35027 switch (Opcode) {
35028 case X86ISD::SETCC_CARRY:
35029 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
35030 return VTBits;
35031
35032 case X86ISD::VTRUNC: {
35033 SDValue Src = Op.getOperand(0);
35034 MVT SrcVT = Src.getSimpleValueType();
35035 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
35036 assert(VTBits < NumSrcBits && "Illegal truncation input type")((void)0);
35037 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
35038 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
35039 if (Tmp > (NumSrcBits - VTBits))
35040 return Tmp - (NumSrcBits - VTBits);
35041 return 1;
35042 }
35043
35044 case X86ISD::PACKSS: {
35045 // PACKSS is just a truncation if the sign bits extend to the packed size.
35046 APInt DemandedLHS, DemandedRHS;
35047 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
35048 DemandedRHS);
35049
35050 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
35051 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
35052 if (!!DemandedLHS)
35053 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
35054 if (!!DemandedRHS)
35055 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
35056 unsigned Tmp = std::min(Tmp0, Tmp1);
35057 if (Tmp > (SrcBits - VTBits))
35058 return Tmp - (SrcBits - VTBits);
35059 return 1;
35060 }
35061
35062 case X86ISD::VBROADCAST: {
35063 SDValue Src = Op.getOperand(0);
35064 if (!Src.getSimpleValueType().isVector())
35065 return DAG.ComputeNumSignBits(Src, Depth + 1);
35066 break;
35067 }
35068
35069 case X86ISD::VSHLI: {
35070 SDValue Src = Op.getOperand(0);
35071 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
35072 if (ShiftVal.uge(VTBits))
35073 return VTBits; // Shifted all bits out --> zero.
35074 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35075 if (ShiftVal.uge(Tmp))
35076 return 1; // Shifted all sign bits out --> unknown.
35077 return Tmp - ShiftVal.getZExtValue();
35078 }
35079
35080 case X86ISD::VSRAI: {
35081 SDValue Src = Op.getOperand(0);
35082 APInt ShiftVal = Op.getConstantOperandAPInt(1);
35083 if (ShiftVal.uge(VTBits - 1))
35084 return VTBits; // Sign splat.
35085 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35086 ShiftVal += Tmp;
35087 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
35088 }
35089
35090 case X86ISD::FSETCC:
35091 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
35092 if (VT == MVT::f32 || VT == MVT::f64 ||
35093 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
35094 return VTBits;
35095 break;
35096
35097 case X86ISD::PCMPGT:
35098 case X86ISD::PCMPEQ:
35099 case X86ISD::CMPP:
35100 case X86ISD::VPCOM:
35101 case X86ISD::VPCOMU:
35102 // Vector compares return zero/all-bits result values.
35103 return VTBits;
35104
35105 case X86ISD::ANDNP: {
35106 unsigned Tmp0 =
35107 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
35108 if (Tmp0 == 1) return 1; // Early out.
35109 unsigned Tmp1 =
35110 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
35111 return std::min(Tmp0, Tmp1);
35112 }
35113
35114 case X86ISD::CMOV: {
35115 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
35116 if (Tmp0 == 1) return 1; // Early out.
35117 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
35118 return std::min(Tmp0, Tmp1);
35119 }
35120 }
35121
35122 // Handle target shuffles.
35123 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
35124 if (isTargetShuffle(Opcode)) {
35125 SmallVector<int, 64> Mask;
35126 SmallVector<SDValue, 2> Ops;
35127 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
35128 unsigned NumOps = Ops.size();
35129 unsigned NumElts = VT.getVectorNumElements();
35130 if (Mask.size() == NumElts) {
35131 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
35132 for (unsigned i = 0; i != NumElts; ++i) {
35133 if (!DemandedElts[i])
35134 continue;
35135 int M = Mask[i];
35136 if (M == SM_SentinelUndef) {
35137 // For UNDEF elements, we don't know anything about the common state
35138 // of the shuffle result.
35139 return 1;
35140 } else if (M == SM_SentinelZero) {
35141 // Zero = all sign bits.
35142 continue;
35143 }
35144 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((void)0)
35145 "Shuffle index out of range")((void)0);
35146
35147 unsigned OpIdx = (unsigned)M / NumElts;
35148 unsigned EltIdx = (unsigned)M % NumElts;
35149 if (Ops[OpIdx].getValueType() != VT) {
35150 // TODO - handle target shuffle ops with different value types.
35151 return 1;
35152 }
35153 DemandedOps[OpIdx].setBit(EltIdx);
35154 }
35155 unsigned Tmp0 = VTBits;
35156 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
35157 if (!DemandedOps[i])
35158 continue;
35159 unsigned Tmp1 =
35160 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
35161 Tmp0 = std::min(Tmp0, Tmp1);
35162 }
35163 return Tmp0;
35164 }
35165 }
35166 }
35167
35168 // Fallback case.
35169 return 1;
35170}
35171
35172SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
35173 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
35174 return N->getOperand(0);
35175 return N;
35176}
35177
35178// Helper to look for a normal load that can be narrowed into a vzload with the
35179// specified VT and memory VT. Returns SDValue() on failure.
35180static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
35181 SelectionDAG &DAG) {
35182 // Can't if the load is volatile or atomic.
35183 if (!LN->isSimple())
35184 return SDValue();
35185
35186 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35187 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
35188 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
35189 LN->getPointerInfo(), LN->getOriginalAlign(),
35190 LN->getMemOperand()->getFlags());
35191}
35192
35193// Attempt to match a combined shuffle mask against supported unary shuffle
35194// instructions.
35195// TODO: Investigate sharing more of this with shuffle lowering.
35196static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35197 bool AllowFloatDomain, bool AllowIntDomain,
35198 SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
35199 const X86Subtarget &Subtarget, unsigned &Shuffle,
35200 MVT &SrcVT, MVT &DstVT) {
35201 unsigned NumMaskElts = Mask.size();
35202 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
35203
35204 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
35205 if (MaskEltSize == 32 && Mask[0] == 0) {
35206 if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
35207 Shuffle = X86ISD::VZEXT_MOVL;
35208 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35209 return true;
35210 }
35211 if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35212 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35213 Shuffle = X86ISD::VZEXT_MOVL;
35214 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35215 return true;
35216 }
35217 }
35218
35219 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
35220 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
35221 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
35222 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
35223 unsigned MaxScale = 64 / MaskEltSize;
35224 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
35225 bool MatchAny = true;
35226 bool MatchZero = true;
35227 unsigned NumDstElts = NumMaskElts / Scale;
35228 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
35229 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
35230 MatchAny = MatchZero = false;
35231 break;
35232 }
35233 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
35234 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
35235 }
35236 if (MatchAny || MatchZero) {
35237 assert(MatchZero && "Failed to match zext but matched aext?")((void)0);
35238 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
35239 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
35240 MVT::getIntegerVT(MaskEltSize);
35241 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
35242
35243 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
35244 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
35245
35246 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
35247 if (SrcVT.getVectorNumElements() != NumDstElts)
35248 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
35249
35250 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
35251 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
35252 return true;
35253 }
35254 }
35255 }
35256
35257 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
35258 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
35259 isUndefOrEqual(Mask[0], 0) &&
35260 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35261 Shuffle = X86ISD::VZEXT_MOVL;
35262 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35263 return true;
35264 }
35265
35266 // Check if we have SSE3 which will let us use MOVDDUP etc. The
35267 // instructions are no slower than UNPCKLPD but has the option to
35268 // fold the input operand into even an unaligned memory load.
35269 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
35270 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
35271 Shuffle = X86ISD::MOVDDUP;
35272 SrcVT = DstVT = MVT::v2f64;
35273 return true;
35274 }
35275 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35276 Shuffle = X86ISD::MOVSLDUP;
35277 SrcVT = DstVT = MVT::v4f32;
35278 return true;
35279 }
35280 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
35281 Shuffle = X86ISD::MOVSHDUP;
35282 SrcVT = DstVT = MVT::v4f32;
35283 return true;
35284 }
35285 }
35286
35287 if (MaskVT.is256BitVector() && AllowFloatDomain) {
35288 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")((void)0);
35289 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35290 Shuffle = X86ISD::MOVDDUP;
35291 SrcVT = DstVT = MVT::v4f64;
35292 return true;
35293 }
35294 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35295 Shuffle = X86ISD::MOVSLDUP;
35296 SrcVT = DstVT = MVT::v8f32;
35297 return true;
35298 }
35299 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
35300 Shuffle = X86ISD::MOVSHDUP;
35301 SrcVT = DstVT = MVT::v8f32;
35302 return true;
35303 }
35304 }
35305
35306 if (MaskVT.is512BitVector() && AllowFloatDomain) {
35307 assert(Subtarget.hasAVX512() &&((void)0)
35308 "AVX512 required for 512-bit vector shuffles")((void)0);
35309 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35310 Shuffle = X86ISD::MOVDDUP;
35311 SrcVT = DstVT = MVT::v8f64;
35312 return true;
35313 }
35314 if (isTargetShuffleEquivalent(
35315 MaskVT, Mask,
35316 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
35317 Shuffle = X86ISD::MOVSLDUP;
35318 SrcVT = DstVT = MVT::v16f32;
35319 return true;
35320 }
35321 if (isTargetShuffleEquivalent(
35322 MaskVT, Mask,
35323 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
35324 Shuffle = X86ISD::MOVSHDUP;
35325 SrcVT = DstVT = MVT::v16f32;
35326 return true;
35327 }
35328 }
35329
35330 return false;
35331}
35332
35333// Attempt to match a combined shuffle mask against supported unary immediate
35334// permute instructions.
35335// TODO: Investigate sharing more of this with shuffle lowering.
35336static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
35337 const APInt &Zeroable,
35338 bool AllowFloatDomain, bool AllowIntDomain,
35339 const X86Subtarget &Subtarget,
35340 unsigned &Shuffle, MVT &ShuffleVT,
35341 unsigned &PermuteImm) {
35342 unsigned NumMaskElts = Mask.size();
35343 unsigned InputSizeInBits = MaskVT.getSizeInBits();
35344 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
35345 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
35346 bool ContainsZeros = isAnyZero(Mask);
35347
35348 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
35349 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
35350 // Check for lane crossing permutes.
35351 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
35352 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
35353 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
35354 Shuffle = X86ISD::VPERMI;
35355 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
35356 PermuteImm = getV4X86ShuffleImm(Mask);
35357 return true;
35358 }
35359 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
35360 SmallVector<int, 4> RepeatedMask;
35361 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
35362 Shuffle = X86ISD::VPERMI;
35363 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
35364 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
35365 return true;
35366 }
35367 }
35368 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
35369 // VPERMILPD can permute with a non-repeating shuffle.
35370 Shuffle = X86ISD::VPERMILPI;
35371 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
35372 PermuteImm = 0;
35373 for (int i = 0, e = Mask.size(); i != e; ++i) {
35374 int M = Mask[i];
35375 if (M == SM_SentinelUndef)
35376 continue;
35377 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")((void)0);
35378 PermuteImm |= (M & 1) << i;
35379 }
35380 return true;
35381 }
35382 }
35383
35384 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
35385 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
35386 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
35387 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
35388 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
35389 SmallVector<int, 4> RepeatedMask;
35390 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35391 // Narrow the repeated mask to create 32-bit element permutes.
35392 SmallVector<int, 4> WordMask = RepeatedMask;
35393 if (MaskScalarSizeInBits == 64)
35394 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
35395
35396 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
35397 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
35398 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
35399 PermuteImm = getV4X86ShuffleImm(WordMask);
35400 return true;
35401 }
35402 }
35403
35404 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
35405 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
35406 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35407 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35408 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35409 SmallVector<int, 4> RepeatedMask;
35410 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35411 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
35412 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
35413
35414 // PSHUFLW: permute lower 4 elements only.
35415 if (isUndefOrInRange(LoMask, 0, 4) &&
35416 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
35417 Shuffle = X86ISD::PSHUFLW;
35418 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35419 PermuteImm = getV4X86ShuffleImm(LoMask);
35420 return true;
35421 }
35422
35423 // PSHUFHW: permute upper 4 elements only.
35424 if (isUndefOrInRange(HiMask, 4, 8) &&
35425 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
35426 // Offset the HiMask so that we can create the shuffle immediate.
35427 int OffsetHiMask[4];
35428 for (int i = 0; i != 4; ++i)
35429 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
35430
35431 Shuffle = X86ISD::PSHUFHW;
35432 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35433 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
35434 return true;
35435 }
35436 }
35437 }
35438
35439 // Attempt to match against byte/bit shifts.
35440 if (AllowIntDomain &&
35441 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35442 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35443 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35444 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
35445 Mask, 0, Zeroable, Subtarget);
35446 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
35447 32 <= ShuffleVT.getScalarSizeInBits())) {
35448 PermuteImm = (unsigned)ShiftAmt;
35449 return true;
35450 }
35451 }
35452
35453 // Attempt to match against bit rotates.
35454 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
35455 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
35456 Subtarget.hasAVX512())) {
35457 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
35458 Subtarget, Mask);
35459 if (0 < RotateAmt) {
35460 Shuffle = X86ISD::VROTLI;
35461 PermuteImm = (unsigned)RotateAmt;
35462 return true;
35463 }
35464 }
35465
35466 return false;
35467}
35468
35469// Attempt to match a combined unary shuffle mask against supported binary
35470// shuffle instructions.
35471// TODO: Investigate sharing more of this with shuffle lowering.
35472static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35473 bool AllowFloatDomain, bool AllowIntDomain,
35474 SDValue &V1, SDValue &V2, const SDLoc &DL,
35475 SelectionDAG &DAG, const X86Subtarget &Subtarget,
35476 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
35477 bool IsUnary) {
35478 unsigned NumMaskElts = Mask.size();
35479 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35480
35481 if (MaskVT.is128BitVector()) {
35482 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
35483 V2 = V1;
35484 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
35485 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
35486 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35487 return true;
35488 }
35489 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
35490 V2 = V1;
35491 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
35492 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35493 return true;
35494 }
35495 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
35496 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
35497 std::swap(V1, V2);
35498 Shuffle = X86ISD::MOVSD;
35499 SrcVT = DstVT = MVT::v2f64;
35500 return true;
35501 }
35502 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
35503 (AllowFloatDomain || !Subtarget.hasSSE41())) {
35504 Shuffle = X86ISD::MOVSS;
35505 SrcVT = DstVT = MVT::v4f32;
35506 return true;
35507 }
35508 }
35509
35510 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
35511 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
35512 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
35513 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
35514 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
35515 Subtarget)) {
35516 DstVT = MaskVT;
35517 return true;
35518 }
35519 }
35520
35521 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
35522 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
35523 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35524 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
35525 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35526 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
35527 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
35528 Subtarget)) {
35529 SrcVT = DstVT = MaskVT;
35530 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
35531 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
35532 return true;
35533 }
35534 }
35535
35536 // Attempt to match against a OR if we're performing a blend shuffle and the
35537 // non-blended source element is zero in each case.
35538 if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35539 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
35540 bool IsBlend = true;
35541 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
35542 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
35543 unsigned Scale1 = NumV1Elts / NumMaskElts;
35544 unsigned Scale2 = NumV2Elts / NumMaskElts;
35545 APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
35546 APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
35547 for (unsigned i = 0; i != NumMaskElts; ++i) {
35548 int M = Mask[i];
35549 if (M == SM_SentinelUndef)
35550 continue;
35551 if (M == SM_SentinelZero) {
35552 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35553 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35554 continue;
35555 }
35556 if (M == (int)i) {
35557 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35558 continue;
35559 }
35560 if (M == (int)(i + NumMaskElts)) {
35561 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35562 continue;
35563 }
35564 IsBlend = false;
35565 break;
35566 }
35567 if (IsBlend &&
35568 DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
35569 DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
35570 Shuffle = ISD::OR;
35571 SrcVT = DstVT = MaskVT.changeTypeToInteger();
35572 return true;
35573 }
35574 }
35575
35576 return false;
35577}
35578
35579static bool matchBinaryPermuteShuffle(
35580 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
35581 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
35582 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
35583 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
35584 unsigned NumMaskElts = Mask.size();
35585 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35586
35587 // Attempt to match against VALIGND/VALIGNQ rotate.
35588 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
35589 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
35590 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
35591 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35592 if (!isAnyZero(Mask)) {
35593 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
35594 if (0 < Rotation) {
35595 Shuffle = X86ISD::VALIGN;
35596 if (EltSizeInBits == 64)
35597 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
35598 else
35599 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
35600 PermuteImm = Rotation;
35601 return true;
35602 }
35603 }
35604 }
35605
35606 // Attempt to match against PALIGNR byte rotate.
35607 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
35608 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35609 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35610 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
35611 if (0 < ByteRotation) {
35612 Shuffle = X86ISD::PALIGNR;
35613 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
35614 PermuteImm = ByteRotation;
35615 return true;
35616 }
35617 }
35618
35619 // Attempt to combine to X86ISD::BLENDI.
35620 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
35621 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
35622 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
35623 uint64_t BlendMask = 0;
35624 bool ForceV1Zero = false, ForceV2Zero = false;
35625 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
35626 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
35627 ForceV2Zero, BlendMask)) {
35628 if (MaskVT == MVT::v16i16) {
35629 // We can only use v16i16 PBLENDW if the lanes are repeated.
35630 SmallVector<int, 8> RepeatedMask;
35631 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
35632 RepeatedMask)) {
35633 assert(RepeatedMask.size() == 8 &&((void)0)
35634 "Repeated mask size doesn't match!")((void)0);
35635 PermuteImm = 0;
35636 for (int i = 0; i < 8; ++i)
35637 if (RepeatedMask[i] >= 8)
35638 PermuteImm |= 1 << i;
35639 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35640 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35641 Shuffle = X86ISD::BLENDI;
35642 ShuffleVT = MaskVT;
35643 return true;
35644 }
35645 } else {
35646 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35647 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35648 PermuteImm = (unsigned)BlendMask;
35649 Shuffle = X86ISD::BLENDI;
35650 ShuffleVT = MaskVT;
35651 return true;
35652 }
35653 }
35654 }
35655
35656 // Attempt to combine to INSERTPS, but only if it has elements that need to
35657 // be set to zero.
35658 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35659 MaskVT.is128BitVector() && isAnyZero(Mask) &&
35660 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35661 Shuffle = X86ISD::INSERTPS;
35662 ShuffleVT = MVT::v4f32;
35663 return true;
35664 }
35665
35666 // Attempt to combine to SHUFPD.
35667 if (AllowFloatDomain && EltSizeInBits == 64 &&
35668 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35669 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35670 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35671 bool ForceV1Zero = false, ForceV2Zero = false;
35672 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
35673 PermuteImm, Mask, Zeroable)) {
35674 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35675 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35676 Shuffle = X86ISD::SHUFP;
35677 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
35678 return true;
35679 }
35680 }
35681
35682 // Attempt to combine to SHUFPS.
35683 if (AllowFloatDomain && EltSizeInBits == 32 &&
35684 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
35685 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35686 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35687 SmallVector<int, 4> RepeatedMask;
35688 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
35689 // Match each half of the repeated mask, to determine if its just
35690 // referencing one of the vectors, is zeroable or entirely undef.
35691 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
35692 int M0 = RepeatedMask[Offset];
35693 int M1 = RepeatedMask[Offset + 1];
35694
35695 if (isUndefInRange(RepeatedMask, Offset, 2)) {
35696 return DAG.getUNDEF(MaskVT);
35697 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
35698 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
35699 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
35700 return getZeroVector(MaskVT, Subtarget, DAG, DL);
35701 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
35702 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35703 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35704 return V1;
35705 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
35706 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35707 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35708 return V2;
35709 }
35710
35711 return SDValue();
35712 };
35713
35714 int ShufMask[4] = {-1, -1, -1, -1};
35715 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
35716 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
35717
35718 if (Lo && Hi) {
35719 V1 = Lo;
35720 V2 = Hi;
35721 Shuffle = X86ISD::SHUFP;
35722 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
35723 PermuteImm = getV4X86ShuffleImm(ShufMask);
35724 return true;
35725 }
35726 }
35727 }
35728
35729 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
35730 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35731 MaskVT.is128BitVector() &&
35732 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35733 Shuffle = X86ISD::INSERTPS;
35734 ShuffleVT = MVT::v4f32;
35735 return true;
35736 }
35737
35738 return false;
35739}
35740
35741static SDValue combineX86ShuffleChainWithExtract(
35742 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
35743 bool HasVariableMask, bool AllowVariableCrossLaneMask,
35744 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
35745 const X86Subtarget &Subtarget);
35746
35747/// Combine an arbitrary chain of shuffles into a single instruction if
35748/// possible.
35749///
35750/// This is the leaf of the recursive combine below. When we have found some
35751/// chain of single-use x86 shuffle instructions and accumulated the combined
35752/// shuffle mask represented by them, this will try to pattern match that mask
35753/// into either a single instruction if there is a special purpose instruction
35754/// for this operation, or into a PSHUFB instruction which is a fully general
35755/// instruction but should only be used to replace chains over a certain depth.
35756static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35757 ArrayRef<int> BaseMask, int Depth,
35758 bool HasVariableMask,
35759 bool AllowVariableCrossLaneMask,
35760 bool AllowVariablePerLaneMask,
35761 SelectionDAG &DAG,
35762 const X86Subtarget &Subtarget) {
35763 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")((void)0);
35764 assert((Inputs.size() == 1 || Inputs.size() == 2) &&((void)0)
35765 "Unexpected number of shuffle inputs!")((void)0);
35766
35767 MVT RootVT = Root.getSimpleValueType();
35768 unsigned RootSizeInBits = RootVT.getSizeInBits();
35769 unsigned NumRootElts = RootVT.getVectorNumElements();
35770
35771 // Canonicalize shuffle input op to the requested type.
35772 // TODO: Support cases where Op is smaller than VT.
35773 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
35774 return DAG.getBitcast(VT, Op);
35775 };
35776
35777 // Find the inputs that enter the chain. Note that multiple uses are OK
35778 // here, we're not going to remove the operands we find.
35779 bool UnaryShuffle = (Inputs.size() == 1);
35780 SDValue V1 = peekThroughBitcasts(Inputs[0]);
35781 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
35782 : peekThroughBitcasts(Inputs[1]));
35783
35784 MVT VT1 = V1.getSimpleValueType();
35785 MVT VT2 = V2.getSimpleValueType();
35786 assert(VT1.getSizeInBits() == RootSizeInBits &&((void)0)
35787 VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch")((void)0);
35788
35789 SDLoc DL(Root);
35790 SDValue Res;
35791
35792 unsigned NumBaseMaskElts = BaseMask.size();
35793 if (NumBaseMaskElts == 1) {
35794 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")((void)0);
35795 return CanonicalizeShuffleInput(RootVT, V1);
35796 }
35797
35798 bool OptForSize = DAG.shouldOptForSize();
35799 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
35800 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
35801 (RootVT.isFloatingPoint() && Depth >= 1) ||
35802 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
35803
35804 // Don't combine if we are a AVX512/EVEX target and the mask element size
35805 // is different from the root element size - this would prevent writemasks
35806 // from being reused.
35807 bool IsMaskedShuffle = false;
35808 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
35809 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
35810 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
35811 IsMaskedShuffle = true;
35812 }
35813 }
35814
35815 // If we are shuffling a broadcast (and not introducing zeros) then
35816 // we can just use the broadcast directly. This works for smaller broadcast
35817 // elements as well as they already repeat across each mask element
35818 if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35819 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35820 V1.getValueSizeInBits() >= RootSizeInBits) {
35821 return CanonicalizeShuffleInput(RootVT, V1);
35822 }
35823
35824 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
35825 // etc. can be simplified.
35826 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
35827 SmallVector<int> ScaledMask, IdentityMask;
35828 unsigned NumElts = VT1.getVectorNumElements();
35829 if (BaseMask.size() <= NumElts &&
35830 scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
35831 for (unsigned i = 0; i != NumElts; ++i)
35832 IdentityMask.push_back(i);
35833 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
35834 return CanonicalizeShuffleInput(RootVT, V1);
35835 }
35836 }
35837
35838 // Handle 128/256-bit lane shuffles of 512-bit vectors.
35839 if (RootVT.is512BitVector() &&
35840 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
35841 // If the upper subvectors are zeroable, then an extract+insert is more
35842 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35843 // to zero the upper subvectors.
35844 if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
35845 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35846 return SDValue(); // Nothing to do!
35847 assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&((void)0)
35848 "Unexpected lane shuffle")((void)0);
35849 Res = CanonicalizeShuffleInput(RootVT, V1);
35850 unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
35851 bool UseZero = isAnyZero(BaseMask);
35852 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35853 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35854 }
35855
35856 // Narrow shuffle mask to v4x128.
35857 SmallVector<int, 4> Mask;
35858 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")((void)0);
35859 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
35860
35861 // Try to lower to vshuf64x2/vshuf32x4.
35862 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
35863 SDValue V1, SDValue V2, SelectionDAG &DAG) {
35864 unsigned PermMask = 0;
35865 // Insure elements came from the same Op.
35866 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
35867 for (int i = 0; i < 4; ++i) {
35868 assert(Mask[i] >= -1 && "Illegal shuffle sentinel value")((void)0);
35869 if (Mask[i] < 0)
35870 continue;
35871
35872 SDValue Op = Mask[i] >= 4 ? V2 : V1;
35873 unsigned OpIndex = i / 2;
35874 if (Ops[OpIndex].isUndef())
35875 Ops[OpIndex] = Op;
35876 else if (Ops[OpIndex] != Op)
35877 return SDValue();
35878
35879 // Convert the 128-bit shuffle mask selection values into 128-bit
35880 // selection bits defined by a vshuf64x2 instruction's immediate control
35881 // byte.
35882 PermMask |= (Mask[i] % 4) << (i * 2);
35883 }
35884
35885 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
35886 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
35887 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
35888 DAG.getTargetConstant(PermMask, DL, MVT::i8));
35889 };
35890
35891 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
35892 // doesn't work because our mask is for 128 bits and we don't have an MVT
35893 // to match that.
35894 bool PreferPERMQ =
35895 UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
35896 isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
35897 isUndefOrInRange(Mask[3], 2, 4) &&
35898 (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
35899 (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
35900
35901 if (!isAnyZero(Mask) && !PreferPERMQ) {
35902 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35903 return SDValue(); // Nothing to do!
35904 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
35905 if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
35906 return DAG.getBitcast(RootVT, V);
35907 }
35908 }
35909
35910 // Handle 128-bit lane shuffles of 256-bit vectors.
35911 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
35912 // If the upper half is zeroable, then an extract+insert is more optimal
35913 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35914 // zero the upper half.
35915 if (isUndefOrZero(BaseMask[1])) {
35916 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35917 return SDValue(); // Nothing to do!
35918 assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle")((void)0);
35919 Res = CanonicalizeShuffleInput(RootVT, V1);
35920 Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
35921 return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
35922 DL, 256);
35923 }
35924
35925 // If we're splatting the low subvector, an insert-subvector 'concat'
35926 // pattern is quicker than VPERM2X128.
35927 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
35928 if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
35929 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35930 return SDValue(); // Nothing to do!
35931 Res = CanonicalizeShuffleInput(RootVT, V1);
35932 Res = extractSubVector(Res, 0, DAG, DL, 128);
35933 return concatSubVectors(Res, Res, DAG, DL);
35934 }
35935
35936 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
35937 return SDValue(); // Nothing to do!
35938
35939 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
35940 // we need to use the zeroing feature.
35941 // Prefer blends for sequential shuffles unless we are optimizing for size.
35942 if (UnaryShuffle &&
35943 !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
35944 (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
35945 unsigned PermMask = 0;
35946 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
35947 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
35948 return DAG.getNode(
35949 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35950 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
35951 }
35952
35953 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35954 return SDValue(); // Nothing to do!
35955
35956 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
35957 if (!UnaryShuffle && !IsMaskedShuffle) {
35958 assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&((void)0)
35959 "Unexpected shuffle sentinel value")((void)0);
35960 // Prefer blends to X86ISD::VPERM2X128.
35961 if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35962 (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35963 unsigned PermMask = 0;
35964 PermMask |= ((BaseMask[0] & 3) << 0);
35965 PermMask |= ((BaseMask[1] & 3) << 4);
35966 SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
35967 SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
35968 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
35969 CanonicalizeShuffleInput(RootVT, LHS),
35970 CanonicalizeShuffleInput(RootVT, RHS),
35971 DAG.getTargetConstant(PermMask, DL, MVT::i8));
35972 }
35973 }
35974 }
35975
35976 // For masks that have been widened to 128-bit elements or more,
35977 // narrow back down to 64-bit elements.
35978 SmallVector<int, 64> Mask;
35979 if (BaseMaskEltSizeInBits > 64) {
35980 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")((void)0);
35981 int MaskScale = BaseMaskEltSizeInBits / 64;
35982 narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
35983 } else {
35984 Mask.assign(BaseMask.begin(), BaseMask.end());
35985 }
35986
35987 // For masked shuffles, we're trying to match the root width for better
35988 // writemask folding, attempt to scale the mask.
35989 // TODO - variable shuffles might need this to be widened again.
35990 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
35991 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")((void)0);
35992 int MaskScale = NumRootElts / Mask.size();
35993 SmallVector<int, 64> ScaledMask;
35994 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
35995 Mask = std::move(ScaledMask);
35996 }
35997
35998 unsigned NumMaskElts = Mask.size();
35999 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
36000
36001 // Determine the effective mask value type.
36002 FloatDomain &= (32 <= MaskEltSizeInBits);
36003 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
36004 : MVT::getIntegerVT(MaskEltSizeInBits);
36005 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
36006
36007 // Only allow legal mask types.
36008 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36009 return SDValue();
36010
36011 // Attempt to match the mask against known shuffle patterns.
36012 MVT ShuffleSrcVT, ShuffleVT;
36013 unsigned Shuffle, PermuteImm;
36014
36015 // Which shuffle domains are permitted?
36016 // Permit domain crossing at higher combine depths.
36017 // TODO: Should we indicate which domain is preferred if both are allowed?
36018 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
36019 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
36020 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
36021
36022 // Determine zeroable mask elements.
36023 APInt KnownUndef, KnownZero;
36024 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
36025 APInt Zeroable = KnownUndef | KnownZero;
36026
36027 if (UnaryShuffle) {
36028 // Attempt to match against broadcast-from-vector.
36029 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
36030 if ((Subtarget.hasAVX2() ||
36031 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
36032 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
36033 if (isUndefOrEqual(Mask, 0)) {
36034 if (V1.getValueType() == MaskVT &&
36035 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36036 MayFoldLoad(V1.getOperand(0))) {
36037 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36038 return SDValue(); // Nothing to do!
36039 Res = V1.getOperand(0);
36040 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36041 return DAG.getBitcast(RootVT, Res);
36042 }
36043 if (Subtarget.hasAVX2()) {
36044 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36045 return SDValue(); // Nothing to do!
36046 Res = CanonicalizeShuffleInput(MaskVT, V1);
36047 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36048 return DAG.getBitcast(RootVT, Res);
36049 }
36050 }
36051 }
36052
36053 SDValue NewV1 = V1; // Save operand in case early exit happens.
36054 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36055 DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36056 ShuffleVT) &&
36057 (!IsMaskedShuffle ||
36058 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36059 if (Depth == 0 && Root.getOpcode() == Shuffle)
36060 return SDValue(); // Nothing to do!
36061 Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36062 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
36063 return DAG.getBitcast(RootVT, Res);
36064 }
36065
36066 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36067 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
36068 PermuteImm) &&
36069 (!IsMaskedShuffle ||
36070 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36071 if (Depth == 0 && Root.getOpcode() == Shuffle)
36072 return SDValue(); // Nothing to do!
36073 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
36074 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
36075 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36076 return DAG.getBitcast(RootVT, Res);
36077 }
36078 }
36079
36080 // Attempt to combine to INSERTPS, but only if the inserted element has come
36081 // from a scalar.
36082 // TODO: Handle other insertions here as well?
36083 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
36084 Subtarget.hasSSE41() &&
36085 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
36086 if (MaskEltSizeInBits == 32) {
36087 SDValue SrcV1 = V1, SrcV2 = V2;
36088 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
36089 DAG) &&
36090 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
36091 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36092 return SDValue(); // Nothing to do!
36093 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36094 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
36095 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
36096 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36097 return DAG.getBitcast(RootVT, Res);
36098 }
36099 }
36100 if (MaskEltSizeInBits == 64 &&
36101 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
36102 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36103 V2.getScalarValueSizeInBits() <= 32) {
36104 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36105 return SDValue(); // Nothing to do!
36106 PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
36107 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36108 CanonicalizeShuffleInput(MVT::v4f32, V1),
36109 CanonicalizeShuffleInput(MVT::v4f32, V2),
36110 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36111 return DAG.getBitcast(RootVT, Res);
36112 }
36113 }
36114
36115 SDValue NewV1 = V1; // Save operands in case early exit happens.
36116 SDValue NewV2 = V2;
36117 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36118 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36119 ShuffleVT, UnaryShuffle) &&
36120 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36121 if (Depth == 0 && Root.getOpcode() == Shuffle)
36122 return SDValue(); // Nothing to do!
36123 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36124 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
36125 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
36126 return DAG.getBitcast(RootVT, Res);
36127 }
36128
36129 NewV1 = V1; // Save operands in case early exit happens.
36130 NewV2 = V2;
36131 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36132 AllowIntDomain, NewV1, NewV2, DL, DAG,
36133 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
36134 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36135 if (Depth == 0 && Root.getOpcode() == Shuffle)
36136 return SDValue(); // Nothing to do!
36137 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
36138 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
36139 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
36140 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36141 return DAG.getBitcast(RootVT, Res);
36142 }
36143
36144 // Typically from here on, we need an integer version of MaskVT.
36145 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
36146 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
36147
36148 // Annoyingly, SSE4A instructions don't map into the above match helpers.
36149 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
36150 uint64_t BitLen, BitIdx;
36151 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
36152 Zeroable)) {
36153 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
36154 return SDValue(); // Nothing to do!
36155 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36156 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
36157 DAG.getTargetConstant(BitLen, DL, MVT::i8),
36158 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36159 return DAG.getBitcast(RootVT, Res);
36160 }
36161
36162 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
36163 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
36164 return SDValue(); // Nothing to do!
36165 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36166 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
36167 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
36168 DAG.getTargetConstant(BitLen, DL, MVT::i8),
36169 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36170 return DAG.getBitcast(RootVT, Res);
36171 }
36172 }
36173
36174 // Match shuffle against TRUNCATE patterns.
36175 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
36176 // Match against a VTRUNC instruction, accounting for src/dst sizes.
36177 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
36178 Subtarget)) {
36179 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
36180 ShuffleSrcVT.getVectorNumElements();
36181 unsigned Opc =
36182 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
36183 if (Depth == 0 && Root.getOpcode() == Opc)
36184 return SDValue(); // Nothing to do!
36185 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36186 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
36187 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
36188 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
36189 return DAG.getBitcast(RootVT, Res);
36190 }
36191
36192 // Do we need a more general binary truncation pattern?
36193 if (RootSizeInBits < 512 &&
36194 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
36195 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
36196 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
36197 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
36198 if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
36199 return SDValue(); // Nothing to do!
36200 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36201 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
36202 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36203 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
36204 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36205 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
36206 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
36207 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
36208 return DAG.getBitcast(RootVT, Res);
36209 }
36210 }
36211
36212 // Don't try to re-form single instruction chains under any circumstances now
36213 // that we've done encoding canonicalization for them.
36214 if (Depth < 1)
36215 return SDValue();
36216
36217 // Depth threshold above which we can efficiently use variable mask shuffles.
36218 int VariableCrossLaneShuffleDepth =
36219 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
36220 int VariablePerLaneShuffleDepth =
36221 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
36222 AllowVariableCrossLaneMask &=
36223 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
36224 AllowVariablePerLaneMask &=
36225 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
36226 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
36227 // higher depth before combining them.
36228 bool AllowBWIVPERMV3 =
36229 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
36230
36231 bool MaskContainsZeros = isAnyZero(Mask);
36232
36233 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
36234 // If we have a single input lane-crossing shuffle then lower to VPERMV.
36235 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
36236 if (Subtarget.hasAVX2() &&
36237 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
36238 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
36239 Res = CanonicalizeShuffleInput(MaskVT, V1);
36240 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
36241 return DAG.getBitcast(RootVT, Res);
36242 }
36243 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
36244 if ((Subtarget.hasAVX512() &&
36245 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36246 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36247 (Subtarget.hasBWI() &&
36248 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36249 (Subtarget.hasVBMI() &&
36250 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
36251 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36252 V2 = DAG.getUNDEF(MaskVT);
36253 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36254 return DAG.getBitcast(RootVT, Res);
36255 }
36256 }
36257
36258 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
36259 // vector as the second source (non-VLX will pad to 512-bit shuffles).
36260 if (UnaryShuffle && AllowVariableCrossLaneMask &&
36261 ((Subtarget.hasAVX512() &&
36262 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36263 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36264 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
36265 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36266 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36267 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36268 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36269 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36270 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
36271 for (unsigned i = 0; i != NumMaskElts; ++i)
36272 if (Mask[i] == SM_SentinelZero)
36273 Mask[i] = NumMaskElts + i;
36274 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36275 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
36276 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36277 return DAG.getBitcast(RootVT, Res);
36278 }
36279
36280 // If that failed and either input is extracted then try to combine as a
36281 // shuffle with the larger type.
36282 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36283 Inputs, Root, BaseMask, Depth, HasVariableMask,
36284 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
36285 Subtarget))
36286 return WideShuffle;
36287
36288 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
36289 // (non-VLX will pad to 512-bit shuffles).
36290 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
36291 ((Subtarget.hasAVX512() &&
36292 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36293 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36294 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
36295 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
36296 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36297 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36298 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36299 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36300 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36301 V2 = CanonicalizeShuffleInput(MaskVT, V2);
36302 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36303 return DAG.getBitcast(RootVT, Res);
36304 }
36305 return SDValue();
36306 }
36307
36308 // See if we can combine a single input shuffle with zeros to a bit-mask,
36309 // which is much simpler than any shuffle.
36310 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
36311 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
36312 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
36313 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
36314 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
36315 APInt UndefElts(NumMaskElts, 0);
36316 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
36317 for (unsigned i = 0; i != NumMaskElts; ++i) {
36318 int M = Mask[i];
36319 if (M == SM_SentinelUndef) {
36320 UndefElts.setBit(i);
36321 continue;
36322 }
36323 if (M == SM_SentinelZero)
36324 continue;
36325 EltBits[i] = AllOnes;
36326 }
36327 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
36328 Res = CanonicalizeShuffleInput(MaskVT, V1);
36329 unsigned AndOpcode =
36330 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
36331 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
36332 return DAG.getBitcast(RootVT, Res);
36333 }
36334
36335 // If we have a single input shuffle with different shuffle patterns in the
36336 // the 128-bit lanes use the variable mask to VPERMILPS.
36337 // TODO Combine other mask types at higher depths.
36338 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
36339 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
36340 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
36341 SmallVector<SDValue, 16> VPermIdx;
36342 for (int M : Mask) {
36343 SDValue Idx =
36344 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
36345 VPermIdx.push_back(Idx);
36346 }
36347 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
36348 Res = CanonicalizeShuffleInput(MaskVT, V1);
36349 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
36350 return DAG.getBitcast(RootVT, Res);
36351 }
36352
36353 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
36354 // to VPERMIL2PD/VPERMIL2PS.
36355 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
36356 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
36357 MaskVT == MVT::v8f32)) {
36358 // VPERMIL2 Operation.
36359 // Bits[3] - Match Bit.
36360 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
36361 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
36362 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
36363 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
36364 SmallVector<int, 8> VPerm2Idx;
36365 unsigned M2ZImm = 0;
36366 for (int M : Mask) {
36367 if (M == SM_SentinelUndef) {
36368 VPerm2Idx.push_back(-1);
36369 continue;
36370 }
36371 if (M == SM_SentinelZero) {
36372 M2ZImm = 2;
36373 VPerm2Idx.push_back(8);
36374 continue;
36375 }
36376 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
36377 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
36378 VPerm2Idx.push_back(Index);
36379 }
36380 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36381 V2 = CanonicalizeShuffleInput(MaskVT, V2);
36382 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
36383 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
36384 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
36385 return DAG.getBitcast(RootVT, Res);
36386 }
36387
36388 // If we have 3 or more shuffle instructions or a chain involving a variable
36389 // mask, we can replace them with a single PSHUFB instruction profitably.
36390 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
36391 // instructions, but in practice PSHUFB tends to be *very* fast so we're
36392 // more aggressive.
36393 if (UnaryShuffle && AllowVariablePerLaneMask &&
36394 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36395 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
36396 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
36397 SmallVector<SDValue, 16> PSHUFBMask;
36398 int NumBytes = RootVT.getSizeInBits() / 8;
36399 int Ratio = NumBytes / NumMaskElts;
36400 for (int i = 0; i < NumBytes; ++i) {
36401 int M = Mask[i / Ratio];
36402 if (M == SM_SentinelUndef) {
36403 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
36404 continue;
36405 }
36406 if (M == SM_SentinelZero) {
36407 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36408 continue;
36409 }
36410 M = Ratio * M + i % Ratio;
36411 assert((M / 16) == (i / 16) && "Lane crossing detected")((void)0);
36412 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36413 }
36414 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
36415 Res = CanonicalizeShuffleInput(ByteVT, V1);
36416 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
36417 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
36418 return DAG.getBitcast(RootVT, Res);
36419 }
36420
36421 // With XOP, if we have a 128-bit binary input shuffle we can always combine
36422 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
36423 // slower than PSHUFB on targets that support both.
36424 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
36425 Subtarget.hasXOP()) {
36426 // VPPERM Mask Operation
36427 // Bits[4:0] - Byte Index (0 - 31)
36428 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
36429 SmallVector<SDValue, 16> VPPERMMask;
36430 int NumBytes = 16;
36431 int Ratio = NumBytes / NumMaskElts;
36432 for (int i = 0; i < NumBytes; ++i) {
36433 int M = Mask[i / Ratio];
36434 if (M == SM_SentinelUndef) {
36435 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
36436 continue;
36437 }
36438 if (M == SM_SentinelZero) {
36439 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36440 continue;
36441 }
36442 M = Ratio * M + i % Ratio;
36443 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36444 }
36445 MVT ByteVT = MVT::v16i8;
36446 V1 = CanonicalizeShuffleInput(ByteVT, V1);
36447 V2 = CanonicalizeShuffleInput(ByteVT, V2);
36448 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
36449 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
36450 return DAG.getBitcast(RootVT, Res);
36451 }
36452
36453 // If that failed and either input is extracted then try to combine as a
36454 // shuffle with the larger type.
36455 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36456 Inputs, Root, BaseMask, Depth, HasVariableMask,
36457 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
36458 return WideShuffle;
36459
36460 // If we have a dual input shuffle then lower to VPERMV3,
36461 // (non-VLX will pad to 512-bit shuffles)
36462 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
36463 ((Subtarget.hasAVX512() &&
36464 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
36465 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
36466 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
36467 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
36468 MaskVT == MVT::v16i32)) ||
36469 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36470 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
36471 MaskVT == MVT::v32i16)) ||
36472 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36473 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
36474 MaskVT == MVT::v64i8)))) {
36475 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36476 V2 = CanonicalizeShuffleInput(MaskVT, V2);
36477 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36478 return DAG.getBitcast(RootVT, Res);
36479 }
36480
36481 // Failed to find any combines.
36482 return SDValue();
36483}
36484
36485// Combine an arbitrary chain of shuffles + extract_subvectors into a single
36486// instruction if possible.
36487//
36488// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
36489// type size to attempt to combine:
36490// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
36491// -->
36492// extract_subvector(shuffle(x,y,m2),0)
36493static SDValue combineX86ShuffleChainWithExtract(
36494 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
36495 bool HasVariableMask, bool AllowVariableCrossLaneMask,
36496 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36497 const X86Subtarget &Subtarget) {
36498 unsigned NumMaskElts = BaseMask.size();
36499 unsigned NumInputs = Inputs.size();
36500 if (NumInputs == 0)
36501 return SDValue();
36502
36503 EVT RootVT = Root.getValueType();
36504 unsigned RootSizeInBits = RootVT.getSizeInBits();
36505 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")((void)0);
36506
36507 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
36508 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
36509
36510 // Peek through subvectors.
36511 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
36512 unsigned WideSizeInBits = RootSizeInBits;
36513 for (unsigned i = 0; i != NumInputs; ++i) {
36514 SDValue &Src = WideInputs[i];
36515 unsigned &Offset = Offsets[i];
36516 Src = peekThroughBitcasts(Src);
36517 EVT BaseVT = Src.getValueType();
36518 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
36519 Offset += Src.getConstantOperandVal(1);
36520 Src = Src.getOperand(0);
36521 }
36522 WideSizeInBits = std::max(WideSizeInBits,
36523 (unsigned)Src.getValueSizeInBits());
36524 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&((void)0)
36525 "Unexpected subvector extraction")((void)0);
36526 Offset /= BaseVT.getVectorNumElements();
36527 Offset *= NumMaskElts;
36528 }
36529
36530 // Bail if we're always extracting from the lowest subvectors,
36531 // combineX86ShuffleChain should match this for the current width.
36532 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
36533 return SDValue();
36534
36535 unsigned Scale = WideSizeInBits / RootSizeInBits;
36536 assert((WideSizeInBits % RootSizeInBits) == 0 &&((void)0)
36537 "Unexpected subvector extraction")((void)0);
36538
36539 // If the src vector types aren't the same, see if we can extend
36540 // them to match each other.
36541 // TODO: Support different scalar types?
36542 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
36543 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
36544 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
36545 Op.getValueType().getScalarType() != WideSVT;
36546 }))
36547 return SDValue();
36548
36549 for (SDValue &NewInput : WideInputs) {
36550 assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&((void)0)
36551 "Shuffle vector size mismatch")((void)0);
36552 if (WideSizeInBits > NewInput.getValueSizeInBits())
36553 NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
36554 SDLoc(NewInput), WideSizeInBits);
36555 assert(WideSizeInBits == NewInput.getValueSizeInBits() &&((void)0)
36556 "Unexpected subvector extraction")((void)0);
36557 }
36558
36559 // Create new mask for larger type.
36560 for (unsigned i = 1; i != NumInputs; ++i)
36561 Offsets[i] += i * Scale * NumMaskElts;
36562
36563 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
36564 for (int &M : WideMask) {
36565 if (M < 0)
36566 continue;
36567 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
36568 }
36569 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
36570
36571 // Remove unused/repeated shuffle source ops.
36572 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
36573 assert(!WideInputs.empty() && "Shuffle with no inputs detected")((void)0);
36574
36575 if (WideInputs.size() > 2)
36576 return SDValue();
36577
36578 // Increase depth for every upper subvector we've peeked through.
36579 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
36580
36581 // Attempt to combine wider chain.
36582 // TODO: Can we use a better Root?
36583 SDValue WideRoot = WideInputs[0];
36584 if (SDValue WideShuffle =
36585 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
36586 HasVariableMask, AllowVariableCrossLaneMask,
36587 AllowVariablePerLaneMask, DAG, Subtarget)) {
36588 WideShuffle =
36589 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
36590 return DAG.getBitcast(RootVT, WideShuffle);
36591 }
36592 return SDValue();
36593}
36594
36595// Canonicalize the combined shuffle mask chain with horizontal ops.
36596// NOTE: This may update the Ops and Mask.
36597static SDValue canonicalizeShuffleMaskWithHorizOp(
36598 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
36599 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
36600 const X86Subtarget &Subtarget) {
36601 if (Mask.empty() || Ops.empty())
36602 return SDValue();
36603
36604 SmallVector<SDValue> BC;
36605 for (SDValue Op : Ops)
36606 BC.push_back(peekThroughBitcasts(Op));
36607
36608 // All ops must be the same horizop + type.
36609 SDValue BC0 = BC[0];
36610 EVT VT0 = BC0.getValueType();
36611 unsigned Opcode0 = BC0.getOpcode();
36612 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
36613 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
36614 }))
36615 return SDValue();
36616
36617 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
36618 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
36619 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
36620 if (!isHoriz && !isPack)
36621 return SDValue();
36622
36623 // Do all ops have a single use?
36624 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
36625 return Op.hasOneUse() &&
36626 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
36627 });
36628
36629 int NumElts = VT0.getVectorNumElements();
36630 int NumLanes = VT0.getSizeInBits() / 128;
36631 int NumEltsPerLane = NumElts / NumLanes;
36632 int NumHalfEltsPerLane = NumEltsPerLane / 2;
36633 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
36634 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
36635
36636 if (NumEltsPerLane >= 4 &&
36637 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
36638 SmallVector<int> LaneMask, ScaledMask;
36639 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
36640 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
36641 // See if we can remove the shuffle by resorting the HOP chain so that
36642 // the HOP args are pre-shuffled.
36643 // TODO: Generalize to any sized/depth chain.
36644 // TODO: Add support for PACKSS/PACKUS.
36645 if (isHoriz) {
36646 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
36647 auto GetHOpSrc = [&](int M) {
36648 if (M == SM_SentinelUndef)
36649 return DAG.getUNDEF(VT0);
36650 if (M == SM_SentinelZero)
36651 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
36652 SDValue Src0 = BC[M / 4];
36653 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
36654 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
36655 return Src1.getOperand(M % 2);
36656 return SDValue();
36657 };
36658 SDValue M0 = GetHOpSrc(ScaledMask[0]);
36659 SDValue M1 = GetHOpSrc(ScaledMask[1]);
36660 SDValue M2 = GetHOpSrc(ScaledMask[2]);
36661 SDValue M3 = GetHOpSrc(ScaledMask[3]);
36662 if (M0 && M1 && M2 && M3) {
36663 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
36664 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
36665 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36666 }
36667 }
36668 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
36669 if (Ops.size() >= 2) {
36670 SDValue LHS, RHS;
36671 auto GetHOpSrc = [&](int M, int &OutM) {
36672 // TODO: Support SM_SentinelZero
36673 if (M < 0)
36674 return M == SM_SentinelUndef;
36675 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
36676 if (!LHS || LHS == Src) {
36677 LHS = Src;
36678 OutM = (M % 2);
36679 return true;
36680 }
36681 if (!RHS || RHS == Src) {
36682 RHS = Src;
36683 OutM = (M % 2) + 2;
36684 return true;
36685 }
36686 return false;
36687 };
36688 int PostMask[4] = {-1, -1, -1, -1};
36689 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
36690 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
36691 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
36692 GetHOpSrc(ScaledMask[3], PostMask[3])) {
36693 LHS = DAG.getBitcast(SrcVT, LHS);
36694 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
36695 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36696 // Use SHUFPS for the permute so this will work on SSE3 targets,
36697 // shuffle combining and domain handling will simplify this later on.
36698 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
36699 Res = DAG.getBitcast(ShuffleVT, Res);
36700 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
36701 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
36702 }
36703 }
36704 }
36705 }
36706
36707 if (2 < Ops.size())
36708 return SDValue();
36709
36710 SDValue BC1 = BC[BC.size() - 1];
36711 if (Mask.size() == VT0.getVectorNumElements()) {
36712 // Canonicalize binary shuffles of horizontal ops that use the
36713 // same sources to an unary shuffle.
36714 // TODO: Try to perform this fold even if the shuffle remains.
36715 if (Ops.size() == 2) {
36716 auto ContainsOps = [](SDValue HOp, SDValue Op) {
36717 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
36718 };
36719 // Commute if all BC0's ops are contained in BC1.
36720 if (ContainsOps(BC1, BC0.getOperand(0)) &&
36721 ContainsOps(BC1, BC0.getOperand(1))) {
36722 ShuffleVectorSDNode::commuteMask(Mask);
36723 std::swap(Ops[0], Ops[1]);
36724 std::swap(BC0, BC1);
36725 }
36726
36727 // If BC1 can be represented by BC0, then convert to unary shuffle.
36728 if (ContainsOps(BC0, BC1.getOperand(0)) &&
36729 ContainsOps(BC0, BC1.getOperand(1))) {
36730 for (int &M : Mask) {
36731 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
36732 continue;
36733 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
36734 M -= NumElts + (SubLane * NumHalfEltsPerLane);
36735 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
36736 M += NumHalfEltsPerLane;
36737 }
36738 }
36739 }
36740
36741 // Canonicalize unary horizontal ops to only refer to lower halves.
36742 for (int i = 0; i != NumElts; ++i) {
36743 int &M = Mask[i];
36744 if (isUndefOrZero(M))
36745 continue;
36746 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
36747 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36748 M -= NumHalfEltsPerLane;
36749 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
36750 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36751 M -= NumHalfEltsPerLane;
36752 }
36753 }
36754
36755 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
36756 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
36757 // represents the LHS/RHS inputs for the lower/upper halves.
36758 SmallVector<int, 16> TargetMask128, WideMask128;
36759 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
36760 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
36761 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")((void)0);
36762 bool SingleOp = (Ops.size() == 1);
36763 if (isPack || OneUseOps ||
36764 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
36765 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
36766 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
36767 Lo = Lo.getOperand(WideMask128[0] & 1);
36768 Hi = Hi.getOperand(WideMask128[1] & 1);
36769 if (SingleOp) {
36770 SDValue Undef = DAG.getUNDEF(SrcVT);
36771 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
36772 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
36773 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
36774 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
36775 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
36776 }
36777 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
36778 }
36779 }
36780
36781 return SDValue();
36782}
36783
36784// Attempt to constant fold all of the constant source ops.
36785// Returns true if the entire shuffle is folded to a constant.
36786// TODO: Extend this to merge multiple constant Ops and update the mask.
36787static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
36788 ArrayRef<int> Mask, SDValue Root,
36789 bool HasVariableMask,
36790 SelectionDAG &DAG,
36791 const X86Subtarget &Subtarget) {
36792 MVT VT = Root.getSimpleValueType();
36793
36794 unsigned SizeInBits = VT.getSizeInBits();
36795 unsigned NumMaskElts = Mask.size();
36796 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
36797 unsigned NumOps = Ops.size();
36798
36799 // Extract constant bits from each source op.
36800 bool OneUseConstantOp = false;
36801 SmallVector<APInt, 16> UndefEltsOps(NumOps);
36802 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
36803 for (unsigned i = 0; i != NumOps; ++i) {
36804 SDValue SrcOp = Ops[i];
36805 OneUseConstantOp |= SrcOp.hasOneUse();
36806 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
36807 RawBitsOps[i]))
36808 return SDValue();
36809 }
36810
36811 // Only fold if at least one of the constants is only used once or
36812 // the combined shuffle has included a variable mask shuffle, this
36813 // is to avoid constant pool bloat.
36814 if (!OneUseConstantOp && !HasVariableMask)
36815 return SDValue();
36816
36817 // Shuffle the constant bits according to the mask.
36818 SDLoc DL(Root);
36819 APInt UndefElts(NumMaskElts, 0);
36820 APInt ZeroElts(NumMaskElts, 0);
36821 APInt ConstantElts(NumMaskElts, 0);
36822 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
36823 APInt::getNullValue(MaskSizeInBits));
36824 for (unsigned i = 0; i != NumMaskElts; ++i) {
36825 int M = Mask[i];
36826 if (M == SM_SentinelUndef) {
36827 UndefElts.setBit(i);
36828 continue;
36829 } else if (M == SM_SentinelZero) {
36830 ZeroElts.setBit(i);
36831 continue;
36832 }
36833 assert(0 <= M && M < (int)(NumMaskElts * NumOps))((void)0);
36834
36835 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
36836 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
36837
36838 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
36839 if (SrcUndefElts[SrcMaskIdx]) {
36840 UndefElts.setBit(i);
36841 continue;
36842 }
36843
36844 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
36845 APInt &Bits = SrcEltBits[SrcMaskIdx];
36846 if (!Bits) {
36847 ZeroElts.setBit(i);
36848 continue;
36849 }
36850
36851 ConstantElts.setBit(i);
36852 ConstantBitData[i] = Bits;
36853 }
36854 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())((void)0);
36855
36856 // Attempt to create a zero vector.
36857 if ((UndefElts | ZeroElts).isAllOnesValue())
36858 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
36859
36860 // Create the constant data.
36861 MVT MaskSVT;
36862 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
36863 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
36864 else
36865 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
36866
36867 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
36868 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36869 return SDValue();
36870
36871 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
36872 return DAG.getBitcast(VT, CstOp);
36873}
36874
36875namespace llvm {
36876 namespace X86 {
36877 enum {
36878 MaxShuffleCombineDepth = 8
36879 };
36880 }
36881} // namespace llvm
36882
36883/// Fully generic combining of x86 shuffle instructions.
36884///
36885/// This should be the last combine run over the x86 shuffle instructions. Once
36886/// they have been fully optimized, this will recursively consider all chains
36887/// of single-use shuffle instructions, build a generic model of the cumulative
36888/// shuffle operation, and check for simpler instructions which implement this
36889/// operation. We use this primarily for two purposes:
36890///
36891/// 1) Collapse generic shuffles to specialized single instructions when
36892/// equivalent. In most cases, this is just an encoding size win, but
36893/// sometimes we will collapse multiple generic shuffles into a single
36894/// special-purpose shuffle.
36895/// 2) Look for sequences of shuffle instructions with 3 or more total
36896/// instructions, and replace them with the slightly more expensive SSSE3
36897/// PSHUFB instruction if available. We do this as the last combining step
36898/// to ensure we avoid using PSHUFB if we can implement the shuffle with
36899/// a suitable short sequence of other instructions. The PSHUFB will either
36900/// use a register or have to read from memory and so is slightly (but only
36901/// slightly) more expensive than the other shuffle instructions.
36902///
36903/// Because this is inherently a quadratic operation (for each shuffle in
36904/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
36905/// This should never be an issue in practice as the shuffle lowering doesn't
36906/// produce sequences of more than 8 instructions.
36907///
36908/// FIXME: We will currently miss some cases where the redundant shuffling
36909/// would simplify under the threshold for PSHUFB formation because of
36910/// combine-ordering. To fix this, we should do the redundant instruction
36911/// combining in this recursive walk.
36912static SDValue combineX86ShufflesRecursively(
36913 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
36914 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
36915 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
36916 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36917 const X86Subtarget &Subtarget) {
36918 assert(RootMask.size() > 0 &&((void)0)
36919 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&((void)0)
36920 "Illegal shuffle root mask")((void)0);
36921 assert(Root.getSimpleValueType().isVector() &&((void)0)
36922 "Shuffles operate on vector types!")((void)0);
36923 unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
36924
36925 // Bound the depth of our recursive combine because this is ultimately
36926 // quadratic in nature.
36927 if (Depth >= MaxDepth)
36928 return SDValue();
36929
36930 // Directly rip through bitcasts to find the underlying operand.
36931 SDValue Op = SrcOps[SrcOpIndex];
36932 Op = peekThroughOneUseBitcasts(Op);
36933
36934 EVT VT = Op.getValueType();
36935 if (!VT.isVector() || !VT.isSimple())
36936 return SDValue(); // Bail if we hit a non-simple non-vector.
36937
36938 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&((void)0)
36939 "Can only combine shuffles upto size of the root op.")((void)0);
36940
36941 // Extract target shuffle mask and resolve sentinels and inputs.
36942 // TODO - determine Op's demanded elts from RootMask.
36943 SmallVector<int, 64> OpMask;
36944 SmallVector<SDValue, 2> OpInputs;
36945 APInt OpUndef, OpZero;
36946 APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
36947 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
36948 if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
36949 OpZero, DAG, Depth, false))
36950 return SDValue();
36951
36952 // Shuffle inputs must not be larger than the shuffle result.
36953 // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
36954 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
36955 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
36956 }))
36957 return SDValue();
36958
36959 // If the shuffle result was smaller than the root, we need to adjust the
36960 // mask indices and pad the mask with undefs.
36961 if (RootSizeInBits > VT.getSizeInBits()) {
36962 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
36963 unsigned OpMaskSize = OpMask.size();
36964 if (OpInputs.size() > 1) {
36965 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
36966 for (int &M : OpMask) {
36967 if (M < 0)
36968 continue;
36969 int EltIdx = M % OpMaskSize;
36970 int OpIdx = M / OpMaskSize;
36971 M = (PaddedMaskSize * OpIdx) + EltIdx;
36972 }
36973 }
36974 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
36975 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
36976 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
36977 }
36978
36979 SmallVector<int, 64> Mask;
36980 SmallVector<SDValue, 16> Ops;
36981
36982 // We don't need to merge masks if the root is empty.
36983 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
36984 if (EmptyRoot) {
36985 // Only resolve zeros if it will remove an input, otherwise we might end
36986 // up in an infinite loop.
36987 bool ResolveKnownZeros = true;
36988 if (!OpZero.isNullValue()) {
36989 APInt UsedInputs = APInt::getNullValue(OpInputs.size());
36990 for (int i = 0, e = OpMask.size(); i != e; ++i) {
36991 int M = OpMask[i];
36992 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
36993 continue;
36994 UsedInputs.setBit(M / OpMask.size());
36995 if (UsedInputs.isAllOnesValue()) {
36996 ResolveKnownZeros = false;
36997 break;
36998 }
36999 }
37000 }
37001 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
37002 ResolveKnownZeros);
37003
37004 Mask = OpMask;
37005 Ops.append(OpInputs.begin(), OpInputs.end());
37006 } else {
37007 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
37008
37009 // Add the inputs to the Ops list, avoiding duplicates.
37010 Ops.append(SrcOps.begin(), SrcOps.end());
37011
37012 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
37013 // Attempt to find an existing match.
37014 SDValue InputBC = peekThroughBitcasts(Input);
37015 for (int i = 0, e = Ops.size(); i < e; ++i)
37016 if (InputBC == peekThroughBitcasts(Ops[i]))
37017 return i;
37018 // Match failed - should we replace an existing Op?
37019 if (InsertionPoint >= 0) {
37020 Ops[InsertionPoint] = Input;
37021 return InsertionPoint;
37022 }
37023 // Add to the end of the Ops list.
37024 Ops.push_back(Input);
37025 return Ops.size() - 1;
37026 };
37027
37028 SmallVector<int, 2> OpInputIdx;
37029 for (SDValue OpInput : OpInputs)
37030 OpInputIdx.push_back(
37031 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
37032
37033 assert(((RootMask.size() > OpMask.size() &&((void)0)
37034 RootMask.size() % OpMask.size() == 0) ||((void)0)
37035 (OpMask.size() > RootMask.size() &&((void)0)
37036 OpMask.size() % RootMask.size() == 0) ||((void)0)
37037 OpMask.size() == RootMask.size()) &&((void)0)
37038 "The smaller number of elements must divide the larger.")((void)0);
37039
37040 // This function can be performance-critical, so we rely on the power-of-2
37041 // knowledge that we have about the mask sizes to replace div/rem ops with
37042 // bit-masks and shifts.
37043 assert(isPowerOf2_32(RootMask.size()) &&((void)0)
37044 "Non-power-of-2 shuffle mask sizes")((void)0);
37045 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")((void)0);
37046 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
37047 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
37048
37049 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
37050 unsigned RootRatio =
37051 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
37052 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
37053 assert((RootRatio == 1 || OpRatio == 1) &&((void)0)
37054 "Must not have a ratio for both incoming and op masks!")((void)0);
37055
37056 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")((void)0);
37057 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")((void)0);
37058 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")((void)0);
37059 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
37060 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
37061
37062 Mask.resize(MaskWidth, SM_SentinelUndef);
37063
37064 // Merge this shuffle operation's mask into our accumulated mask. Note that
37065 // this shuffle's mask will be the first applied to the input, followed by
37066 // the root mask to get us all the way to the root value arrangement. The
37067 // reason for this order is that we are recursing up the operation chain.
37068 for (unsigned i = 0; i < MaskWidth; ++i) {
37069 unsigned RootIdx = i >> RootRatioLog2;
37070 if (RootMask[RootIdx] < 0) {
37071 // This is a zero or undef lane, we're done.
37072 Mask[i] = RootMask[RootIdx];
37073 continue;
37074 }
37075
37076 unsigned RootMaskedIdx =
37077 RootRatio == 1
37078 ? RootMask[RootIdx]
37079 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
37080
37081 // Just insert the scaled root mask value if it references an input other
37082 // than the SrcOp we're currently inserting.
37083 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
37084 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
37085 Mask[i] = RootMaskedIdx;
37086 continue;
37087 }
37088
37089 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
37090 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
37091 if (OpMask[OpIdx] < 0) {
37092 // The incoming lanes are zero or undef, it doesn't matter which ones we
37093 // are using.
37094 Mask[i] = OpMask[OpIdx];
37095 continue;
37096 }
37097
37098 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
37099 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
37100 : (OpMask[OpIdx] << OpRatioLog2) +
37101 (RootMaskedIdx & (OpRatio - 1));
37102
37103 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
37104 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
37105 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")((void)0);
37106 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
37107
37108 Mask[i] = OpMaskedIdx;
37109 }
37110 }
37111
37112 // Remove unused/repeated shuffle source ops.
37113 resolveTargetShuffleInputsAndMask(Ops, Mask);
37114
37115 // Handle the all undef/zero/ones cases early.
37116 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
37117 return DAG.getUNDEF(Root.getValueType());
37118 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
37119 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
37120 SDLoc(Root));
37121 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
37122 none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
37123 return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
37124
37125 assert(!Ops.empty() && "Shuffle with no inputs detected")((void)0);
37126 HasVariableMask |= IsOpVariableMask;
37127
37128 // Update the list of shuffle nodes that have been combined so far.
37129 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
37130 SrcNodes.end());
37131 CombinedNodes.push_back(Op.getNode());
37132
37133 // See if we can recurse into each shuffle source op (if it's a target
37134 // shuffle). The source op should only be generally combined if it either has
37135 // a single use (i.e. current Op) or all its users have already been combined,
37136 // if not then we can still combine but should prevent generation of variable
37137 // shuffles to avoid constant pool bloat.
37138 // Don't recurse if we already have more source ops than we can combine in
37139 // the remaining recursion depth.
37140 if (Ops.size() < (MaxDepth - Depth)) {
37141 for (int i = 0, e = Ops.size(); i < e; ++i) {
37142 // For empty roots, we need to resolve zeroable elements before combining
37143 // them with other shuffles.
37144 SmallVector<int, 64> ResolvedMask = Mask;
37145 if (EmptyRoot)
37146 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
37147 bool AllowCrossLaneVar = false;
37148 bool AllowPerLaneVar = false;
37149 if (Ops[i].getNode()->hasOneUse() ||
37150 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
37151 AllowCrossLaneVar = AllowVariableCrossLaneMask;
37152 AllowPerLaneVar = AllowVariablePerLaneMask;
37153 }
37154 if (SDValue Res = combineX86ShufflesRecursively(
37155 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
37156 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
37157 Subtarget))
37158 return Res;
37159 }
37160 }
37161
37162 // Attempt to constant fold all of the constant source ops.
37163 if (SDValue Cst = combineX86ShufflesConstants(
37164 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
37165 return Cst;
37166
37167 // If constant fold failed and we only have constants - then we have
37168 // multiple uses by a single non-variable shuffle - just bail.
37169 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
37170 APInt UndefElts;
37171 SmallVector<APInt> RawBits;
37172 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37173 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
37174 RawBits);
37175 })) {
37176 return SDValue();
37177 }
37178
37179 // Canonicalize the combined shuffle mask chain with horizontal ops.
37180 // NOTE: This will update the Ops and Mask.
37181 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
37182 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
37183 return DAG.getBitcast(Root.getValueType(), HOp);
37184
37185 // Widen any subvector shuffle inputs we've collected.
37186 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
37187 return Op.getValueSizeInBits() < RootSizeInBits;
37188 })) {
37189 for (SDValue &Op : Ops)
37190 if (Op.getValueSizeInBits() < RootSizeInBits)
37191 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
37192 RootSizeInBits);
37193 // Reresolve - we might have repeated subvector sources.
37194 resolveTargetShuffleInputsAndMask(Ops, Mask);
37195 }
37196
37197 // We can only combine unary and binary shuffle mask cases.
37198 if (Ops.size() <= 2) {
37199 // Minor canonicalization of the accumulated shuffle mask to make it easier
37200 // to match below. All this does is detect masks with sequential pairs of
37201 // elements, and shrink them to the half-width mask. It does this in a loop
37202 // so it will reduce the size of the mask to the minimal width mask which
37203 // performs an equivalent shuffle.
37204 while (Mask.size() > 1) {
37205 SmallVector<int, 64> WidenedMask;
37206 if (!canWidenShuffleElements(Mask, WidenedMask))
37207 break;
37208 Mask = std::move(WidenedMask);
37209 }
37210
37211 // Canonicalization of binary shuffle masks to improve pattern matching by
37212 // commuting the inputs.
37213 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
37214 ShuffleVectorSDNode::commuteMask(Mask);
37215 std::swap(Ops[0], Ops[1]);
37216 }
37217
37218 // Finally, try to combine into a single shuffle instruction.
37219 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
37220 AllowVariableCrossLaneMask,
37221 AllowVariablePerLaneMask, DAG, Subtarget);
37222 }
37223
37224 // If that failed and any input is extracted then try to combine as a
37225 // shuffle with the larger type.
37226 return combineX86ShuffleChainWithExtract(
37227 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
37228 AllowVariablePerLaneMask, DAG, Subtarget);
37229}
37230
37231/// Helper entry wrapper to combineX86ShufflesRecursively.
37232static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
37233 const X86Subtarget &Subtarget) {
37234 return combineX86ShufflesRecursively(
37235 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
37236 /*HasVarMask*/ false,
37237 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
37238 Subtarget);
37239}
37240
37241/// Get the PSHUF-style mask from PSHUF node.
37242///
37243/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
37244/// PSHUF-style masks that can be reused with such instructions.
37245static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
37246 MVT VT = N.getSimpleValueType();
37247 SmallVector<int, 4> Mask;
37248 SmallVector<SDValue, 2> Ops;
37249 bool HaveMask =
37250 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
37251 (void)HaveMask;
37252 assert(HaveMask)((void)0);
37253
37254 // If we have more than 128-bits, only the low 128-bits of shuffle mask
37255 // matter. Check that the upper masks are repeats and remove them.
37256 if (VT.getSizeInBits() > 128) {
37257 int LaneElts = 128 / VT.getScalarSizeInBits();
37258#ifndef NDEBUG1
37259 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
37260 for (int j = 0; j < LaneElts; ++j)
37261 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&((void)0)
37262 "Mask doesn't repeat in high 128-bit lanes!")((void)0);
37263#endif
37264 Mask.resize(LaneElts);
37265 }
37266
37267 switch (N.getOpcode()) {
37268 case X86ISD::PSHUFD:
37269 return Mask;
37270 case X86ISD::PSHUFLW:
37271 Mask.resize(4);
37272 return Mask;
37273 case X86ISD::PSHUFHW:
37274 Mask.erase(Mask.begin(), Mask.begin() + 4);
37275 for (int &M : Mask)
37276 M -= 4;
37277 return Mask;
37278 default:
37279 llvm_unreachable("No valid shuffle instruction found!")__builtin_unreachable();
37280 }
37281}
37282
37283/// Search for a combinable shuffle across a chain ending in pshufd.
37284///
37285/// We walk up the chain and look for a combinable shuffle, skipping over
37286/// shuffles that we could hoist this shuffle's transformation past without
37287/// altering anything.
37288static SDValue
37289combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
37290 SelectionDAG &DAG) {
37291 assert(N.getOpcode() == X86ISD::PSHUFD &&((void)0)
37292 "Called with something other than an x86 128-bit half shuffle!")((void)0);
37293 SDLoc DL(N);
37294
37295 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
37296 // of the shuffles in the chain so that we can form a fresh chain to replace
37297 // this one.
37298 SmallVector<SDValue, 8> Chain;
37299 SDValue V = N.getOperand(0);
37300 for (; V.hasOneUse(); V = V.getOperand(0)) {
37301 switch (V.getOpcode()) {
37302 default:
37303 return SDValue(); // Nothing combined!
37304
37305 case ISD::BITCAST:
37306 // Skip bitcasts as we always know the type for the target specific
37307 // instructions.
37308 continue;
37309
37310 case X86ISD::PSHUFD:
37311 // Found another dword shuffle.
37312 break;
37313
37314 case X86ISD::PSHUFLW:
37315 // Check that the low words (being shuffled) are the identity in the
37316 // dword shuffle, and the high words are self-contained.
37317 if (Mask[0] != 0 || Mask[1] != 1 ||
37318 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
37319 return SDValue();
37320
37321 Chain.push_back(V);
37322 continue;
37323
37324 case X86ISD::PSHUFHW:
37325 // Check that the high words (being shuffled) are the identity in the
37326 // dword shuffle, and the low words are self-contained.
37327 if (Mask[2] != 2 || Mask[3] != 3 ||
37328 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
37329 return SDValue();
37330
37331 Chain.push_back(V);
37332 continue;
37333
37334 case X86ISD::UNPCKL:
37335 case X86ISD::UNPCKH:
37336 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
37337 // shuffle into a preceding word shuffle.
37338 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
37339 V.getSimpleValueType().getVectorElementType() != MVT::i16)
37340 return SDValue();
37341
37342 // Search for a half-shuffle which we can combine with.
37343 unsigned CombineOp =
37344 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
37345 if (V.getOperand(0) != V.getOperand(1) ||
37346 !V->isOnlyUserOf(V.getOperand(0).getNode()))
37347 return SDValue();
37348 Chain.push_back(V);
37349 V = V.getOperand(0);
37350 do {
37351 switch (V.getOpcode()) {
37352 default:
37353 return SDValue(); // Nothing to combine.
37354
37355 case X86ISD::PSHUFLW:
37356 case X86ISD::PSHUFHW:
37357 if (V.getOpcode() == CombineOp)
37358 break;
37359
37360 Chain.push_back(V);
37361
37362 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37363 case ISD::BITCAST:
37364 V = V.getOperand(0);
37365 continue;
37366 }
37367 break;
37368 } while (V.hasOneUse());
37369 break;
37370 }
37371 // Break out of the loop if we break out of the switch.
37372 break;
37373 }
37374
37375 if (!V.hasOneUse())
37376 // We fell out of the loop without finding a viable combining instruction.
37377 return SDValue();
37378
37379 // Merge this node's mask and our incoming mask.
37380 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
37381 for (int &M : Mask)
37382 M = VMask[M];
37383 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
37384 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
37385
37386 // Rebuild the chain around this new shuffle.
37387 while (!Chain.empty()) {
37388 SDValue W = Chain.pop_back_val();
37389
37390 if (V.getValueType() != W.getOperand(0).getValueType())
37391 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
37392
37393 switch (W.getOpcode()) {
37394 default:
37395 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")__builtin_unreachable();
37396
37397 case X86ISD::UNPCKL:
37398 case X86ISD::UNPCKH:
37399 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
37400 break;
37401
37402 case X86ISD::PSHUFD:
37403 case X86ISD::PSHUFLW:
37404 case X86ISD::PSHUFHW:
37405 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
37406 break;
37407 }
37408 }
37409 if (V.getValueType() != N.getValueType())
37410 V = DAG.getBitcast(N.getValueType(), V);
37411
37412 // Return the new chain to replace N.
37413 return V;
37414}
37415
37416// Attempt to commute shufps LHS loads:
37417// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
37418static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
37419 SelectionDAG &DAG) {
37420 // TODO: Add vXf64 support.
37421 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
37422 return SDValue();
37423
37424 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
37425 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
37426 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
37427 return SDValue();
37428 SDValue N0 = V.getOperand(0);
37429 SDValue N1 = V.getOperand(1);
37430 unsigned Imm = V.getConstantOperandVal(2);
37431 if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
37432 MayFoldLoad(peekThroughOneUseBitcasts(N1)))
37433 return SDValue();
37434 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
37435 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
37436 DAG.getTargetConstant(Imm, DL, MVT::i8));
37437 };
37438
37439 switch (N.getOpcode()) {
37440 case X86ISD::VPERMILPI:
37441 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
37442 unsigned Imm = N.getConstantOperandVal(1);
37443 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
37444 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37445 }
37446 break;
37447 case X86ISD::SHUFP: {
37448 SDValue N0 = N.getOperand(0);
37449 SDValue N1 = N.getOperand(1);
37450 unsigned Imm = N.getConstantOperandVal(2);
37451 if (N0 == N1) {
37452 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
37453 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
37454 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37455 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
37456 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
37457 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
37458 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
37459 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
37460 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
37461 }
37462 break;
37463 }
37464 }
37465
37466 return SDValue();
37467}
37468
37469// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
37470static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
37471 const SDLoc &DL) {
37472 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37473 EVT ShuffleVT = N.getValueType();
37474
37475 auto IsMergeableWithShuffle = [](SDValue Op) {
37476 // AllZeros/AllOnes constants are freely shuffled and will peek through
37477 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
37478 // merge with target shuffles if it has one use so shuffle combining is
37479 // likely to kick in.
37480 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
37481 ISD::isBuildVectorAllZeros(Op.getNode()) ||
37482 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
37483 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
37484 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
37485 };
37486 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
37487 // Ensure we only shuffle whole vector src elements, unless its a logical
37488 // binops where we can more aggressively move shuffles from dst to src.
37489 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
37490 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
37491 };
37492
37493 unsigned Opc = N.getOpcode();
37494 switch (Opc) {
37495 // Unary and Unary+Permute Shuffles.
37496 case X86ISD::PSHUFB: {
37497 // Don't merge PSHUFB if it contains zero'd elements.
37498 SmallVector<int> Mask;
37499 SmallVector<SDValue> Ops;
37500 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
37501 Mask))
37502 break;
37503 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37504 }
37505 case X86ISD::VBROADCAST:
37506 case X86ISD::MOVDDUP:
37507 case X86ISD::PSHUFD:
37508 case X86ISD::VPERMI:
37509 case X86ISD::VPERMILPI: {
37510 if (N.getOperand(0).getValueType() == ShuffleVT &&
37511 N->isOnlyUserOf(N.getOperand(0).getNode())) {
37512 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37513 unsigned SrcOpcode = N0.getOpcode();
37514 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
37515 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37516 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37517 if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
37518 SDValue LHS, RHS;
37519 Op00 = DAG.getBitcast(ShuffleVT, Op00);
37520 Op01 = DAG.getBitcast(ShuffleVT, Op01);
37521 if (N.getNumOperands() == 2) {
37522 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
37523 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
37524 } else {
37525 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
37526 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
37527 }
37528 EVT OpVT = N0.getValueType();
37529 return DAG.getBitcast(ShuffleVT,
37530 DAG.getNode(SrcOpcode, DL, OpVT,
37531 DAG.getBitcast(OpVT, LHS),
37532 DAG.getBitcast(OpVT, RHS)));
37533 }
37534 }
37535 }
37536 break;
37537 }
37538 // Binary and Binary+Permute Shuffles.
37539 case X86ISD::INSERTPS: {
37540 // Don't merge INSERTPS if it contains zero'd elements.
37541 unsigned InsertPSMask = N.getConstantOperandVal(2);
37542 unsigned ZeroMask = InsertPSMask & 0xF;
37543 if (ZeroMask != 0)
37544 break;
37545 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37546 }
37547 case X86ISD::MOVSD:
37548 case X86ISD::MOVSS:
37549 case X86ISD::BLENDI:
37550 case X86ISD::SHUFP:
37551 case X86ISD::UNPCKH:
37552 case X86ISD::UNPCKL: {
37553 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
37554 N->isOnlyUserOf(N.getOperand(1).getNode())) {
37555 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37556 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
37557 unsigned SrcOpcode = N0.getOpcode();
37558 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
37559 IsSafeToMoveShuffle(N0, SrcOpcode) &&
37560 IsSafeToMoveShuffle(N1, SrcOpcode)) {
37561 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37562 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
37563 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37564 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
37565 // Ensure the total number of shuffles doesn't increase by folding this
37566 // shuffle through to the source ops.
37567 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
37568 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
37569 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
37570 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
37571 SDValue LHS, RHS;
37572 Op00 = DAG.getBitcast(ShuffleVT, Op00);
37573 Op10 = DAG.getBitcast(ShuffleVT, Op10);
37574 Op01 = DAG.getBitcast(ShuffleVT, Op01);
37575 Op11 = DAG.getBitcast(ShuffleVT, Op11);
37576 if (N.getNumOperands() == 3) {
37577 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
37578 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
37579 } else {
37580 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
37581 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
37582 }
37583 EVT OpVT = N0.getValueType();
37584 return DAG.getBitcast(ShuffleVT,
37585 DAG.getNode(SrcOpcode, DL, OpVT,
37586 DAG.getBitcast(OpVT, LHS),
37587 DAG.getBitcast(OpVT, RHS)));
37588 }
37589 }
37590 }
37591 break;
37592 }
37593 }
37594 return SDValue();
37595}
37596
37597/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
37598static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
37599 SelectionDAG &DAG,
37600 const SDLoc &DL) {
37601 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")((void)0);
37602
37603 MVT VT = V.getSimpleValueType();
37604 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
37605 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
37606 unsigned SrcOpc0 = Src0.getOpcode();
37607 unsigned SrcOpc1 = Src1.getOpcode();
37608 EVT SrcVT0 = Src0.getValueType();
37609 EVT SrcVT1 = Src1.getValueType();
37610
37611 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
37612 return SDValue();
37613
37614 switch (SrcOpc0) {
37615 case X86ISD::MOVDDUP: {
37616 SDValue LHS = Src0.getOperand(0);
37617 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37618 SDValue Res =
37619 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
37620 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
37621 return DAG.getBitcast(VT, Res);
37622 }
37623 case X86ISD::VPERMILPI:
37624 // TODO: Handle v4f64 permutes with different low/high lane masks.
37625 if (SrcVT0 == MVT::v4f64) {
37626 uint64_t Mask = Src0.getConstantOperandVal(1);
37627 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
37628 break;
37629 }
37630 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37631 case X86ISD::VSHLI:
37632 case X86ISD::VSRLI:
37633 case X86ISD::VSRAI:
37634 case X86ISD::PSHUFD:
37635 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
37636 SDValue LHS = Src0.getOperand(0);
37637 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37638 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
37639 V.getOperand(2));
37640 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
37641 return DAG.getBitcast(VT, Res);
37642 }
37643 break;
37644 }
37645
37646 return SDValue();
37647}
37648
37649/// Try to combine x86 target specific shuffles.
37650static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37651 TargetLowering::DAGCombinerInfo &DCI,
37652 const X86Subtarget &Subtarget) {
37653 SDLoc DL(N);
37654 MVT VT = N.getSimpleValueType();
37655 SmallVector<int, 4> Mask;
37656 unsigned Opcode = N.getOpcode();
37657
37658 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
37659 return R;
37660
37661 if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
37662 return R;
37663
37664 // Handle specific target shuffles.
37665 switch (Opcode) {
37666 case X86ISD::MOVDDUP: {
37667 SDValue Src = N.getOperand(0);
37668 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
37669 if (VT == MVT::v2f64 && Src.hasOneUse() &&
37670 ISD::isNormalLoad(Src.getNode())) {
37671 LoadSDNode *LN = cast<LoadSDNode>(Src);
37672 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
37673 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
37674 DCI.CombineTo(N.getNode(), Movddup);
37675 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37676 DCI.recursivelyDeleteUnusedNodes(LN);
37677 return N; // Return N so it doesn't get rechecked!
37678 }
37679 }
37680
37681 return SDValue();
37682 }
37683 case X86ISD::VBROADCAST: {
37684 SDValue Src = N.getOperand(0);
37685 SDValue BC = peekThroughBitcasts(Src);
37686 EVT SrcVT = Src.getValueType();
37687 EVT BCVT = BC.getValueType();
37688
37689 // If broadcasting from another shuffle, attempt to simplify it.
37690 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
37691 if (isTargetShuffle(BC.getOpcode()) &&
37692 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
37693 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
37694 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
37695 SM_SentinelUndef);
37696 for (unsigned i = 0; i != Scale; ++i)
37697 DemandedMask[i] = i;
37698 if (SDValue Res = combineX86ShufflesRecursively(
37699 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
37700 X86::MaxShuffleCombineDepth,
37701 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
37702 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
37703 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37704 DAG.getBitcast(SrcVT, Res));
37705 }
37706
37707 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
37708 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
37709 if (Src.getOpcode() == ISD::BITCAST &&
37710 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
37711 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
37712 FixedVectorType::isValidElementType(
37713 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
37714 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
37715 VT.getVectorNumElements());
37716 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
37717 }
37718
37719 // Reduce broadcast source vector to lowest 128-bits.
37720 if (SrcVT.getSizeInBits() > 128)
37721 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37722 extract128BitVector(Src, 0, DAG, DL));
37723
37724 // broadcast(scalar_to_vector(x)) -> broadcast(x).
37725 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
37726 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
37727
37728 // Share broadcast with the longest vector and extract low subvector (free).
37729 // Ensure the same SDValue from the SDNode use is being used.
37730 for (SDNode *User : Src->uses())
37731 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
37732 Src == User->getOperand(0) &&
37733 User->getValueSizeInBits(0).getFixedSize() >
37734 VT.getFixedSizeInBits()) {
37735 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
37736 VT.getSizeInBits());
37737 }
37738
37739 // vbroadcast(scalarload X) -> vbroadcast_load X
37740 // For float loads, extract other uses of the scalar from the broadcast.
37741 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
37742 ISD::isNormalLoad(Src.getNode())) {
37743 LoadSDNode *LN = cast<LoadSDNode>(Src);
37744 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37745 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37746 SDValue BcastLd =
37747 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37748 LN->getMemoryVT(), LN->getMemOperand());
37749 // If the load value is used only by N, replace it via CombineTo N.
37750 bool NoReplaceExtract = Src.hasOneUse();
37751 DCI.CombineTo(N.getNode(), BcastLd);
37752 if (NoReplaceExtract) {
37753 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37754 DCI.recursivelyDeleteUnusedNodes(LN);
37755 } else {
37756 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
37757 DAG.getIntPtrConstant(0, DL));
37758 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
37759 }
37760 return N; // Return N so it doesn't get rechecked!
37761 }
37762
37763 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
37764 // i16. So shrink it ourselves if we can make a broadcast_load.
37765 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
37766 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
37767 assert(Subtarget.hasAVX2() && "Expected AVX2")((void)0);
37768 SDValue TruncIn = Src.getOperand(0);
37769
37770 // If this is a truncate of a non extending load we can just narrow it to
37771 // use a broadcast_load.
37772 if (ISD::isNormalLoad(TruncIn.getNode())) {
37773 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
37774 // Unless its volatile or atomic.
37775 if (LN->isSimple()) {
37776 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37777 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37778 SDValue BcastLd = DAG.getMemIntrinsicNode(
37779 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37780 LN->getPointerInfo(), LN->getOriginalAlign(),
37781 LN->getMemOperand()->getFlags());
37782 DCI.CombineTo(N.getNode(), BcastLd);
37783 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37784 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37785 return N; // Return N so it doesn't get rechecked!
37786 }
37787 }
37788
37789 // If this is a truncate of an i16 extload, we can directly replace it.
37790 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
37791 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
37792 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
37793 if (LN->getMemoryVT().getSizeInBits() == 16) {
37794 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37795 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37796 SDValue BcastLd =
37797 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37798 LN->getMemoryVT(), LN->getMemOperand());
37799 DCI.CombineTo(N.getNode(), BcastLd);
37800 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37801 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37802 return N; // Return N so it doesn't get rechecked!
37803 }
37804 }
37805
37806 // If this is a truncate of load that has been shifted right, we can
37807 // offset the pointer and use a narrower load.
37808 if (TruncIn.getOpcode() == ISD::SRL &&
37809 TruncIn.getOperand(0).hasOneUse() &&
37810 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
37811 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
37812 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
37813 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
37814 // Make sure the shift amount and the load size are divisible by 16.
37815 // Don't do this if the load is volatile or atomic.
37816 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
37817 LN->isSimple()) {
37818 unsigned Offset = ShiftAmt / 8;
37819 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37820 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
37821 TypeSize::Fixed(Offset), DL);
37822 SDValue Ops[] = { LN->getChain(), Ptr };
37823 SDValue BcastLd = DAG.getMemIntrinsicNode(
37824 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37825 LN->getPointerInfo().getWithOffset(Offset),
37826 LN->getOriginalAlign(),
37827 LN->getMemOperand()->getFlags());
37828 DCI.CombineTo(N.getNode(), BcastLd);
37829 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37830 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37831 return N; // Return N so it doesn't get rechecked!
37832 }
37833 }
37834 }
37835
37836 // vbroadcast(vzload X) -> vbroadcast_load X
37837 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
37838 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
37839 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
37840 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37841 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37842 SDValue BcastLd =
37843 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37844 LN->getMemoryVT(), LN->getMemOperand());
37845 DCI.CombineTo(N.getNode(), BcastLd);
37846 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37847 DCI.recursivelyDeleteUnusedNodes(LN);
37848 return N; // Return N so it doesn't get rechecked!
37849 }
37850 }
37851
37852 // vbroadcast(vector load X) -> vbroadcast_load
37853 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
37854 SrcVT == MVT::v4i32) &&
37855 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
37856 LoadSDNode *LN = cast<LoadSDNode>(Src);
37857 // Unless the load is volatile or atomic.
37858 if (LN->isSimple()) {
37859 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37860 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37861 SDValue BcastLd = DAG.getMemIntrinsicNode(
37862 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
37863 LN->getPointerInfo(), LN->getOriginalAlign(),
37864 LN->getMemOperand()->getFlags());
37865 DCI.CombineTo(N.getNode(), BcastLd);
37866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37867 DCI.recursivelyDeleteUnusedNodes(LN);
37868 return N; // Return N so it doesn't get rechecked!
37869 }
37870 }
37871
37872 return SDValue();
37873 }
37874 case X86ISD::VZEXT_MOVL: {
37875 SDValue N0 = N.getOperand(0);
37876
37877 // If this a vzmovl of a full vector load, replace it with a vzload, unless
37878 // the load is volatile.
37879 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
37880 auto *LN = cast<LoadSDNode>(N0);
37881 if (SDValue VZLoad =
37882 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
37883 DCI.CombineTo(N.getNode(), VZLoad);
37884 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37885 DCI.recursivelyDeleteUnusedNodes(LN);
37886 return N;
37887 }
37888 }
37889
37890 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
37891 // and can just use a VZEXT_LOAD.
37892 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
37893 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
37894 auto *LN = cast<MemSDNode>(N0);
37895 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
37896 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37897 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37898 SDValue VZLoad =
37899 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
37900 LN->getMemoryVT(), LN->getMemOperand());
37901 DCI.CombineTo(N.getNode(), VZLoad);
37902 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37903 DCI.recursivelyDeleteUnusedNodes(LN);
37904 return N;
37905 }
37906 }
37907
37908 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
37909 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
37910 // if the upper bits of the i64 are zero.
37911 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37912 N0.getOperand(0).hasOneUse() &&
37913 N0.getOperand(0).getValueType() == MVT::i64) {
37914 SDValue In = N0.getOperand(0);
37915 APInt Mask = APInt::getHighBitsSet(64, 32);
37916 if (DAG.MaskedValueIsZero(In, Mask)) {
37917 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
37918 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
37919 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
37920 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
37921 return DAG.getBitcast(VT, Movl);
37922 }
37923 }
37924
37925 // Load a scalar integer constant directly to XMM instead of transferring an
37926 // immediate value from GPR.
37927 // vzext_movl (scalar_to_vector C) --> load [C,0...]
37928 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37929 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
37930 // Create a vector constant - scalar constant followed by zeros.
37931 EVT ScalarVT = N0.getOperand(0).getValueType();
37932 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
37933 unsigned NumElts = VT.getVectorNumElements();
37934 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
37935 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
37936 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
37937
37938 // Load the vector constant from constant pool.
37939 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
37940 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
37941 MachinePointerInfo MPI =
37942 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
37943 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
37944 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
37945 MachineMemOperand::MOLoad);
37946 }
37947 }
37948
37949 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
37950 // insert into a zero vector. This helps get VZEXT_MOVL closer to
37951 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
37952 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
37953 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
37954 SDValue V = peekThroughOneUseBitcasts(N0);
37955
37956 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
37957 isNullConstant(V.getOperand(2))) {
37958 SDValue In = V.getOperand(1);
37959 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
37960 In.getValueSizeInBits() /
37961 VT.getScalarSizeInBits());
37962 In = DAG.getBitcast(SubVT, In);
37963 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
37964 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
37965 getZeroVector(VT, Subtarget, DAG, DL), Movl,
37966 V.getOperand(2));
37967 }
37968 }
37969
37970 return SDValue();
37971 }
37972 case X86ISD::BLENDI: {
37973 SDValue N0 = N.getOperand(0);
37974 SDValue N1 = N.getOperand(1);
37975
37976 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
37977 // TODO: Handle MVT::v16i16 repeated blend mask.
37978 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
37979 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
37980 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
37981 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
37982 SrcVT.getScalarSizeInBits() >= 32) {
37983 unsigned BlendMask = N.getConstantOperandVal(2);
37984 unsigned Size = VT.getVectorNumElements();
37985 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
37986 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
37987 return DAG.getBitcast(
37988 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
37989 N1.getOperand(0),
37990 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
37991 }
37992 }
37993 return SDValue();
37994 }
37995 case X86ISD::VPERMI: {
37996 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
37997 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
37998 SDValue N0 = N.getOperand(0);
37999 SDValue N1 = N.getOperand(1);
38000 unsigned EltSizeInBits = VT.getScalarSizeInBits();
38001 if (N0.getOpcode() == ISD::BITCAST &&
38002 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
38003 SDValue Src = N0.getOperand(0);
38004 EVT SrcVT = Src.getValueType();
38005 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
38006 return DAG.getBitcast(VT, Res);
38007 }
38008 return SDValue();
38009 }
38010 case X86ISD::VPERM2X128: {
38011 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
38012 SDValue LHS = N->getOperand(0);
38013 SDValue RHS = N->getOperand(1);
38014 if (LHS.getOpcode() == ISD::BITCAST &&
38015 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
38016 EVT SrcVT = LHS.getOperand(0).getValueType();
38017 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
38018 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
38019 DAG.getBitcast(SrcVT, LHS),
38020 DAG.getBitcast(SrcVT, RHS),
38021 N->getOperand(2)));
38022 }
38023 }
38024
38025 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
38026 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
38027 return Res;
38028
38029 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
38030 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
38031 auto FindSubVector128 = [&](unsigned Idx) {
38032 if (Idx > 3)
38033 return SDValue();
38034 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
38035 SmallVector<SDValue> SubOps;
38036 if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
38037 return SubOps[Idx & 1];
38038 unsigned NumElts = Src.getValueType().getVectorNumElements();
38039 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
38040 Src.getOperand(1).getValueSizeInBits() == 128 &&
38041 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
38042 return Src.getOperand(1);
38043 }
38044 return SDValue();
38045 };
38046 unsigned Imm = N.getConstantOperandVal(2);
38047 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
38048 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
38049 MVT SubVT = VT.getHalfNumVectorElementsVT();
38050 SubLo = DAG.getBitcast(SubVT, SubLo);
38051 SubHi = DAG.getBitcast(SubVT, SubHi);
38052 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
38053 }
38054 }
38055 return SDValue();
38056 }
38057 case X86ISD::PSHUFD:
38058 case X86ISD::PSHUFLW:
38059 case X86ISD::PSHUFHW:
38060 Mask = getPSHUFShuffleMask(N);
38061 assert(Mask.size() == 4)((void)0);
38062 break;
38063 case X86ISD::MOVSD:
38064 case X86ISD::MOVSS: {
38065 SDValue N0 = N.getOperand(0);
38066 SDValue N1 = N.getOperand(1);
38067
38068 // Canonicalize scalar FPOps:
38069 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
38070 // If commutable, allow OP(N1[0], N0[0]).
38071 unsigned Opcode1 = N1.getOpcode();
38072 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
38073 Opcode1 == ISD::FDIV) {
38074 SDValue N10 = N1.getOperand(0);
38075 SDValue N11 = N1.getOperand(1);
38076 if (N10 == N0 ||
38077 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
38078 if (N10 != N0)
38079 std::swap(N10, N11);
38080 MVT SVT = VT.getVectorElementType();
38081 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
38082 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
38083 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
38084 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
38085 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
38086 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
38087 }
38088 }
38089
38090 return SDValue();
38091 }
38092 case X86ISD::INSERTPS: {
38093 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")((void)0);
38094 SDValue Op0 = N.getOperand(0);
38095 SDValue Op1 = N.getOperand(1);
38096 unsigned InsertPSMask = N.getConstantOperandVal(2);
38097 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
38098 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
38099 unsigned ZeroMask = InsertPSMask & 0xF;
38100
38101 // If we zero out all elements from Op0 then we don't need to reference it.
38102 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
38103 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
38104 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38105
38106 // If we zero out the element from Op1 then we don't need to reference it.
38107 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
38108 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38109 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38110
38111 // Attempt to merge insertps Op1 with an inner target shuffle node.
38112 SmallVector<int, 8> TargetMask1;
38113 SmallVector<SDValue, 2> Ops1;
38114 APInt KnownUndef1, KnownZero1;
38115 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
38116 KnownZero1)) {
38117 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
38118 // Zero/UNDEF insertion - zero out element and remove dependency.
38119 InsertPSMask |= (1u << DstIdx);
38120 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38121 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38122 }
38123 // Update insertps mask srcidx and reference the source input directly.
38124 int M = TargetMask1[SrcIdx];
38125 assert(0 <= M && M < 8 && "Shuffle index out of range")((void)0);
38126 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
38127 Op1 = Ops1[M < 4 ? 0 : 1];
38128 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38129 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38130 }
38131
38132 // Attempt to merge insertps Op0 with an inner target shuffle node.
38133 SmallVector<int, 8> TargetMask0;
38134 SmallVector<SDValue, 2> Ops0;
38135 APInt KnownUndef0, KnownZero0;
38136 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
38137 KnownZero0)) {
38138 bool Updated = false;
38139 bool UseInput00 = false;
38140 bool UseInput01 = false;
38141 for (int i = 0; i != 4; ++i) {
38142 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
38143 // No change if element is already zero or the inserted element.
38144 continue;
38145 } else if (KnownUndef0[i] || KnownZero0[i]) {
38146 // If the target mask is undef/zero then we must zero the element.
38147 InsertPSMask |= (1u << i);
38148 Updated = true;
38149 continue;
38150 }
38151
38152 // The input vector element must be inline.
38153 int M = TargetMask0[i];
38154 if (M != i && M != (i + 4))
38155 return SDValue();
38156
38157 // Determine which inputs of the target shuffle we're using.
38158 UseInput00 |= (0 <= M && M < 4);
38159 UseInput01 |= (4 <= M);
38160 }
38161
38162 // If we're not using both inputs of the target shuffle then use the
38163 // referenced input directly.
38164 if (UseInput00 && !UseInput01) {
38165 Updated = true;
38166 Op0 = Ops0[0];
38167 } else if (!UseInput00 && UseInput01) {
38168 Updated = true;
38169 Op0 = Ops0[1];
38170 }
38171
38172 if (Updated)
38173 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38174 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38175 }
38176
38177 // If we're inserting an element from a vbroadcast load, fold the
38178 // load into the X86insertps instruction. We need to convert the scalar
38179 // load to a vector and clear the source lane of the INSERTPS control.
38180 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
38181 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
38182 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
38183 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
38184 MemIntr->getBasePtr(),
38185 MemIntr->getMemOperand());
38186 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
38187 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
38188 Load),
38189 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
38190 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
38191 return Insert;
38192 }
38193 }
38194
38195 return SDValue();
38196 }
38197 default:
38198 return SDValue();
38199 }
38200
38201 // Nuke no-op shuffles that show up after combining.
38202 if (isNoopShuffleMask(Mask))
38203 return N.getOperand(0);
38204
38205 // Look for simplifications involving one or two shuffle instructions.
38206 SDValue V = N.getOperand(0);
38207 switch (N.getOpcode()) {
38208 default:
38209 break;
38210 case X86ISD::PSHUFLW:
38211 case X86ISD::PSHUFHW:
38212 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")((void)0);
38213
38214 // See if this reduces to a PSHUFD which is no more expensive and can
38215 // combine with more operations. Note that it has to at least flip the
38216 // dwords as otherwise it would have been removed as a no-op.
38217 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
38218 int DMask[] = {0, 1, 2, 3};
38219 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
38220 DMask[DOffset + 0] = DOffset + 1;
38221 DMask[DOffset + 1] = DOffset + 0;
38222 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
38223 V = DAG.getBitcast(DVT, V);
38224 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
38225 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
38226 return DAG.getBitcast(VT, V);
38227 }
38228
38229 // Look for shuffle patterns which can be implemented as a single unpack.
38230 // FIXME: This doesn't handle the location of the PSHUFD generically, and
38231 // only works when we have a PSHUFD followed by two half-shuffles.
38232 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
38233 (V.getOpcode() == X86ISD::PSHUFLW ||
38234 V.getOpcode() == X86ISD::PSHUFHW) &&
38235 V.getOpcode() != N.getOpcode() &&
38236 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
38237 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
38238 if (D.getOpcode() == X86ISD::PSHUFD) {
38239 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38240 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
38241 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38242 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38243 int WordMask[8];
38244 for (int i = 0; i < 4; ++i) {
38245 WordMask[i + NOffset] = Mask[i] + NOffset;
38246 WordMask[i + VOffset] = VMask[i] + VOffset;
38247 }
38248 // Map the word mask through the DWord mask.
38249 int MappedMask[8];
38250 for (int i = 0; i < 8; ++i)
38251 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
38252 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
38253 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
38254 // We can replace all three shuffles with an unpack.
38255 V = DAG.getBitcast(VT, D.getOperand(0));
38256 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
38257 : X86ISD::UNPCKH,
38258 DL, VT, V, V);
38259 }
38260 }
38261 }
38262
38263 break;
38264
38265 case X86ISD::PSHUFD:
38266 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
38267 return NewN;
38268
38269 break;
38270 }
38271
38272 return SDValue();
38273}
38274
38275/// Checks if the shuffle mask takes subsequent elements
38276/// alternately from two vectors.
38277/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
38278static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
38279
38280 int ParitySrc[2] = {-1, -1};
38281 unsigned Size = Mask.size();
38282 for (unsigned i = 0; i != Size; ++i) {
38283 int M = Mask[i];
38284 if (M < 0)
38285 continue;
38286
38287 // Make sure we are using the matching element from the input.
38288 if ((M % Size) != i)
38289 return false;
38290
38291 // Make sure we use the same input for all elements of the same parity.
38292 int Src = M / Size;
38293 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
38294 return false;
38295 ParitySrc[i % 2] = Src;
38296 }
38297
38298 // Make sure each input is used.
38299 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
38300 return false;
38301
38302 Op0Even = ParitySrc[0] == 0;
38303 return true;
38304}
38305
38306/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
38307/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
38308/// are written to the parameters \p Opnd0 and \p Opnd1.
38309///
38310/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
38311/// so it is easier to generically match. We also insert dummy vector shuffle
38312/// nodes for the operands which explicitly discard the lanes which are unused
38313/// by this operation to try to flow through the rest of the combiner
38314/// the fact that they're unused.
38315static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
38316 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
38317 bool &IsSubAdd) {
38318
38319 EVT VT = N->getValueType(0);
38320 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38321 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
38322 !VT.getSimpleVT().isFloatingPoint())
38323 return false;
38324
38325 // We only handle target-independent shuffles.
38326 // FIXME: It would be easy and harmless to use the target shuffle mask
38327 // extraction tool to support more.
38328 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38329 return false;
38330
38331 SDValue V1 = N->getOperand(0);
38332 SDValue V2 = N->getOperand(1);
38333
38334 // Make sure we have an FADD and an FSUB.
38335 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
38336 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
38337 V1.getOpcode() == V2.getOpcode())
38338 return false;
38339
38340 // If there are other uses of these operations we can't fold them.
38341 if (!V1->hasOneUse() || !V2->hasOneUse())
38342 return false;
38343
38344 // Ensure that both operations have the same operands. Note that we can
38345 // commute the FADD operands.
38346 SDValue LHS, RHS;
38347 if (V1.getOpcode() == ISD::FSUB) {
38348 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
38349 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
38350 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
38351 return false;
38352 } else {
38353 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")((void)0);
38354 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
38355 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
38356 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
38357 return false;
38358 }
38359
38360 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38361 bool Op0Even;
38362 if (!isAddSubOrSubAddMask(Mask, Op0Even))
38363 return false;
38364
38365 // It's a subadd if the vector in the even parity is an FADD.
38366 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
38367 : V2->getOpcode() == ISD::FADD;
38368
38369 Opnd0 = LHS;
38370 Opnd1 = RHS;
38371 return true;
38372}
38373
38374/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
38375static SDValue combineShuffleToFMAddSub(SDNode *N,
38376 const X86Subtarget &Subtarget,
38377 SelectionDAG &DAG) {
38378 // We only handle target-independent shuffles.
38379 // FIXME: It would be easy and harmless to use the target shuffle mask
38380 // extraction tool to support more.
38381 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38382 return SDValue();
38383
38384 MVT VT = N->getSimpleValueType(0);
38385 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38386 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
38387 return SDValue();
38388
38389 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
38390 SDValue Op0 = N->getOperand(0);
38391 SDValue Op1 = N->getOperand(1);
38392 SDValue FMAdd = Op0, FMSub = Op1;
38393 if (FMSub.getOpcode() != X86ISD::FMSUB)
38394 std::swap(FMAdd, FMSub);
38395
38396 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
38397 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
38398 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
38399 FMAdd.getOperand(2) != FMSub.getOperand(2))
38400 return SDValue();
38401
38402 // Check for correct shuffle mask.
38403 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38404 bool Op0Even;
38405 if (!isAddSubOrSubAddMask(Mask, Op0Even))
38406 return SDValue();
38407
38408 // FMAddSub takes zeroth operand from FMSub node.
38409 SDLoc DL(N);
38410 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
38411 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38412 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
38413 FMAdd.getOperand(2));
38414}
38415
38416/// Try to combine a shuffle into a target-specific add-sub or
38417/// mul-add-sub node.
38418static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
38419 const X86Subtarget &Subtarget,
38420 SelectionDAG &DAG) {
38421 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
38422 return V;
38423
38424 SDValue Opnd0, Opnd1;
38425 bool IsSubAdd;
38426 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
38427 return SDValue();
38428
38429 MVT VT = N->getSimpleValueType(0);
38430 SDLoc DL(N);
38431
38432 // Try to generate X86ISD::FMADDSUB node here.
38433 SDValue Opnd2;
38434 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
38435 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38436 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
38437 }
38438
38439 if (IsSubAdd)
38440 return SDValue();
38441
38442 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
38443 // the ADDSUB idiom has been successfully recognized. There are no known
38444 // X86 targets with 512-bit ADDSUB instructions!
38445 if (VT.is512BitVector())
38446 return SDValue();
38447
38448 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
38449}
38450
38451// We are looking for a shuffle where both sources are concatenated with undef
38452// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
38453// if we can express this as a single-source shuffle, that's preferable.
38454static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
38455 const X86Subtarget &Subtarget) {
38456 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
38457 return SDValue();
38458
38459 EVT VT = N->getValueType(0);
38460
38461 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
38462 if (!VT.is128BitVector() && !VT.is256BitVector())
38463 return SDValue();
38464
38465 if (VT.getVectorElementType() != MVT::i32 &&
38466 VT.getVectorElementType() != MVT::i64 &&
38467 VT.getVectorElementType() != MVT::f32 &&
38468 VT.getVectorElementType() != MVT::f64)
38469 return SDValue();
38470
38471 SDValue N0 = N->getOperand(0);
38472 SDValue N1 = N->getOperand(1);
38473
38474 // Check that both sources are concats with undef.
38475 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
38476 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
38477 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
38478 !N1.getOperand(1).isUndef())
38479 return SDValue();
38480
38481 // Construct the new shuffle mask. Elements from the first source retain their
38482 // index, but elements from the second source no longer need to skip an undef.
38483 SmallVector<int, 8> Mask;
38484 int NumElts = VT.getVectorNumElements();
38485
38486 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
38487 for (int Elt : SVOp->getMask())
38488 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
38489
38490 SDLoc DL(N);
38491 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
38492 N1.getOperand(0));
38493 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
38494}
38495
38496/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
38497/// low half of each source vector and does not set any high half elements in
38498/// the destination vector, narrow the shuffle to half its original size.
38499static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
38500 if (!Shuf->getValueType(0).isSimple())
38501 return SDValue();
38502 MVT VT = Shuf->getSimpleValueType(0);
38503 if (!VT.is256BitVector() && !VT.is512BitVector())
38504 return SDValue();
38505
38506 // See if we can ignore all of the high elements of the shuffle.
38507 ArrayRef<int> Mask = Shuf->getMask();
38508 if (!isUndefUpperHalf(Mask))
38509 return SDValue();
38510
38511 // Check if the shuffle mask accesses only the low half of each input vector
38512 // (half-index output is 0 or 2).
38513 int HalfIdx1, HalfIdx2;
38514 SmallVector<int, 8> HalfMask(Mask.size() / 2);
38515 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
38516 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
38517 return SDValue();
38518
38519 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
38520 // The trick is knowing that all of the insert/extract are actually free
38521 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
38522 // of narrow inputs into a narrow output, and that is always cheaper than
38523 // the wide shuffle that we started with.
38524 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
38525 Shuf->getOperand(1), HalfMask, HalfIdx1,
38526 HalfIdx2, false, DAG, /*UseConcat*/true);
38527}
38528
38529static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
38530 TargetLowering::DAGCombinerInfo &DCI,
38531 const X86Subtarget &Subtarget) {
38532 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
38533 if (SDValue V = narrowShuffle(Shuf, DAG))
38534 return V;
38535
38536 // If we have legalized the vector types, look for blends of FADD and FSUB
38537 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
38538 SDLoc dl(N);
38539 EVT VT = N->getValueType(0);
38540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38541 if (TLI.isTypeLegal(VT))
38542 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
38543 return AddSub;
38544
38545 // Attempt to combine into a vector load/broadcast.
38546 if (SDValue LD = combineToConsecutiveLoads(
38547 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
38548 return LD;
38549
38550 // For AVX2, we sometimes want to combine
38551 // (vector_shuffle <mask> (concat_vectors t1, undef)
38552 // (concat_vectors t2, undef))
38553 // Into:
38554 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
38555 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
38556 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
38557 return ShufConcat;
38558
38559 if (isTargetShuffle(N->getOpcode())) {
38560 SDValue Op(N, 0);
38561 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
38562 return Shuffle;
38563
38564 // Try recursively combining arbitrary sequences of x86 shuffle
38565 // instructions into higher-order shuffles. We do this after combining
38566 // specific PSHUF instruction sequences into their minimal form so that we
38567 // can evaluate how many specialized shuffle instructions are involved in
38568 // a particular chain.
38569 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38570 return Res;
38571
38572 // Simplify source operands based on shuffle mask.
38573 // TODO - merge this into combineX86ShufflesRecursively.
38574 APInt KnownUndef, KnownZero;
38575 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
38576 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
38577 DCI))
38578 return SDValue(N, 0);
38579 }
38580
38581 return SDValue();
38582}
38583
38584// Simplify variable target shuffle masks based on the demanded elements.
38585// TODO: Handle DemandedBits in mask indices as well?
38586bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
38587 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
38588 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
38589 // If we're demanding all elements don't bother trying to simplify the mask.
38590 unsigned NumElts = DemandedElts.getBitWidth();
38591 if (DemandedElts.isAllOnesValue())
38592 return false;
38593
38594 SDValue Mask = Op.getOperand(MaskIndex);
38595 if (!Mask.hasOneUse())
38596 return false;
38597
38598 // Attempt to generically simplify the variable shuffle mask.
38599 APInt MaskUndef, MaskZero;
38600 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
38601 Depth + 1))
38602 return true;
38603
38604 // Attempt to extract+simplify a (constant pool load) shuffle mask.
38605 // TODO: Support other types from getTargetShuffleMaskIndices?
38606 SDValue BC = peekThroughOneUseBitcasts(Mask);
38607 EVT BCVT = BC.getValueType();
38608 auto *Load = dyn_cast<LoadSDNode>(BC);
38609 if (!Load)
38610 return false;
38611
38612 const Constant *C = getTargetConstantFromNode(Load);
38613 if (!C)
38614 return false;
38615
38616 Type *CTy = C->getType();
38617 if (!CTy->isVectorTy() ||
38618 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
38619 return false;
38620
38621 // Handle scaling for i64 elements on 32-bit targets.
38622 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
38623 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
38624 return false;
38625 unsigned Scale = NumCstElts / NumElts;
38626
38627 // Simplify mask if we have an undemanded element that is not undef.
38628 bool Simplified = false;
38629 SmallVector<Constant *, 32> ConstVecOps;
38630 for (unsigned i = 0; i != NumCstElts; ++i) {
38631 Constant *Elt = C->getAggregateElement(i);
38632 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
38633 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
38634 Simplified = true;
38635 continue;
38636 }
38637 ConstVecOps.push_back(Elt);
38638 }
38639 if (!Simplified)
38640 return false;
38641
38642 // Generate new constant pool entry + legalize immediately for the load.
38643 SDLoc DL(Op);
38644 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
38645 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
38646 SDValue NewMask = TLO.DAG.getLoad(
38647 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
38648 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
38649 Load->getAlign());
38650 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
38651}
38652
38653bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
38654 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
38655 TargetLoweringOpt &TLO, unsigned Depth) const {
38656 int NumElts = DemandedElts.getBitWidth();
38657 unsigned Opc = Op.getOpcode();
38658 EVT VT = Op.getValueType();
38659
38660 // Handle special case opcodes.
38661 switch (Opc) {
38662 case X86ISD::PMULDQ:
38663 case X86ISD::PMULUDQ: {
38664 APInt LHSUndef, LHSZero;
38665 APInt RHSUndef, RHSZero;
38666 SDValue LHS = Op.getOperand(0);
38667 SDValue RHS = Op.getOperand(1);
38668 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
38669 Depth + 1))
38670 return true;
38671 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
38672 Depth + 1))
38673 return true;
38674 // Multiply by zero.
38675 KnownZero = LHSZero | RHSZero;
38676 break;
38677 }
38678 case X86ISD::VSHL:
38679 case X86ISD::VSRL:
38680 case X86ISD::VSRA: {
38681 // We only need the bottom 64-bits of the (128-bit) shift amount.
38682 SDValue Amt = Op.getOperand(1);
38683 MVT AmtVT = Amt.getSimpleValueType();
38684 assert(AmtVT.is128BitVector() && "Unexpected value type")((void)0);
38685
38686 // If we reuse the shift amount just for sse shift amounts then we know that
38687 // only the bottom 64-bits are only ever used.
38688 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
38689 unsigned UseOpc = Use->getOpcode();
38690 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
38691 UseOpc == X86ISD::VSRA) &&
38692 Use->getOperand(0) != Amt;
38693 });
38694
38695 APInt AmtUndef, AmtZero;
38696 unsigned NumAmtElts = AmtVT.getVectorNumElements();
38697 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
38698 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
38699 Depth + 1, AssumeSingleUse))
38700 return true;
38701 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38702 }
38703 case X86ISD::VSHLI:
38704 case X86ISD::VSRLI:
38705 case X86ISD::VSRAI: {
38706 SDValue Src = Op.getOperand(0);
38707 APInt SrcUndef;
38708 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
38709 Depth + 1))
38710 return true;
38711
38712 // Aggressively peek through ops to get at the demanded elts.
38713 if (!DemandedElts.isAllOnesValue())
38714 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38715 Src, DemandedElts, TLO.DAG, Depth + 1))
38716 return TLO.CombineTo(
38717 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
38718 break;
38719 }
38720 case X86ISD::KSHIFTL: {
38721 SDValue Src = Op.getOperand(0);
38722 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38723 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((void)0);
38724 unsigned ShiftAmt = Amt->getZExtValue();
38725
38726 if (ShiftAmt == 0)
38727 return TLO.CombineTo(Op, Src);
38728
38729 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
38730 // single shift. We can do this if the bottom bits (which are shifted
38731 // out) are never demanded.
38732 if (Src.getOpcode() == X86ISD::KSHIFTR) {
38733 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
38734 unsigned C1 = Src.getConstantOperandVal(1);
38735 unsigned NewOpc = X86ISD::KSHIFTL;
38736 int Diff = ShiftAmt - C1;
38737 if (Diff < 0) {
38738 Diff = -Diff;
38739 NewOpc = X86ISD::KSHIFTR;
38740 }
38741
38742 SDLoc dl(Op);
38743 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38744 return TLO.CombineTo(
38745 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38746 }
38747 }
38748
38749 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
38750 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38751 Depth + 1))
38752 return true;
38753
38754 KnownUndef <<= ShiftAmt;
38755 KnownZero <<= ShiftAmt;
38756 KnownZero.setLowBits(ShiftAmt);
38757 break;
38758 }
38759 case X86ISD::KSHIFTR: {
38760 SDValue Src = Op.getOperand(0);
38761 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38762 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((void)0);
38763 unsigned ShiftAmt = Amt->getZExtValue();
38764
38765 if (ShiftAmt == 0)
38766 return TLO.CombineTo(Op, Src);
38767
38768 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
38769 // single shift. We can do this if the top bits (which are shifted
38770 // out) are never demanded.
38771 if (Src.getOpcode() == X86ISD::KSHIFTL) {
38772 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
38773 unsigned C1 = Src.getConstantOperandVal(1);
38774 unsigned NewOpc = X86ISD::KSHIFTR;
38775 int Diff = ShiftAmt - C1;
38776 if (Diff < 0) {
38777 Diff = -Diff;
38778 NewOpc = X86ISD::KSHIFTL;
38779 }
38780
38781 SDLoc dl(Op);
38782 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38783 return TLO.CombineTo(
38784 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38785 }
38786 }
38787
38788 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
38789 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38790 Depth + 1))
38791 return true;
38792
38793 KnownUndef.lshrInPlace(ShiftAmt);
38794 KnownZero.lshrInPlace(ShiftAmt);
38795 KnownZero.setHighBits(ShiftAmt);
38796 break;
38797 }
38798 case X86ISD::CVTSI2P:
38799 case X86ISD::CVTUI2P: {
38800 SDValue Src = Op.getOperand(0);
38801 MVT SrcVT = Src.getSimpleValueType();
38802 APInt SrcUndef, SrcZero;
38803 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38804 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38805 Depth + 1))
38806 return true;
38807 break;
38808 }
38809 case X86ISD::PACKSS:
38810 case X86ISD::PACKUS: {
38811 SDValue N0 = Op.getOperand(0);
38812 SDValue N1 = Op.getOperand(1);
38813
38814 APInt DemandedLHS, DemandedRHS;
38815 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38816
38817 APInt LHSUndef, LHSZero;
38818 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38819 Depth + 1))
38820 return true;
38821 APInt RHSUndef, RHSZero;
38822 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38823 Depth + 1))
38824 return true;
38825
38826 // TODO - pass on known zero/undef.
38827
38828 // Aggressively peek through ops to get at the demanded elts.
38829 // TODO - we should do this for all target/faux shuffles ops.
38830 if (!DemandedElts.isAllOnesValue()) {
38831 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38832 TLO.DAG, Depth + 1);
38833 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38834 TLO.DAG, Depth + 1);
38835 if (NewN0 || NewN1) {
38836 NewN0 = NewN0 ? NewN0 : N0;
38837 NewN1 = NewN1 ? NewN1 : N1;
38838 return TLO.CombineTo(Op,
38839 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38840 }
38841 }
38842 break;
38843 }
38844 case X86ISD::HADD:
38845 case X86ISD::HSUB:
38846 case X86ISD::FHADD:
38847 case X86ISD::FHSUB: {
38848 SDValue N0 = Op.getOperand(0);
38849 SDValue N1 = Op.getOperand(1);
38850
38851 APInt DemandedLHS, DemandedRHS;
38852 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38853
38854 APInt LHSUndef, LHSZero;
38855 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38856 Depth + 1))
38857 return true;
38858 APInt RHSUndef, RHSZero;
38859 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38860 Depth + 1))
38861 return true;
38862
38863 // TODO - pass on known zero/undef.
38864
38865 // Aggressively peek through ops to get at the demanded elts.
38866 // TODO: Handle repeated operands.
38867 if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
38868 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38869 TLO.DAG, Depth + 1);
38870 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38871 TLO.DAG, Depth + 1);
38872 if (NewN0 || NewN1) {
38873 NewN0 = NewN0 ? NewN0 : N0;
38874 NewN1 = NewN1 ? NewN1 : N1;
38875 return TLO.CombineTo(Op,
38876 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38877 }
38878 }
38879 break;
38880 }
38881 case X86ISD::VTRUNC:
38882 case X86ISD::VTRUNCS:
38883 case X86ISD::VTRUNCUS: {
38884 SDValue Src = Op.getOperand(0);
38885 MVT SrcVT = Src.getSimpleValueType();
38886 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38887 APInt SrcUndef, SrcZero;
38888 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
38889 Depth + 1))
38890 return true;
38891 KnownZero = SrcZero.zextOrTrunc(NumElts);
38892 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
38893 break;
38894 }
38895 case X86ISD::BLENDV: {
38896 APInt SelUndef, SelZero;
38897 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
38898 SelZero, TLO, Depth + 1))
38899 return true;
38900
38901 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
38902 APInt LHSUndef, LHSZero;
38903 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
38904 LHSZero, TLO, Depth + 1))
38905 return true;
38906
38907 APInt RHSUndef, RHSZero;
38908 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
38909 RHSZero, TLO, Depth + 1))
38910 return true;
38911
38912 KnownZero = LHSZero & RHSZero;
38913 KnownUndef = LHSUndef & RHSUndef;
38914 break;
38915 }
38916 case X86ISD::VZEXT_MOVL: {
38917 // If upper demanded elements are already zero then we have nothing to do.
38918 SDValue Src = Op.getOperand(0);
38919 APInt DemandedUpperElts = DemandedElts;
38920 DemandedUpperElts.clearLowBits(1);
38921 if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
38922 return TLO.CombineTo(Op, Src);
38923 break;
38924 }
38925 case X86ISD::VBROADCAST: {
38926 SDValue Src = Op.getOperand(0);
38927 MVT SrcVT = Src.getSimpleValueType();
38928 if (!SrcVT.isVector())
38929 break;
38930 // Don't bother broadcasting if we just need the 0'th element.
38931 if (DemandedElts == 1) {
38932 if (Src.getValueType() != VT)
38933 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
38934 SDLoc(Op));
38935 return TLO.CombineTo(Op, Src);
38936 }
38937 APInt SrcUndef, SrcZero;
38938 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
38939 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38940 Depth + 1))
38941 return true;
38942 // Aggressively peek through src to get at the demanded elt.
38943 // TODO - we should do this for all target/faux shuffles ops.
38944 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38945 Src, SrcElts, TLO.DAG, Depth + 1))
38946 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
38947 break;
38948 }
38949 case X86ISD::VPERMV:
38950 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
38951 Depth))
38952 return true;
38953 break;
38954 case X86ISD::PSHUFB:
38955 case X86ISD::VPERMV3:
38956 case X86ISD::VPERMILPV:
38957 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
38958 Depth))
38959 return true;
38960 break;
38961 case X86ISD::VPPERM:
38962 case X86ISD::VPERMIL2:
38963 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
38964 Depth))
38965 return true;
38966 break;
38967 }
38968
38969 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
38970 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
38971 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
38972 if ((VT.is256BitVector() || VT.is512BitVector()) &&
38973 DemandedElts.lshr(NumElts / 2) == 0) {
38974 unsigned SizeInBits = VT.getSizeInBits();
38975 unsigned ExtSizeInBits = SizeInBits / 2;
38976
38977 // See if 512-bit ops only use the bottom 128-bits.
38978 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
38979 ExtSizeInBits = SizeInBits / 4;
38980
38981 switch (Opc) {
38982 // Scalar broadcast.
38983 case X86ISD::VBROADCAST: {
38984 SDLoc DL(Op);
38985 SDValue Src = Op.getOperand(0);
38986 if (Src.getValueSizeInBits() > ExtSizeInBits)
38987 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
38988 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38989 ExtSizeInBits / VT.getScalarSizeInBits());
38990 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
38991 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38992 TLO.DAG, DL, ExtSizeInBits));
38993 }
38994 case X86ISD::VBROADCAST_LOAD: {
38995 SDLoc DL(Op);
38996 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
38997 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38998 ExtSizeInBits / VT.getScalarSizeInBits());
38999 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39000 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39001 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
39002 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
39003 MemIntr->getMemOperand());
39004 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39005 Bcst.getValue(1));
39006 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39007 TLO.DAG, DL, ExtSizeInBits));
39008 }
39009 // Subvector broadcast.
39010 case X86ISD::SUBV_BROADCAST_LOAD: {
39011 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39012 EVT MemVT = MemIntr->getMemoryVT();
39013 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
39014 SDLoc DL(Op);
39015 SDValue Ld =
39016 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
39017 MemIntr->getBasePtr(), MemIntr->getMemOperand());
39018 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39019 Ld.getValue(1));
39020 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
39021 TLO.DAG, DL, ExtSizeInBits));
39022 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
39023 SDLoc DL(Op);
39024 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39025 ExtSizeInBits / VT.getScalarSizeInBits());
39026 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39027 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39028 SDValue Bcst =
39029 TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
39030 Ops, MemVT, MemIntr->getMemOperand());
39031 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39032 Bcst.getValue(1));
39033 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39034 TLO.DAG, DL, ExtSizeInBits));
39035 }
39036 break;
39037 }
39038 // Byte shifts by immediate.
39039 case X86ISD::VSHLDQ:
39040 case X86ISD::VSRLDQ:
39041 // Shift by uniform.
39042 case X86ISD::VSHL:
39043 case X86ISD::VSRL:
39044 case X86ISD::VSRA:
39045 // Shift by immediate.
39046 case X86ISD::VSHLI:
39047 case X86ISD::VSRLI:
39048 case X86ISD::VSRAI: {
39049 SDLoc DL(Op);
39050 SDValue Ext0 =
39051 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
39052 SDValue ExtOp =
39053 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
39054 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39055 SDValue Insert =
39056 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39057 return TLO.CombineTo(Op, Insert);
39058 }
39059 case X86ISD::VPERMI: {
39060 // Simplify PERMPD/PERMQ to extract_subvector.
39061 // TODO: This should be done in shuffle combining.
39062 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
39063 SmallVector<int, 4> Mask;
39064 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
39065 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
39066 SDLoc DL(Op);
39067 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
39068 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39069 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
39070 return TLO.CombineTo(Op, Insert);
39071 }
39072 }
39073 break;
39074 }
39075 case X86ISD::VPERM2X128: {
39076 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
39077 SDLoc DL(Op);
39078 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
39079 if (LoMask & 0x8)
39080 return TLO.CombineTo(
39081 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
39082 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
39083 unsigned SrcIdx = (LoMask & 0x2) >> 1;
39084 SDValue ExtOp =
39085 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
39086 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39087 SDValue Insert =
39088 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39089 return TLO.CombineTo(Op, Insert);
39090 }
39091 // Zero upper elements.
39092 case X86ISD::VZEXT_MOVL:
39093 // Target unary shuffles by immediate:
39094 case X86ISD::PSHUFD:
39095 case X86ISD::PSHUFLW:
39096 case X86ISD::PSHUFHW:
39097 case X86ISD::VPERMILPI:
39098 // (Non-Lane Crossing) Target Shuffles.
39099 case X86ISD::VPERMILPV:
39100 case X86ISD::VPERMIL2:
39101 case X86ISD::PSHUFB:
39102 case X86ISD::UNPCKL:
39103 case X86ISD::UNPCKH:
39104 case X86ISD::BLENDI:
39105 // Integer ops.
39106 case X86ISD::AVG:
39107 case X86ISD::PACKSS:
39108 case X86ISD::PACKUS:
39109 // Horizontal Ops.
39110 case X86ISD::HADD:
39111 case X86ISD::HSUB:
39112 case X86ISD::FHADD:
39113 case X86ISD::FHSUB: {
39114 SDLoc DL(Op);
39115 SmallVector<SDValue, 4> Ops;
39116 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
39117 SDValue SrcOp = Op.getOperand(i);
39118 EVT SrcVT = SrcOp.getValueType();
39119 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&((void)0)
39120 "Unsupported vector size")((void)0);
39121 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
39122 ExtSizeInBits)
39123 : SrcOp);
39124 }
39125 MVT ExtVT = VT.getSimpleVT();
39126 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
39127 ExtSizeInBits / ExtVT.getScalarSizeInBits());
39128 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
39129 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39130 SDValue Insert =
39131 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39132 return TLO.CombineTo(Op, Insert);
39133 }
39134 }
39135 }
39136
39137 // Get target/faux shuffle mask.
39138 APInt OpUndef, OpZero;
39139 SmallVector<int, 64> OpMask;
39140 SmallVector<SDValue, 2> OpInputs;
39141 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
39142 OpZero, TLO.DAG, Depth, false))
39143 return false;
39144
39145 // Shuffle inputs must be the same size as the result.
39146 if (OpMask.size() != (unsigned)NumElts ||
39147 llvm::any_of(OpInputs, [VT](SDValue V) {
39148 return VT.getSizeInBits() != V.getValueSizeInBits() ||
39149 !V.getValueType().isVector();
39150 }))
39151 return false;
39152
39153 KnownZero = OpZero;
39154 KnownUndef = OpUndef;
39155
39156 // Check if shuffle mask can be simplified to undef/zero/identity.
39157 int NumSrcs = OpInputs.size();
39158 for (int i = 0; i != NumElts; ++i)
39159 if (!DemandedElts[i])
39160 OpMask[i] = SM_SentinelUndef;
39161
39162 if (isUndefInRange(OpMask, 0, NumElts)) {
39163 KnownUndef.setAllBits();
39164 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
39165 }
39166 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
39167 KnownZero.setAllBits();
39168 return TLO.CombineTo(
39169 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
39170 }
39171 for (int Src = 0; Src != NumSrcs; ++Src)
39172 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
39173 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
39174
39175 // Attempt to simplify inputs.
39176 for (int Src = 0; Src != NumSrcs; ++Src) {
39177 // TODO: Support inputs of different types.
39178 if (OpInputs[Src].getValueType() != VT)
39179 continue;
39180
39181 int Lo = Src * NumElts;
39182 APInt SrcElts = APInt::getNullValue(NumElts);
39183 for (int i = 0; i != NumElts; ++i)
39184 if (DemandedElts[i]) {
39185 int M = OpMask[i] - Lo;
39186 if (0 <= M && M < NumElts)
39187 SrcElts.setBit(M);
39188 }
39189
39190 // TODO - Propagate input undef/zero elts.
39191 APInt SrcUndef, SrcZero;
39192 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
39193 TLO, Depth + 1))
39194 return true;
39195 }
39196
39197 // If we don't demand all elements, then attempt to combine to a simpler
39198 // shuffle.
39199 // We need to convert the depth to something combineX86ShufflesRecursively
39200 // can handle - so pretend its Depth == 0 again, and reduce the max depth
39201 // to match. This prevents combineX86ShuffleChain from returning a
39202 // combined shuffle that's the same as the original root, causing an
39203 // infinite loop.
39204 if (!DemandedElts.isAllOnesValue()) {
39205 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")((void)0);
39206
39207 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
39208 for (int i = 0; i != NumElts; ++i)
39209 if (DemandedElts[i])
39210 DemandedMask[i] = i;
39211
39212 SDValue NewShuffle = combineX86ShufflesRecursively(
39213 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
39214 /*HasVarMask*/ false,
39215 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
39216 Subtarget);
39217 if (NewShuffle)
39218 return TLO.CombineTo(Op, NewShuffle);
39219 }
39220
39221 return false;
39222}
39223
39224bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
39225 SDValue Op, const APInt &OriginalDemandedBits,
39226 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
39227 unsigned Depth) const {
39228 EVT VT = Op.getValueType();
39229 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
39230 unsigned Opc = Op.getOpcode();
39231 switch(Opc) {
39232 case X86ISD::VTRUNC: {
39233 KnownBits KnownOp;
39234 SDValue Src = Op.getOperand(0);
39235 MVT SrcVT = Src.getSimpleValueType();
39236
39237 // Simplify the input, using demanded bit information.
39238 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
39239 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
39240 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
39241 return true;
39242 break;
39243 }
39244 case X86ISD::PMULDQ:
39245 case X86ISD::PMULUDQ: {
39246 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
39247 KnownBits KnownOp;
39248 SDValue LHS = Op.getOperand(0);
39249 SDValue RHS = Op.getOperand(1);
39250 // FIXME: Can we bound this better?
39251 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
39252 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
39253 TLO, Depth + 1))
39254 return true;
39255 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
39256 TLO, Depth + 1))
39257 return true;
39258
39259 // Aggressively peek through ops to get at the demanded low bits.
39260 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
39261 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39262 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
39263 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39264 if (DemandedLHS || DemandedRHS) {
39265 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
39266 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
39267 return TLO.CombineTo(
39268 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
39269 }
39270 break;
39271 }
39272 case X86ISD::VSHLI: {
39273 SDValue Op0 = Op.getOperand(0);
39274
39275 unsigned ShAmt = Op.getConstantOperandVal(1);
39276 if (ShAmt >= BitWidth)
39277 break;
39278
39279 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
39280
39281 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39282 // single shift. We can do this if the bottom bits (which are shifted
39283 // out) are never demanded.
39284 if (Op0.getOpcode() == X86ISD::VSRLI &&
39285 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
39286 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
39287 if (Shift2Amt < BitWidth) {
39288 int Diff = ShAmt - Shift2Amt;
39289 if (Diff == 0)
39290 return TLO.CombineTo(Op, Op0.getOperand(0));
39291
39292 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
39293 SDValue NewShift = TLO.DAG.getNode(
39294 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
39295 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
39296 return TLO.CombineTo(Op, NewShift);
39297 }
39298 }
39299
39300 // If we are only demanding sign bits then we can use the shift source directly.
39301 unsigned NumSignBits =
39302 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
39303 unsigned UpperDemandedBits =
39304 BitWidth - OriginalDemandedBits.countTrailingZeros();
39305 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39306 return TLO.CombineTo(Op, Op0);
39307
39308 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39309 TLO, Depth + 1))
39310 return true;
39311
39312 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((void)0);
39313 Known.Zero <<= ShAmt;
39314 Known.One <<= ShAmt;
39315
39316 // Low bits known zero.
39317 Known.Zero.setLowBits(ShAmt);
39318 return false;
39319 }
39320 case X86ISD::VSRLI: {
39321 unsigned ShAmt = Op.getConstantOperandVal(1);
39322 if (ShAmt >= BitWidth)
39323 break;
39324
39325 APInt DemandedMask = OriginalDemandedBits << ShAmt;
39326
39327 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
39328 OriginalDemandedElts, Known, TLO, Depth + 1))
39329 return true;
39330
39331 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((void)0);
39332 Known.Zero.lshrInPlace(ShAmt);
39333 Known.One.lshrInPlace(ShAmt);
39334
39335 // High bits known zero.
39336 Known.Zero.setHighBits(ShAmt);
39337 return false;
39338 }
39339 case X86ISD::VSRAI: {
39340 SDValue Op0 = Op.getOperand(0);
39341 SDValue Op1 = Op.getOperand(1);
39342
39343 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
39344 if (ShAmt >= BitWidth)
39345 break;
39346
39347 APInt DemandedMask = OriginalDemandedBits << ShAmt;
39348
39349 // If we just want the sign bit then we don't need to shift it.
39350 if (OriginalDemandedBits.isSignMask())
39351 return TLO.CombineTo(Op, Op0);
39352
39353 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
39354 if (Op0.getOpcode() == X86ISD::VSHLI &&
39355 Op.getOperand(1) == Op0.getOperand(1)) {
39356 SDValue Op00 = Op0.getOperand(0);
39357 unsigned NumSignBits =
39358 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
39359 if (ShAmt < NumSignBits)
39360 return TLO.CombineTo(Op, Op00);
39361 }
39362
39363 // If any of the demanded bits are produced by the sign extension, we also
39364 // demand the input sign bit.
39365 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
39366 DemandedMask.setSignBit();
39367
39368 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39369 TLO, Depth + 1))
39370 return true;
39371
39372 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((void)0);
39373 Known.Zero.lshrInPlace(ShAmt);
39374 Known.One.lshrInPlace(ShAmt);
39375
39376 // If the input sign bit is known to be zero, or if none of the top bits
39377 // are demanded, turn this into an unsigned shift right.
39378 if (Known.Zero[BitWidth - ShAmt - 1] ||
39379 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
39380 return TLO.CombineTo(
39381 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
39382
39383 // High bits are known one.
39384 if (Known.One[BitWidth - ShAmt - 1])
39385 Known.One.setHighBits(ShAmt);
39386 return false;
39387 }
39388 case X86ISD::PEXTRB:
39389 case X86ISD::PEXTRW: {
39390 SDValue Vec = Op.getOperand(0);
39391 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
39392 MVT VecVT = Vec.getSimpleValueType();
39393 unsigned NumVecElts = VecVT.getVectorNumElements();
39394
39395 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
39396 unsigned Idx = CIdx->getZExtValue();
39397 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
39398
39399 // If we demand no bits from the vector then we must have demanded
39400 // bits from the implict zext - simplify to zero.
39401 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
39402 if (DemandedVecBits == 0)
39403 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39404
39405 APInt KnownUndef, KnownZero;
39406 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
39407 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
39408 KnownZero, TLO, Depth + 1))
39409 return true;
39410
39411 KnownBits KnownVec;
39412 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
39413 KnownVec, TLO, Depth + 1))
39414 return true;
39415
39416 if (SDValue V = SimplifyMultipleUseDemandedBits(
39417 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
39418 return TLO.CombineTo(
39419 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
39420
39421 Known = KnownVec.zext(BitWidth);
39422 return false;
39423 }
39424 break;
39425 }
39426 case X86ISD::PINSRB:
39427 case X86ISD::PINSRW: {
39428 SDValue Vec = Op.getOperand(0);
39429 SDValue Scl = Op.getOperand(1);
39430 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39431 MVT VecVT = Vec.getSimpleValueType();
39432
39433 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
39434 unsigned Idx = CIdx->getZExtValue();
39435 if (!OriginalDemandedElts[Idx])
39436 return TLO.CombineTo(Op, Vec);
39437
39438 KnownBits KnownVec;
39439 APInt DemandedVecElts(OriginalDemandedElts);
39440 DemandedVecElts.clearBit(Idx);
39441 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
39442 KnownVec, TLO, Depth + 1))
39443 return true;
39444
39445 KnownBits KnownScl;
39446 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
39447 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
39448 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
39449 return true;
39450
39451 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
39452 Known = KnownBits::commonBits(KnownVec, KnownScl);
39453 return false;
39454 }
39455 break;
39456 }
39457 case X86ISD::PACKSS:
39458 // PACKSS saturates to MIN/MAX integer values. So if we just want the
39459 // sign bit then we can just ask for the source operands sign bit.
39460 // TODO - add known bits handling.
39461 if (OriginalDemandedBits.isSignMask()) {
39462 APInt DemandedLHS, DemandedRHS;
39463 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
39464
39465 KnownBits KnownLHS, KnownRHS;
39466 APInt SignMask = APInt::getSignMask(BitWidth * 2);
39467 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
39468 KnownLHS, TLO, Depth + 1))
39469 return true;
39470 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
39471 KnownRHS, TLO, Depth + 1))
39472 return true;
39473
39474 // Attempt to avoid multi-use ops if we don't need anything from them.
39475 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
39476 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
39477 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
39478 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
39479 if (DemandedOp0 || DemandedOp1) {
39480 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
39481 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
39482 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
39483 }
39484 }
39485 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
39486 break;
39487 case X86ISD::VBROADCAST: {
39488 SDValue Src = Op.getOperand(0);
39489 MVT SrcVT = Src.getSimpleValueType();
39490 APInt DemandedElts = APInt::getOneBitSet(
39491 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
39492 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
39493 TLO, Depth + 1))
39494 return true;
39495 // If we don't need the upper bits, attempt to narrow the broadcast source.
39496 // Don't attempt this on AVX512 as it might affect broadcast folding.
39497 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
39498 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
39499 OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
39500 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
39501 SDValue NewSrc =
39502 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
39503 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
39504 SDValue NewBcst =
39505 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
39506 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
39507 }
39508 break;
39509 }
39510 case X86ISD::PCMPGT:
39511 // icmp sgt(0, R) == ashr(R, BitWidth-1).
39512 // iff we only need the sign bit then we can use R directly.
39513 if (OriginalDemandedBits.isSignMask() &&
39514 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39515 return TLO.CombineTo(Op, Op.getOperand(1));
39516 break;
39517 case X86ISD::MOVMSK: {
39518 SDValue Src = Op.getOperand(0);
39519 MVT SrcVT = Src.getSimpleValueType();
39520 unsigned SrcBits = SrcVT.getScalarSizeInBits();
39521 unsigned NumElts = SrcVT.getVectorNumElements();
39522
39523 // If we don't need the sign bits at all just return zero.
39524 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
39525 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39526
39527 // Only demand the vector elements of the sign bits we need.
39528 APInt KnownUndef, KnownZero;
39529 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
39530 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
39531 TLO, Depth + 1))
39532 return true;
39533
39534 Known.Zero = KnownZero.zextOrSelf(BitWidth);
39535 Known.Zero.setHighBits(BitWidth - NumElts);
39536
39537 // MOVMSK only uses the MSB from each vector element.
39538 KnownBits KnownSrc;
39539 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
39540 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
39541 Depth + 1))
39542 return true;
39543
39544 if (KnownSrc.One[SrcBits - 1])
39545 Known.One.setLowBits(NumElts);
39546 else if (KnownSrc.Zero[SrcBits - 1])
39547 Known.Zero.setLowBits(NumElts);
39548
39549 // Attempt to avoid multi-use os if we don't need anything from it.
39550 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
39551 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
39552 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
39553 return false;
39554 }
39555 case X86ISD::BEXTR:
39556 case X86ISD::BEXTRI: {
39557 SDValue Op0 = Op.getOperand(0);
39558 SDValue Op1 = Op.getOperand(1);
39559
39560 // Only bottom 16-bits of the control bits are required.
39561 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
39562 // NOTE: SimplifyDemandedBits won't do this for constants.
39563 uint64_t Val1 = Cst1->getZExtValue();
39564 uint64_t MaskedVal1 = Val1 & 0xFFFF;
39565 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
39566 SDLoc DL(Op);
39567 return TLO.CombineTo(
39568 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
39569 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
39570 }
39571
39572 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
39573 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
39574
39575 // If the length is 0, the result is 0.
39576 if (Length == 0) {
39577 Known.setAllZero();
39578 return false;
39579 }
39580
39581 if ((Shift + Length) <= BitWidth) {
39582 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
39583 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
39584 return true;
39585
39586 Known = Known.extractBits(Length, Shift);
39587 Known = Known.zextOrTrunc(BitWidth);
39588 return false;
39589 }
39590 } else {
39591 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")((void)0);
39592 KnownBits Known1;
39593 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
39594 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
39595 return true;
39596
39597 // If the length is 0, replace with 0.
39598 KnownBits LengthBits = Known1.extractBits(8, 8);
39599 if (LengthBits.isZero())
39600 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39601 }
39602
39603 break;
39604 }
39605 case X86ISD::PDEP: {
39606 SDValue Op0 = Op.getOperand(0);
39607 SDValue Op1 = Op.getOperand(1);
39608
39609 unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
39610 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
39611
39612 // If the demanded bits has leading zeroes, we don't demand those from the
39613 // mask.
39614 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
39615 return true;
39616
39617 // The number of possible 1s in the mask determines the number of LSBs of
39618 // operand 0 used. Undemanded bits from the mask don't matter so filter
39619 // them before counting.
39620 KnownBits Known2;
39621 uint64_t Count = (~Known.Zero & LoMask).countPopulation();
39622 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
39623 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
39624 return true;
39625
39626 // Zeroes are retained from the mask, but not ones.
39627 Known.One.clearAllBits();
39628 // The result will have at least as many trailing zeros as the non-mask
39629 // operand since bits can only map to the same or higher bit position.
39630 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
39631 return false;
39632 }
39633 }
39634
39635 return TargetLowering::SimplifyDemandedBitsForTargetNode(
39636 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
39637}
39638
39639SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39640 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
39641 SelectionDAG &DAG, unsigned Depth) const {
39642 int NumElts = DemandedElts.getBitWidth();
39643 unsigned Opc = Op.getOpcode();
39644 EVT VT = Op.getValueType();
39645
39646 switch (Opc) {
39647 case X86ISD::PINSRB:
39648 case X86ISD::PINSRW: {
39649 // If we don't demand the inserted element, return the base vector.
39650 SDValue Vec = Op.getOperand(0);
39651 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39652 MVT VecVT = Vec.getSimpleValueType();
39653 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
39654 !DemandedElts[CIdx->getZExtValue()])
39655 return Vec;
39656 break;
39657 }
39658 case X86ISD::VSHLI: {
39659 // If we are only demanding sign bits then we can use the shift source
39660 // directly.
39661 SDValue Op0 = Op.getOperand(0);
39662 unsigned ShAmt = Op.getConstantOperandVal(1);
39663 unsigned BitWidth = DemandedBits.getBitWidth();
39664 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
39665 unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
39666 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39667 return Op0;
39668 break;
39669 }
39670 case X86ISD::VSRAI:
39671 // iff we only need the sign bit then we can use the source directly.
39672 // TODO: generalize where we only demand extended signbits.
39673 if (DemandedBits.isSignMask())
39674 return Op.getOperand(0);
39675 break;
39676 case X86ISD::PCMPGT:
39677 // icmp sgt(0, R) == ashr(R, BitWidth-1).
39678 // iff we only need the sign bit then we can use R directly.
39679 if (DemandedBits.isSignMask() &&
39680 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39681 return Op.getOperand(1);
39682 break;
39683 }
39684
39685 APInt ShuffleUndef, ShuffleZero;
39686 SmallVector<int, 16> ShuffleMask;
39687 SmallVector<SDValue, 2> ShuffleOps;
39688 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
39689 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
39690 // If all the demanded elts are from one operand and are inline,
39691 // then we can use the operand directly.
39692 int NumOps = ShuffleOps.size();
39693 if (ShuffleMask.size() == (unsigned)NumElts &&
39694 llvm::all_of(ShuffleOps, [VT](SDValue V) {
39695 return VT.getSizeInBits() == V.getValueSizeInBits();
39696 })) {
39697
39698 if (DemandedElts.isSubsetOf(ShuffleUndef))
39699 return DAG.getUNDEF(VT);
39700 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
39701 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
39702
39703 // Bitmask that indicates which ops have only been accessed 'inline'.
39704 APInt IdentityOp = APInt::getAllOnesValue(NumOps);
39705 for (int i = 0; i != NumElts; ++i) {
39706 int M = ShuffleMask[i];
39707 if (!DemandedElts[i] || ShuffleUndef[i])
39708 continue;
39709 int OpIdx = M / NumElts;
39710 int EltIdx = M % NumElts;
39711 if (M < 0 || EltIdx != i) {
39712 IdentityOp.clearAllBits();
39713 break;
39714 }
39715 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
39716 if (IdentityOp == 0)
39717 break;
39718 }
39719 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&((void)0)
39720 "Multiple identity shuffles detected")((void)0);
39721
39722 if (IdentityOp != 0)
39723 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
39724 }
39725 }
39726
39727 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39728 Op, DemandedBits, DemandedElts, DAG, Depth);
39729}
39730
39731// Helper to peek through bitops/trunc/setcc to determine size of source vector.
39732// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
39733static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
39734 bool AllowTruncate) {
39735 switch (Src.getOpcode()) {
39736 case ISD::TRUNCATE:
39737 if (!AllowTruncate)
39738 return false;
39739 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39740 case ISD::SETCC:
39741 return Src.getOperand(0).getValueSizeInBits() == Size;
39742 case ISD::AND:
39743 case ISD::XOR:
39744 case ISD::OR:
39745 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
39746 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
39747 }
39748 return false;
39749}
39750
39751// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
39752static unsigned getAltBitOpcode(unsigned Opcode) {
39753 switch(Opcode) {
39754 case ISD::AND: return X86ISD::FAND;
39755 case ISD::OR: return X86ISD::FOR;
39756 case ISD::XOR: return X86ISD::FXOR;
39757 case X86ISD::ANDNP: return X86ISD::FANDN;
39758 }
39759 llvm_unreachable("Unknown bitwise opcode")__builtin_unreachable();
39760}
39761
39762// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
39763static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
39764 const SDLoc &DL) {
39765 EVT SrcVT = Src.getValueType();
39766 if (SrcVT != MVT::v4i1)
39767 return SDValue();
39768
39769 switch (Src.getOpcode()) {
39770 case ISD::SETCC:
39771 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
39772 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
39773 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
39774 SDValue Op0 = Src.getOperand(0);
39775 if (ISD::isNormalLoad(Op0.getNode()))
39776 return DAG.getBitcast(MVT::v4f32, Op0);
39777 if (Op0.getOpcode() == ISD::BITCAST &&
39778 Op0.getOperand(0).getValueType() == MVT::v4f32)
39779 return Op0.getOperand(0);
39780 }
39781 break;
39782 case ISD::AND:
39783 case ISD::XOR:
39784 case ISD::OR: {
39785 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
39786 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
39787 if (Op0 && Op1)
39788 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
39789 Op1);
39790 break;
39791 }
39792 }
39793 return SDValue();
39794}
39795
39796// Helper to push sign extension of vXi1 SETCC result through bitops.
39797static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
39798 SDValue Src, const SDLoc &DL) {
39799 switch (Src.getOpcode()) {
39800 case ISD::SETCC:
39801 case ISD::TRUNCATE:
39802 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39803 case ISD::AND:
39804 case ISD::XOR:
39805 case ISD::OR:
39806 return DAG.getNode(
39807 Src.getOpcode(), DL, SExtVT,
39808 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
39809 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
39810 }
39811 llvm_unreachable("Unexpected node type for vXi1 sign extension")__builtin_unreachable();
39812}
39813
39814// Try to match patterns such as
39815// (i16 bitcast (v16i1 x))
39816// ->
39817// (i16 movmsk (16i8 sext (v16i1 x)))
39818// before the illegal vector is scalarized on subtargets that don't have legal
39819// vxi1 types.
39820static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
39821 const SDLoc &DL,
39822 const X86Subtarget &Subtarget) {
39823 EVT SrcVT = Src.getValueType();
39824 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
39825 return SDValue();
39826
39827 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
39828 // legalization destroys the v4i32 type.
39829 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
39830 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
39831 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
39832 DAG.getBitcast(MVT::v4f32, V));
39833 return DAG.getZExtOrTrunc(V, DL, VT);
39834 }
39835 }
39836
39837 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
39838 // movmskb even with avx512. This will be better than truncating to vXi1 and
39839 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
39840 // vpcmpeqb/vpcmpgtb.
39841 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
39842 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
39843 Src.getOperand(0).getValueType() == MVT::v32i8 ||
39844 Src.getOperand(0).getValueType() == MVT::v64i8);
39845
39846 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
39847 // directly with vpmovmskb/vmovmskps/vmovmskpd.
39848 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
39849 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
39850 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
39851 EVT CmpVT = Src.getOperand(0).getValueType();
39852 EVT EltVT = CmpVT.getVectorElementType();
39853 if (CmpVT.getSizeInBits() <= 256 &&
39854 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
39855 PreferMovMsk = true;
39856 }
39857
39858 // With AVX512 vxi1 types are legal and we prefer using k-regs.
39859 // MOVMSK is supported in SSE2 or later.
39860 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
39861 return SDValue();
39862
39863 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
39864 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
39865 // v8i16 and v16i16.
39866 // For these two cases, we can shuffle the upper element bytes to a
39867 // consecutive sequence at the start of the vector and treat the results as
39868 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
39869 // for v16i16 this is not the case, because the shuffle is expensive, so we
39870 // avoid sign-extending to this type entirely.
39871 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
39872 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
39873 MVT SExtVT;
39874 bool PropagateSExt = false;
39875 switch (SrcVT.getSimpleVT().SimpleTy) {
39876 default:
39877 return SDValue();
39878 case MVT::v2i1:
39879 SExtVT = MVT::v2i64;
39880 break;
39881 case MVT::v4i1:
39882 SExtVT = MVT::v4i32;
39883 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
39884 // sign-extend to a 256-bit operation to avoid truncation.
39885 if (Subtarget.hasAVX() &&
39886 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
39887 SExtVT = MVT::v4i64;
39888 PropagateSExt = true;
39889 }
39890 break;
39891 case MVT::v8i1:
39892 SExtVT = MVT::v8i16;
39893 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
39894 // sign-extend to a 256-bit operation to match the compare.
39895 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
39896 // 256-bit because the shuffle is cheaper than sign extending the result of
39897 // the compare.
39898 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
39899 checkBitcastSrcVectorSize(Src, 512, true))) {
39900 SExtVT = MVT::v8i32;
39901 PropagateSExt = true;
39902 }
39903 break;
39904 case MVT::v16i1:
39905 SExtVT = MVT::v16i8;
39906 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
39907 // it is not profitable to sign-extend to 256-bit because this will
39908 // require an extra cross-lane shuffle which is more expensive than
39909 // truncating the result of the compare to 128-bits.
39910 break;
39911 case MVT::v32i1:
39912 SExtVT = MVT::v32i8;
39913 break;
39914 case MVT::v64i1:
39915 // If we have AVX512F, but not AVX512BW and the input is truncated from
39916 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
39917 if (Subtarget.hasAVX512()) {
39918 if (Subtarget.hasBWI())
39919 return SDValue();
39920 SExtVT = MVT::v64i8;
39921 break;
39922 }
39923 // Split if this is a <64 x i8> comparison result.
39924 if (checkBitcastSrcVectorSize(Src, 512, false)) {
39925 SExtVT = MVT::v64i8;
39926 break;
39927 }
39928 return SDValue();
39929 };
39930
39931 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
39932 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39933
39934 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
39935 V = getPMOVMSKB(DL, V, DAG, Subtarget);
39936 } else {
39937 if (SExtVT == MVT::v8i16)
39938 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
39939 DAG.getUNDEF(MVT::v8i16));
39940 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
39941 }
39942
39943 EVT IntVT =
39944 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
39945 V = DAG.getZExtOrTrunc(V, DL, IntVT);
39946 return DAG.getBitcast(VT, V);
39947}
39948
39949// Convert a vXi1 constant build vector to the same width scalar integer.
39950static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
39951 EVT SrcVT = Op.getValueType();
39952 assert(SrcVT.getVectorElementType() == MVT::i1 &&((void)0)
39953 "Expected a vXi1 vector")((void)0);
39954 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((void)0)
39955 "Expected a constant build vector")((void)0);
39956
39957 APInt Imm(SrcVT.getVectorNumElements(), 0);
39958 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
39959 SDValue In = Op.getOperand(Idx);
39960 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
39961 Imm.setBit(Idx);
39962 }
39963 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
39964 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
39965}
39966
39967static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
39968 TargetLowering::DAGCombinerInfo &DCI,
39969 const X86Subtarget &Subtarget) {
39970 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")((void)0);
39971
39972 if (!DCI.isBeforeLegalizeOps())
39973 return SDValue();
39974
39975 // Only do this if we have k-registers.
39976 if (!Subtarget.hasAVX512())
39977 return SDValue();
39978
39979 EVT DstVT = N->getValueType(0);
39980 SDValue Op = N->getOperand(0);
39981 EVT SrcVT = Op.getValueType();
39982
39983 if (!Op.hasOneUse())
39984 return SDValue();
39985
39986 // Look for logic ops.
39987 if (Op.getOpcode() != ISD::AND &&
39988 Op.getOpcode() != ISD::OR &&
39989 Op.getOpcode() != ISD::XOR)
39990 return SDValue();
39991
39992 // Make sure we have a bitcast between mask registers and a scalar type.
39993 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
39994 DstVT.isScalarInteger()) &&
39995 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
39996 SrcVT.isScalarInteger()))
39997 return SDValue();
39998
39999 SDValue LHS = Op.getOperand(0);
40000 SDValue RHS = Op.getOperand(1);
40001
40002 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
40003 LHS.getOperand(0).getValueType() == DstVT)
40004 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
40005 DAG.getBitcast(DstVT, RHS));
40006
40007 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
40008 RHS.getOperand(0).getValueType() == DstVT)
40009 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40010 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
40011
40012 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
40013 // Most of these have to move a constant from the scalar domain anyway.
40014 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
40015 RHS = combinevXi1ConstantToInteger(RHS, DAG);
40016 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40017 DAG.getBitcast(DstVT, LHS), RHS);
40018 }
40019
40020 return SDValue();
40021}
40022
40023static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
40024 const X86Subtarget &Subtarget) {
40025 SDLoc DL(BV);
40026 unsigned NumElts = BV->getNumOperands();
40027 SDValue Splat = BV->getSplatValue();
40028
40029 // Build MMX element from integer GPR or SSE float values.
40030 auto CreateMMXElement = [&](SDValue V) {
40031 if (V.isUndef())
40032 return DAG.getUNDEF(MVT::x86mmx);
40033 if (V.getValueType().isFloatingPoint()) {
40034 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
40035 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
40036 V = DAG.getBitcast(MVT::v2i64, V);
40037 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
40038 }
40039 V = DAG.getBitcast(MVT::i32, V);
40040 } else {
40041 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
40042 }
40043 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
40044 };
40045
40046 // Convert build vector ops to MMX data in the bottom elements.
40047 SmallVector<SDValue, 8> Ops;
40048
40049 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40050
40051 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
40052 if (Splat) {
40053 if (Splat.isUndef())
40054 return DAG.getUNDEF(MVT::x86mmx);
40055
40056 Splat = CreateMMXElement(Splat);
40057
40058 if (Subtarget.hasSSE1()) {
40059 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
40060 if (NumElts == 8)
40061 Splat = DAG.getNode(
40062 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40063 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
40064 TLI.getPointerTy(DAG.getDataLayout())),
40065 Splat, Splat);
40066
40067 // Use PSHUFW to repeat 16-bit elements.
40068 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
40069 return DAG.getNode(
40070 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40071 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
40072 TLI.getPointerTy(DAG.getDataLayout())),
40073 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
40074 }
40075 Ops.append(NumElts, Splat);
40076 } else {
40077 for (unsigned i = 0; i != NumElts; ++i)
40078 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
40079 }
40080
40081 // Use tree of PUNPCKLs to build up general MMX vector.
40082 while (Ops.size() > 1) {
40083 unsigned NumOps = Ops.size();
40084 unsigned IntrinOp =
40085 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
40086 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
40087 : Intrinsic::x86_mmx_punpcklbw));
40088 SDValue Intrin = DAG.getTargetConstant(
40089 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
40090 for (unsigned i = 0; i != NumOps; i += 2)
40091 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
40092 Ops[i], Ops[i + 1]);
40093 Ops.resize(NumOps / 2);
40094 }
40095
40096 return Ops[0];
40097}
40098
40099// Recursive function that attempts to find if a bool vector node was originally
40100// a vector/float/double that got truncated/extended/bitcast to/from a scalar
40101// integer. If so, replace the scalar ops with bool vector equivalents back down
40102// the chain.
40103static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
40104 SelectionDAG &DAG,
40105 const X86Subtarget &Subtarget) {
40106 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40107 unsigned Opc = V.getOpcode();
40108 switch (Opc) {
40109 case ISD::BITCAST: {
40110 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
40111 SDValue Src = V.getOperand(0);
40112 EVT SrcVT = Src.getValueType();
40113 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
40114 return DAG.getBitcast(VT, Src);
40115 break;
40116 }
40117 case ISD::TRUNCATE: {
40118 // If we find a suitable source, a truncated scalar becomes a subvector.
40119 SDValue Src = V.getOperand(0);
40120 EVT NewSrcVT =
40121 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
40122 if (TLI.isTypeLegal(NewSrcVT))
40123 if (SDValue N0 =
40124 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40125 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
40126 DAG.getIntPtrConstant(0, DL));
40127 break;
40128 }
40129 case ISD::ANY_EXTEND:
40130 case ISD::ZERO_EXTEND: {
40131 // If we find a suitable source, an extended scalar becomes a subvector.
40132 SDValue Src = V.getOperand(0);
40133 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
40134 Src.getScalarValueSizeInBits());
40135 if (TLI.isTypeLegal(NewSrcVT))
40136 if (SDValue N0 =
40137 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40138 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40139 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
40140 : DAG.getConstant(0, DL, VT),
40141 N0, DAG.getIntPtrConstant(0, DL));
40142 break;
40143 }
40144 case ISD::OR: {
40145 // If we find suitable sources, we can just move an OR to the vector domain.
40146 SDValue Src0 = V.getOperand(0);
40147 SDValue Src1 = V.getOperand(1);
40148 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40149 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
40150 return DAG.getNode(Opc, DL, VT, N0, N1);
40151 break;
40152 }
40153 case ISD::SHL: {
40154 // If we find a suitable source, a SHL becomes a KSHIFTL.
40155 SDValue Src0 = V.getOperand(0);
40156 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
40157 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
40158 break;
40159
40160 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
40161 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40162 return DAG.getNode(
40163 X86ISD::KSHIFTL, DL, VT, N0,
40164 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
40165 break;
40166 }
40167 }
40168 return SDValue();
40169}
40170
40171static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
40172 TargetLowering::DAGCombinerInfo &DCI,
40173 const X86Subtarget &Subtarget) {
40174 SDValue N0 = N->getOperand(0);
40175 EVT VT = N->getValueType(0);
40176 EVT SrcVT = N0.getValueType();
40177 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40178
40179 // Try to match patterns such as
40180 // (i16 bitcast (v16i1 x))
40181 // ->
40182 // (i16 movmsk (16i8 sext (v16i1 x)))
40183 // before the setcc result is scalarized on subtargets that don't have legal
40184 // vxi1 types.
40185 if (DCI.isBeforeLegalize()) {
40186 SDLoc dl(N);
40187 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
40188 return V;
40189
40190 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40191 // type, widen both sides to avoid a trip through memory.
40192 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
40193 Subtarget.hasAVX512()) {
40194 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
40195 N0 = DAG.getBitcast(MVT::v8i1, N0);
40196 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
40197 DAG.getIntPtrConstant(0, dl));
40198 }
40199
40200 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40201 // type, widen both sides to avoid a trip through memory.
40202 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
40203 Subtarget.hasAVX512()) {
40204 // Use zeros for the widening if we already have some zeroes. This can
40205 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
40206 // stream of this.
40207 // FIXME: It might make sense to detect a concat_vectors with a mix of
40208 // zeroes and undef and turn it into insert_subvector for i1 vectors as
40209 // a separate combine. What we can't do is canonicalize the operands of
40210 // such a concat or we'll get into a loop with SimplifyDemandedBits.
40211 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
40212 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
40213 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
40214 SrcVT = LastOp.getValueType();
40215 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40216 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
40217 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
40218 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40219 N0 = DAG.getBitcast(MVT::i8, N0);
40220 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40221 }
40222 }
40223
40224 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40225 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
40226 Ops[0] = N0;
40227 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40228 N0 = DAG.getBitcast(MVT::i8, N0);
40229 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40230 }
40231 } else {
40232 // If we're bitcasting from iX to vXi1, see if the integer originally
40233 // began as a vXi1 and whether we can remove the bitcast entirely.
40234 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
40235 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
40236 if (SDValue V =
40237 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
40238 return V;
40239 }
40240 }
40241
40242 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
40243 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
40244 // due to insert_subvector legalization on KNL. By promoting the copy to i16
40245 // we can help with known bits propagation from the vXi1 domain to the
40246 // scalar domain.
40247 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
40248 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40249 N0.getOperand(0).getValueType() == MVT::v16i1 &&
40250 isNullConstant(N0.getOperand(1)))
40251 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
40252 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
40253
40254 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
40255 // and the vbroadcast_load are both integer or both fp. In some cases this
40256 // will remove the bitcast entirely.
40257 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
40258 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
40259 auto *BCast = cast<MemIntrinsicSDNode>(N0);
40260 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
40261 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
40262 // Don't swap i8/i16 since don't have fp types that size.
40263 if (MemSize >= 32) {
40264 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
40265 : MVT::getIntegerVT(MemSize);
40266 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
40267 : MVT::getIntegerVT(SrcVTSize);
40268 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
40269
40270 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
40271 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
40272 SDValue ResNode =
40273 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
40274 MemVT, BCast->getMemOperand());
40275 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
40276 return DAG.getBitcast(VT, ResNode);
40277 }
40278 }
40279
40280 // Since MMX types are special and don't usually play with other vector types,
40281 // it's better to handle them early to be sure we emit efficient code by
40282 // avoiding store-load conversions.
40283 if (VT == MVT::x86mmx) {
40284 // Detect MMX constant vectors.
40285 APInt UndefElts;
40286 SmallVector<APInt, 1> EltBits;
40287 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
40288 SDLoc DL(N0);
40289 // Handle zero-extension of i32 with MOVD.
40290 if (EltBits[0].countLeadingZeros() >= 32)
40291 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
40292 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
40293 // Else, bitcast to a double.
40294 // TODO - investigate supporting sext 32-bit immediates on x86_64.
40295 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
40296 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
40297 }
40298
40299 // Detect bitcasts to x86mmx low word.
40300 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40301 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
40302 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
40303 bool LowUndef = true, AllUndefOrZero = true;
40304 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
40305 SDValue Op = N0.getOperand(i);
40306 LowUndef &= Op.isUndef() || (i >= e/2);
40307 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
40308 }
40309 if (AllUndefOrZero) {
40310 SDValue N00 = N0.getOperand(0);
40311 SDLoc dl(N00);
40312 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
40313 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
40314 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
40315 }
40316 }
40317
40318 // Detect bitcasts of 64-bit build vectors and convert to a
40319 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
40320 // lowest element.
40321 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40322 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
40323 SrcVT == MVT::v8i8))
40324 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
40325
40326 // Detect bitcasts between element or subvector extraction to x86mmx.
40327 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
40328 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
40329 isNullConstant(N0.getOperand(1))) {
40330 SDValue N00 = N0.getOperand(0);
40331 if (N00.getValueType().is128BitVector())
40332 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
40333 DAG.getBitcast(MVT::v2i64, N00));
40334 }
40335
40336 // Detect bitcasts from FP_TO_SINT to x86mmx.
40337 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
40338 SDLoc DL(N0);
40339 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
40340 DAG.getUNDEF(MVT::v2i32));
40341 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
40342 DAG.getBitcast(MVT::v2i64, Res));
40343 }
40344 }
40345
40346 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
40347 // most of these to scalar anyway.
40348 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
40349 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40350 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
40351 return combinevXi1ConstantToInteger(N0, DAG);
40352 }
40353
40354 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40355 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40356 isa<ConstantSDNode>(N0)) {
40357 auto *C = cast<ConstantSDNode>(N0);
40358 if (C->isAllOnesValue())
40359 return DAG.getConstant(1, SDLoc(N0), VT);
40360 if (C->isNullValue())
40361 return DAG.getConstant(0, SDLoc(N0), VT);
40362 }
40363
40364 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
40365 // Turn it into a sign bit compare that produces a k-register. This avoids
40366 // a trip through a GPR.
40367 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40368 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40369 isPowerOf2_32(VT.getVectorNumElements())) {
40370 unsigned NumElts = VT.getVectorNumElements();
40371 SDValue Src = N0;
40372
40373 // Peek through truncate.
40374 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
40375 Src = N0.getOperand(0);
40376
40377 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
40378 SDValue MovmskIn = Src.getOperand(0);
40379 MVT MovmskVT = MovmskIn.getSimpleValueType();
40380 unsigned MovMskElts = MovmskVT.getVectorNumElements();
40381
40382 // We allow extra bits of the movmsk to be used since they are known zero.
40383 // We can't convert a VPMOVMSKB without avx512bw.
40384 if (MovMskElts <= NumElts &&
40385 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
40386 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
40387 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
40388 SDLoc dl(N);
40389 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
40390 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
40391 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
40392 if (EVT(CmpVT) == VT)
40393 return Cmp;
40394
40395 // Pad with zeroes up to original VT to replace the zeroes that were
40396 // being used from the MOVMSK.
40397 unsigned NumConcats = NumElts / MovMskElts;
40398 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
40399 Ops[0] = Cmp;
40400 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
40401 }
40402 }
40403 }
40404
40405 // Try to remove bitcasts from input and output of mask arithmetic to
40406 // remove GPR<->K-register crossings.
40407 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
40408 return V;
40409
40410 // Convert a bitcasted integer logic operation that has one bitcasted
40411 // floating-point operand into a floating-point logic operation. This may
40412 // create a load of a constant, but that is cheaper than materializing the
40413 // constant in an integer register and transferring it to an SSE register or
40414 // transferring the SSE operand to integer register and back.
40415 unsigned FPOpcode;
40416 switch (N0.getOpcode()) {
40417 case ISD::AND: FPOpcode = X86ISD::FAND; break;
40418 case ISD::OR: FPOpcode = X86ISD::FOR; break;
40419 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
40420 default: return SDValue();
40421 }
40422
40423 // Check if we have a bitcast from another integer type as well.
40424 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
40425 (Subtarget.hasSSE2() && VT == MVT::f64) ||
40426 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
40427 TLI.isTypeLegal(VT))))
40428 return SDValue();
40429
40430 SDValue LogicOp0 = N0.getOperand(0);
40431 SDValue LogicOp1 = N0.getOperand(1);
40432 SDLoc DL0(N0);
40433
40434 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
40435 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
40436 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
40437 LogicOp0.getOperand(0).getValueType() == VT &&
40438 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
40439 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
40440 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40441 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
40442 }
40443 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
40444 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
40445 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
40446 LogicOp1.getOperand(0).getValueType() == VT &&
40447 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
40448 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
40449 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40450 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
40451 }
40452
40453 return SDValue();
40454}
40455
40456// Given a ABS node, detect the following pattern:
40457// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
40458// This is useful as it is the input into a SAD pattern.
40459static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
40460 SDValue AbsOp1 = Abs->getOperand(0);
40461 if (AbsOp1.getOpcode() != ISD::SUB)
40462 return false;
40463
40464 Op0 = AbsOp1.getOperand(0);
40465 Op1 = AbsOp1.getOperand(1);
40466
40467 // Check if the operands of the sub are zero-extended from vectors of i8.
40468 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
40469 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
40470 Op1.getOpcode() != ISD::ZERO_EXTEND ||
40471 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
40472 return false;
40473
40474 return true;
40475}
40476
40477// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
40478// to these zexts.
40479static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
40480 const SDValue &Zext1, const SDLoc &DL,
40481 const X86Subtarget &Subtarget) {
40482 // Find the appropriate width for the PSADBW.
40483 EVT InVT = Zext0.getOperand(0).getValueType();
40484 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
40485
40486 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
40487 // fill in the missing vector elements with 0.
40488 unsigned NumConcat = RegSize / InVT.getSizeInBits();
40489 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
40490 Ops[0] = Zext0.getOperand(0);
40491 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
40492 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40493 Ops[0] = Zext1.getOperand(0);
40494 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40495
40496 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
40497 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40498 ArrayRef<SDValue> Ops) {
40499 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
40500 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
40501 };
40502 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
40503 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
40504 PSADBWBuilder);
40505}
40506
40507// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
40508// PHMINPOSUW.
40509static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
40510 const X86Subtarget &Subtarget) {
40511 // Bail without SSE41.
40512 if (!Subtarget.hasSSE41())
40513 return SDValue();
40514
40515 EVT ExtractVT = Extract->getValueType(0);
40516 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
40517 return SDValue();
40518
40519 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
40520 ISD::NodeType BinOp;
40521 SDValue Src = DAG.matchBinOpReduction(
40522 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
40523 if (!Src)
40524 return SDValue();
40525
40526 EVT SrcVT = Src.getValueType();
40527 EVT SrcSVT = SrcVT.getScalarType();
40528 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
40529 return SDValue();
40530
40531 SDLoc DL(Extract);
40532 SDValue MinPos = Src;
40533
40534 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
40535 while (SrcVT.getSizeInBits() > 128) {
40536 SDValue Lo, Hi;
40537 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
40538 SrcVT = Lo.getValueType();
40539 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
40540 }
40541 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||((void)0)
40542 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&((void)0)
40543 "Unexpected value type")((void)0);
40544
40545 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
40546 // to flip the value accordingly.
40547 SDValue Mask;
40548 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
40549 if (BinOp == ISD::SMAX)
40550 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
40551 else if (BinOp == ISD::SMIN)
40552 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
40553 else if (BinOp == ISD::UMAX)
40554 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
40555
40556 if (Mask)
40557 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40558
40559 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
40560 // shuffling each upper element down and insert zeros. This means that the
40561 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
40562 // ready for the PHMINPOS.
40563 if (ExtractVT == MVT::i8) {
40564 SDValue Upper = DAG.getVectorShuffle(
40565 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
40566 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
40567 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
40568 }
40569
40570 // Perform the PHMINPOS on a v8i16 vector,
40571 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
40572 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
40573 MinPos = DAG.getBitcast(SrcVT, MinPos);
40574
40575 if (Mask)
40576 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40577
40578 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
40579 DAG.getIntPtrConstant(0, DL));
40580}
40581
40582// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
40583static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
40584 const X86Subtarget &Subtarget) {
40585 // Bail without SSE2.
40586 if (!Subtarget.hasSSE2())
40587 return SDValue();
40588
40589 EVT ExtractVT = Extract->getValueType(0);
40590 unsigned BitWidth = ExtractVT.getSizeInBits();
40591 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
40592 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
40593 return SDValue();
40594
40595 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
40596 ISD::NodeType BinOp;
40597 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
40598 if (!Match && ExtractVT == MVT::i1)
40599 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
40600 if (!Match)
40601 return SDValue();
40602
40603 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
40604 // which we can't support here for now.
40605 if (Match.getScalarValueSizeInBits() != BitWidth)
40606 return SDValue();
40607
40608 SDValue Movmsk;
40609 SDLoc DL(Extract);
40610 EVT MatchVT = Match.getValueType();
40611 unsigned NumElts = MatchVT.getVectorNumElements();
40612 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
40613 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40614
40615 if (ExtractVT == MVT::i1) {
40616 // Special case for (pre-legalization) vXi1 reductions.
40617 if (NumElts > 64 || !isPowerOf2_32(NumElts))
40618 return SDValue();
40619 if (TLI.isTypeLegal(MatchVT)) {
40620 // If this is a legal AVX512 predicate type then we can just bitcast.
40621 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40622 Movmsk = DAG.getBitcast(MovmskVT, Match);
40623 } else {
40624 // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
40625 // PCMPEQQ (SSE41+), use PCMPEQD instead.
40626 if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
40627 Match.getOpcode() == ISD::SETCC &&
40628 ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
40629 cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
40630 ISD::CondCode::SETEQ) {
40631 SDValue Vec = Match.getOperand(0);
40632 if (Vec.getValueType().getScalarType() == MVT::i64 &&
40633 (2 * NumElts) <= MaxElts) {
40634 NumElts *= 2;
40635 EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
40636 MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
40637 Match = DAG.getSetCC(
40638 DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
40639 DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
40640 }
40641 }
40642
40643 // Use combineBitcastvxi1 to create the MOVMSK.
40644 while (NumElts > MaxElts) {
40645 SDValue Lo, Hi;
40646 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40647 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40648 NumElts /= 2;
40649 }
40650 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40651 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
40652 }
40653 if (!Movmsk)
40654 return SDValue();
40655 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
40656 } else {
40657 // FIXME: Better handling of k-registers or 512-bit vectors?
40658 unsigned MatchSizeInBits = Match.getValueSizeInBits();
40659 if (!(MatchSizeInBits == 128 ||
40660 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
40661 return SDValue();
40662
40663 // Make sure this isn't a vector of 1 element. The perf win from using
40664 // MOVMSK diminishes with less elements in the reduction, but it is
40665 // generally better to get the comparison over to the GPRs as soon as
40666 // possible to reduce the number of vector ops.
40667 if (Match.getValueType().getVectorNumElements() < 2)
40668 return SDValue();
40669
40670 // Check that we are extracting a reduction of all sign bits.
40671 if (DAG.ComputeNumSignBits(Match) != BitWidth)
40672 return SDValue();
40673
40674 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
40675 SDValue Lo, Hi;
40676 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40677 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40678 MatchSizeInBits = Match.getValueSizeInBits();
40679 }
40680
40681 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
40682 MVT MaskSrcVT;
40683 if (64 == BitWidth || 32 == BitWidth)
40684 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
40685 MatchSizeInBits / BitWidth);
40686 else
40687 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
40688
40689 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
40690 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
40691 NumElts = MaskSrcVT.getVectorNumElements();
40692 }
40693 assert((NumElts <= 32 || NumElts == 64) &&((void)0)
40694 "Not expecting more than 64 elements")((void)0);
40695
40696 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
40697 if (BinOp == ISD::XOR) {
40698 // parity -> (PARITY(MOVMSK X))
40699 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
40700 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
40701 }
40702
40703 SDValue CmpC;
40704 ISD::CondCode CondCode;
40705 if (BinOp == ISD::OR) {
40706 // any_of -> MOVMSK != 0
40707 CmpC = DAG.getConstant(0, DL, CmpVT);
40708 CondCode = ISD::CondCode::SETNE;
40709 } else {
40710 // all_of -> MOVMSK == ((1 << NumElts) - 1)
40711 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
40712 DL, CmpVT);
40713 CondCode = ISD::CondCode::SETEQ;
40714 }
40715
40716 // The setcc produces an i8 of 0/1, so extend that to the result width and
40717 // negate to get the final 0/-1 mask value.
40718 EVT SetccVT =
40719 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
40720 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
40721 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
40722 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
40723 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
40724}
40725
40726static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
40727 const X86Subtarget &Subtarget) {
40728 // PSADBW is only supported on SSE2 and up.
40729 if (!Subtarget.hasSSE2())
40730 return SDValue();
40731
40732 EVT ExtractVT = Extract->getValueType(0);
40733 // Verify the type we're extracting is either i32 or i64.
40734 // FIXME: Could support other types, but this is what we have coverage for.
40735 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
40736 return SDValue();
40737
40738 EVT VT = Extract->getOperand(0).getValueType();
40739 if (!isPowerOf2_32(VT.getVectorNumElements()))
40740 return SDValue();
40741
40742 // Match shuffle + add pyramid.
40743 ISD::NodeType BinOp;
40744 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
40745
40746 // The operand is expected to be zero extended from i8
40747 // (verified in detectZextAbsDiff).
40748 // In order to convert to i64 and above, additional any/zero/sign
40749 // extend is expected.
40750 // The zero extend from 32 bit has no mathematical effect on the result.
40751 // Also the sign extend is basically zero extend
40752 // (extends the sign bit which is zero).
40753 // So it is correct to skip the sign/zero extend instruction.
40754 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
40755 Root.getOpcode() == ISD::ZERO_EXTEND ||
40756 Root.getOpcode() == ISD::ANY_EXTEND))
40757 Root = Root.getOperand(0);
40758
40759 // If there was a match, we want Root to be a select that is the root of an
40760 // abs-diff pattern.
40761 if (!Root || Root.getOpcode() != ISD::ABS)
40762 return SDValue();
40763
40764 // Check whether we have an abs-diff pattern feeding into the select.
40765 SDValue Zext0, Zext1;
40766 if (!detectZextAbsDiff(Root, Zext0, Zext1))
40767 return SDValue();
40768
40769 // Create the SAD instruction.
40770 SDLoc DL(Extract);
40771 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
40772
40773 // If the original vector was wider than 8 elements, sum over the results
40774 // in the SAD vector.
40775 unsigned Stages = Log2_32(VT.getVectorNumElements());
40776 EVT SadVT = SAD.getValueType();
40777 if (Stages > 3) {
40778 unsigned SadElems = SadVT.getVectorNumElements();
40779
40780 for(unsigned i = Stages - 3; i > 0; --i) {
40781 SmallVector<int, 16> Mask(SadElems, -1);
40782 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
40783 Mask[j] = MaskEnd + j;
40784
40785 SDValue Shuffle =
40786 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
40787 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
40788 }
40789 }
40790
40791 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
40792 // Return the lowest ExtractSizeInBits bits.
40793 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
40794 SadVT.getSizeInBits() / ExtractSizeInBits);
40795 SAD = DAG.getBitcast(ResVT, SAD);
40796 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
40797 Extract->getOperand(1));
40798}
40799
40800// Attempt to peek through a target shuffle and extract the scalar from the
40801// source.
40802static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
40803 TargetLowering::DAGCombinerInfo &DCI,
40804 const X86Subtarget &Subtarget) {
40805 if (DCI.isBeforeLegalizeOps())
40806 return SDValue();
40807
40808 SDLoc dl(N);
40809 SDValue Src = N->getOperand(0);
40810 SDValue Idx = N->getOperand(1);
40811
40812 EVT VT = N->getValueType(0);
40813 EVT SrcVT = Src.getValueType();
40814 EVT SrcSVT = SrcVT.getVectorElementType();
40815 unsigned SrcEltBits = SrcSVT.getSizeInBits();
40816 unsigned NumSrcElts = SrcVT.getVectorNumElements();
40817
40818 // Don't attempt this for boolean mask vectors or unknown extraction indices.
40819 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
40820 return SDValue();
40821
40822 const APInt &IdxC = N->getConstantOperandAPInt(1);
40823 if (IdxC.uge(NumSrcElts))
40824 return SDValue();
40825
40826 SDValue SrcBC = peekThroughBitcasts(Src);
40827
40828 // Handle extract(bitcast(broadcast(scalar_value))).
40829 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
40830 SDValue SrcOp = SrcBC.getOperand(0);
40831 EVT SrcOpVT = SrcOp.getValueType();
40832 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
40833 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
40834 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
40835 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
40836 // TODO support non-zero offsets.
40837 if (Offset == 0) {
40838 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
40839 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
40840 return SrcOp;
40841 }
40842 }
40843 }
40844
40845 // If we're extracting a single element from a broadcast load and there are
40846 // no other users, just create a single load.
40847 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
40848 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
40849 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
40850 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
40851 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
40852 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
40853 MemIntr->getBasePtr(),
40854 MemIntr->getPointerInfo(),
40855 MemIntr->getOriginalAlign(),
40856 MemIntr->getMemOperand()->getFlags());
40857 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40858 return Load;
40859 }
40860 }
40861
40862 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
40863 // TODO: Move to DAGCombine?
40864 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
40865 SrcBC.getValueType().isInteger() &&
40866 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
40867 SrcBC.getScalarValueSizeInBits() ==
40868 SrcBC.getOperand(0).getValueSizeInBits()) {
40869 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
40870 if (IdxC.ult(Scale)) {
40871 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
40872 SDValue Scl = SrcBC.getOperand(0);
40873 EVT SclVT = Scl.getValueType();
40874 if (Offset) {
40875 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
40876 DAG.getShiftAmountConstant(Offset, SclVT, dl));
40877 }
40878 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
40879 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
40880 return Scl;
40881 }
40882 }
40883
40884 // Handle extract(truncate(x)) for 0'th index.
40885 // TODO: Treat this as a faux shuffle?
40886 // TODO: When can we use this for general indices?
40887 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
40888 (SrcVT.getSizeInBits() % 128) == 0) {
40889 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
40890 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
40891 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
40892 Idx);
40893 }
40894
40895 // We can only legally extract other elements from 128-bit vectors and in
40896 // certain circumstances, depending on SSE-level.
40897 // TODO: Investigate float/double extraction if it will be just stored.
40898 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
40899 unsigned Idx) {
40900 EVT VecSVT = VecVT.getScalarType();
40901 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
40902 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
40903 VecSVT == MVT::i64)) {
40904 unsigned EltSizeInBits = VecSVT.getSizeInBits();
40905 unsigned NumEltsPerLane = 128 / EltSizeInBits;
40906 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
40907 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
40908 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
40909 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
40910 Idx &= (NumEltsPerLane - 1);
40911 }
40912 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
40913 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
40914 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
40915 DAG.getBitcast(VecVT, Vec),
40916 DAG.getIntPtrConstant(Idx, dl));
40917 }
40918 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
40919 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
40920 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
40921 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
40922 DAG.getTargetConstant(Idx, dl, MVT::i8));
40923 }
40924 return SDValue();
40925 };
40926
40927 // Resolve the target shuffle inputs and mask.
40928 SmallVector<int, 16> Mask;
40929 SmallVector<SDValue, 2> Ops;
40930 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
40931 return SDValue();
40932
40933 // Shuffle inputs must be the same size as the result.
40934 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
40935 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
40936 }))
40937 return SDValue();
40938
40939 // Attempt to narrow/widen the shuffle mask to the correct size.
40940 if (Mask.size() != NumSrcElts) {
40941 if ((NumSrcElts % Mask.size()) == 0) {
40942 SmallVector<int, 16> ScaledMask;
40943 int Scale = NumSrcElts / Mask.size();
40944 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
40945 Mask = std::move(ScaledMask);
40946 } else if ((Mask.size() % NumSrcElts) == 0) {
40947 // Simplify Mask based on demanded element.
40948 int ExtractIdx = (int)IdxC.getZExtValue();
40949 int Scale = Mask.size() / NumSrcElts;
40950 int Lo = Scale * ExtractIdx;
40951 int Hi = Scale * (ExtractIdx + 1);
40952 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
40953 if (i < Lo || Hi <= i)
40954 Mask[i] = SM_SentinelUndef;
40955
40956 SmallVector<int, 16> WidenedMask;
40957 while (Mask.size() > NumSrcElts &&
40958 canWidenShuffleElements(Mask, WidenedMask))
40959 Mask = std::move(WidenedMask);
40960 }
40961 }
40962
40963 // If narrowing/widening failed, see if we can extract+zero-extend.
40964 int ExtractIdx;
40965 EVT ExtractVT;
40966 if (Mask.size() == NumSrcElts) {
40967 ExtractIdx = Mask[IdxC.getZExtValue()];
40968 ExtractVT = SrcVT;
40969 } else {
40970 unsigned Scale = Mask.size() / NumSrcElts;
40971 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
40972 return SDValue();
40973 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
40974 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
40975 return SDValue();
40976 ExtractIdx = Mask[ScaledIdx];
40977 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
40978 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
40979 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&((void)0)
40980 "Failed to widen vector type")((void)0);
40981 }
40982
40983 // If the shuffle source element is undef/zero then we can just accept it.
40984 if (ExtractIdx == SM_SentinelUndef)
40985 return DAG.getUNDEF(VT);
40986
40987 if (ExtractIdx == SM_SentinelZero)
40988 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
40989 : DAG.getConstant(0, dl, VT);
40990
40991 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
40992 ExtractIdx = ExtractIdx % Mask.size();
40993 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
40994 return DAG.getZExtOrTrunc(V, dl, VT);
40995
40996 return SDValue();
40997}
40998
40999/// Extracting a scalar FP value from vector element 0 is free, so extract each
41000/// operand first, then perform the math as a scalar op.
41001static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
41002 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")((void)0);
41003 SDValue Vec = ExtElt->getOperand(0);
41004 SDValue Index = ExtElt->getOperand(1);
41005 EVT VT = ExtElt->getValueType(0);
41006 EVT VecVT = Vec.getValueType();
41007
41008 // TODO: If this is a unary/expensive/expand op, allow extraction from a
41009 // non-zero element because the shuffle+scalar op will be cheaper?
41010 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
41011 return SDValue();
41012
41013 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
41014 // extract, the condition code), so deal with those as a special-case.
41015 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
41016 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
41017 if (OpVT != MVT::f32 && OpVT != MVT::f64)
41018 return SDValue();
41019
41020 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
41021 SDLoc DL(ExtElt);
41022 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41023 Vec.getOperand(0), Index);
41024 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41025 Vec.getOperand(1), Index);
41026 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
41027 }
41028
41029 if (VT != MVT::f32 && VT != MVT::f64)
41030 return SDValue();
41031
41032 // Vector FP selects don't fit the pattern of FP math ops (because the
41033 // condition has a different type and we have to change the opcode), so deal
41034 // with those here.
41035 // FIXME: This is restricted to pre type legalization by ensuring the setcc
41036 // has i1 elements. If we loosen this we need to convert vector bool to a
41037 // scalar bool.
41038 if (Vec.getOpcode() == ISD::VSELECT &&
41039 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
41040 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
41041 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
41042 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
41043 SDLoc DL(ExtElt);
41044 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
41045 Vec.getOperand(0).getValueType().getScalarType(),
41046 Vec.getOperand(0), Index);
41047 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41048 Vec.getOperand(1), Index);
41049 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41050 Vec.getOperand(2), Index);
41051 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
41052 }
41053
41054 // TODO: This switch could include FNEG and the x86-specific FP logic ops
41055 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
41056 // missed load folding and fma+fneg combining.
41057 switch (Vec.getOpcode()) {
41058 case ISD::FMA: // Begin 3 operands
41059 case ISD::FMAD:
41060 case ISD::FADD: // Begin 2 operands
41061 case ISD::FSUB:
41062 case ISD::FMUL:
41063 case ISD::FDIV:
41064 case ISD::FREM:
41065 case ISD::FCOPYSIGN:
41066 case ISD::FMINNUM:
41067 case ISD::FMAXNUM:
41068 case ISD::FMINNUM_IEEE:
41069 case ISD::FMAXNUM_IEEE:
41070 case ISD::FMAXIMUM:
41071 case ISD::FMINIMUM:
41072 case X86ISD::FMAX:
41073 case X86ISD::FMIN:
41074 case ISD::FABS: // Begin 1 operand
41075 case ISD::FSQRT:
41076 case ISD::FRINT:
41077 case ISD::FCEIL:
41078 case ISD::FTRUNC:
41079 case ISD::FNEARBYINT:
41080 case ISD::FROUND:
41081 case ISD::FFLOOR:
41082 case X86ISD::FRCP:
41083 case X86ISD::FRSQRT: {
41084 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
41085 SDLoc DL(ExtElt);
41086 SmallVector<SDValue, 4> ExtOps;
41087 for (SDValue Op : Vec->ops())
41088 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
41089 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
41090 }
41091 default:
41092 return SDValue();
41093 }
41094 llvm_unreachable("All opcodes should return within switch")__builtin_unreachable();
41095}
41096
41097/// Try to convert a vector reduction sequence composed of binops and shuffles
41098/// into horizontal ops.
41099static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
41100 const X86Subtarget &Subtarget) {
41101 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")((void)0);
41102
41103 // We need at least SSE2 to anything here.
41104 if (!Subtarget.hasSSE2())
41105 return SDValue();
41106
41107 ISD::NodeType Opc;
41108 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
41109 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
41110 if (!Rdx)
41111 return SDValue();
41112
41113 SDValue Index = ExtElt->getOperand(1);
41114 assert(isNullConstant(Index) &&((void)0)
41115 "Reduction doesn't end in an extract from index 0")((void)0);
41116
41117 EVT VT = ExtElt->getValueType(0);
41118 EVT VecVT = Rdx.getValueType();
41119 if (VecVT.getScalarType() != VT)
41120 return SDValue();
41121
41122 SDLoc DL(ExtElt);
41123
41124 // vXi8 mul reduction - promote to vXi16 mul reduction.
41125 if (Opc == ISD::MUL) {
41126 unsigned NumElts = VecVT.getVectorNumElements();
41127 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
41128 return SDValue();
41129 if (VecVT.getSizeInBits() >= 128) {
41130 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
41131 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41132 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41133 Lo = DAG.getBitcast(WideVT, Lo);
41134 Hi = DAG.getBitcast(WideVT, Hi);
41135 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
41136 while (Rdx.getValueSizeInBits() > 128) {
41137 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41138 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
41139 }
41140 } else {
41141 if (VecVT == MVT::v4i8)
41142 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41143 DAG.getUNDEF(MVT::v4i8));
41144 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41145 DAG.getUNDEF(MVT::v8i8));
41146 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
41147 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
41148 }
41149 if (NumElts >= 8)
41150 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41151 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41152 {4, 5, 6, 7, -1, -1, -1, -1}));
41153 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41154 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41155 {2, 3, -1, -1, -1, -1, -1, -1}));
41156 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41157 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41158 {1, -1, -1, -1, -1, -1, -1, -1}));
41159 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41160 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41161 }
41162
41163 // vXi8 add reduction - sub 128-bit vector.
41164 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
41165 if (VecVT == MVT::v4i8) {
41166 // Pad with zero.
41167 if (Subtarget.hasSSE41()) {
41168 Rdx = DAG.getBitcast(MVT::i32, Rdx);
41169 Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
41170 DAG.getConstant(0, DL, MVT::v4i32), Rdx,
41171 DAG.getIntPtrConstant(0, DL));
41172 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41173 } else {
41174 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41175 DAG.getConstant(0, DL, VecVT));
41176 }
41177 }
41178 if (Rdx.getValueType() == MVT::v8i8) {
41179 // Pad with undef.
41180 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41181 DAG.getUNDEF(MVT::v8i8));
41182 }
41183 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41184 DAG.getConstant(0, DL, MVT::v16i8));
41185 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41186 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41187 }
41188
41189 // Must be a >=128-bit vector with pow2 elements.
41190 if ((VecVT.getSizeInBits() % 128) != 0 ||
41191 !isPowerOf2_32(VecVT.getVectorNumElements()))
41192 return SDValue();
41193
41194 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
41195 if (VT == MVT::i8) {
41196 while (Rdx.getValueSizeInBits() > 128) {
41197 SDValue Lo, Hi;
41198 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41199 VecVT = Lo.getValueType();
41200 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
41201 }
41202 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")((void)0);
41203
41204 SDValue Hi = DAG.getVectorShuffle(
41205 MVT::v16i8, DL, Rdx, Rdx,
41206 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
41207 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
41208 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41209 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
41210 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41211 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41212 }
41213
41214 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
41215 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
41216 return SDValue();
41217
41218 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
41219
41220 // 256-bit horizontal instructions operate on 128-bit chunks rather than
41221 // across the whole vector, so we need an extract + hop preliminary stage.
41222 // This is the only step where the operands of the hop are not the same value.
41223 // TODO: We could extend this to handle 512-bit or even longer vectors.
41224 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
41225 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
41226 unsigned NumElts = VecVT.getVectorNumElements();
41227 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
41228 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
41229 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
41230 VecVT = Rdx.getValueType();
41231 }
41232 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
41233 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
41234 return SDValue();
41235
41236 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
41237 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
41238 for (unsigned i = 0; i != ReductionSteps; ++i)
41239 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
41240
41241 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41242}
41243
41244/// Detect vector gather/scatter index generation and convert it from being a
41245/// bunch of shuffles and extracts into a somewhat faster sequence.
41246/// For i686, the best sequence is apparently storing the value and loading
41247/// scalars back, while for x64 we should use 64-bit extracts and shifts.
41248static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
41249 TargetLowering::DAGCombinerInfo &DCI,
41250 const X86Subtarget &Subtarget) {
41251 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
41252 return NewOp;
41253
41254 SDValue InputVector = N->getOperand(0);
41255 SDValue EltIdx = N->getOperand(1);
41256 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
41257
41258 EVT SrcVT = InputVector.getValueType();
41259 EVT VT = N->getValueType(0);
41260 SDLoc dl(InputVector);
41261 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
41262 unsigned NumSrcElts = SrcVT.getVectorNumElements();
41263
41264 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
41265 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41266
41267 // Integer Constant Folding.
41268 if (CIdx && VT.isInteger()) {
41269 APInt UndefVecElts;
41270 SmallVector<APInt, 16> EltBits;
41271 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
41272 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
41273 EltBits, true, false)) {
41274 uint64_t Idx = CIdx->getZExtValue();
41275 if (UndefVecElts[Idx])
41276 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41277 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
41278 dl, VT);
41279 }
41280 }
41281
41282 if (IsPextr) {
41283 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41284 if (TLI.SimplifyDemandedBits(
41285 SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
41286 return SDValue(N, 0);
41287
41288 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
41289 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
41290 InputVector.getOpcode() == X86ISD::PINSRW) &&
41291 InputVector.getOperand(2) == EltIdx) {
41292 assert(SrcVT == InputVector.getOperand(0).getValueType() &&((void)0)
41293 "Vector type mismatch")((void)0);
41294 SDValue Scl = InputVector.getOperand(1);
41295 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
41296 return DAG.getZExtOrTrunc(Scl, dl, VT);
41297 }
41298
41299 // TODO - Remove this once we can handle the implicit zero-extension of
41300 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
41301 // combineBasicSADPattern.
41302 return SDValue();
41303 }
41304
41305 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
41306 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41307 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
41308 SDValue MMXSrc = InputVector.getOperand(0);
41309
41310 // The bitcast source is a direct mmx result.
41311 if (MMXSrc.getValueType() == MVT::x86mmx)
41312 return DAG.getBitcast(VT, InputVector);
41313 }
41314
41315 // Detect mmx to i32 conversion through a v2i32 elt extract.
41316 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41317 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
41318 SDValue MMXSrc = InputVector.getOperand(0);
41319
41320 // The bitcast source is a direct mmx result.
41321 if (MMXSrc.getValueType() == MVT::x86mmx)
41322 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
41323 }
41324
41325 // Check whether this extract is the root of a sum of absolute differences
41326 // pattern. This has to be done here because we really want it to happen
41327 // pre-legalization,
41328 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
41329 return SAD;
41330
41331 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
41332 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
41333 return Cmp;
41334
41335 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
41336 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
41337 return MinMax;
41338
41339 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
41340 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
41341 return V;
41342
41343 if (SDValue V = scalarizeExtEltFP(N, DAG))
41344 return V;
41345
41346 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
41347 // and then testing the relevant element.
41348 //
41349 // Note that we only combine extracts on the *same* result number, i.e.
41350 // t0 = merge_values a0, a1, a2, a3
41351 // i1 = extract_vector_elt t0, Constant:i64<2>
41352 // i1 = extract_vector_elt t0, Constant:i64<3>
41353 // but not
41354 // i1 = extract_vector_elt t0:1, Constant:i64<2>
41355 // since the latter would need its own MOVMSK.
41356 if (CIdx && SrcVT.getScalarType() == MVT::i1) {
41357 SmallVector<SDNode *, 16> BoolExtracts;
41358 unsigned ResNo = InputVector.getResNo();
41359 auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
41360 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41361 isa<ConstantSDNode>(Use->getOperand(1)) &&
41362 Use->getOperand(0).getResNo() == ResNo &&
41363 Use->getValueType(0) == MVT::i1) {
41364 BoolExtracts.push_back(Use);
41365 return true;
41366 }
41367 return false;
41368 };
41369 if (all_of(InputVector->uses(), IsBoolExtract) &&
41370 BoolExtracts.size() > 1) {
41371 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
41372 if (SDValue BC =
41373 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
41374 for (SDNode *Use : BoolExtracts) {
41375 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
41376 unsigned MaskIdx = Use->getConstantOperandVal(1);
41377 APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
41378 SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
41379 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
41380 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
41381 DCI.CombineTo(Use, Res);
41382 }
41383 return SDValue(N, 0);
41384 }
41385 }
41386 }
41387
41388 return SDValue();
41389}
41390
41391/// If a vector select has an operand that is -1 or 0, try to simplify the
41392/// select to a bitwise logic operation.
41393/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
41394static SDValue
41395combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
41396 TargetLowering::DAGCombinerInfo &DCI,
41397 const X86Subtarget &Subtarget) {
41398 SDValue Cond = N->getOperand(0);
41399 SDValue LHS = N->getOperand(1);
41400 SDValue RHS = N->getOperand(2);
41401 EVT VT = LHS.getValueType();
41402 EVT CondVT = Cond.getValueType();
41403 SDLoc DL(N);
41404 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41405
41406 if (N->getOpcode() != ISD::VSELECT)
41407 return SDValue();
41408
41409 assert(CondVT.isVector() && "Vector select expects a vector selector!")((void)0);
41410
41411 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
41412 // TODO: Can we assert that both operands are not zeros (because that should
41413 // get simplified at node creation time)?
41414 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
41415 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
41416
41417 // If both inputs are 0/undef, create a complete zero vector.
41418 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
41419 if (TValIsAllZeros && FValIsAllZeros) {
41420 if (VT.isFloatingPoint())
41421 return DAG.getConstantFP(0.0, DL, VT);
41422 return DAG.getConstant(0, DL, VT);
41423 }
41424
41425 // To use the condition operand as a bitwise mask, it must have elements that
41426 // are the same size as the select elements. Ie, the condition operand must
41427 // have already been promoted from the IR select condition type <N x i1>.
41428 // Don't check if the types themselves are equal because that excludes
41429 // vector floating-point selects.
41430 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
41431 return SDValue();
41432
41433 // Try to invert the condition if true value is not all 1s and false value is
41434 // not all 0s. Only do this if the condition has one use.
41435 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
41436 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
41437 // Check if the selector will be produced by CMPP*/PCMP*.
41438 Cond.getOpcode() == ISD::SETCC &&
41439 // Check if SETCC has already been promoted.
41440 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
41441 CondVT) {
41442 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
41443
41444 if (TValIsAllZeros || FValIsAllOnes) {
41445 SDValue CC = Cond.getOperand(2);
41446 ISD::CondCode NewCC = ISD::getSetCCInverse(
41447 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
41448 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
41449 NewCC);
41450 std::swap(LHS, RHS);
41451 TValIsAllOnes = FValIsAllOnes;
41452 FValIsAllZeros = TValIsAllZeros;
41453 }
41454 }
41455
41456 // Cond value must be 'sign splat' to be converted to a logical op.
41457 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
41458 return SDValue();
41459
41460 // vselect Cond, 111..., 000... -> Cond
41461 if (TValIsAllOnes && FValIsAllZeros)
41462 return DAG.getBitcast(VT, Cond);
41463
41464 if (!TLI.isTypeLegal(CondVT))
41465 return SDValue();
41466
41467 // vselect Cond, 111..., X -> or Cond, X
41468 if (TValIsAllOnes) {
41469 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41470 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
41471 return DAG.getBitcast(VT, Or);
41472 }
41473
41474 // vselect Cond, X, 000... -> and Cond, X
41475 if (FValIsAllZeros) {
41476 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
41477 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
41478 return DAG.getBitcast(VT, And);
41479 }
41480
41481 // vselect Cond, 000..., X -> andn Cond, X
41482 if (TValIsAllZeros) {
41483 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41484 SDValue AndN;
41485 // The canonical form differs for i1 vectors - x86andnp is not used
41486 if (CondVT.getScalarType() == MVT::i1)
41487 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
41488 CastRHS);
41489 else
41490 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
41491 return DAG.getBitcast(VT, AndN);
41492 }
41493
41494 return SDValue();
41495}
41496
41497/// If both arms of a vector select are concatenated vectors, split the select,
41498/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
41499/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
41500/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
41501static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
41502 const X86Subtarget &Subtarget) {
41503 unsigned Opcode = N->getOpcode();
41504 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
41505 return SDValue();
41506
41507 // TODO: Split 512-bit vectors too?
41508 EVT VT = N->getValueType(0);
41509 if (!VT.is256BitVector())
41510 return SDValue();
41511
41512 // TODO: Split as long as any 2 of the 3 operands are concatenated?
41513 SDValue Cond = N->getOperand(0);
41514 SDValue TVal = N->getOperand(1);
41515 SDValue FVal = N->getOperand(2);
41516 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
41517 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
41518 !collectConcatOps(TVal.getNode(), CatOpsT) ||
41519 !collectConcatOps(FVal.getNode(), CatOpsF))
41520 return SDValue();
41521
41522 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
41523 ArrayRef<SDValue> Ops) {
41524 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
41525 };
41526 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
41527 makeBlend, /*CheckBWI*/ false);
41528}
41529
41530static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
41531 SDValue Cond = N->getOperand(0);
41532 SDValue LHS = N->getOperand(1);
41533 SDValue RHS = N->getOperand(2);
41534 SDLoc DL(N);
41535
41536 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
41537 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
41538 if (!TrueC || !FalseC)
41539 return SDValue();
41540
41541 // Don't do this for crazy integer types.
41542 EVT VT = N->getValueType(0);
41543 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
41544 return SDValue();
41545
41546 // We're going to use the condition bit in math or logic ops. We could allow
41547 // this with a wider condition value (post-legalization it becomes an i8),
41548 // but if nothing is creating selects that late, it doesn't matter.
41549 if (Cond.getValueType() != MVT::i1)
41550 return SDValue();
41551
41552 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
41553 // 3, 5, or 9 with i32/i64, so those get transformed too.
41554 // TODO: For constants that overflow or do not differ by power-of-2 or small
41555 // multiplier, convert to 'and' + 'add'.
41556 const APInt &TrueVal = TrueC->getAPIntValue();
41557 const APInt &FalseVal = FalseC->getAPIntValue();
41558 bool OV;
41559 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
41560 if (OV)
41561 return SDValue();
41562
41563 APInt AbsDiff = Diff.abs();
41564 if (AbsDiff.isPowerOf2() ||
41565 ((VT == MVT::i32 || VT == MVT::i64) &&
41566 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
41567
41568 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
41569 // of the condition can usually be folded into a compare predicate, but even
41570 // without that, the sequence should be cheaper than a CMOV alternative.
41571 if (TrueVal.slt(FalseVal)) {
41572 Cond = DAG.getNOT(DL, Cond, MVT::i1);
41573 std::swap(TrueC, FalseC);
41574 }
41575
41576 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
41577 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
41578
41579 // Multiply condition by the difference if non-one.
41580 if (!AbsDiff.isOneValue())
41581 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
41582
41583 // Add the base if non-zero.
41584 if (!FalseC->isNullValue())
41585 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
41586
41587 return R;
41588 }
41589
41590 return SDValue();
41591}
41592
41593/// If this is a *dynamic* select (non-constant condition) and we can match
41594/// this node with one of the variable blend instructions, restructure the
41595/// condition so that blends can use the high (sign) bit of each element.
41596/// This function will also call SimplifyDemandedBits on already created
41597/// BLENDV to perform additional simplifications.
41598static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
41599 TargetLowering::DAGCombinerInfo &DCI,
41600 const X86Subtarget &Subtarget) {
41601 SDValue Cond = N->getOperand(0);
41602 if ((N->getOpcode() != ISD::VSELECT &&
41603 N->getOpcode() != X86ISD::BLENDV) ||
41604 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
41605 return SDValue();
41606
41607 // Don't optimize before the condition has been transformed to a legal type
41608 // and don't ever optimize vector selects that map to AVX512 mask-registers.
41609 unsigned BitWidth = Cond.getScalarValueSizeInBits();
41610 if (BitWidth < 8 || BitWidth > 64)
41611 return SDValue();
41612
41613 // We can only handle the cases where VSELECT is directly legal on the
41614 // subtarget. We custom lower VSELECT nodes with constant conditions and
41615 // this makes it hard to see whether a dynamic VSELECT will correctly
41616 // lower, so we both check the operation's status and explicitly handle the
41617 // cases where a *dynamic* blend will fail even though a constant-condition
41618 // blend could be custom lowered.
41619 // FIXME: We should find a better way to handle this class of problems.
41620 // Potentially, we should combine constant-condition vselect nodes
41621 // pre-legalization into shuffles and not mark as many types as custom
41622 // lowered.
41623 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41624 EVT VT = N->getValueType(0);
41625 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
41626 return SDValue();
41627 // FIXME: We don't support i16-element blends currently. We could and
41628 // should support them by making *all* the bits in the condition be set
41629 // rather than just the high bit and using an i8-element blend.
41630 if (VT.getVectorElementType() == MVT::i16)
41631 return SDValue();
41632 // Dynamic blending was only available from SSE4.1 onward.
41633 if (VT.is128BitVector() && !Subtarget.hasSSE41())
41634 return SDValue();
41635 // Byte blends are only available in AVX2
41636 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
41637 return SDValue();
41638 // There are no 512-bit blend instructions that use sign bits.
41639 if (VT.is512BitVector())
41640 return SDValue();
41641
41642 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
41643 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
41644 UI != UE; ++UI)
41645 if ((UI->getOpcode() != ISD::VSELECT &&
41646 UI->getOpcode() != X86ISD::BLENDV) ||
41647 UI.getOperandNo() != 0)
41648 return false;
41649
41650 return true;
41651 };
41652
41653 APInt DemandedBits(APInt::getSignMask(BitWidth));
41654
41655 if (OnlyUsedAsSelectCond(Cond)) {
41656 KnownBits Known;
41657 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
41658 !DCI.isBeforeLegalizeOps());
41659 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
41660 return SDValue();
41661
41662 // If we changed the computation somewhere in the DAG, this change will
41663 // affect all users of Cond. Update all the nodes so that we do not use
41664 // the generic VSELECT anymore. Otherwise, we may perform wrong
41665 // optimizations as we messed with the actual expectation for the vector
41666 // boolean values.
41667 for (SDNode *U : Cond->uses()) {
41668 if (U->getOpcode() == X86ISD::BLENDV)
41669 continue;
41670
41671 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
41672 Cond, U->getOperand(1), U->getOperand(2));
41673 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
41674 DCI.AddToWorklist(U);
41675 }
41676 DCI.CommitTargetLoweringOpt(TLO);
41677 return SDValue(N, 0);
41678 }
41679
41680 // Otherwise we can still at least try to simplify multiple use bits.
41681 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
41682 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
41683 N->getOperand(1), N->getOperand(2));
41684
41685 return SDValue();
41686}
41687
41688// Try to match:
41689// (or (and (M, (sub 0, X)), (pandn M, X)))
41690// which is a special case of:
41691// (select M, (sub 0, X), X)
41692// Per:
41693// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
41694// We know that, if fNegate is 0 or 1:
41695// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
41696//
41697// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
41698// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
41699// ( M ? -X : X) == ((X ^ M ) + (M & 1))
41700// This lets us transform our vselect to:
41701// (add (xor X, M), (and M, 1))
41702// And further to:
41703// (sub (xor X, M), M)
41704static SDValue combineLogicBlendIntoConditionalNegate(
41705 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
41706 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
41707 EVT MaskVT = Mask.getValueType();
41708 assert(MaskVT.isInteger() &&((void)0)
41709 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&((void)0)
41710 "Mask must be zero/all-bits")((void)0);
41711
41712 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
41713 return SDValue();
41714 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
41715 return SDValue();
41716
41717 auto IsNegV = [](SDNode *N, SDValue V) {
41718 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
41719 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
41720 };
41721
41722 SDValue V;
41723 if (IsNegV(Y.getNode(), X))
41724 V = X;
41725 else if (IsNegV(X.getNode(), Y))
41726 V = Y;
41727 else
41728 return SDValue();
41729
41730 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
41731 SDValue SubOp2 = Mask;
41732
41733 // If the negate was on the false side of the select, then
41734 // the operands of the SUB need to be swapped. PR 27251.
41735 // This is because the pattern being matched above is
41736 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
41737 // but if the pattern matched was
41738 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
41739 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
41740 // pattern also needs to be a negation of the replacement pattern above.
41741 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
41742 // sub accomplishes the negation of the replacement pattern.
41743 if (V == Y)
41744 std::swap(SubOp1, SubOp2);
41745
41746 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
41747 return DAG.getBitcast(VT, Res);
41748}
41749
41750/// Do target-specific dag combines on SELECT and VSELECT nodes.
41751static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
41752 TargetLowering::DAGCombinerInfo &DCI,
41753 const X86Subtarget &Subtarget) {
41754 SDLoc DL(N);
41755 SDValue Cond = N->getOperand(0);
41756 SDValue LHS = N->getOperand(1);
41757 SDValue RHS = N->getOperand(2);
41758
41759 // Try simplification again because we use this function to optimize
41760 // BLENDV nodes that are not handled by the generic combiner.
41761 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
41762 return V;
41763
41764 EVT VT = LHS.getValueType();
41765 EVT CondVT = Cond.getValueType();
41766 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41767 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
41768
41769 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
41770 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
41771 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
41772 if (CondVT.isVector() && CondVT.isInteger() &&
41773 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
41774 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
41775 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
41776 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
41777 DL, DAG, Subtarget))
41778 return V;
41779
41780 // Convert vselects with constant condition into shuffles.
41781 if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
41782 SmallVector<int, 64> Mask;
41783 if (createShuffleMaskFromVSELECT(Mask, Cond))
41784 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
41785 }
41786
41787 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
41788 // by forcing the unselected elements to zero.
41789 // TODO: Can we handle more shuffles with this?
41790 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
41791 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
41792 LHS.hasOneUse() && RHS.hasOneUse()) {
41793 MVT SimpleVT = VT.getSimpleVT();
41794 SmallVector<SDValue, 1> LHSOps, RHSOps;
41795 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
41796 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
41797 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
41798 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
41799 int NumElts = VT.getVectorNumElements();
41800 for (int i = 0; i != NumElts; ++i) {
41801 if (CondMask[i] < NumElts)
41802 RHSMask[i] = 0x80;
41803 else
41804 LHSMask[i] = 0x80;
41805 }
41806 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
41807 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
41808 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
41809 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
41810 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
41811 }
41812 }
41813
41814 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
41815 // instructions match the semantics of the common C idiom x<y?x:y but not
41816 // x<=y?x:y, because of how they handle negative zero (which can be
41817 // ignored in unsafe-math mode).
41818 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
41819 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
41820 VT != MVT::f80 && VT != MVT::f128 &&
41821 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
41822 (Subtarget.hasSSE2() ||
41823 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
41824 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41825
41826 unsigned Opcode = 0;
41827 // Check for x CC y ? x : y.
41828 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
41829 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
41830 switch (CC) {
41831 default: break;
41832 case ISD::SETULT:
41833 // Converting this to a min would handle NaNs incorrectly, and swapping
41834 // the operands would cause it to handle comparisons between positive
41835 // and negative zero incorrectly.
41836 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41837 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41838 !(DAG.isKnownNeverZeroFloat(LHS) ||
41839 DAG.isKnownNeverZeroFloat(RHS)))
41840 break;
41841 std::swap(LHS, RHS);
41842 }
41843 Opcode = X86ISD::FMIN;
41844 break;
41845 case ISD::SETOLE:
41846 // Converting this to a min would handle comparisons between positive
41847 // and negative zero incorrectly.
41848 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41849 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41850 break;
41851 Opcode = X86ISD::FMIN;
41852 break;
41853 case ISD::SETULE:
41854 // Converting this to a min would handle both negative zeros and NaNs
41855 // incorrectly, but we can swap the operands to fix both.
41856 std::swap(LHS, RHS);
41857 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41858 case ISD::SETOLT:
41859 case ISD::SETLT:
41860 case ISD::SETLE:
41861 Opcode = X86ISD::FMIN;
41862 break;
41863
41864 case ISD::SETOGE:
41865 // Converting this to a max would handle comparisons between positive
41866 // and negative zero incorrectly.
41867 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41868 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41869 break;
41870 Opcode = X86ISD::FMAX;
41871 break;
41872 case ISD::SETUGT:
41873 // Converting this to a max would handle NaNs incorrectly, and swapping
41874 // the operands would cause it to handle comparisons between positive
41875 // and negative zero incorrectly.
41876 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41877 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41878 !(DAG.isKnownNeverZeroFloat(LHS) ||
41879 DAG.isKnownNeverZeroFloat(RHS)))
41880 break;
41881 std::swap(LHS, RHS);
41882 }
41883 Opcode = X86ISD::FMAX;
41884 break;
41885 case ISD::SETUGE:
41886 // Converting this to a max would handle both negative zeros and NaNs
41887 // incorrectly, but we can swap the operands to fix both.
41888 std::swap(LHS, RHS);
41889 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41890 case ISD::SETOGT:
41891 case ISD::SETGT:
41892 case ISD::SETGE:
41893 Opcode = X86ISD::FMAX;
41894 break;
41895 }
41896 // Check for x CC y ? y : x -- a min/max with reversed arms.
41897 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
41898 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
41899 switch (CC) {
41900 default: break;
41901 case ISD::SETOGE:
41902 // Converting this to a min would handle comparisons between positive
41903 // and negative zero incorrectly, and swapping the operands would
41904 // cause it to handle NaNs incorrectly.
41905 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41906 !(DAG.isKnownNeverZeroFloat(LHS) ||
41907 DAG.isKnownNeverZeroFloat(RHS))) {
41908 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41909 break;
41910 std::swap(LHS, RHS);
41911 }
41912 Opcode = X86ISD::FMIN;
41913 break;
41914 case ISD::SETUGT:
41915 // Converting this to a min would handle NaNs incorrectly.
41916 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41917 break;
41918 Opcode = X86ISD::FMIN;
41919 break;
41920 case ISD::SETUGE:
41921 // Converting this to a min would handle both negative zeros and NaNs
41922 // incorrectly, but we can swap the operands to fix both.
41923 std::swap(LHS, RHS);
41924 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41925 case ISD::SETOGT:
41926 case ISD::SETGT:
41927 case ISD::SETGE:
41928 Opcode = X86ISD::FMIN;
41929 break;
41930
41931 case ISD::SETULT:
41932 // Converting this to a max would handle NaNs incorrectly.
41933 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41934 break;
41935 Opcode = X86ISD::FMAX;
41936 break;
41937 case ISD::SETOLE:
41938 // Converting this to a max would handle comparisons between positive
41939 // and negative zero incorrectly, and swapping the operands would
41940 // cause it to handle NaNs incorrectly.
41941 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41942 !DAG.isKnownNeverZeroFloat(LHS) &&
41943 !DAG.isKnownNeverZeroFloat(RHS)) {
41944 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41945 break;
41946 std::swap(LHS, RHS);
41947 }
41948 Opcode = X86ISD::FMAX;
41949 break;
41950 case ISD::SETULE:
41951 // Converting this to a max would handle both negative zeros and NaNs
41952 // incorrectly, but we can swap the operands to fix both.
41953 std::swap(LHS, RHS);
41954 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41955 case ISD::SETOLT:
41956 case ISD::SETLT:
41957 case ISD::SETLE:
41958 Opcode = X86ISD::FMAX;
41959 break;
41960 }
41961 }
41962
41963 if (Opcode)
41964 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
41965 }
41966
41967 // Some mask scalar intrinsics rely on checking if only one bit is set
41968 // and implement it in C code like this:
41969 // A[0] = (U & 1) ? A[0] : W[0];
41970 // This creates some redundant instructions that break pattern matching.
41971 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
41972 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
41973 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
41974 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41975 SDValue AndNode = Cond.getOperand(0);
41976 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
41977 isNullConstant(Cond.getOperand(1)) &&
41978 isOneConstant(AndNode.getOperand(1))) {
41979 // LHS and RHS swapped due to
41980 // setcc outputting 1 when AND resulted in 0 and vice versa.
41981 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
41982 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
41983 }
41984 }
41985
41986 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
41987 // lowering on KNL. In this case we convert it to
41988 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
41989 // The same situation all vectors of i8 and i16 without BWI.
41990 // Make sure we extend these even before type legalization gets a chance to
41991 // split wide vectors.
41992 // Since SKX these selects have a proper lowering.
41993 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
41994 CondVT.getVectorElementType() == MVT::i1 &&
41995 (VT.getVectorElementType() == MVT::i8 ||
41996 VT.getVectorElementType() == MVT::i16)) {
41997 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
41998 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
41999 }
42000
42001 // AVX512 - Extend select with zero to merge with target shuffle.
42002 // select(mask, extract_subvector(shuffle(x)), zero) -->
42003 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
42004 // TODO - support non target shuffles as well.
42005 if (Subtarget.hasAVX512() && CondVT.isVector() &&
42006 CondVT.getVectorElementType() == MVT::i1) {
42007 auto SelectableOp = [&TLI](SDValue Op) {
42008 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42009 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
42010 isNullConstant(Op.getOperand(1)) &&
42011 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
42012 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
42013 };
42014
42015 bool SelectableLHS = SelectableOp(LHS);
42016 bool SelectableRHS = SelectableOp(RHS);
42017 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
42018 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
42019
42020 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
42021 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
42022 : RHS.getOperand(0).getValueType();
42023 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
42024 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
42025 VT.getSizeInBits());
42026 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
42027 VT.getSizeInBits());
42028 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
42029 DAG.getUNDEF(SrcCondVT), Cond,
42030 DAG.getIntPtrConstant(0, DL));
42031 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
42032 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
42033 }
42034 }
42035
42036 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
42037 return V;
42038
42039 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
42040 Cond.hasOneUse()) {
42041 EVT CondVT = Cond.getValueType();
42042 SDValue Cond0 = Cond.getOperand(0);
42043 SDValue Cond1 = Cond.getOperand(1);
42044 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42045
42046 // Canonicalize min/max:
42047 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
42048 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
42049 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
42050 // the need for an extra compare against zero. e.g.
42051 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
42052 // subl %esi, %edi
42053 // testl %edi, %edi
42054 // movl $0, %eax
42055 // cmovgl %edi, %eax
42056 // =>
42057 // xorl %eax, %eax
42058 // subl %esi, $edi
42059 // cmovsl %eax, %edi
42060 //
42061 // We can also canonicalize
42062 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
42063 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
42064 // This allows the use of a test instruction for the compare.
42065 if (LHS == Cond0 && RHS == Cond1) {
42066 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
42067 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
42068 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
42069 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42070 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42071 }
42072 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
42073 ISD::CondCode NewCC = ISD::SETUGE;
42074 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42075 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42076 }
42077 }
42078
42079 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
42080 // fold eq + gt/lt nested selects into ge/le selects
42081 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
42082 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
42083 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
42084 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
42085 // .. etc ..
42086 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
42087 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
42088 SDValue InnerSetCC = RHS.getOperand(0);
42089 ISD::CondCode InnerCC =
42090 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
42091 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
42092 Cond0 == InnerSetCC.getOperand(0) &&
42093 Cond1 == InnerSetCC.getOperand(1)) {
42094 ISD::CondCode NewCC;
42095 switch (CC == ISD::SETEQ ? InnerCC : CC) {
42096 case ISD::SETGT: NewCC = ISD::SETGE; break;
42097 case ISD::SETLT: NewCC = ISD::SETLE; break;
42098 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
42099 case ISD::SETULT: NewCC = ISD::SETULE; break;
42100 default: NewCC = ISD::SETCC_INVALID; break;
42101 }
42102 if (NewCC != ISD::SETCC_INVALID) {
42103 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
42104 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
42105 }
42106 }
42107 }
42108 }
42109
42110 // Check if the first operand is all zeros and Cond type is vXi1.
42111 // If this an avx512 target we can improve the use of zero masking by
42112 // swapping the operands and inverting the condition.
42113 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
42114 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
42115 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
42116 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
42117 // Invert the cond to not(cond) : xor(op,allones)=not(op)
42118 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
42119 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
42120 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
42121 }
42122
42123 // Early exit check
42124 if (!TLI.isTypeLegal(VT))
42125 return SDValue();
42126
42127 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
42128 return V;
42129
42130 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
42131 return V;
42132
42133 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
42134 return V;
42135
42136 // select(~Cond, X, Y) -> select(Cond, Y, X)
42137 if (CondVT.getScalarType() != MVT::i1) {
42138 if (SDValue CondNot = IsNOT(Cond, DAG))
42139 return DAG.getNode(N->getOpcode(), DL, VT,
42140 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
42141 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
42142 if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
42143 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
42144 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
42145 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
42146 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
42147 }
42148 }
42149
42150 // Try to optimize vXi1 selects if both operands are either all constants or
42151 // bitcasts from scalar integer type. In that case we can convert the operands
42152 // to integer and use an integer select which will be converted to a CMOV.
42153 // We need to take a little bit of care to avoid creating an i64 type after
42154 // type legalization.
42155 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
42156 VT.getVectorElementType() == MVT::i1 &&
42157 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
42158 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
42159 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
42160 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
42161
42162 if ((LHSIsConst ||
42163 (LHS.getOpcode() == ISD::BITCAST &&
42164 LHS.getOperand(0).getValueType() == IntVT)) &&
42165 (RHSIsConst ||
42166 (RHS.getOpcode() == ISD::BITCAST &&
42167 RHS.getOperand(0).getValueType() == IntVT))) {
42168 if (LHSIsConst)
42169 LHS = combinevXi1ConstantToInteger(LHS, DAG);
42170 else
42171 LHS = LHS.getOperand(0);
42172
42173 if (RHSIsConst)
42174 RHS = combinevXi1ConstantToInteger(RHS, DAG);
42175 else
42176 RHS = RHS.getOperand(0);
42177
42178 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
42179 return DAG.getBitcast(VT, Select);
42180 }
42181 }
42182
42183 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
42184 // single bits, then invert the predicate and swap the select operands.
42185 // This can lower using a vector shift bit-hack rather than mask and compare.
42186 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
42187 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
42188 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
42189 Cond.getOperand(0).getOpcode() == ISD::AND &&
42190 isNullOrNullSplat(Cond.getOperand(1)) &&
42191 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
42192 Cond.getOperand(0).getValueType() == VT) {
42193 // The 'and' mask must be composed of power-of-2 constants.
42194 SDValue And = Cond.getOperand(0);
42195 auto *C = isConstOrConstSplat(And.getOperand(1));
42196 if (C && C->getAPIntValue().isPowerOf2()) {
42197 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
42198 SDValue NotCond =
42199 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
42200 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
42201 }
42202
42203 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
42204 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
42205 // 16-bit lacks a proper blendv.
42206 unsigned EltBitWidth = VT.getScalarSizeInBits();
42207 bool CanShiftBlend =
42208 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
42209 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
42210 (Subtarget.hasXOP()));
42211 if (CanShiftBlend &&
42212 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
42213 return C->getAPIntValue().isPowerOf2();
42214 })) {
42215 // Create a left-shift constant to get the mask bits over to the sign-bit.
42216 SDValue Mask = And.getOperand(1);
42217 SmallVector<int, 32> ShlVals;
42218 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
42219 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
42220 ShlVals.push_back(EltBitWidth - 1 -
42221 MaskVal->getAPIntValue().exactLogBase2());
42222 }
42223 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
42224 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
42225 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
42226 SDValue NewCond =
42227 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
42228 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
42229 }
42230 }
42231
42232 return SDValue();
42233}
42234
42235/// Combine:
42236/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
42237/// to:
42238/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
42239/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
42240/// Note that this is only legal for some op/cc combinations.
42241static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
42242 SelectionDAG &DAG,
42243 const X86Subtarget &Subtarget) {
42244 // This combine only operates on CMP-like nodes.
42245 if (!(Cmp.getOpcode() == X86ISD::CMP ||
42246 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42247 return SDValue();
42248
42249 // Can't replace the cmp if it has more uses than the one we're looking at.
42250 // FIXME: We would like to be able to handle this, but would need to make sure
42251 // all uses were updated.
42252 if (!Cmp.hasOneUse())
42253 return SDValue();
42254
42255 // This only applies to variations of the common case:
42256 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
42257 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
42258 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
42259 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
42260 // Using the proper condcodes (see below), overflow is checked for.
42261
42262 // FIXME: We can generalize both constraints:
42263 // - XOR/OR/AND (if they were made to survive AtomicExpand)
42264 // - LHS != 1
42265 // if the result is compared.
42266
42267 SDValue CmpLHS = Cmp.getOperand(0);
42268 SDValue CmpRHS = Cmp.getOperand(1);
42269 EVT CmpVT = CmpLHS.getValueType();
42270
42271 if (!CmpLHS.hasOneUse())
42272 return SDValue();
42273
42274 unsigned Opc = CmpLHS.getOpcode();
42275 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
42276 return SDValue();
42277
42278 SDValue OpRHS = CmpLHS.getOperand(2);
42279 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
42280 if (!OpRHSC)
42281 return SDValue();
42282
42283 APInt Addend = OpRHSC->getAPIntValue();
42284 if (Opc == ISD::ATOMIC_LOAD_SUB)
42285 Addend = -Addend;
42286
42287 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
42288 if (!CmpRHSC)
42289 return SDValue();
42290
42291 APInt Comparison = CmpRHSC->getAPIntValue();
42292 APInt NegAddend = -Addend;
42293
42294 // See if we can adjust the CC to make the comparison match the negated
42295 // addend.
42296 if (Comparison != NegAddend) {
42297 APInt IncComparison = Comparison + 1;
42298 if (IncComparison == NegAddend) {
42299 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
42300 Comparison = IncComparison;
42301 CC = X86::COND_AE;
42302 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
42303 Comparison = IncComparison;
42304 CC = X86::COND_L;
42305 }
42306 }
42307 APInt DecComparison = Comparison - 1;
42308 if (DecComparison == NegAddend) {
42309 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
42310 Comparison = DecComparison;
42311 CC = X86::COND_A;
42312 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
42313 Comparison = DecComparison;
42314 CC = X86::COND_LE;
42315 }
42316 }
42317 }
42318
42319 // If the addend is the negation of the comparison value, then we can do
42320 // a full comparison by emitting the atomic arithmetic as a locked sub.
42321 if (Comparison == NegAddend) {
42322 // The CC is fine, but we need to rewrite the LHS of the comparison as an
42323 // atomic sub.
42324 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
42325 auto AtomicSub = DAG.getAtomic(
42326 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
42327 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
42328 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
42329 AN->getMemOperand());
42330 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
42331 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42332 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42333 return LockOp;
42334 }
42335
42336 // We can handle comparisons with zero in a number of cases by manipulating
42337 // the CC used.
42338 if (!Comparison.isNullValue())
42339 return SDValue();
42340
42341 if (CC == X86::COND_S && Addend == 1)
42342 CC = X86::COND_LE;
42343 else if (CC == X86::COND_NS && Addend == 1)
42344 CC = X86::COND_G;
42345 else if (CC == X86::COND_G && Addend == -1)
42346 CC = X86::COND_GE;
42347 else if (CC == X86::COND_LE && Addend == -1)
42348 CC = X86::COND_L;
42349 else
42350 return SDValue();
42351
42352 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
42353 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42354 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42355 return LockOp;
42356}
42357
42358// Check whether a boolean test is testing a boolean value generated by
42359// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
42360// code.
42361//
42362// Simplify the following patterns:
42363// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
42364// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
42365// to (Op EFLAGS Cond)
42366//
42367// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
42368// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
42369// to (Op EFLAGS !Cond)
42370//
42371// where Op could be BRCOND or CMOV.
42372//
42373static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
42374 // This combine only operates on CMP-like nodes.
42375 if (!(Cmp.getOpcode() == X86ISD::CMP ||
42376 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42377 return SDValue();
42378
42379 // Quit if not used as a boolean value.
42380 if (CC != X86::COND_E && CC != X86::COND_NE)
42381 return SDValue();
42382
42383 // Check CMP operands. One of them should be 0 or 1 and the other should be
42384 // an SetCC or extended from it.
42385 SDValue Op1 = Cmp.getOperand(0);
42386 SDValue Op2 = Cmp.getOperand(1);
42387
42388 SDValue SetCC;
42389 const ConstantSDNode* C = nullptr;
42390 bool needOppositeCond = (CC == X86::COND_E);
42391 bool checkAgainstTrue = false; // Is it a comparison against 1?
42392
42393 if ((C = dyn_cast<ConstantSDNode>(Op1)))
42394 SetCC = Op2;
42395 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
42396 SetCC = Op1;
42397 else // Quit if all operands are not constants.
42398 return SDValue();
42399
42400 if (C->getZExtValue() == 1) {
42401 needOppositeCond = !needOppositeCond;
42402 checkAgainstTrue = true;
42403 } else if (C->getZExtValue() != 0)
42404 // Quit if the constant is neither 0 or 1.
42405 return SDValue();
42406
42407 bool truncatedToBoolWithAnd = false;
42408 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
42409 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
42410 SetCC.getOpcode() == ISD::TRUNCATE ||
42411 SetCC.getOpcode() == ISD::AND) {
42412 if (SetCC.getOpcode() == ISD::AND) {
42413 int OpIdx = -1;
42414 if (isOneConstant(SetCC.getOperand(0)))
42415 OpIdx = 1;
42416 if (isOneConstant(SetCC.getOperand(1)))
42417 OpIdx = 0;
42418 if (OpIdx < 0)
42419 break;
42420 SetCC = SetCC.getOperand(OpIdx);
42421 truncatedToBoolWithAnd = true;
42422 } else
42423 SetCC = SetCC.getOperand(0);
42424 }
42425
42426 switch (SetCC.getOpcode()) {
42427 case X86ISD::SETCC_CARRY:
42428 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
42429 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
42430 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
42431 // truncated to i1 using 'and'.
42432 if (checkAgainstTrue && !truncatedToBoolWithAnd)
42433 break;
42434 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((void)0)
42435 "Invalid use of SETCC_CARRY!")((void)0);
42436 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42437 case X86ISD::SETCC:
42438 // Set the condition code or opposite one if necessary.
42439 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
42440 if (needOppositeCond)
42441 CC = X86::GetOppositeBranchCondition(CC);
42442 return SetCC.getOperand(1);
42443 case X86ISD::CMOV: {
42444 // Check whether false/true value has canonical one, i.e. 0 or 1.
42445 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
42446 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
42447 // Quit if true value is not a constant.
42448 if (!TVal)
42449 return SDValue();
42450 // Quit if false value is not a constant.
42451 if (!FVal) {
42452 SDValue Op = SetCC.getOperand(0);
42453 // Skip 'zext' or 'trunc' node.
42454 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
42455 Op.getOpcode() == ISD::TRUNCATE)
42456 Op = Op.getOperand(0);
42457 // A special case for rdrand/rdseed, where 0 is set if false cond is
42458 // found.
42459 if ((Op.getOpcode() != X86ISD::RDRAND &&
42460 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
42461 return SDValue();
42462 }
42463 // Quit if false value is not the constant 0 or 1.
42464 bool FValIsFalse = true;
42465 if (FVal && FVal->getZExtValue() != 0) {
42466 if (FVal->getZExtValue() != 1)
42467 return SDValue();
42468 // If FVal is 1, opposite cond is needed.
42469 needOppositeCond = !needOppositeCond;
42470 FValIsFalse = false;
42471 }
42472 // Quit if TVal is not the constant opposite of FVal.
42473 if (FValIsFalse && TVal->getZExtValue() != 1)
42474 return SDValue();
42475 if (!FValIsFalse && TVal->getZExtValue() != 0)
42476 return SDValue();
42477 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
42478 if (needOppositeCond)
42479 CC = X86::GetOppositeBranchCondition(CC);
42480 return SetCC.getOperand(3);
42481 }
42482 }
42483
42484 return SDValue();
42485}
42486
42487/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
42488/// Match:
42489/// (X86or (X86setcc) (X86setcc))
42490/// (X86cmp (and (X86setcc) (X86setcc)), 0)
42491static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
42492 X86::CondCode &CC1, SDValue &Flags,
42493 bool &isAnd) {
42494 if (Cond->getOpcode() == X86ISD::CMP) {
42495 if (!isNullConstant(Cond->getOperand(1)))
42496 return false;
42497
42498 Cond = Cond->getOperand(0);
42499 }
42500
42501 isAnd = false;
42502
42503 SDValue SetCC0, SetCC1;
42504 switch (Cond->getOpcode()) {
42505 default: return false;
42506 case ISD::AND:
42507 case X86ISD::AND:
42508 isAnd = true;
42509 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42510 case ISD::OR:
42511 case X86ISD::OR:
42512 SetCC0 = Cond->getOperand(0);
42513 SetCC1 = Cond->getOperand(1);
42514 break;
42515 };
42516
42517 // Make sure we have SETCC nodes, using the same flags value.
42518 if (SetCC0.getOpcode() != X86ISD::SETCC ||
42519 SetCC1.getOpcode() != X86ISD::SETCC ||
42520 SetCC0->getOperand(1) != SetCC1->getOperand(1))
42521 return false;
42522
42523 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
42524 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
42525 Flags = SetCC0->getOperand(1);
42526 return true;
42527}
42528
42529// When legalizing carry, we create carries via add X, -1
42530// If that comes from an actual carry, via setcc, we use the
42531// carry directly.
42532static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
42533 if (EFLAGS.getOpcode() == X86ISD::ADD) {
42534 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
42535 SDValue Carry = EFLAGS.getOperand(0);
42536 while (Carry.getOpcode() == ISD::TRUNCATE ||
42537 Carry.getOpcode() == ISD::ZERO_EXTEND ||
42538 Carry.getOpcode() == ISD::SIGN_EXTEND ||
42539 Carry.getOpcode() == ISD::ANY_EXTEND ||
42540 (Carry.getOpcode() == ISD::AND &&
42541 isOneConstant(Carry.getOperand(1))))
42542 Carry = Carry.getOperand(0);
42543 if (Carry.getOpcode() == X86ISD::SETCC ||
42544 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
42545 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
42546 uint64_t CarryCC = Carry.getConstantOperandVal(0);
42547 SDValue CarryOp1 = Carry.getOperand(1);
42548 if (CarryCC == X86::COND_B)
42549 return CarryOp1;
42550 if (CarryCC == X86::COND_A) {
42551 // Try to convert COND_A into COND_B in an attempt to facilitate
42552 // materializing "setb reg".
42553 //
42554 // Do not flip "e > c", where "c" is a constant, because Cmp
42555 // instruction cannot take an immediate as its first operand.
42556 //
42557 if (CarryOp1.getOpcode() == X86ISD::SUB &&
42558 CarryOp1.getNode()->hasOneUse() &&
42559 CarryOp1.getValueType().isInteger() &&
42560 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
42561 SDValue SubCommute =
42562 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
42563 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
42564 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
42565 }
42566 }
42567 // If this is a check of the z flag of an add with 1, switch to the
42568 // C flag.
42569 if (CarryCC == X86::COND_E &&
42570 CarryOp1.getOpcode() == X86ISD::ADD &&
42571 isOneConstant(CarryOp1.getOperand(1)))
42572 return CarryOp1;
42573 }
42574 }
42575 }
42576
42577 return SDValue();
42578}
42579
42580/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
42581/// to avoid the inversion.
42582static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
42583 SelectionDAG &DAG,
42584 const X86Subtarget &Subtarget) {
42585 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
42586 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
42587 EFLAGS.getOpcode() != X86ISD::TESTP)
42588 return SDValue();
42589
42590 // PTEST/TESTP sets EFLAGS as:
42591 // TESTZ: ZF = (Op0 & Op1) == 0
42592 // TESTC: CF = (~Op0 & Op1) == 0
42593 // TESTNZC: ZF == 0 && CF == 0
42594 EVT VT = EFLAGS.getValueType();
42595 SDValue Op0 = EFLAGS.getOperand(0);
42596 SDValue Op1 = EFLAGS.getOperand(1);
42597 EVT OpVT = Op0.getValueType();
42598
42599 // TEST*(~X,Y) == TEST*(X,Y)
42600 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
42601 X86::CondCode InvCC;
42602 switch (CC) {
42603 case X86::COND_B:
42604 // testc -> testz.
42605 InvCC = X86::COND_E;
42606 break;
42607 case X86::COND_AE:
42608 // !testc -> !testz.
42609 InvCC = X86::COND_NE;
42610 break;
42611 case X86::COND_E:
42612 // testz -> testc.
42613 InvCC = X86::COND_B;
42614 break;
42615 case X86::COND_NE:
42616 // !testz -> !testc.
42617 InvCC = X86::COND_AE;
42618 break;
42619 case X86::COND_A:
42620 case X86::COND_BE:
42621 // testnzc -> testnzc (no change).
42622 InvCC = CC;
42623 break;
42624 default:
42625 InvCC = X86::COND_INVALID;
42626 break;
42627 }
42628
42629 if (InvCC != X86::COND_INVALID) {
42630 CC = InvCC;
42631 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42632 DAG.getBitcast(OpVT, NotOp0), Op1);
42633 }
42634 }
42635
42636 if (CC == X86::COND_E || CC == X86::COND_NE) {
42637 // TESTZ(X,~Y) == TESTC(Y,X)
42638 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
42639 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42640 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42641 DAG.getBitcast(OpVT, NotOp1), Op0);
42642 }
42643
42644 if (Op0 == Op1) {
42645 SDValue BC = peekThroughBitcasts(Op0);
42646 EVT BCVT = BC.getValueType();
42647 assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&((void)0)
42648 "Unexpected vector type")((void)0);
42649
42650 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
42651 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
42652 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42653 DAG.getBitcast(OpVT, BC.getOperand(0)),
42654 DAG.getBitcast(OpVT, BC.getOperand(1)));
42655 }
42656
42657 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
42658 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
42659 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42660 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42661 DAG.getBitcast(OpVT, BC.getOperand(0)),
42662 DAG.getBitcast(OpVT, BC.getOperand(1)));
42663 }
42664
42665 // If every element is an all-sign value, see if we can use MOVMSK to
42666 // more efficiently extract the sign bits and compare that.
42667 // TODO: Handle TESTC with comparison inversion.
42668 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
42669 // MOVMSK combines to make sure its never worse than PTEST?
42670 unsigned EltBits = BCVT.getScalarSizeInBits();
42671 if (DAG.ComputeNumSignBits(BC) == EltBits) {
42672 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")((void)0);
42673 APInt SignMask = APInt::getSignMask(EltBits);
42674 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42675 if (SDValue Res =
42676 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
42677 // For vXi16 cases we need to use pmovmksb and extract every other
42678 // sign bit.
42679 SDLoc DL(EFLAGS);
42680 if (EltBits == 16) {
42681 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
42682 Res = DAG.getBitcast(MovmskVT, Res);
42683 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42684 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
42685 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42686 } else {
42687 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42688 }
42689 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
42690 DAG.getConstant(0, DL, MVT::i32));
42691 }
42692 }
42693 }
42694
42695 // TESTZ(-1,X) == TESTZ(X,X)
42696 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
42697 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
42698
42699 // TESTZ(X,-1) == TESTZ(X,X)
42700 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
42701 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
42702 }
42703
42704 return SDValue();
42705}
42706
42707// Attempt to simplify the MOVMSK input based on the comparison type.
42708static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
42709 SelectionDAG &DAG,
42710 const X86Subtarget &Subtarget) {
42711 // Handle eq/ne against zero (any_of).
42712 // Handle eq/ne against -1 (all_of).
42713 if (!(CC == X86::COND_E || CC == X86::COND_NE))
42714 return SDValue();
42715 if (EFLAGS.getValueType() != MVT::i32)
42716 return SDValue();
42717 unsigned CmpOpcode = EFLAGS.getOpcode();
42718 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
42719 return SDValue();
42720 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
42721 if (!CmpConstant)
42722 return SDValue();
42723 const APInt &CmpVal = CmpConstant->getAPIntValue();
42724
42725 SDValue CmpOp = EFLAGS.getOperand(0);
42726 unsigned CmpBits = CmpOp.getValueSizeInBits();
42727 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")((void)0);
42728
42729 // Peek through any truncate.
42730 if (CmpOp.getOpcode() == ISD::TRUNCATE)
42731 CmpOp = CmpOp.getOperand(0);
42732
42733 // Bail if we don't find a MOVMSK.
42734 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
42735 return SDValue();
42736
42737 SDValue Vec = CmpOp.getOperand(0);
42738 MVT VecVT = Vec.getSimpleValueType();
42739 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&((void)0)
42740 "Unexpected MOVMSK operand")((void)0);
42741 unsigned NumElts = VecVT.getVectorNumElements();
42742 unsigned NumEltBits = VecVT.getScalarSizeInBits();
42743
42744 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
42745 bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
42746 CmpVal.isMask(NumElts);
42747 if (!IsAnyOf && !IsAllOf)
42748 return SDValue();
42749
42750 // See if we can peek through to a vector with a wider element type, if the
42751 // signbits extend down to all the sub-elements as well.
42752 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
42753 // potential SimplifyDemandedBits/Elts cases.
42754 if (Vec.getOpcode() == ISD::BITCAST) {
42755 SDValue BC = peekThroughBitcasts(Vec);
42756 MVT BCVT = BC.getSimpleValueType();
42757 unsigned BCNumElts = BCVT.getVectorNumElements();
42758 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
42759 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
42760 BCNumEltBits > NumEltBits &&
42761 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
42762 SDLoc DL(EFLAGS);
42763 unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
42764 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
42765 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
42766 DAG.getConstant(CmpMask, DL, MVT::i32));
42767 }
42768 }
42769
42770 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
42771 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
42772 if (IsAllOf && Subtarget.hasSSE41()) {
42773 SDValue BC = peekThroughBitcasts(Vec);
42774 if (BC.getOpcode() == X86ISD::PCMPEQ &&
42775 ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
42776 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
42777 SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
42778 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
42779 }
42780 }
42781
42782 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
42783 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
42784 // sign bits prior to the comparison with zero unless we know that
42785 // the vXi16 splats the sign bit down to the lower i8 half.
42786 // TODO: Handle all_of patterns.
42787 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
42788 SDValue VecOp0 = Vec.getOperand(0);
42789 SDValue VecOp1 = Vec.getOperand(1);
42790 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
42791 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
42792 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
42793 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
42794 SDLoc DL(EFLAGS);
42795 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
42796 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42797 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
42798 if (!SignExt0) {
42799 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
42800 DAG.getConstant(0xAAAA, DL, MVT::i16));
42801 }
42802 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42803 DAG.getConstant(0, DL, MVT::i16));
42804 }
42805 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
42806 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
42807 if (CmpBits >= 16 && Subtarget.hasInt256() &&
42808 VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42809 VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42810 VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
42811 VecOp0.getConstantOperandAPInt(1) == 0 &&
42812 VecOp1.getConstantOperandAPInt(1) == 8 &&
42813 (IsAnyOf || (SignExt0 && SignExt1))) {
42814 SDLoc DL(EFLAGS);
42815 SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
42816 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42817 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
42818 if (!SignExt0 || !SignExt1) {
42819 assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns")((void)0);
42820 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
42821 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42822 }
42823 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42824 DAG.getConstant(CmpMask, DL, MVT::i32));
42825 }
42826 }
42827
42828 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
42829 SmallVector<int, 32> ShuffleMask;
42830 SmallVector<SDValue, 2> ShuffleInputs;
42831 if (NumElts <= CmpBits &&
42832 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
42833 ShuffleMask, DAG) &&
42834 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
42835 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
42836 unsigned NumShuffleElts = ShuffleMask.size();
42837 APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
42838 for (int M : ShuffleMask) {
42839 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")((void)0);
42840 DemandedElts.setBit(M);
42841 }
42842 if (DemandedElts.isAllOnesValue()) {
42843 SDLoc DL(EFLAGS);
42844 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
42845 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42846 Result =
42847 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
42848 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42849 EFLAGS.getOperand(1));
42850 }
42851 }
42852
42853 return SDValue();
42854}
42855
42856/// Optimize an EFLAGS definition used according to the condition code \p CC
42857/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
42858/// uses of chain values.
42859static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
42860 SelectionDAG &DAG,
42861 const X86Subtarget &Subtarget) {
42862 if (CC == X86::COND_B)
42863 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
42864 return Flags;
42865
42866 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
42867 return R;
42868
42869 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
42870 return R;
42871
42872 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
42873 return R;
42874
42875 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
42876}
42877
42878/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
42879static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
42880 TargetLowering::DAGCombinerInfo &DCI,
42881 const X86Subtarget &Subtarget) {
42882 SDLoc DL(N);
42883
42884 SDValue FalseOp = N->getOperand(0);
42885 SDValue TrueOp = N->getOperand(1);
42886 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
42887 SDValue Cond = N->getOperand(3);
42888
42889 // cmov X, X, ?, ? --> X
42890 if (TrueOp == FalseOp)
42891 return TrueOp;
42892
42893 // Try to simplify the EFLAGS and condition code operands.
42894 // We can't always do this as FCMOV only supports a subset of X86 cond.
42895 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
42896 if (!(FalseOp.getValueType() == MVT::f80 ||
42897 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
42898 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
42899 !Subtarget.hasCMov() || hasFPCMov(CC)) {
42900 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
42901 Flags};
42902 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42903 }
42904 }
42905
42906 // If this is a select between two integer constants, try to do some
42907 // optimizations. Note that the operands are ordered the opposite of SELECT
42908 // operands.
42909 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
42910 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
42911 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
42912 // larger than FalseC (the false value).
42913 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
42914 CC = X86::GetOppositeBranchCondition(CC);
42915 std::swap(TrueC, FalseC);
42916 std::swap(TrueOp, FalseOp);
42917 }
42918
42919 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
42920 // This is efficient for any integer data type (including i8/i16) and
42921 // shift amount.
42922 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
42923 Cond = getSETCC(CC, Cond, DL, DAG);
42924
42925 // Zero extend the condition if needed.
42926 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
42927
42928 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
42929 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
42930 DAG.getConstant(ShAmt, DL, MVT::i8));
42931 return Cond;
42932 }
42933
42934 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
42935 // for any integer data type, including i8/i16.
42936 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
42937 Cond = getSETCC(CC, Cond, DL, DAG);
42938
42939 // Zero extend the condition if needed.
42940 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
42941 FalseC->getValueType(0), Cond);
42942 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42943 SDValue(FalseC, 0));
42944 return Cond;
42945 }
42946
42947 // Optimize cases that will turn into an LEA instruction. This requires
42948 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
42949 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
42950 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
42951 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&((void)0)
42952 "Implicit constant truncation")((void)0);
42953
42954 bool isFastMultiplier = false;
42955 if (Diff.ult(10)) {
42956 switch (Diff.getZExtValue()) {
42957 default: break;
42958 case 1: // result = add base, cond
42959 case 2: // result = lea base( , cond*2)
42960 case 3: // result = lea base(cond, cond*2)
42961 case 4: // result = lea base( , cond*4)
42962 case 5: // result = lea base(cond, cond*4)
42963 case 8: // result = lea base( , cond*8)
42964 case 9: // result = lea base(cond, cond*8)
42965 isFastMultiplier = true;
42966 break;
42967 }
42968 }
42969
42970 if (isFastMultiplier) {
42971 Cond = getSETCC(CC, Cond, DL ,DAG);
42972 // Zero extend the condition if needed.
42973 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
42974 Cond);
42975 // Scale the condition by the difference.
42976 if (Diff != 1)
42977 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
42978 DAG.getConstant(Diff, DL, Cond.getValueType()));
42979
42980 // Add the base if non-zero.
42981 if (FalseC->getAPIntValue() != 0)
42982 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42983 SDValue(FalseC, 0));
42984 return Cond;
42985 }
42986 }
42987 }
42988 }
42989
42990 // Handle these cases:
42991 // (select (x != c), e, c) -> select (x != c), e, x),
42992 // (select (x == c), c, e) -> select (x == c), x, e)
42993 // where the c is an integer constant, and the "select" is the combination
42994 // of CMOV and CMP.
42995 //
42996 // The rationale for this change is that the conditional-move from a constant
42997 // needs two instructions, however, conditional-move from a register needs
42998 // only one instruction.
42999 //
43000 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
43001 // some instruction-combining opportunities. This opt needs to be
43002 // postponed as late as possible.
43003 //
43004 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
43005 // the DCI.xxxx conditions are provided to postpone the optimization as
43006 // late as possible.
43007
43008 ConstantSDNode *CmpAgainst = nullptr;
43009 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
43010 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
43011 !isa<ConstantSDNode>(Cond.getOperand(0))) {
43012
43013 if (CC == X86::COND_NE &&
43014 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
43015 CC = X86::GetOppositeBranchCondition(CC);
43016 std::swap(TrueOp, FalseOp);
43017 }
43018
43019 if (CC == X86::COND_E &&
43020 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
43021 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
43022 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
43023 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43024 }
43025 }
43026 }
43027
43028 // Fold and/or of setcc's to double CMOV:
43029 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
43030 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
43031 //
43032 // This combine lets us generate:
43033 // cmovcc1 (jcc1 if we don't have CMOV)
43034 // cmovcc2 (same)
43035 // instead of:
43036 // setcc1
43037 // setcc2
43038 // and/or
43039 // cmovne (jne if we don't have CMOV)
43040 // When we can't use the CMOV instruction, it might increase branch
43041 // mispredicts.
43042 // When we can use CMOV, or when there is no mispredict, this improves
43043 // throughput and reduces register pressure.
43044 //
43045 if (CC == X86::COND_NE) {
43046 SDValue Flags;
43047 X86::CondCode CC0, CC1;
43048 bool isAndSetCC;
43049 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
43050 if (isAndSetCC) {
43051 std::swap(FalseOp, TrueOp);
43052 CC0 = X86::GetOppositeBranchCondition(CC0);
43053 CC1 = X86::GetOppositeBranchCondition(CC1);
43054 }
43055
43056 SDValue LOps[] = {FalseOp, TrueOp,
43057 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
43058 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
43059 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
43060 Flags};
43061 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43062 return CMOV;
43063 }
43064 }
43065
43066 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
43067 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
43068 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
43069 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
43070 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
43071 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
43072 SDValue Add = TrueOp;
43073 SDValue Const = FalseOp;
43074 // Canonicalize the condition code for easier matching and output.
43075 if (CC == X86::COND_E)
43076 std::swap(Add, Const);
43077
43078 // We might have replaced the constant in the cmov with the LHS of the
43079 // compare. If so change it to the RHS of the compare.
43080 if (Const == Cond.getOperand(0))
43081 Const = Cond.getOperand(1);
43082
43083 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
43084 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
43085 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
43086 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
43087 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
43088 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
43089 EVT VT = N->getValueType(0);
43090 // This should constant fold.
43091 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
43092 SDValue CMov =
43093 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
43094 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
43095 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
43096 }
43097 }
43098
43099 return SDValue();
43100}
43101
43102/// Different mul shrinking modes.
43103enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
43104
43105static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
43106 EVT VT = N->getOperand(0).getValueType();
43107 if (VT.getScalarSizeInBits() != 32)
43108 return false;
43109
43110 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")((void)0);
43111 unsigned SignBits[2] = {1, 1};
43112 bool IsPositive[2] = {false, false};
43113 for (unsigned i = 0; i < 2; i++) {
43114 SDValue Opd = N->getOperand(i);
43115
43116 SignBits[i] = DAG.ComputeNumSignBits(Opd);
43117 IsPositive[i] = DAG.SignBitIsZero(Opd);
43118 }
43119
43120 bool AllPositive = IsPositive[0] && IsPositive[1];
43121 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
43122 // When ranges are from -128 ~ 127, use MULS8 mode.
43123 if (MinSignBits >= 25)
43124 Mode = ShrinkMode::MULS8;
43125 // When ranges are from 0 ~ 255, use MULU8 mode.
43126 else if (AllPositive && MinSignBits >= 24)
43127 Mode = ShrinkMode::MULU8;
43128 // When ranges are from -32768 ~ 32767, use MULS16 mode.
43129 else if (MinSignBits >= 17)
43130 Mode = ShrinkMode::MULS16;
43131 // When ranges are from 0 ~ 65535, use MULU16 mode.
43132 else if (AllPositive && MinSignBits >= 16)
43133 Mode = ShrinkMode::MULU16;
43134 else
43135 return false;
43136 return true;
43137}
43138
43139/// When the operands of vector mul are extended from smaller size values,
43140/// like i8 and i16, the type of mul may be shrinked to generate more
43141/// efficient code. Two typical patterns are handled:
43142/// Pattern1:
43143/// %2 = sext/zext <N x i8> %1 to <N x i32>
43144/// %4 = sext/zext <N x i8> %3 to <N x i32>
43145// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43146/// %5 = mul <N x i32> %2, %4
43147///
43148/// Pattern2:
43149/// %2 = zext/sext <N x i16> %1 to <N x i32>
43150/// %4 = zext/sext <N x i16> %3 to <N x i32>
43151/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43152/// %5 = mul <N x i32> %2, %4
43153///
43154/// There are four mul shrinking modes:
43155/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
43156/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
43157/// generate pmullw+sext32 for it (MULS8 mode).
43158/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
43159/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
43160/// generate pmullw+zext32 for it (MULU8 mode).
43161/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
43162/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
43163/// generate pmullw+pmulhw for it (MULS16 mode).
43164/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
43165/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
43166/// generate pmullw+pmulhuw for it (MULU16 mode).
43167static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
43168 const X86Subtarget &Subtarget) {
43169 // Check for legality
43170 // pmullw/pmulhw are not supported by SSE.
43171 if (!Subtarget.hasSSE2())
43172 return SDValue();
43173
43174 // Check for profitability
43175 // pmulld is supported since SSE41. It is better to use pmulld
43176 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
43177 // the expansion.
43178 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
43179 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
43180 return SDValue();
43181
43182 ShrinkMode Mode;
43183 if (!canReduceVMulWidth(N, DAG, Mode))
43184 return SDValue();
43185
43186 SDLoc DL(N);
43187 SDValue N0 = N->getOperand(0);
43188 SDValue N1 = N->getOperand(1);
43189 EVT VT = N->getOperand(0).getValueType();
43190 unsigned NumElts = VT.getVectorNumElements();
43191 if ((NumElts % 2) != 0)
43192 return SDValue();
43193
43194 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
43195
43196 // Shrink the operands of mul.
43197 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
43198 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
43199
43200 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
43201 // lower part is needed.
43202 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
43203 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
43204 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
43205 : ISD::SIGN_EXTEND,
43206 DL, VT, MulLo);
43207
43208 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
43209 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
43210 // the higher part is also needed.
43211 SDValue MulHi =
43212 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
43213 ReducedVT, NewN0, NewN1);
43214
43215 // Repack the lower part and higher part result of mul into a wider
43216 // result.
43217 // Generate shuffle functioning as punpcklwd.
43218 SmallVector<int, 16> ShuffleMask(NumElts);
43219 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43220 ShuffleMask[2 * i] = i;
43221 ShuffleMask[2 * i + 1] = i + NumElts;
43222 }
43223 SDValue ResLo =
43224 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43225 ResLo = DAG.getBitcast(ResVT, ResLo);
43226 // Generate shuffle functioning as punpckhwd.
43227 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43228 ShuffleMask[2 * i] = i + NumElts / 2;
43229 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
43230 }
43231 SDValue ResHi =
43232 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43233 ResHi = DAG.getBitcast(ResVT, ResHi);
43234 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
43235}
43236
43237static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
43238 EVT VT, const SDLoc &DL) {
43239
43240 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
43241 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43242 DAG.getConstant(Mult, DL, VT));
43243 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
43244 DAG.getConstant(Shift, DL, MVT::i8));
43245 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43246 N->getOperand(0));
43247 return Result;
43248 };
43249
43250 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
43251 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43252 DAG.getConstant(Mul1, DL, VT));
43253 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
43254 DAG.getConstant(Mul2, DL, VT));
43255 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43256 N->getOperand(0));
43257 return Result;
43258 };
43259
43260 switch (MulAmt) {
43261 default:
43262 break;
43263 case 11:
43264 // mul x, 11 => add ((shl (mul x, 5), 1), x)
43265 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
43266 case 21:
43267 // mul x, 21 => add ((shl (mul x, 5), 2), x)
43268 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
43269 case 41:
43270 // mul x, 41 => add ((shl (mul x, 5), 3), x)
43271 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
43272 case 22:
43273 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
43274 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43275 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
43276 case 19:
43277 // mul x, 19 => add ((shl (mul x, 9), 1), x)
43278 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
43279 case 37:
43280 // mul x, 37 => add ((shl (mul x, 9), 2), x)
43281 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
43282 case 73:
43283 // mul x, 73 => add ((shl (mul x, 9), 3), x)
43284 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
43285 case 13:
43286 // mul x, 13 => add ((shl (mul x, 3), 2), x)
43287 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
43288 case 23:
43289 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
43290 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
43291 case 26:
43292 // mul x, 26 => add ((mul (mul x, 5), 5), x)
43293 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
43294 case 28:
43295 // mul x, 28 => add ((mul (mul x, 9), 3), x)
43296 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
43297 case 29:
43298 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
43299 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43300 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
43301 }
43302
43303 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
43304 // by a single LEA.
43305 // First check if this a sum of two power of 2s because that's easy. Then
43306 // count how many zeros are up to the first bit.
43307 // TODO: We can do this even without LEA at a cost of two shifts and an add.
43308 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
43309 unsigned ScaleShift = countTrailingZeros(MulAmt);
43310 if (ScaleShift >= 1 && ScaleShift < 4) {
43311 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
43312 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43313 DAG.getConstant(ShiftAmt, DL, MVT::i8));
43314 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43315 DAG.getConstant(ScaleShift, DL, MVT::i8));
43316 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
43317 }
43318 }
43319
43320 return SDValue();
43321}
43322
43323// If the upper 17 bits of each element are zero then we can use PMADDWD,
43324// which is always at least as quick as PMULLD, except on KNL.
43325static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
43326 const X86Subtarget &Subtarget) {
43327 if (!Subtarget.hasSSE2())
43328 return SDValue();
43329
43330 if (Subtarget.isPMADDWDSlow())
43331 return SDValue();
43332
43333 EVT VT = N->getValueType(0);
43334
43335 // Only support vXi32 vectors.
43336 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
43337 return SDValue();
43338
43339 // Make sure the type is legal or will be widened to a legal type.
43340 if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
43341 return SDValue();
43342
43343 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
43344
43345 // Without BWI, we would need to split v32i16.
43346 if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
43347 return SDValue();
43348
43349 SDValue N0 = N->getOperand(0);
43350 SDValue N1 = N->getOperand(1);
43351
43352 // If we are zero extending two steps without SSE4.1, its better to reduce
43353 // the vmul width instead.
43354 if (!Subtarget.hasSSE41() &&
43355 (N0.getOpcode() == ISD::ZERO_EXTEND &&
43356 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
43357 (N1.getOpcode() == ISD::ZERO_EXTEND &&
43358 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
43359 return SDValue();
43360
43361 APInt Mask17 = APInt::getHighBitsSet(32, 17);
43362 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
43363 !DAG.MaskedValueIsZero(N0, Mask17))
43364 return SDValue();
43365
43366 // Use SplitOpsAndApply to handle AVX splitting.
43367 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43368 ArrayRef<SDValue> Ops) {
43369 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43370 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
43371 };
43372 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
43373 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
43374 PMADDWDBuilder);
43375}
43376
43377static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
43378 const X86Subtarget &Subtarget) {
43379 if (!Subtarget.hasSSE2())
43380 return SDValue();
43381
43382 EVT VT = N->getValueType(0);
43383
43384 // Only support vXi64 vectors.
43385 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
43386 VT.getVectorNumElements() < 2 ||
43387 !isPowerOf2_32(VT.getVectorNumElements()))
43388 return SDValue();
43389
43390 SDValue N0 = N->getOperand(0);
43391 SDValue N1 = N->getOperand(1);
43392
43393 // MULDQ returns the 64-bit result of the signed multiplication of the lower
43394 // 32-bits. We can lower with this if the sign bits stretch that far.
43395 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
43396 DAG.ComputeNumSignBits(N1) > 32) {
43397 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43398 ArrayRef<SDValue> Ops) {
43399 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
43400 };
43401 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43402 PMULDQBuilder, /*CheckBWI*/false);
43403 }
43404
43405 // If the upper bits are zero we can use a single pmuludq.
43406 APInt Mask = APInt::getHighBitsSet(64, 32);
43407 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
43408 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43409 ArrayRef<SDValue> Ops) {
43410 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
43411 };
43412 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43413 PMULUDQBuilder, /*CheckBWI*/false);
43414 }
43415
43416 return SDValue();
43417}
43418
43419/// Optimize a single multiply with constant into two operations in order to
43420/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
43421static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
43422 TargetLowering::DAGCombinerInfo &DCI,
43423 const X86Subtarget &Subtarget) {
43424 EVT VT = N->getValueType(0);
43425
43426 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
43427 return V;
43428
43429 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
43430 return V;
43431
43432 if (DCI.isBeforeLegalize() && VT.isVector())
43433 return reduceVMULWidth(N, DAG, Subtarget);
43434
43435 if (!MulConstantOptimization)
43436 return SDValue();
43437 // An imul is usually smaller than the alternative sequence.
43438 if (DAG.getMachineFunction().getFunction().hasMinSize())
43439 return SDValue();
43440
43441 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
43442 return SDValue();
43443
43444 if (VT != MVT::i64 && VT != MVT::i32)
43445 return SDValue();
43446
43447 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
43448 if (!C)
43449 return SDValue();
43450 if (isPowerOf2_64(C->getZExtValue()))
43451 return SDValue();
43452
43453 int64_t SignMulAmt = C->getSExtValue();
43454 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")((void)0);
43455 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
43456
43457 SDLoc DL(N);
43458 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
43459 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43460 DAG.getConstant(AbsMulAmt, DL, VT));
43461 if (SignMulAmt < 0)
43462 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43463 NewMul);
43464
43465 return NewMul;
43466 }
43467
43468 uint64_t MulAmt1 = 0;
43469 uint64_t MulAmt2 = 0;
43470 if ((AbsMulAmt % 9) == 0) {
43471 MulAmt1 = 9;
43472 MulAmt2 = AbsMulAmt / 9;
43473 } else if ((AbsMulAmt % 5) == 0) {
43474 MulAmt1 = 5;
43475 MulAmt2 = AbsMulAmt / 5;
43476 } else if ((AbsMulAmt % 3) == 0) {
43477 MulAmt1 = 3;
43478 MulAmt2 = AbsMulAmt / 3;
43479 }
43480
43481 SDValue NewMul;
43482 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
43483 if (MulAmt2 &&
43484 (isPowerOf2_64(MulAmt2) ||
43485 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
43486
43487 if (isPowerOf2_64(MulAmt2) &&
43488 !(SignMulAmt >= 0 && N->hasOneUse() &&
43489 N->use_begin()->getOpcode() == ISD::ADD))
43490 // If second multiplifer is pow2, issue it first. We want the multiply by
43491 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
43492 // is an add. Only do this for positive multiply amounts since the
43493 // negate would prevent it from being used as an address mode anyway.
43494 std::swap(MulAmt1, MulAmt2);
43495
43496 if (isPowerOf2_64(MulAmt1))
43497 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43498 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
43499 else
43500 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43501 DAG.getConstant(MulAmt1, DL, VT));
43502
43503 if (isPowerOf2_64(MulAmt2))
43504 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
43505 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
43506 else
43507 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
43508 DAG.getConstant(MulAmt2, DL, VT));
43509
43510 // Negate the result.
43511 if (SignMulAmt < 0)
43512 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43513 NewMul);
43514 } else if (!Subtarget.slowLEA())
43515 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
43516
43517 if (!NewMul) {
43518 assert(C->getZExtValue() != 0 &&((void)0)
43519 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&((void)0)
43520 "Both cases that could cause potential overflows should have "((void)0)
43521 "already been handled.")((void)0);
43522 if (isPowerOf2_64(AbsMulAmt - 1)) {
43523 // (mul x, 2^N + 1) => (add (shl x, N), x)
43524 NewMul = DAG.getNode(
43525 ISD::ADD, DL, VT, N->getOperand(0),
43526 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43527 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
43528 MVT::i8)));
43529 // To negate, subtract the number from zero
43530 if (SignMulAmt < 0)
43531 NewMul = DAG.getNode(ISD::SUB, DL, VT,
43532 DAG.getConstant(0, DL, VT), NewMul);
43533 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
43534 // (mul x, 2^N - 1) => (sub (shl x, N), x)
43535 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43536 DAG.getConstant(Log2_64(AbsMulAmt + 1),
43537 DL, MVT::i8));
43538 // To negate, reverse the operands of the subtract.
43539 if (SignMulAmt < 0)
43540 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
43541 else
43542 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43543 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
43544 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
43545 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43546 DAG.getConstant(Log2_64(AbsMulAmt - 2),
43547 DL, MVT::i8));
43548 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43549 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43550 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
43551 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
43552 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43553 DAG.getConstant(Log2_64(AbsMulAmt + 2),
43554 DL, MVT::i8));
43555 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43556 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43557 }
43558 }
43559
43560 return NewMul;
43561}
43562
43563// Try to form a MULHU or MULHS node by looking for
43564// (srl (mul ext, ext), 16)
43565// TODO: This is X86 specific because we want to be able to handle wide types
43566// before type legalization. But we can only do it if the vector will be
43567// legalized via widening/splitting. Type legalization can't handle promotion
43568// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
43569// combiner.
43570static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
43571 const X86Subtarget &Subtarget) {
43572 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&((void)0)
43573 "SRL or SRA node is required here!")((void)0);
43574 SDLoc DL(N);
43575
43576 // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
43577 // the multiply.
43578 if (!Subtarget.hasSSE41())
43579 return SDValue();
43580
43581 // The operation feeding into the shift must be a multiply.
43582 SDValue ShiftOperand = N->getOperand(0);
43583 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
43584 return SDValue();
43585
43586 // Input type should be at least vXi32.
43587 EVT VT = N->getValueType(0);
43588 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
43589 return SDValue();
43590
43591 // Need a shift by 16.
43592 APInt ShiftAmt;
43593 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
43594 ShiftAmt != 16)
43595 return SDValue();
43596
43597 SDValue LHS = ShiftOperand.getOperand(0);
43598 SDValue RHS = ShiftOperand.getOperand(1);
43599
43600 unsigned ExtOpc = LHS.getOpcode();
43601 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
43602 RHS.getOpcode() != ExtOpc)
43603 return SDValue();
43604
43605 // Peek through the extends.
43606 LHS = LHS.getOperand(0);
43607 RHS = RHS.getOperand(0);
43608
43609 // Ensure the input types match.
43610 EVT MulVT = LHS.getValueType();
43611 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
43612 return SDValue();
43613
43614 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
43615 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
43616
43617 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
43618 return DAG.getNode(ExtOpc, DL, VT, Mulh);
43619}
43620
43621static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
43622 SDValue N0 = N->getOperand(0);
43623 SDValue N1 = N->getOperand(1);
43624 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
43625 EVT VT = N0.getValueType();
43626
43627 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
43628 // since the result of setcc_c is all zero's or all ones.
43629 if (VT.isInteger() && !VT.isVector() &&
43630 N1C && N0.getOpcode() == ISD::AND &&
43631 N0.getOperand(1).getOpcode() == ISD::Constant) {
43632 SDValue N00 = N0.getOperand(0);
43633 APInt Mask = N0.getConstantOperandAPInt(1);
43634 Mask <<= N1C->getAPIntValue();
43635 bool MaskOK = false;
43636 // We can handle cases concerning bit-widening nodes containing setcc_c if
43637 // we carefully interrogate the mask to make sure we are semantics
43638 // preserving.
43639 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
43640 // of the underlying setcc_c operation if the setcc_c was zero extended.
43641 // Consider the following example:
43642 // zext(setcc_c) -> i32 0x0000FFFF
43643 // c1 -> i32 0x0000FFFF
43644 // c2 -> i32 0x00000001
43645 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
43646 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
43647 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
43648 MaskOK = true;
43649 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
43650 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43651 MaskOK = true;
43652 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
43653 N00.getOpcode() == ISD::ANY_EXTEND) &&
43654 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43655 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
43656 }
43657 if (MaskOK && Mask != 0) {
43658 SDLoc DL(N);
43659 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
43660 }
43661 }
43662
43663 // Hardware support for vector shifts is sparse which makes us scalarize the
43664 // vector operations in many cases. Also, on sandybridge ADD is faster than
43665 // shl.
43666 // (shl V, 1) -> add V,V
43667 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
43668 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
43669 assert(N0.getValueType().isVector() && "Invalid vector shift type")((void)0);
43670 // We shift all of the values by one. In many cases we do not have
43671 // hardware support for this operation. This is better expressed as an ADD
43672 // of two values.
43673 if (N1SplatC->isOne())
43674 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
43675 }
43676
43677 return SDValue();
43678}
43679
43680static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
43681 const X86Subtarget &Subtarget) {
43682 SDValue N0 = N->getOperand(0);
43683 SDValue N1 = N->getOperand(1);
43684 EVT VT = N0.getValueType();
43685 unsigned Size = VT.getSizeInBits();
43686
43687 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43688 return V;
43689
43690 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
43691 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
43692 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
43693 // depending on sign of (SarConst - [56,48,32,24,16])
43694
43695 // sexts in X86 are MOVs. The MOVs have the same code size
43696 // as above SHIFTs (only SHIFT on 1 has lower code size).
43697 // However the MOVs have 2 advantages to a SHIFT:
43698 // 1. MOVs can write to a register that differs from source
43699 // 2. MOVs accept memory operands
43700
43701 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
43702 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
43703 N0.getOperand(1).getOpcode() != ISD::Constant)
43704 return SDValue();
43705
43706 SDValue N00 = N0.getOperand(0);
43707 SDValue N01 = N0.getOperand(1);
43708 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
43709 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
43710 EVT CVT = N1.getValueType();
43711
43712 if (SarConst.isNegative())
43713 return SDValue();
43714
43715 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
43716 unsigned ShiftSize = SVT.getSizeInBits();
43717 // skipping types without corresponding sext/zext and
43718 // ShlConst that is not one of [56,48,32,24,16]
43719 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
43720 continue;
43721 SDLoc DL(N);
43722 SDValue NN =
43723 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
43724 SarConst = SarConst - (Size - ShiftSize);
43725 if (SarConst == 0)
43726 return NN;
43727 else if (SarConst.isNegative())
43728 return DAG.getNode(ISD::SHL, DL, VT, NN,
43729 DAG.getConstant(-SarConst, DL, CVT));
43730 else
43731 return DAG.getNode(ISD::SRA, DL, VT, NN,
43732 DAG.getConstant(SarConst, DL, CVT));
43733 }
43734 return SDValue();
43735}
43736
43737static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
43738 TargetLowering::DAGCombinerInfo &DCI,
43739 const X86Subtarget &Subtarget) {
43740 SDValue N0 = N->getOperand(0);
43741 SDValue N1 = N->getOperand(1);
43742 EVT VT = N0.getValueType();
43743
43744 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43745 return V;
43746
43747 // Only do this on the last DAG combine as it can interfere with other
43748 // combines.
43749 if (!DCI.isAfterLegalizeDAG())
43750 return SDValue();
43751
43752 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
43753 // TODO: This is a generic DAG combine that became an x86-only combine to
43754 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
43755 // and-not ('andn').
43756 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
43757 return SDValue();
43758
43759 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
43760 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
43761 if (!ShiftC || !AndC)
43762 return SDValue();
43763
43764 // If we can shrink the constant mask below 8-bits or 32-bits, then this
43765 // transform should reduce code size. It may also enable secondary transforms
43766 // from improved known-bits analysis or instruction selection.
43767 APInt MaskVal = AndC->getAPIntValue();
43768
43769 // If this can be matched by a zero extend, don't optimize.
43770 if (MaskVal.isMask()) {
43771 unsigned TO = MaskVal.countTrailingOnes();
43772 if (TO >= 8 && isPowerOf2_32(TO))
43773 return SDValue();
43774 }
43775
43776 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
43777 unsigned OldMaskSize = MaskVal.getMinSignedBits();
43778 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
43779 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
43780 (OldMaskSize > 32 && NewMaskSize <= 32)) {
43781 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
43782 SDLoc DL(N);
43783 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
43784 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
43785 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
43786 }
43787 return SDValue();
43788}
43789
43790static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
43791 const X86Subtarget &Subtarget) {
43792 unsigned Opcode = N->getOpcode();
43793 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")((void)0);
43794
43795 SDLoc DL(N);
43796 EVT VT = N->getValueType(0);
43797 SDValue N0 = N->getOperand(0);
43798 SDValue N1 = N->getOperand(1);
43799 EVT SrcVT = N0.getValueType();
43800
43801 SDValue BC0 =
43802 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
43803 SDValue BC1 =
43804 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
43805
43806 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
43807 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
43808 // truncation trees that help us avoid lane crossing shuffles.
43809 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
43810 // TODO: We don't handle vXf64 shuffles yet.
43811 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
43812 BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43813 BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43814 BC0.getOperand(0) == BC1.getOperand(0) &&
43815 BC0.getOperand(0).getValueType().is256BitVector() &&
43816 BC0.getConstantOperandAPInt(1) == 0 &&
43817 BC1.getConstantOperandAPInt(1) ==
43818 BC0.getValueType().getVectorNumElements()) {
43819 SmallVector<SDValue> ShuffleOps;
43820 SmallVector<int> ShuffleMask, ScaledMask;
43821 SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
43822 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
43823 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
43824 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
43825 // shuffle to a v4X64 width - we can probably relax this in the future.
43826 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
43827 ShuffleOps[0].getValueType().is256BitVector() &&
43828 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
43829 SDValue Lo, Hi;
43830 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43831 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
43832 Lo = DAG.getBitcast(SrcVT, Lo);
43833 Hi = DAG.getBitcast(SrcVT, Hi);
43834 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
43835 Res = DAG.getBitcast(ShufVT, Res);
43836 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
43837 return DAG.getBitcast(VT, Res);
43838 }
43839 }
43840 }
43841
43842 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
43843 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
43844 // If either/both ops are a shuffle that can scale to v2x64,
43845 // then see if we can perform this as a v4x32 post shuffle.
43846 SmallVector<SDValue> Ops0, Ops1;
43847 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
43848 bool IsShuf0 =
43849 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43850 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43851 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43852 bool IsShuf1 =
43853 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43854 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
43855 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43856 if (IsShuf0 || IsShuf1) {
43857 if (!IsShuf0) {
43858 Ops0.assign({BC0});
43859 ScaledMask0.assign({0, 1});
43860 }
43861 if (!IsShuf1) {
43862 Ops1.assign({BC1});
43863 ScaledMask1.assign({0, 1});
43864 }
43865
43866 SDValue LHS, RHS;
43867 int PostShuffle[4] = {-1, -1, -1, -1};
43868 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
43869 if (M < 0)
43870 return true;
43871 Idx = M % 2;
43872 SDValue Src = Ops[M / 2];
43873 if (!LHS || LHS == Src) {
43874 LHS = Src;
43875 return true;
43876 }
43877 if (!RHS || RHS == Src) {
43878 Idx += 2;
43879 RHS = Src;
43880 return true;
43881 }
43882 return false;
43883 };
43884 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
43885 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
43886 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
43887 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
43888 LHS = DAG.getBitcast(SrcVT, LHS);
43889 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
43890 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43891 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
43892 Res = DAG.getBitcast(ShufVT, Res);
43893 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
43894 return DAG.getBitcast(VT, Res);
43895 }
43896 }
43897 }
43898
43899 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
43900 if (VT.is256BitVector() && Subtarget.hasInt256()) {
43901 SmallVector<int> Mask0, Mask1;
43902 SmallVector<SDValue> Ops0, Ops1;
43903 SmallVector<int, 2> ScaledMask0, ScaledMask1;
43904 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43905 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43906 !Ops0.empty() && !Ops1.empty() &&
43907 all_of(Ops0,
43908 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43909 all_of(Ops1,
43910 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43911 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43912 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
43913 SDValue Op00 = peekThroughBitcasts(Ops0.front());
43914 SDValue Op10 = peekThroughBitcasts(Ops1.front());
43915 SDValue Op01 = peekThroughBitcasts(Ops0.back());
43916 SDValue Op11 = peekThroughBitcasts(Ops1.back());
43917 if ((Op00 == Op11) && (Op01 == Op10)) {
43918 std::swap(Op10, Op11);
43919 ShuffleVectorSDNode::commuteMask(ScaledMask1);
43920 }
43921 if ((Op00 == Op10) && (Op01 == Op11)) {
43922 const int Map[4] = {0, 2, 1, 3};
43923 SmallVector<int, 4> ShuffleMask(
43924 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
43925 Map[ScaledMask1[1]]});
43926 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
43927 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
43928 DAG.getBitcast(SrcVT, Op01));
43929 Res = DAG.getBitcast(ShufVT, Res);
43930 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
43931 return DAG.getBitcast(VT, Res);
43932 }
43933 }
43934 }
43935
43936 return SDValue();
43937}
43938
43939static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
43940 TargetLowering::DAGCombinerInfo &DCI,
43941 const X86Subtarget &Subtarget) {
43942 unsigned Opcode = N->getOpcode();
43943 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&((void)0)
43944 "Unexpected pack opcode")((void)0);
43945
43946 EVT VT = N->getValueType(0);
43947 SDValue N0 = N->getOperand(0);
43948 SDValue N1 = N->getOperand(1);
43949 unsigned NumDstElts = VT.getVectorNumElements();
43950 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
43951 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
43952 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&((void)0)
43953 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&((void)0)
43954 "Unexpected PACKSS/PACKUS input type")((void)0);
43955
43956 bool IsSigned = (X86ISD::PACKSS == Opcode);
43957
43958 // Constant Folding.
43959 APInt UndefElts0, UndefElts1;
43960 SmallVector<APInt, 32> EltBits0, EltBits1;
43961 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
43962 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
43963 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
43964 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
43965 unsigned NumLanes = VT.getSizeInBits() / 128;
43966 unsigned NumSrcElts = NumDstElts / 2;
43967 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
43968 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
43969
43970 APInt Undefs(NumDstElts, 0);
43971 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
43972 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
43973 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
43974 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
43975 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
43976 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
43977
43978 if (UndefElts[SrcIdx]) {
43979 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
43980 continue;
43981 }
43982
43983 APInt &Val = EltBits[SrcIdx];
43984 if (IsSigned) {
43985 // PACKSS: Truncate signed value with signed saturation.
43986 // Source values less than dst minint are saturated to minint.
43987 // Source values greater than dst maxint are saturated to maxint.
43988 if (Val.isSignedIntN(DstBitsPerElt))
43989 Val = Val.trunc(DstBitsPerElt);
43990 else if (Val.isNegative())
43991 Val = APInt::getSignedMinValue(DstBitsPerElt);
43992 else
43993 Val = APInt::getSignedMaxValue(DstBitsPerElt);
43994 } else {
43995 // PACKUS: Truncate signed value with unsigned saturation.
43996 // Source values less than zero are saturated to zero.
43997 // Source values greater than dst maxuint are saturated to maxuint.
43998 if (Val.isIntN(DstBitsPerElt))
43999 Val = Val.trunc(DstBitsPerElt);
44000 else if (Val.isNegative())
44001 Val = APInt::getNullValue(DstBitsPerElt);
44002 else
44003 Val = APInt::getAllOnesValue(DstBitsPerElt);
44004 }
44005 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
44006 }
44007 }
44008
44009 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
44010 }
44011
44012 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
44013 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44014 return V;
44015
44016 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
44017 // truncate to create a larger truncate.
44018 if (Subtarget.hasAVX512() &&
44019 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
44020 N0.getOperand(0).getValueType() == MVT::v8i32) {
44021 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
44022 (!IsSigned &&
44023 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
44024 if (Subtarget.hasVLX())
44025 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
44026
44027 // Widen input to v16i32 so we can truncate that.
44028 SDLoc dl(N);
44029 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
44030 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
44031 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
44032 }
44033 }
44034
44035 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
44036 if (VT.is128BitVector()) {
44037 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
44038 SDValue Src0, Src1;
44039 if (N0.getOpcode() == ExtOpc &&
44040 N0.getOperand(0).getValueType().is64BitVector() &&
44041 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44042 Src0 = N0.getOperand(0);
44043 }
44044 if (N1.getOpcode() == ExtOpc &&
44045 N1.getOperand(0).getValueType().is64BitVector() &&
44046 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44047 Src1 = N1.getOperand(0);
44048 }
44049 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
44050 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")((void)0);
44051 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
44052 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
44053 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
44054 }
44055 }
44056
44057 // Attempt to combine as shuffle.
44058 SDValue Op(N, 0);
44059 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44060 return Res;
44061
44062 return SDValue();
44063}
44064
44065static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
44066 TargetLowering::DAGCombinerInfo &DCI,
44067 const X86Subtarget &Subtarget) {
44068 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||((void)0)
44069 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&((void)0)
44070 "Unexpected horizontal add/sub opcode")((void)0);
44071
44072 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
44073 // For slow-hop targets, if we have a hop with a single op, see if we already
44074 // have another user that we can reuse and shuffle the result.
44075 MVT VT = N->getSimpleValueType(0);
44076 SDValue LHS = N->getOperand(0);
44077 SDValue RHS = N->getOperand(1);
44078 if (VT.is128BitVector() && LHS == RHS) {
44079 for (SDNode *User : LHS->uses()) {
44080 if (User != N && User->getOpcode() == N->getOpcode()) {
44081 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44082 if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
44083 return DAG.getBitcast(
44084 VT,
44085 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44086 DAG.getBitcast(ShufVT, SDValue(User, 0)),
44087 DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
44088 }
44089 if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
44090 return DAG.getBitcast(
44091 VT,
44092 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44093 DAG.getBitcast(ShufVT, SDValue(User, 0)),
44094 DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
44095 }
44096 }
44097 }
44098 }
44099
44100 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
44101 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
44102 LHS.getOpcode() == RHS.getOpcode() &&
44103 LHS.getValueType() == RHS.getValueType()) {
44104 SDValue LHS0 = LHS.getOperand(0);
44105 SDValue RHS0 = LHS.getOperand(1);
44106 SDValue LHS1 = RHS.getOperand(0);
44107 SDValue RHS1 = RHS.getOperand(1);
44108 if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
44109 (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
44110 SDLoc DL(N);
44111 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
44112 LHS0.isUndef() ? RHS0 : LHS0,
44113 LHS1.isUndef() ? RHS1 : LHS1);
44114 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
44115 Res = DAG.getBitcast(ShufVT, Res);
44116 SDValue NewLHS =
44117 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44118 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
44119 SDValue NewRHS =
44120 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44121 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
44122 DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
44123 DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
44124 return SDValue(N, 0);
44125 }
44126 }
44127 }
44128
44129 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
44130 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44131 return V;
44132
44133 return SDValue();
44134}
44135
44136static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
44137 TargetLowering::DAGCombinerInfo &DCI,
44138 const X86Subtarget &Subtarget) {
44139 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||((void)0)
44140 X86ISD::VSRL == N->getOpcode()) &&((void)0)
44141 "Unexpected shift opcode")((void)0);
44142 EVT VT = N->getValueType(0);
44143 SDValue N0 = N->getOperand(0);
44144 SDValue N1 = N->getOperand(1);
44145
44146 // Shift zero -> zero.
44147 if (ISD::isBuildVectorAllZeros(N0.getNode()))
44148 return DAG.getConstant(0, SDLoc(N), VT);
44149
44150 // Detect constant shift amounts.
44151 APInt UndefElts;
44152 SmallVector<APInt, 32> EltBits;
44153 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
44154 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
44155 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
44156 EltBits[0].getZExtValue(), DAG);
44157 }
44158
44159 APInt KnownUndef, KnownZero;
44160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44161 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
44162 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
44163 KnownZero, DCI))
44164 return SDValue(N, 0);
44165
44166 return SDValue();
44167}
44168
44169static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
44170 TargetLowering::DAGCombinerInfo &DCI,
44171 const X86Subtarget &Subtarget) {
44172 unsigned Opcode = N->getOpcode();
44173 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||((void)0)
44174 X86ISD::VSRLI == Opcode) &&((void)0)
44175 "Unexpected shift opcode")((void)0);
44176 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
44177 EVT VT = N->getValueType(0);
44178 SDValue N0 = N->getOperand(0);
44179 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44180 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&((void)0)
44181 "Unexpected value type")((void)0);
44182 assert(N->getOperand(1).getValueType() == MVT::i8 &&((void)0)
44183 "Unexpected shift amount type")((void)0);
44184
44185 // (shift undef, X) -> 0
44186 if (N0.isUndef())
44187 return DAG.getConstant(0, SDLoc(N), VT);
44188
44189 // Out of range logical bit shifts are guaranteed to be zero.
44190 // Out of range arithmetic bit shifts splat the sign bit.
44191 unsigned ShiftVal = N->getConstantOperandVal(1);
44192 if (ShiftVal >= NumBitsPerElt) {
44193 if (LogicalShift)
44194 return DAG.getConstant(0, SDLoc(N), VT);
44195 ShiftVal = NumBitsPerElt - 1;
44196 }
44197
44198 // (shift X, 0) -> X
44199 if (!ShiftVal)
44200 return N0;
44201
44202 // (shift 0, C) -> 0
44203 if (ISD::isBuildVectorAllZeros(N0.getNode()))
44204 // N0 is all zeros or undef. We guarantee that the bits shifted into the
44205 // result are all zeros, not undef.
44206 return DAG.getConstant(0, SDLoc(N), VT);
44207
44208 // (VSRAI -1, C) -> -1
44209 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
44210 // N0 is all ones or undef. We guarantee that the bits shifted into the
44211 // result are all ones, not undef.
44212 return DAG.getConstant(-1, SDLoc(N), VT);
44213
44214 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
44215 if (Opcode == N0.getOpcode()) {
44216 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
44217 unsigned NewShiftVal = ShiftVal + ShiftVal2;
44218 if (NewShiftVal >= NumBitsPerElt) {
44219 // Out of range logical bit shifts are guaranteed to be zero.
44220 // Out of range arithmetic bit shifts splat the sign bit.
44221 if (LogicalShift)
44222 return DAG.getConstant(0, SDLoc(N), VT);
44223 NewShiftVal = NumBitsPerElt - 1;
44224 }
44225 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
44226 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
44227 }
44228
44229 // We can decode 'whole byte' logical bit shifts as shuffles.
44230 if (LogicalShift && (ShiftVal % 8) == 0) {
44231 SDValue Op(N, 0);
44232 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44233 return Res;
44234 }
44235
44236 // Constant Folding.
44237 APInt UndefElts;
44238 SmallVector<APInt, 32> EltBits;
44239 if (N->isOnlyUserOf(N0.getNode()) &&
44240 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
44241 assert(EltBits.size() == VT.getVectorNumElements() &&((void)0)
44242 "Unexpected shift value type")((void)0);
44243 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
44244 // created an undef input due to no input bits being demanded, but user
44245 // still expects 0 in other bits.
44246 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
44247 APInt &Elt = EltBits[i];
44248 if (UndefElts[i])
44249 Elt = 0;
44250 else if (X86ISD::VSHLI == Opcode)
44251 Elt <<= ShiftVal;
44252 else if (X86ISD::VSRAI == Opcode)
44253 Elt.ashrInPlace(ShiftVal);
44254 else
44255 Elt.lshrInPlace(ShiftVal);
44256 }
44257 // Reset undef elements since they were zeroed above.
44258 UndefElts = 0;
44259 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
44260 }
44261
44262 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44263 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44264 APInt::getAllOnesValue(NumBitsPerElt), DCI))
44265 return SDValue(N, 0);
44266
44267 return SDValue();
44268}
44269
44270static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
44271 TargetLowering::DAGCombinerInfo &DCI,
44272 const X86Subtarget &Subtarget) {
44273 EVT VT = N->getValueType(0);
44274 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||((void)0)
44275 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||((void)0)
44276 N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&((void)0)
44277 "Unexpected vector insertion")((void)0);
44278
44279 if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
44280 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44281 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44282 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44283 APInt::getAllOnesValue(NumBitsPerElt), DCI))
44284 return SDValue(N, 0);
44285 }
44286
44287 // Attempt to combine insertion patterns to a shuffle.
44288 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
44289 SDValue Op(N, 0);
44290 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44291 return Res;
44292 }
44293
44294 return SDValue();
44295}
44296
44297/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
44298/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
44299/// OR -> CMPNEQSS.
44300static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
44301 TargetLowering::DAGCombinerInfo &DCI,
44302 const X86Subtarget &Subtarget) {
44303 unsigned opcode;
44304
44305 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
44306 // we're requiring SSE2 for both.
44307 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
44308 SDValue N0 = N->getOperand(0);
44309 SDValue N1 = N->getOperand(1);
44310 SDValue CMP0 = N0.getOperand(1);
44311 SDValue CMP1 = N1.getOperand(1);
44312 SDLoc DL(N);
44313
44314 // The SETCCs should both refer to the same CMP.
44315 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
44316 return SDValue();
44317
44318 SDValue CMP00 = CMP0->getOperand(0);
44319 SDValue CMP01 = CMP0->getOperand(1);
44320 EVT VT = CMP00.getValueType();
44321
44322 if (VT == MVT::f32 || VT == MVT::f64) {
44323 bool ExpectingFlags = false;
44324 // Check for any users that want flags:
44325 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
44326 !ExpectingFlags && UI != UE; ++UI)
44327 switch (UI->getOpcode()) {
44328 default:
44329 case ISD::BR_CC:
44330 case ISD::BRCOND:
44331 case ISD::SELECT:
44332 ExpectingFlags = true;
44333 break;
44334 case ISD::CopyToReg:
44335 case ISD::SIGN_EXTEND:
44336 case ISD::ZERO_EXTEND:
44337 case ISD::ANY_EXTEND:
44338 break;
44339 }
44340
44341 if (!ExpectingFlags) {
44342 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
44343 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
44344
44345 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
44346 X86::CondCode tmp = cc0;
44347 cc0 = cc1;
44348 cc1 = tmp;
44349 }
44350
44351 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
44352 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
44353 // FIXME: need symbolic constants for these magic numbers.
44354 // See X86ATTInstPrinter.cpp:printSSECC().
44355 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
44356 if (Subtarget.hasAVX512()) {
44357 SDValue FSetCC =
44358 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
44359 DAG.getTargetConstant(x86cc, DL, MVT::i8));
44360 // Need to fill with zeros to ensure the bitcast will produce zeroes
44361 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
44362 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
44363 DAG.getConstant(0, DL, MVT::v16i1),
44364 FSetCC, DAG.getIntPtrConstant(0, DL));
44365 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
44366 N->getSimpleValueType(0));
44367 }
44368 SDValue OnesOrZeroesF =
44369 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
44370 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
44371
44372 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
44373 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
44374
44375 if (is64BitFP && !Subtarget.is64Bit()) {
44376 // On a 32-bit target, we cannot bitcast the 64-bit float to a
44377 // 64-bit integer, since that's not a legal type. Since
44378 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
44379 // bits, but can do this little dance to extract the lowest 32 bits
44380 // and work with those going forward.
44381 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
44382 OnesOrZeroesF);
44383 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
44384 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
44385 Vector32, DAG.getIntPtrConstant(0, DL));
44386 IntVT = MVT::i32;
44387 }
44388
44389 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
44390 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
44391 DAG.getConstant(1, DL, IntVT));
44392 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
44393 ANDed);
44394 return OneBitOfTruth;
44395 }
44396 }
44397 }
44398 }
44399 return SDValue();
44400}
44401
44402/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
44403static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
44404 assert(N->getOpcode() == ISD::AND)((void)0);
44405
44406 MVT VT = N->getSimpleValueType(0);
44407 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
44408 return SDValue();
44409
44410 SDValue X, Y;
44411 SDValue N0 = N->getOperand(0);
44412 SDValue N1 = N->getOperand(1);
44413
44414 auto GetNot = [&VT, &DAG](SDValue V) {
44415 // Basic X = NOT(Y) detection.
44416 if (SDValue Not = IsNOT(V, DAG))
44417 return Not;
44418 // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
44419 if (V.getOpcode() == X86ISD::VBROADCAST) {
44420 SDValue Src = V.getOperand(0);
44421 EVT SrcVT = Src.getValueType();
44422 if (!SrcVT.isVector())
44423 return SDValue();
44424 if (SDValue Not = IsNOT(Src, DAG))
44425 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
44426 DAG.getBitcast(SrcVT, Not));
44427 }
44428 return SDValue();
44429 };
44430
44431 if (SDValue Not = GetNot(N0)) {
44432 X = Not;
44433 Y = N1;
44434 } else if (SDValue Not = GetNot(N1)) {
44435 X = Not;
44436 Y = N0;
44437 } else
44438 return SDValue();
44439
44440 X = DAG.getBitcast(VT, X);
44441 Y = DAG.getBitcast(VT, Y);
44442 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
44443}
44444
44445// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
44446// logical operations, like in the example below.
44447// or (and (truncate x, truncate y)),
44448// (xor (truncate z, build_vector (constants)))
44449// Given a target type \p VT, we generate
44450// or (and x, y), (xor z, zext(build_vector (constants)))
44451// given x, y and z are of type \p VT. We can do so, if operands are either
44452// truncates from VT types, the second operand is a vector of constants or can
44453// be recursively promoted.
44454static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
44455 unsigned Depth) {
44456 // Limit recursion to avoid excessive compile times.
44457 if (Depth >= SelectionDAG::MaxRecursionDepth)
44458 return SDValue();
44459
44460 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
44461 N->getOpcode() != ISD::OR)
44462 return SDValue();
44463
44464 SDValue N0 = N->getOperand(0);
44465 SDValue N1 = N->getOperand(1);
44466 SDLoc DL(N);
44467
44468 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44469 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
44470 return SDValue();
44471
44472 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
44473 N0 = NN0;
44474 else {
44475 // The Left side has to be a trunc.
44476 if (N0.getOpcode() != ISD::TRUNCATE)
44477 return SDValue();
44478
44479 // The type of the truncated inputs.
44480 if (N0.getOperand(0).getValueType() != VT)
44481 return SDValue();
44482
44483 N0 = N0.getOperand(0);
44484 }
44485
44486 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
44487 N1 = NN1;
44488 else {
44489 // The right side has to be a 'trunc' or a constant vector.
44490 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
44491 N1.getOperand(0).getValueType() == VT;
44492 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
44493 return SDValue();
44494
44495 if (RHSTrunc)
44496 N1 = N1.getOperand(0);
44497 else
44498 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
44499 }
44500
44501 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
44502}
44503
44504// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
44505// register. In most cases we actually compare or select YMM-sized registers
44506// and mixing the two types creates horrible code. This method optimizes
44507// some of the transition sequences.
44508// Even with AVX-512 this is still useful for removing casts around logical
44509// operations on vXi1 mask types.
44510static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44511 const X86Subtarget &Subtarget) {
44512 EVT VT = N->getValueType(0);
44513 assert(VT.isVector() && "Expected vector type")((void)0);
44514
44515 SDLoc DL(N);
44516 assert((N->getOpcode() == ISD::ANY_EXTEND ||((void)0)
44517 N->getOpcode() == ISD::ZERO_EXTEND ||((void)0)
44518 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")((void)0);
44519
44520 SDValue Narrow = N->getOperand(0);
44521 EVT NarrowVT = Narrow.getValueType();
44522
44523 // Generate the wide operation.
44524 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
44525 if (!Op)
44526 return SDValue();
44527 switch (N->getOpcode()) {
44528 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
44529 case ISD::ANY_EXTEND:
44530 return Op;
44531 case ISD::ZERO_EXTEND:
44532 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
44533 case ISD::SIGN_EXTEND:
44534 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
44535 Op, DAG.getValueType(NarrowVT));
44536 }
44537}
44538
44539static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
44540 unsigned FPOpcode;
44541 switch (Opcode) {
44542 default: llvm_unreachable("Unexpected input node for FP logic conversion")__builtin_unreachable();
44543 case ISD::AND: FPOpcode = X86ISD::FAND; break;
44544 case ISD::OR: FPOpcode = X86ISD::FOR; break;
44545 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44546 }
44547 return FPOpcode;
44548}
44549
44550/// If both input operands of a logic op are being cast from floating point
44551/// types, try to convert this into a floating point logic node to avoid
44552/// unnecessary moves from SSE to integer registers.
44553static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
44554 const X86Subtarget &Subtarget) {
44555 EVT VT = N->getValueType(0);
44556 SDValue N0 = N->getOperand(0);
44557 SDValue N1 = N->getOperand(1);
44558 SDLoc DL(N);
44559
44560 if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
44561 return SDValue();
44562
44563 SDValue N00 = N0.getOperand(0);
44564 SDValue N10 = N1.getOperand(0);
44565 EVT N00Type = N00.getValueType();
44566 EVT N10Type = N10.getValueType();
44567
44568 // Ensure that both types are the same and are legal scalar fp types.
44569 if (N00Type != N10Type ||
44570 !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
44571 (Subtarget.hasSSE2() && N00Type == MVT::f64)))
44572 return SDValue();
44573
44574 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
44575 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
44576 return DAG.getBitcast(VT, FPLogic);
44577}
44578
44579// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
44580// to reduce XMM->GPR traffic.
44581static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
44582 unsigned Opc = N->getOpcode();
44583 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&((void)0)
44584 "Unexpected bit opcode")((void)0);
44585
44586 SDValue N0 = N->getOperand(0);
44587 SDValue N1 = N->getOperand(1);
44588
44589 // Both operands must be single use MOVMSK.
44590 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
44591 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
44592 return SDValue();
44593
44594 SDValue Vec0 = N0.getOperand(0);
44595 SDValue Vec1 = N1.getOperand(0);
44596 EVT VecVT0 = Vec0.getValueType();
44597 EVT VecVT1 = Vec1.getValueType();
44598
44599 // Both MOVMSK operands must be from vectors of the same size and same element
44600 // size, but its OK for a fp/int diff.
44601 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
44602 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
44603 return SDValue();
44604
44605 SDLoc DL(N);
44606 unsigned VecOpc =
44607 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
44608 SDValue Result =
44609 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
44610 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44611}
44612
44613/// If this is a zero/all-bits result that is bitwise-anded with a low bits
44614/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
44615/// with a shift-right to eliminate loading the vector constant mask value.
44616static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
44617 const X86Subtarget &Subtarget) {
44618 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
44619 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
44620 EVT VT0 = Op0.getValueType();
44621 EVT VT1 = Op1.getValueType();
44622
44623 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
44624 return SDValue();
44625
44626 APInt SplatVal;
44627 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
44628 !SplatVal.isMask())
44629 return SDValue();
44630
44631 // Don't prevent creation of ANDN.
44632 if (isBitwiseNot(Op0))
44633 return SDValue();
44634
44635 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
44636 return SDValue();
44637
44638 unsigned EltBitWidth = VT0.getScalarSizeInBits();
44639 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
44640 return SDValue();
44641
44642 SDLoc DL(N);
44643 unsigned ShiftVal = SplatVal.countTrailingOnes();
44644 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
44645 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
44646 return DAG.getBitcast(N->getValueType(0), Shift);
44647}
44648
44649// Get the index node from the lowered DAG of a GEP IR instruction with one
44650// indexing dimension.
44651static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
44652 if (Ld->isIndexed())
44653 return SDValue();
44654
44655 SDValue Base = Ld->getBasePtr();
44656
44657 if (Base.getOpcode() != ISD::ADD)
44658 return SDValue();
44659
44660 SDValue ShiftedIndex = Base.getOperand(0);
44661
44662 if (ShiftedIndex.getOpcode() != ISD::SHL)
44663 return SDValue();
44664
44665 return ShiftedIndex.getOperand(0);
44666
44667}
44668
44669static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
44670 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
44671 switch (VT.getSizeInBits()) {
44672 default: return false;
44673 case 64: return Subtarget.is64Bit() ? true : false;
44674 case 32: return true;
44675 }
44676 }
44677 return false;
44678}
44679
44680// This function recognizes cases where X86 bzhi instruction can replace and
44681// 'and-load' sequence.
44682// In case of loading integer value from an array of constants which is defined
44683// as follows:
44684//
44685// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
44686//
44687// then applying a bitwise and on the result with another input.
44688// It's equivalent to performing bzhi (zero high bits) on the input, with the
44689// same index of the load.
44690static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
44691 const X86Subtarget &Subtarget) {
44692 MVT VT = Node->getSimpleValueType(0);
44693 SDLoc dl(Node);
44694
44695 // Check if subtarget has BZHI instruction for the node's type
44696 if (!hasBZHI(Subtarget, VT))
44697 return SDValue();
44698
44699 // Try matching the pattern for both operands.
44700 for (unsigned i = 0; i < 2; i++) {
44701 SDValue N = Node->getOperand(i);
44702 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
44703
44704 // continue if the operand is not a load instruction
44705 if (!Ld)
44706 return SDValue();
44707
44708 const Value *MemOp = Ld->getMemOperand()->getValue();
44709
44710 if (!MemOp)
44711 return SDValue();
44712
44713 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
44714 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
44715 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
44716
44717 Constant *Init = GV->getInitializer();
44718 Type *Ty = Init->getType();
44719 if (!isa<ConstantDataArray>(Init) ||
44720 !Ty->getArrayElementType()->isIntegerTy() ||
44721 Ty->getArrayElementType()->getScalarSizeInBits() !=
44722 VT.getSizeInBits() ||
44723 Ty->getArrayNumElements() >
44724 Ty->getArrayElementType()->getScalarSizeInBits())
44725 continue;
44726
44727 // Check if the array's constant elements are suitable to our case.
44728 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
44729 bool ConstantsMatch = true;
44730 for (uint64_t j = 0; j < ArrayElementCount; j++) {
44731 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
44732 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
44733 ConstantsMatch = false;
44734 break;
44735 }
44736 }
44737 if (!ConstantsMatch)
44738 continue;
44739
44740 // Do the transformation (For 32-bit type):
44741 // -> (and (load arr[idx]), inp)
44742 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
44743 // that will be replaced with one bzhi instruction.
44744 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
44745 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
44746
44747 // Get the Node which indexes into the array.
44748 SDValue Index = getIndexFromUnindexedLoad(Ld);
44749 if (!Index)
44750 return SDValue();
44751 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
44752
44753 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
44754 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
44755
44756 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
44757 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
44758
44759 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
44760 }
44761 }
44762 }
44763 }
44764 return SDValue();
44765}
44766
44767// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
44768// Where C is a mask containing the same number of bits as the setcc and
44769// where the setcc will freely 0 upper bits of k-register. We can replace the
44770// undef in the concat with 0s and remove the AND. This mainly helps with
44771// v2i1/v4i1 setcc being casted to scalar.
44772static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
44773 const X86Subtarget &Subtarget) {
44774 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")((void)0);
44775
44776 EVT VT = N->getValueType(0);
44777
44778 // Make sure this is an AND with constant. We will check the value of the
44779 // constant later.
44780 if (!isa<ConstantSDNode>(N->getOperand(1)))
44781 return SDValue();
44782
44783 // This is implied by the ConstantSDNode.
44784 assert(!VT.isVector() && "Expected scalar VT!")((void)0);
44785
44786 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
44787 !N->getOperand(0).hasOneUse() ||
44788 !N->getOperand(0).getOperand(0).hasOneUse())
44789 return SDValue();
44790
44791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44792 SDValue Src = N->getOperand(0).getOperand(0);
44793 EVT SrcVT = Src.getValueType();
44794 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
44795 !TLI.isTypeLegal(SrcVT))
44796 return SDValue();
44797
44798 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
44799 return SDValue();
44800
44801 // We only care about the first subvector of the concat, we expect the
44802 // other subvectors to be ignored due to the AND if we make the change.
44803 SDValue SubVec = Src.getOperand(0);
44804 EVT SubVecVT = SubVec.getValueType();
44805
44806 // First subvector should be a setcc with a legal result type. The RHS of the
44807 // AND should be a mask with this many bits.
44808 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
44809 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
44810 return SDValue();
44811
44812 EVT SetccVT = SubVec.getOperand(0).getValueType();
44813 if (!TLI.isTypeLegal(SetccVT) ||
44814 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
44815 return SDValue();
44816
44817 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
44818 return SDValue();
44819
44820 // We passed all the checks. Rebuild the concat_vectors with zeroes
44821 // and cast it back to VT.
44822 SDLoc dl(N);
44823 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
44824 DAG.getConstant(0, dl, SubVecVT));
44825 Ops[0] = SubVec;
44826 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
44827 Ops);
44828 return DAG.getBitcast(VT, Concat);
44829}
44830
44831static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
44832 TargetLowering::DAGCombinerInfo &DCI,
44833 const X86Subtarget &Subtarget) {
44834 EVT VT = N->getValueType(0);
44835
44836 // If this is SSE1 only convert to FAND to avoid scalarization.
44837 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
44838 return DAG.getBitcast(
44839 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
44840 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
44841 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
44842 }
44843
44844 // Use a 32-bit and+zext if upper bits known zero.
44845 if (VT == MVT::i64 && Subtarget.is64Bit() &&
44846 !isa<ConstantSDNode>(N->getOperand(1))) {
44847 APInt HiMask = APInt::getHighBitsSet(64, 32);
44848 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
44849 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
44850 SDLoc dl(N);
44851 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
44852 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
44853 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
44854 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
44855 }
44856 }
44857
44858 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
44859 // TODO: Support multiple SrcOps.
44860 if (VT == MVT::i1) {
44861 SmallVector<SDValue, 2> SrcOps;
44862 SmallVector<APInt, 2> SrcPartials;
44863 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
44864 SrcOps.size() == 1) {
44865 SDLoc dl(N);
44866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44867 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
44868 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44869 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
44870 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
44871 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
44872 if (Mask) {
44873 assert(SrcPartials[0].getBitWidth() == NumElts &&((void)0)
44874 "Unexpected partial reduction mask")((void)0);
44875 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
44876 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
44877 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
44878 }
44879 }
44880 }
44881
44882 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
44883 return V;
44884
44885 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
44886 return R;
44887
44888 if (DCI.isBeforeLegalizeOps())
44889 return SDValue();
44890
44891 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
44892 return R;
44893
44894 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
44895 return FPLogic;
44896
44897 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
44898 return R;
44899
44900 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
44901 return ShiftRight;
44902
44903 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
44904 return R;
44905
44906 // Attempt to recursively combine a bitmask AND with shuffles.
44907 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44908 SDValue Op(N, 0);
44909 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44910 return Res;
44911 }
44912
44913 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
44914 if ((VT.getScalarSizeInBits() % 8) == 0 &&
44915 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44916 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
44917 SDValue BitMask = N->getOperand(1);
44918 SDValue SrcVec = N->getOperand(0).getOperand(0);
44919 EVT SrcVecVT = SrcVec.getValueType();
44920
44921 // Check that the constant bitmask masks whole bytes.
44922 APInt UndefElts;
44923 SmallVector<APInt, 64> EltBits;
44924 if (VT == SrcVecVT.getScalarType() &&
44925 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
44926 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
44927 llvm::all_of(EltBits, [](const APInt &M) {
44928 return M.isNullValue() || M.isAllOnesValue();
44929 })) {
44930 unsigned NumElts = SrcVecVT.getVectorNumElements();
44931 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
44932 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
44933
44934 // Create a root shuffle mask from the byte mask and the extracted index.
44935 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
44936 for (unsigned i = 0; i != Scale; ++i) {
44937 if (UndefElts[i])
44938 continue;
44939 int VecIdx = Scale * Idx + i;
44940 ShuffleMask[VecIdx] =
44941 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
44942 }
44943
44944 if (SDValue Shuffle = combineX86ShufflesRecursively(
44945 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
44946 X86::MaxShuffleCombineDepth,
44947 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
44948 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
44949 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
44950 N->getOperand(0).getOperand(1));
44951 }
44952 }
44953
44954 return SDValue();
44955}
44956
44957// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
44958static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
44959 const X86Subtarget &Subtarget) {
44960 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((void)0);
44961
44962 MVT VT = N->getSimpleValueType(0);
44963 if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
44964 return SDValue();
44965
44966 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
44967 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
44968 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
44969 return SDValue();
44970
44971 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
44972 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
44973 bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
44974 Subtarget.hasVLX();
44975 if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
44976 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
44977 return SDValue();
44978
44979 // Attempt to extract constant byte masks.
44980 APInt UndefElts0, UndefElts1;
44981 SmallVector<APInt, 32> EltBits0, EltBits1;
44982 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
44983 false, false))
44984 return SDValue();
44985 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
44986 false, false))
44987 return SDValue();
44988
44989 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
44990 // TODO - add UNDEF elts support.
44991 if (UndefElts0[i] || UndefElts1[i])
44992 return SDValue();
44993 if (EltBits0[i] != ~EltBits1[i])
44994 return SDValue();
44995 }
44996
44997 SDLoc DL(N);
44998
44999 if (UseVPTERNLOG) {
45000 // Emit a VPTERNLOG node directly.
45001 SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
45002 SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
45003 SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
45004 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
45005 return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
45006 }
45007
45008 SDValue X = N->getOperand(0);
45009 SDValue Y =
45010 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
45011 DAG.getBitcast(VT, N1.getOperand(0)));
45012 return DAG.getNode(ISD::OR, DL, VT, X, Y);
45013}
45014
45015// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
45016static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
45017 if (N->getOpcode() != ISD::OR)
45018 return false;
45019
45020 SDValue N0 = N->getOperand(0);
45021 SDValue N1 = N->getOperand(1);
45022
45023 // Canonicalize AND to LHS.
45024 if (N1.getOpcode() == ISD::AND)
45025 std::swap(N0, N1);
45026
45027 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
45028 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
45029 return false;
45030
45031 Mask = N1.getOperand(0);
45032 X = N1.getOperand(1);
45033
45034 // Check to see if the mask appeared in both the AND and ANDNP.
45035 if (N0.getOperand(0) == Mask)
45036 Y = N0.getOperand(1);
45037 else if (N0.getOperand(1) == Mask)
45038 Y = N0.getOperand(0);
45039 else
45040 return false;
45041
45042 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
45043 // ANDNP combine allows other combines to happen that prevent matching.
45044 return true;
45045}
45046
45047// Try to fold:
45048// (or (and (m, y), (pandn m, x)))
45049// into:
45050// (vselect m, x, y)
45051// As a special case, try to fold:
45052// (or (and (m, (sub 0, x)), (pandn m, x)))
45053// into:
45054// (sub (xor X, M), M)
45055static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
45056 const X86Subtarget &Subtarget) {
45057 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((void)0);
45058
45059 EVT VT = N->getValueType(0);
45060 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
45061 (VT.is256BitVector() && Subtarget.hasInt256())))
45062 return SDValue();
45063
45064 SDValue X, Y, Mask;
45065 if (!matchLogicBlend(N, X, Y, Mask))
45066 return SDValue();
45067
45068 // Validate that X, Y, and Mask are bitcasts, and see through them.
45069 Mask = peekThroughBitcasts(Mask);
45070 X = peekThroughBitcasts(X);
45071 Y = peekThroughBitcasts(Y);
45072
45073 EVT MaskVT = Mask.getValueType();
45074 unsigned EltBits = MaskVT.getScalarSizeInBits();
45075
45076 // TODO: Attempt to handle floating point cases as well?
45077 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
45078 return SDValue();
45079
45080 SDLoc DL(N);
45081
45082 // Attempt to combine to conditional negate: (sub (xor X, M), M)
45083 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
45084 DAG, Subtarget))
45085 return Res;
45086
45087 // PBLENDVB is only available on SSE 4.1.
45088 if (!Subtarget.hasSSE41())
45089 return SDValue();
45090
45091 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
45092 if (Subtarget.hasVLX())
45093 return SDValue();
45094
45095 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
45096
45097 X = DAG.getBitcast(BlendVT, X);
45098 Y = DAG.getBitcast(BlendVT, Y);
45099 Mask = DAG.getBitcast(BlendVT, Mask);
45100 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
45101 return DAG.getBitcast(VT, Mask);
45102}
45103
45104// Helper function for combineOrCmpEqZeroToCtlzSrl
45105// Transforms:
45106// seteq(cmp x, 0)
45107// into:
45108// srl(ctlz x), log2(bitsize(x))
45109// Input pattern is checked by caller.
45110static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
45111 SelectionDAG &DAG) {
45112 SDValue Cmp = Op.getOperand(1);
45113 EVT VT = Cmp.getOperand(0).getValueType();
45114 unsigned Log2b = Log2_32(VT.getSizeInBits());
45115 SDLoc dl(Op);
45116 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
45117 // The result of the shift is true or false, and on X86, the 32-bit
45118 // encoding of shr and lzcnt is more desirable.
45119 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
45120 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
45121 DAG.getConstant(Log2b, dl, MVT::i8));
45122 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
45123}
45124
45125// Try to transform:
45126// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
45127// into:
45128// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
45129// Will also attempt to match more generic cases, eg:
45130// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
45131// Only applies if the target supports the FastLZCNT feature.
45132static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
45133 TargetLowering::DAGCombinerInfo &DCI,
45134 const X86Subtarget &Subtarget) {
45135 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
45136 return SDValue();
45137
45138 auto isORCandidate = [](SDValue N) {
45139 return (N->getOpcode() == ISD::OR && N->hasOneUse());
45140 };
45141
45142 // Check the zero extend is extending to 32-bit or more. The code generated by
45143 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
45144 // instructions to clear the upper bits.
45145 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
45146 !isORCandidate(N->getOperand(0)))
45147 return SDValue();
45148
45149 // Check the node matches: setcc(eq, cmp 0)
45150 auto isSetCCCandidate = [](SDValue N) {
45151 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
45152 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
45153 N->getOperand(1).getOpcode() == X86ISD::CMP &&
45154 isNullConstant(N->getOperand(1).getOperand(1)) &&
45155 N->getOperand(1).getValueType().bitsGE(MVT::i32);
45156 };
45157
45158 SDNode *OR = N->getOperand(0).getNode();
45159 SDValue LHS = OR->getOperand(0);
45160 SDValue RHS = OR->getOperand(1);
45161
45162 // Save nodes matching or(or, setcc(eq, cmp 0)).
45163 SmallVector<SDNode *, 2> ORNodes;
45164 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
45165 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
45166 ORNodes.push_back(OR);
45167 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
45168 LHS = OR->getOperand(0);
45169 RHS = OR->getOperand(1);
45170 }
45171
45172 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
45173 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
45174 !isORCandidate(SDValue(OR, 0)))
45175 return SDValue();
45176
45177 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
45178 // to
45179 // or(srl(ctlz),srl(ctlz)).
45180 // The dag combiner can then fold it into:
45181 // srl(or(ctlz, ctlz)).
45182 EVT VT = OR->getValueType(0);
45183 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
45184 SDValue Ret, NewRHS;
45185 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
45186 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
45187
45188 if (!Ret)
45189 return SDValue();
45190
45191 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
45192 while (ORNodes.size() > 0) {
45193 OR = ORNodes.pop_back_val();
45194 LHS = OR->getOperand(0);
45195 RHS = OR->getOperand(1);
45196 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
45197 if (RHS->getOpcode() == ISD::OR)
45198 std::swap(LHS, RHS);
45199 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
45200 if (!NewRHS)
45201 return SDValue();
45202 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
45203 }
45204
45205 if (Ret)
45206 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
45207
45208 return Ret;
45209}
45210
45211static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
45212 TargetLowering::DAGCombinerInfo &DCI,
45213 const X86Subtarget &Subtarget) {
45214 SDValue N0 = N->getOperand(0);
45215 SDValue N1 = N->getOperand(1);
45216 EVT VT = N->getValueType(0);
45217
45218 // If this is SSE1 only convert to FOR to avoid scalarization.
45219 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
45220 return DAG.getBitcast(MVT::v4i32,
45221 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
45222 DAG.getBitcast(MVT::v4f32, N0),
45223 DAG.getBitcast(MVT::v4f32, N1)));
45224 }
45225
45226 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
45227 // TODO: Support multiple SrcOps.
45228 if (VT == MVT::i1) {
45229 SmallVector<SDValue, 2> SrcOps;
45230 SmallVector<APInt, 2> SrcPartials;
45231 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
45232 SrcOps.size() == 1) {
45233 SDLoc dl(N);
45234 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45235 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
45236 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45237 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
45238 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
45239 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
45240 if (Mask) {
45241 assert(SrcPartials[0].getBitWidth() == NumElts &&((void)0)
45242 "Unexpected partial reduction mask")((void)0);
45243 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
45244 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
45245 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
45246 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
45247 }
45248 }
45249 }
45250
45251 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
45252 return R;
45253
45254 if (DCI.isBeforeLegalizeOps())
45255 return SDValue();
45256
45257 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
45258 return R;
45259
45260 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
45261 return FPLogic;
45262
45263 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
45264 return R;
45265
45266 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
45267 return R;
45268
45269 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
45270 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
45271 // iff the upper elements of the non-shifted arg are zero.
45272 // KUNPCK require 16+ bool vector elements.
45273 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
45274 unsigned NumElts = VT.getVectorNumElements();
45275 unsigned HalfElts = NumElts / 2;
45276 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
45277 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
45278 N1.getConstantOperandAPInt(1) == HalfElts &&
45279 DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
45280 SDLoc dl(N);
45281 return DAG.getNode(
45282 ISD::CONCAT_VECTORS, dl, VT,
45283 extractSubVector(N0, 0, DAG, dl, HalfElts),
45284 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
45285 }
45286 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
45287 N0.getConstantOperandAPInt(1) == HalfElts &&
45288 DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
45289 SDLoc dl(N);
45290 return DAG.getNode(
45291 ISD::CONCAT_VECTORS, dl, VT,
45292 extractSubVector(N1, 0, DAG, dl, HalfElts),
45293 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
45294 }
45295 }
45296
45297 // Attempt to recursively combine an OR of shuffles.
45298 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
45299 SDValue Op(N, 0);
45300 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45301 return Res;
45302 }
45303
45304 return SDValue();
45305}
45306
45307/// Try to turn tests against the signbit in the form of:
45308/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
45309/// into:
45310/// SETGT(X, -1)
45311static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
45312 // This is only worth doing if the output type is i8 or i1.
45313 EVT ResultType = N->getValueType(0);
45314 if (ResultType != MVT::i8 && ResultType != MVT::i1)
45315 return SDValue();
45316
45317 SDValue N0 = N->getOperand(0);
45318 SDValue N1 = N->getOperand(1);
45319
45320 // We should be performing an xor against a truncated shift.
45321 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
45322 return SDValue();
45323
45324 // Make sure we are performing an xor against one.
45325 if (!isOneConstant(N1))
45326 return SDValue();
45327
45328 // SetCC on x86 zero extends so only act on this if it's a logical shift.
45329 SDValue Shift = N0.getOperand(0);
45330 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
45331 return SDValue();
45332
45333 // Make sure we are truncating from one of i16, i32 or i64.
45334 EVT ShiftTy = Shift.getValueType();
45335 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
45336 return SDValue();
45337
45338 // Make sure the shift amount extracts the sign bit.
45339 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
45340 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
45341 return SDValue();
45342
45343 // Create a greater-than comparison against -1.
45344 // N.B. Using SETGE against 0 works but we want a canonical looking
45345 // comparison, using SETGT matches up with what TranslateX86CC.
45346 SDLoc DL(N);
45347 SDValue ShiftOp = Shift.getOperand(0);
45348 EVT ShiftOpTy = ShiftOp.getValueType();
45349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45350 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
45351 *DAG.getContext(), ResultType);
45352 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
45353 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
45354 if (SetCCResultType != ResultType)
45355 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
45356 return Cond;
45357}
45358
45359/// Turn vector tests of the signbit in the form of:
45360/// xor (sra X, elt_size(X)-1), -1
45361/// into:
45362/// pcmpgt X, -1
45363///
45364/// This should be called before type legalization because the pattern may not
45365/// persist after that.
45366static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
45367 const X86Subtarget &Subtarget) {
45368 EVT VT = N->getValueType(0);
45369 if (!VT.isSimple())
45370 return SDValue();
45371
45372 switch (VT.getSimpleVT().SimpleTy) {
45373 default: return SDValue();
45374 case MVT::v16i8:
45375 case MVT::v8i16:
45376 case MVT::v4i32:
45377 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
45378 case MVT::v32i8:
45379 case MVT::v16i16:
45380 case MVT::v8i32:
45381 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
45382 }
45383
45384 // There must be a shift right algebraic before the xor, and the xor must be a
45385 // 'not' operation.
45386 SDValue Shift = N->getOperand(0);
45387 SDValue Ones = N->getOperand(1);
45388 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
45389 !ISD::isBuildVectorAllOnes(Ones.getNode()))
45390 return SDValue();
45391
45392 // The shift should be smearing the sign bit across each vector element.
45393 auto *ShiftAmt =
45394 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
45395 if (!ShiftAmt ||
45396 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
45397 return SDValue();
45398
45399 // Create a greater-than comparison against -1. We don't use the more obvious
45400 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
45401 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
45402}
45403
45404/// Detect patterns of truncation with unsigned saturation:
45405///
45406/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
45407/// Return the source value x to be truncated or SDValue() if the pattern was
45408/// not matched.
45409///
45410/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
45411/// where C1 >= 0 and C2 is unsigned max of destination type.
45412///
45413/// (truncate (smax (smin (x, C2), C1)) to dest_type)
45414/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
45415///
45416/// These two patterns are equivalent to:
45417/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
45418/// So return the smax(x, C1) value to be truncated or SDValue() if the
45419/// pattern was not matched.
45420static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45421 const SDLoc &DL) {
45422 EVT InVT = In.getValueType();
45423
45424 // Saturation with truncation. We truncate from InVT to VT.
45425 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&((void)0)
45426 "Unexpected types for truncate operation")((void)0);
45427
45428 // Match min/max and return limit value as a parameter.
45429 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
45430 if (V.getOpcode() == Opcode &&
45431 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
45432 return V.getOperand(0);
45433 return SDValue();
45434 };
45435
45436 APInt C1, C2;
45437 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
45438 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
45439 // the element size of the destination type.
45440 if (C2.isMask(VT.getScalarSizeInBits()))
45441 return UMin;
45442
45443 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
45444 if (MatchMinMax(SMin, ISD::SMAX, C1))
45445 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
45446 return SMin;
45447
45448 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
45449 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
45450 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
45451 C2.uge(C1)) {
45452 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
45453 }
45454
45455 return SDValue();
45456}
45457
45458/// Detect patterns of truncation with signed saturation:
45459/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
45460/// signed_max_of_dest_type)) to dest_type)
45461/// or:
45462/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
45463/// signed_min_of_dest_type)) to dest_type).
45464/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
45465/// Return the source value to be truncated or SDValue() if the pattern was not
45466/// matched.
45467static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
45468 unsigned NumDstBits = VT.getScalarSizeInBits();
45469 unsigned NumSrcBits = In.getScalarValueSizeInBits();
45470 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")((void)0);
45471
45472 auto MatchMinMax = [](SDValue V, unsigned Opcode,
45473 const APInt &Limit) -> SDValue {
45474 APInt C;
45475 if (V.getOpcode() == Opcode &&
45476 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
45477 return V.getOperand(0);
45478 return SDValue();
45479 };
45480
45481 APInt SignedMax, SignedMin;
45482 if (MatchPackUS) {
45483 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
45484 SignedMin = APInt(NumSrcBits, 0);
45485 } else {
45486 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
45487 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
45488 }
45489
45490 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
45491 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
45492 return SMax;
45493
45494 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
45495 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
45496 return SMin;
45497
45498 return SDValue();
45499}
45500
45501static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
45502 SelectionDAG &DAG,
45503 const X86Subtarget &Subtarget) {
45504 if (!Subtarget.hasSSE2() || !VT.isVector())
45505 return SDValue();
45506
45507 EVT SVT = VT.getVectorElementType();
45508 EVT InVT = In.getValueType();
45509 EVT InSVT = InVT.getVectorElementType();
45510
45511 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
45512 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
45513 // and concatenate at the same time. Then we can use a final vpmovuswb to
45514 // clip to 0-255.
45515 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
45516 InVT == MVT::v16i32 && VT == MVT::v16i8) {
45517 if (auto USatVal = detectSSatPattern(In, VT, true)) {
45518 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
45519 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
45520 DL, DAG, Subtarget);
45521 assert(Mid && "Failed to pack!")((void)0);
45522 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
45523 }
45524 }
45525
45526 // vXi32 truncate instructions are available with AVX512F.
45527 // vXi16 truncate instructions are only available with AVX512BW.
45528 // For 256-bit or smaller vectors, we require VLX.
45529 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
45530 // If the result type is 256-bits or larger and we have disable 512-bit
45531 // registers, we should go ahead and use the pack instructions if possible.
45532 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
45533 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
45534 (InVT.getSizeInBits() > 128) &&
45535 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
45536 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
45537
45538 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
45539 VT.getSizeInBits() >= 64 &&
45540 (SVT == MVT::i8 || SVT == MVT::i16) &&
45541 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
45542 if (auto USatVal = detectSSatPattern(In, VT, true)) {
45543 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
45544 // Only do this when the result is at least 64 bits or we'll leaving
45545 // dangling PACKSSDW nodes.
45546 if (SVT == MVT::i8 && InSVT == MVT::i32) {
45547 EVT MidVT = VT.changeVectorElementType(MVT::i16);
45548 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
45549 DAG, Subtarget);
45550 assert(Mid && "Failed to pack!")((void)0);
45551 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
45552 Subtarget);
45553 assert(V && "Failed to pack!")((void)0);
45554 return V;
45555 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
45556 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
45557 Subtarget);
45558 }
45559 if (auto SSatVal = detectSSatPattern(In, VT))
45560 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
45561 Subtarget);
45562 }
45563
45564 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45565 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
45566 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
45567 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
45568 unsigned TruncOpc = 0;
45569 SDValue SatVal;
45570 if (auto SSatVal = detectSSatPattern(In, VT)) {
45571 SatVal = SSatVal;
45572 TruncOpc = X86ISD::VTRUNCS;
45573 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
45574 SatVal = USatVal;
45575 TruncOpc = X86ISD::VTRUNCUS;
45576 }
45577 if (SatVal) {
45578 unsigned ResElts = VT.getVectorNumElements();
45579 // If the input type is less than 512 bits and we don't have VLX, we need
45580 // to widen to 512 bits.
45581 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
45582 unsigned NumConcats = 512 / InVT.getSizeInBits();
45583 ResElts *= NumConcats;
45584 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
45585 ConcatOps[0] = SatVal;
45586 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
45587 NumConcats * InVT.getVectorNumElements());
45588 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
45589 }
45590 // Widen the result if its narrower than 128 bits.
45591 if (ResElts * SVT.getSizeInBits() < 128)
45592 ResElts = 128 / SVT.getSizeInBits();
45593 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
45594 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
45595 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45596 DAG.getIntPtrConstant(0, DL));
45597 }
45598 }
45599
45600 return SDValue();
45601}
45602
45603/// This function detects the AVG pattern between vectors of unsigned i8/i16,
45604/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
45605/// X86ISD::AVG instruction.
45606static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45607 const X86Subtarget &Subtarget,
45608 const SDLoc &DL) {
45609 if (!VT.isVector())
45610 return SDValue();
45611 EVT InVT = In.getValueType();
45612 unsigned NumElems = VT.getVectorNumElements();
45613
45614 EVT ScalarVT = VT.getVectorElementType();
45615 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
45616 return SDValue();
45617
45618 // InScalarVT is the intermediate type in AVG pattern and it should be greater
45619 // than the original input type (i8/i16).
45620 EVT InScalarVT = InVT.getVectorElementType();
45621 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
45622 return SDValue();
45623
45624 if (!Subtarget.hasSSE2())
45625 return SDValue();
45626
45627 // Detect the following pattern:
45628 //
45629 // %1 = zext <N x i8> %a to <N x i32>
45630 // %2 = zext <N x i8> %b to <N x i32>
45631 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
45632 // %4 = add nuw nsw <N x i32> %3, %2
45633 // %5 = lshr <N x i32> %N, <i32 1 x N>
45634 // %6 = trunc <N x i32> %5 to <N x i8>
45635 //
45636 // In AVX512, the last instruction can also be a trunc store.
45637 if (In.getOpcode() != ISD::SRL)
45638 return SDValue();
45639
45640 // A lambda checking the given SDValue is a constant vector and each element
45641 // is in the range [Min, Max].
45642 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
45643 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
45644 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
45645 });
45646 };
45647
45648 // Check if each element of the vector is right-shifted by one.
45649 SDValue LHS = In.getOperand(0);
45650 SDValue RHS = In.getOperand(1);
45651 if (!IsConstVectorInRange(RHS, 1, 1))
45652 return SDValue();
45653 if (LHS.getOpcode() != ISD::ADD)
45654 return SDValue();
45655
45656 // Detect a pattern of a + b + 1 where the order doesn't matter.
45657 SDValue Operands[3];
45658 Operands[0] = LHS.getOperand(0);
45659 Operands[1] = LHS.getOperand(1);
45660
45661 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45662 ArrayRef<SDValue> Ops) {
45663 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
45664 };
45665
45666 auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
45667 // Pad to a power-of-2 vector, split+apply and extract the original vector.
45668 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
45669 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
45670 if (NumElemsPow2 != NumElems) {
45671 SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45672 SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45673 for (unsigned i = 0; i != NumElems; ++i) {
45674 SDValue Idx = DAG.getIntPtrConstant(i, DL);
45675 Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
45676 Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
45677 }
45678 Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
45679 Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
45680 }
45681 SDValue Res =
45682 SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
45683 if (NumElemsPow2 == NumElems)
45684 return Res;
45685 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45686 DAG.getIntPtrConstant(0, DL));
45687 };
45688
45689 // Take care of the case when one of the operands is a constant vector whose
45690 // element is in the range [1, 256].
45691 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
45692 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
45693 Operands[0].getOperand(0).getValueType() == VT) {
45694 // The pattern is detected. Subtract one from the constant vector, then
45695 // demote it and emit X86ISD::AVG instruction.
45696 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
45697 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
45698 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
45699 return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
45700 }
45701
45702 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
45703 // Match the or case only if its 'add-like' - can be replaced by an add.
45704 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
45705 if (ISD::ADD == V.getOpcode()) {
45706 Op0 = V.getOperand(0);
45707 Op1 = V.getOperand(1);
45708 return true;
45709 }
45710 if (ISD::ZERO_EXTEND != V.getOpcode())
45711 return false;
45712 V = V.getOperand(0);
45713 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
45714 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
45715 return false;
45716 Op0 = V.getOperand(0);
45717 Op1 = V.getOperand(1);
45718 return true;
45719 };
45720
45721 SDValue Op0, Op1;
45722 if (FindAddLike(Operands[0], Op0, Op1))
45723 std::swap(Operands[0], Operands[1]);
45724 else if (!FindAddLike(Operands[1], Op0, Op1))
45725 return SDValue();
45726 Operands[2] = Op0;
45727 Operands[1] = Op1;
45728
45729 // Now we have three operands of two additions. Check that one of them is a
45730 // constant vector with ones, and the other two can be promoted from i8/i16.
45731 for (int i = 0; i < 3; ++i) {
45732 if (!IsConstVectorInRange(Operands[i], 1, 1))
45733 continue;
45734 std::swap(Operands[i], Operands[2]);
45735
45736 // Check if Operands[0] and Operands[1] are results of type promotion.
45737 for (int j = 0; j < 2; ++j)
45738 if (Operands[j].getValueType() != VT) {
45739 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
45740 Operands[j].getOperand(0).getValueType() != VT)
45741 return SDValue();
45742 Operands[j] = Operands[j].getOperand(0);
45743 }
45744
45745 // The pattern is detected, emit X86ISD::AVG instruction(s).
45746 return AVGSplitter(Operands[0], Operands[1]);
45747 }
45748
45749 return SDValue();
45750}
45751
45752static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
45753 TargetLowering::DAGCombinerInfo &DCI,
45754 const X86Subtarget &Subtarget) {
45755 LoadSDNode *Ld = cast<LoadSDNode>(N);
45756 EVT RegVT = Ld->getValueType(0);
45757 EVT MemVT = Ld->getMemoryVT();
45758 SDLoc dl(Ld);
45759 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45760
45761 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
45762 // into two 16-byte operations. Also split non-temporal aligned loads on
45763 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
45764 ISD::LoadExtType Ext = Ld->getExtensionType();
45765 bool Fast;
45766 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
45767 Ext == ISD::NON_EXTLOAD &&
45768 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
45769 Ld->getAlignment() >= 16) ||
45770 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
45771 *Ld->getMemOperand(), &Fast) &&
45772 !Fast))) {
45773 unsigned NumElems = RegVT.getVectorNumElements();
45774 if (NumElems < 2)
45775 return SDValue();
45776
45777 unsigned HalfOffset = 16;
45778 SDValue Ptr1 = Ld->getBasePtr();
45779 SDValue Ptr2 =
45780 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
45781 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
45782 NumElems / 2);
45783 SDValue Load1 =
45784 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
45785 Ld->getOriginalAlign(),
45786 Ld->getMemOperand()->getFlags());
45787 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
45788 Ld->getPointerInfo().getWithOffset(HalfOffset),
45789 Ld->getOriginalAlign(),
45790 Ld->getMemOperand()->getFlags());
45791 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
45792 Load1.getValue(1), Load2.getValue(1));
45793
45794 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
45795 return DCI.CombineTo(N, NewVec, TF, true);
45796 }
45797
45798 // Bool vector load - attempt to cast to an integer, as we have good
45799 // (vXiY *ext(vXi1 bitcast(iX))) handling.
45800 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
45801 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
45802 unsigned NumElts = RegVT.getVectorNumElements();
45803 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45804 if (TLI.isTypeLegal(IntVT)) {
45805 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
45806 Ld->getPointerInfo(),
45807 Ld->getOriginalAlign(),
45808 Ld->getMemOperand()->getFlags());
45809 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
45810 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
45811 }
45812 }
45813
45814 // If we also broadcast this as a subvector to a wider type, then just extract
45815 // the lowest subvector.
45816 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
45817 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
45818 SDValue Ptr = Ld->getBasePtr();
45819 SDValue Chain = Ld->getChain();
45820 for (SDNode *User : Ptr->uses()) {
45821 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
45822 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
45823 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
45824 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
45825 MemVT.getSizeInBits() &&
45826 !User->hasAnyUseOfValue(1) &&
45827 User->getValueSizeInBits(0).getFixedSize() >
45828 RegVT.getFixedSizeInBits()) {
45829 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
45830 RegVT.getSizeInBits());
45831 Extract = DAG.getBitcast(RegVT, Extract);
45832 return DCI.CombineTo(N, Extract, SDValue(User, 1));
45833 }
45834 }
45835 }
45836
45837 // Cast ptr32 and ptr64 pointers to the default address space before a load.
45838 unsigned AddrSpace = Ld->getAddressSpace();
45839 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
45840 AddrSpace == X86AS::PTR32_UPTR) {
45841 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
45842 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
45843 SDValue Cast =
45844 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
45845 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
45846 Ld->getOriginalAlign(),
45847 Ld->getMemOperand()->getFlags());
45848 }
45849 }
45850
45851 return SDValue();
45852}
45853
45854/// If V is a build vector of boolean constants and exactly one of those
45855/// constants is true, return the operand index of that true element.
45856/// Otherwise, return -1.
45857static int getOneTrueElt(SDValue V) {
45858 // This needs to be a build vector of booleans.
45859 // TODO: Checking for the i1 type matches the IR definition for the mask,
45860 // but the mask check could be loosened to i8 or other types. That might
45861 // also require checking more than 'allOnesValue'; eg, the x86 HW
45862 // instructions only require that the MSB is set for each mask element.
45863 // The ISD::MSTORE comments/definition do not specify how the mask operand
45864 // is formatted.
45865 auto *BV = dyn_cast<BuildVectorSDNode>(V);
45866 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
45867 return -1;
45868
45869 int TrueIndex = -1;
45870 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
45871 for (unsigned i = 0; i < NumElts; ++i) {
45872 const SDValue &Op = BV->getOperand(i);
45873 if (Op.isUndef())
45874 continue;
45875 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
45876 if (!ConstNode)
45877 return -1;
45878 if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
45879 // If we already found a one, this is too many.
45880 if (TrueIndex >= 0)
45881 return -1;
45882 TrueIndex = i;
45883 }
45884 }
45885 return TrueIndex;
45886}
45887
45888/// Given a masked memory load/store operation, return true if it has one mask
45889/// bit set. If it has one mask bit set, then also return the memory address of
45890/// the scalar element to load/store, the vector index to insert/extract that
45891/// scalar element, and the alignment for the scalar memory access.
45892static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
45893 SelectionDAG &DAG, SDValue &Addr,
45894 SDValue &Index, Align &Alignment,
45895 unsigned &Offset) {
45896 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
45897 if (TrueMaskElt < 0)
45898 return false;
45899
45900 // Get the address of the one scalar element that is specified by the mask
45901 // using the appropriate offset from the base pointer.
45902 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
45903 Offset = 0;
45904 Addr = MaskedOp->getBasePtr();
45905 if (TrueMaskElt != 0) {
45906 Offset = TrueMaskElt * EltVT.getStoreSize();
45907 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
45908 SDLoc(MaskedOp));
45909 }
45910
45911 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
45912 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
45913 EltVT.getStoreSize());
45914 return true;
45915}
45916
45917/// If exactly one element of the mask is set for a non-extending masked load,
45918/// it is a scalar load and vector insert.
45919/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
45920/// mask have already been optimized in IR, so we don't bother with those here.
45921static SDValue
45922reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45923 TargetLowering::DAGCombinerInfo &DCI,
45924 const X86Subtarget &Subtarget) {
45925 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((void)0);
45926 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
45927 // However, some target hooks may need to be added to know when the transform
45928 // is profitable. Endianness would also have to be considered.
45929
45930 SDValue Addr, VecIndex;
45931 Align Alignment;
45932 unsigned Offset;
45933 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
45934 return SDValue();
45935
45936 // Load the one scalar element that is specified by the mask using the
45937 // appropriate offset from the base pointer.
45938 SDLoc DL(ML);
45939 EVT VT = ML->getValueType(0);
45940 EVT EltVT = VT.getVectorElementType();
45941
45942 EVT CastVT = VT;
45943 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
45944 EltVT = MVT::f64;
45945 CastVT = VT.changeVectorElementType(EltVT);
45946 }
45947
45948 SDValue Load =
45949 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
45950 ML->getPointerInfo().getWithOffset(Offset),
45951 Alignment, ML->getMemOperand()->getFlags());
45952
45953 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
45954
45955 // Insert the loaded element into the appropriate place in the vector.
45956 SDValue Insert =
45957 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
45958 Insert = DAG.getBitcast(VT, Insert);
45959 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
45960}
45961
45962static SDValue
45963combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45964 TargetLowering::DAGCombinerInfo &DCI) {
45965 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((void)0);
45966 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
45967 return SDValue();
45968
45969 SDLoc DL(ML);
45970 EVT VT = ML->getValueType(0);
45971
45972 // If we are loading the first and last elements of a vector, it is safe and
45973 // always faster to load the whole vector. Replace the masked load with a
45974 // vector load and select.
45975 unsigned NumElts = VT.getVectorNumElements();
45976 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
45977 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
45978 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
45979 if (LoadFirstElt && LoadLastElt) {
45980 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
45981 ML->getMemOperand());
45982 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
45983 ML->getPassThru());
45984 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
45985 }
45986
45987 // Convert a masked load with a constant mask into a masked load and a select.
45988 // This allows the select operation to use a faster kind of select instruction
45989 // (for example, vblendvps -> vblendps).
45990
45991 // Don't try this if the pass-through operand is already undefined. That would
45992 // cause an infinite loop because that's what we're about to create.
45993 if (ML->getPassThru().isUndef())
45994 return SDValue();
45995
45996 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
45997 return SDValue();
45998
45999 // The new masked load has an undef pass-through operand. The select uses the
46000 // original pass-through operand.
46001 SDValue NewML = DAG.getMaskedLoad(
46002 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
46003 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
46004 ML->getAddressingMode(), ML->getExtensionType());
46005 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
46006 ML->getPassThru());
46007
46008 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
46009}
46010
46011static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
46012 TargetLowering::DAGCombinerInfo &DCI,
46013 const X86Subtarget &Subtarget) {
46014 auto *Mld = cast<MaskedLoadSDNode>(N);
46015
46016 // TODO: Expanding load with constant mask may be optimized as well.
46017 if (Mld->isExpandingLoad())
46018 return SDValue();
46019
46020 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
46021 if (SDValue ScalarLoad =
46022 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
46023 return ScalarLoad;
46024
46025 // TODO: Do some AVX512 subsets benefit from this transform?
46026 if (!Subtarget.hasAVX512())
46027 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
46028 return Blend;
46029 }
46030
46031 // If the mask value has been legalized to a non-boolean vector, try to
46032 // simplify ops leading up to it. We only demand the MSB of each lane.
46033 SDValue Mask = Mld->getMask();
46034 if (Mask.getScalarValueSizeInBits() != 1) {
46035 EVT VT = Mld->getValueType(0);
46036 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46037 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46038 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46039 if (N->getOpcode() != ISD::DELETED_NODE)
46040 DCI.AddToWorklist(N);
46041 return SDValue(N, 0);
46042 }
46043 if (SDValue NewMask =
46044 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46045 return DAG.getMaskedLoad(
46046 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
46047 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
46048 Mld->getAddressingMode(), Mld->getExtensionType());
46049 }
46050
46051 return SDValue();
46052}
46053
46054/// If exactly one element of the mask is set for a non-truncating masked store,
46055/// it is a vector extract and scalar store.
46056/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
46057/// mask have already been optimized in IR, so we don't bother with those here.
46058static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
46059 SelectionDAG &DAG,
46060 const X86Subtarget &Subtarget) {
46061 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
46062 // However, some target hooks may need to be added to know when the transform
46063 // is profitable. Endianness would also have to be considered.
46064
46065 SDValue Addr, VecIndex;
46066 Align Alignment;
46067 unsigned Offset;
46068 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
46069 return SDValue();
46070
46071 // Extract the one scalar element that is actually being stored.
46072 SDLoc DL(MS);
46073 SDValue Value = MS->getValue();
46074 EVT VT = Value.getValueType();
46075 EVT EltVT = VT.getVectorElementType();
46076 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
46077 EltVT = MVT::f64;
46078 EVT CastVT = VT.changeVectorElementType(EltVT);
46079 Value = DAG.getBitcast(CastVT, Value);
46080 }
46081 SDValue Extract =
46082 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
46083
46084 // Store that element at the appropriate offset from the base pointer.
46085 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
46086 MS->getPointerInfo().getWithOffset(Offset),
46087 Alignment, MS->getMemOperand()->getFlags());
46088}
46089
46090static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
46091 TargetLowering::DAGCombinerInfo &DCI,
46092 const X86Subtarget &Subtarget) {
46093 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
46094 if (Mst->isCompressingStore())
46095 return SDValue();
46096
46097 EVT VT = Mst->getValue().getValueType();
46098 SDLoc dl(Mst);
46099 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46100
46101 if (Mst->isTruncatingStore())
46102 return SDValue();
46103
46104 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
46105 return ScalarStore;
46106
46107 // If the mask value has been legalized to a non-boolean vector, try to
46108 // simplify ops leading up to it. We only demand the MSB of each lane.
46109 SDValue Mask = Mst->getMask();
46110 if (Mask.getScalarValueSizeInBits() != 1) {
46111 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46112 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46113 if (N->getOpcode() != ISD::DELETED_NODE)
46114 DCI.AddToWorklist(N);
46115 return SDValue(N, 0);
46116 }
46117 if (SDValue NewMask =
46118 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46119 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
46120 Mst->getBasePtr(), Mst->getOffset(), NewMask,
46121 Mst->getMemoryVT(), Mst->getMemOperand(),
46122 Mst->getAddressingMode());
46123 }
46124
46125 SDValue Value = Mst->getValue();
46126 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
46127 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
46128 Mst->getMemoryVT())) {
46129 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
46130 Mst->getBasePtr(), Mst->getOffset(), Mask,
46131 Mst->getMemoryVT(), Mst->getMemOperand(),
46132 Mst->getAddressingMode(), true);
46133 }
46134
46135 return SDValue();
46136}
46137
46138static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
46139 TargetLowering::DAGCombinerInfo &DCI,
46140 const X86Subtarget &Subtarget) {
46141 StoreSDNode *St = cast<StoreSDNode>(N);
46142 EVT StVT = St->getMemoryVT();
46143 SDLoc dl(St);
46144 SDValue StoredVal = St->getValue();
46145 EVT VT = StoredVal.getValueType();
46146 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46147
46148 // Convert a store of vXi1 into a store of iX and a bitcast.
46149 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
46150 VT.getVectorElementType() == MVT::i1) {
46151
46152 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46153 StoredVal = DAG.getBitcast(NewVT, StoredVal);
46154
46155 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46156 St->getPointerInfo(), St->getOriginalAlign(),
46157 St->getMemOperand()->getFlags());
46158 }
46159
46160 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
46161 // This will avoid a copy to k-register.
46162 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
46163 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
46164 StoredVal.getOperand(0).getValueType() == MVT::i8) {
46165 SDValue Val = StoredVal.getOperand(0);
46166 // We must store zeros to the unused bits.
46167 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
46168 return DAG.getStore(St->getChain(), dl, Val,
46169 St->getBasePtr(), St->getPointerInfo(),
46170 St->getOriginalAlign(),
46171 St->getMemOperand()->getFlags());
46172 }
46173
46174 // Widen v2i1/v4i1 stores to v8i1.
46175 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
46176 Subtarget.hasAVX512()) {
46177 unsigned NumConcats = 8 / VT.getVectorNumElements();
46178 // We must store zeros to the unused bits.
46179 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
46180 Ops[0] = StoredVal;
46181 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
46182 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46183 St->getPointerInfo(), St->getOriginalAlign(),
46184 St->getMemOperand()->getFlags());
46185 }
46186
46187 // Turn vXi1 stores of constants into a scalar store.
46188 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
46189 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
46190 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
46191 // If its a v64i1 store without 64-bit support, we need two stores.
46192 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
46193 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
46194 StoredVal->ops().slice(0, 32));
46195 Lo = combinevXi1ConstantToInteger(Lo, DAG);
46196 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
46197 StoredVal->ops().slice(32, 32));
46198 Hi = combinevXi1ConstantToInteger(Hi, DAG);
46199
46200 SDValue Ptr0 = St->getBasePtr();
46201 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
46202
46203 SDValue Ch0 =
46204 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
46205 St->getOriginalAlign(),
46206 St->getMemOperand()->getFlags());
46207 SDValue Ch1 =
46208 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
46209 St->getPointerInfo().getWithOffset(4),
46210 St->getOriginalAlign(),
46211 St->getMemOperand()->getFlags());
46212 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
46213 }
46214
46215 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
46216 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46217 St->getPointerInfo(), St->getOriginalAlign(),
46218 St->getMemOperand()->getFlags());
46219 }
46220
46221 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
46222 // Sandy Bridge, perform two 16-byte stores.
46223 bool Fast;
46224 if (VT.is256BitVector() && StVT == VT &&
46225 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
46226 *St->getMemOperand(), &Fast) &&
46227 !Fast) {
46228 unsigned NumElems = VT.getVectorNumElements();
46229 if (NumElems < 2)
46230 return SDValue();
46231
46232 return splitVectorStore(St, DAG);
46233 }
46234
46235 // Split under-aligned vector non-temporal stores.
46236 if (St->isNonTemporal() && StVT == VT &&
46237 St->getAlignment() < VT.getStoreSize()) {
46238 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
46239 // vectors or the legalizer can scalarize it to use MOVNTI.
46240 if (VT.is256BitVector() || VT.is512BitVector()) {
46241 unsigned NumElems = VT.getVectorNumElements();
46242 if (NumElems < 2)
46243 return SDValue();
46244 return splitVectorStore(St, DAG);
46245 }
46246
46247 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
46248 // to use MOVNTI.
46249 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
46250 MVT NTVT = Subtarget.hasSSE4A()
46251 ? MVT::v2f64
46252 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
46253 return scalarizeVectorStore(St, NTVT, DAG);
46254 }
46255 }
46256
46257 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
46258 // supported, but avx512f is by extending to v16i32 and truncating.
46259 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
46260 St->getValue().getOpcode() == ISD::TRUNCATE &&
46261 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
46262 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
46263 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
46264 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
46265 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
46266 MVT::v16i8, St->getMemOperand());
46267 }
46268
46269 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
46270 if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
46271 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
46272 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
46273 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
46274 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
46275 return EmitTruncSStore(IsSigned, St->getChain(),
46276 dl, StoredVal.getOperand(0), St->getBasePtr(),
46277 VT, St->getMemOperand(), DAG);
46278 }
46279
46280 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
46281 if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
46282 auto IsExtractedElement = [](SDValue V) {
46283 if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
46284 V = V.getOperand(0);
46285 unsigned Opc = V.getOpcode();
46286 if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
46287 if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
46288 return V.getOperand(0);
46289 }
46290 return SDValue();
46291 };
46292 if (SDValue Extract = IsExtractedElement(StoredVal)) {
46293 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
46294 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
46295 SDValue Src = Trunc.getOperand(0);
46296 MVT DstVT = Trunc.getSimpleValueType();
46297 MVT SrcVT = Src.getSimpleValueType();
46298 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46299 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
46300 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
46301 if (NumTruncBits == VT.getSizeInBits() &&
46302 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
46303 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
46304 TruncVT, St->getMemOperand());
46305 }
46306 }
46307 }
46308 }
46309
46310 // Optimize trunc store (of multiple scalars) to shuffle and store.
46311 // First, pack all of the elements in one place. Next, store to memory
46312 // in fewer chunks.
46313 if (St->isTruncatingStore() && VT.isVector()) {
46314 // Check if we can detect an AVG pattern from the truncation. If yes,
46315 // replace the trunc store by a normal store with the result of X86ISD::AVG
46316 // instruction.
46317 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
46318 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
46319 Subtarget, dl))
46320 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
46321 St->getPointerInfo(), St->getOriginalAlign(),
46322 St->getMemOperand()->getFlags());
46323
46324 if (TLI.isTruncStoreLegal(VT, StVT)) {
46325 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
46326 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
46327 dl, Val, St->getBasePtr(),
46328 St->getMemoryVT(), St->getMemOperand(), DAG);
46329 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
46330 DAG, dl))
46331 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
46332 dl, Val, St->getBasePtr(),
46333 St->getMemoryVT(), St->getMemOperand(), DAG);
46334 }
46335
46336 return SDValue();
46337 }
46338
46339 // Cast ptr32 and ptr64 pointers to the default address space before a store.
46340 unsigned AddrSpace = St->getAddressSpace();
46341 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
46342 AddrSpace == X86AS::PTR32_UPTR) {
46343 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
46344 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
46345 SDValue Cast =
46346 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
46347 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
46348 St->getPointerInfo(), St->getOriginalAlign(),
46349 St->getMemOperand()->getFlags(), St->getAAInfo());
46350 }
46351 }
46352
46353 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
46354 // the FP state in cases where an emms may be missing.
46355 // A preferable solution to the general problem is to figure out the right
46356 // places to insert EMMS. This qualifies as a quick hack.
46357
46358 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
46359 if (VT.getSizeInBits() != 64)
46360 return SDValue();
46361
46362 const Function &F = DAG.getMachineFunction().getFunction();
46363 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
46364 bool F64IsLegal =
46365 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
46366 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
46367 isa<LoadSDNode>(St->getValue()) &&
46368 cast<LoadSDNode>(St->getValue())->isSimple() &&
46369 St->getChain().hasOneUse() && St->isSimple()) {
46370 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
46371
46372 if (!ISD::isNormalLoad(Ld))
46373 return SDValue();
46374
46375 // Avoid the transformation if there are multiple uses of the loaded value.
46376 if (!Ld->hasNUsesOfValue(1, 0))
46377 return SDValue();
46378
46379 SDLoc LdDL(Ld);
46380 SDLoc StDL(N);
46381 // Lower to a single movq load/store pair.
46382 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
46383 Ld->getBasePtr(), Ld->getMemOperand());
46384
46385 // Make sure new load is placed in same chain order.
46386 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
46387 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
46388 St->getMemOperand());
46389 }
46390
46391 // This is similar to the above case, but here we handle a scalar 64-bit
46392 // integer store that is extracted from a vector on a 32-bit target.
46393 // If we have SSE2, then we can treat it like a floating-point double
46394 // to get past legalization. The execution dependencies fixup pass will
46395 // choose the optimal machine instruction for the store if this really is
46396 // an integer or v2f32 rather than an f64.
46397 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
46398 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
46399 SDValue OldExtract = St->getOperand(1);
46400 SDValue ExtOp0 = OldExtract.getOperand(0);
46401 unsigned VecSize = ExtOp0.getValueSizeInBits();
46402 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
46403 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
46404 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
46405 BitCast, OldExtract.getOperand(1));
46406 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
46407 St->getPointerInfo(), St->getOriginalAlign(),
46408 St->getMemOperand()->getFlags());
46409 }
46410
46411 return SDValue();
46412}
46413
46414static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
46415 TargetLowering::DAGCombinerInfo &DCI,
46416 const X86Subtarget &Subtarget) {
46417 auto *St = cast<MemIntrinsicSDNode>(N);
46418
46419 SDValue StoredVal = N->getOperand(1);
46420 MVT VT = StoredVal.getSimpleValueType();
46421 EVT MemVT = St->getMemoryVT();
46422
46423 // Figure out which elements we demand.
46424 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
46425 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
46426
46427 APInt KnownUndef, KnownZero;
46428 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46429 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
46430 KnownZero, DCI)) {
46431 if (N->getOpcode() != ISD::DELETED_NODE)
46432 DCI.AddToWorklist(N);
46433 return SDValue(N, 0);
46434 }
46435
46436 return SDValue();
46437}
46438
46439/// Return 'true' if this vector operation is "horizontal"
46440/// and return the operands for the horizontal operation in LHS and RHS. A
46441/// horizontal operation performs the binary operation on successive elements
46442/// of its first operand, then on successive elements of its second operand,
46443/// returning the resulting values in a vector. For example, if
46444/// A = < float a0, float a1, float a2, float a3 >
46445/// and
46446/// B = < float b0, float b1, float b2, float b3 >
46447/// then the result of doing a horizontal operation on A and B is
46448/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
46449/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
46450/// A horizontal-op B, for some already available A and B, and if so then LHS is
46451/// set to A, RHS to B, and the routine returns 'true'.
46452static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
46453 SelectionDAG &DAG, const X86Subtarget &Subtarget,
46454 bool IsCommutative,
46455 SmallVectorImpl<int> &PostShuffleMask) {
46456 // If either operand is undef, bail out. The binop should be simplified.
46457 if (LHS.isUndef() || RHS.isUndef())
46458 return false;
46459
46460 // Look for the following pattern:
46461 // A = < float a0, float a1, float a2, float a3 >
46462 // B = < float b0, float b1, float b2, float b3 >
46463 // and
46464 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
46465 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
46466 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
46467 // which is A horizontal-op B.
46468
46469 MVT VT = LHS.getSimpleValueType();
46470 assert((VT.is128BitVector() || VT.is256BitVector()) &&((void)0)
46471 "Unsupported vector type for horizontal add/sub")((void)0);
46472 unsigned NumElts = VT.getVectorNumElements();
46473
46474 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
46475 SmallVectorImpl<int> &ShuffleMask) {
46476 bool UseSubVector = false;
46477 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46478 Op.getOperand(0).getValueType().is256BitVector() &&
46479 llvm::isNullConstant(Op.getOperand(1))) {
46480 Op = Op.getOperand(0);
46481 UseSubVector = true;
46482 }
46483 SmallVector<SDValue, 2> SrcOps;
46484 SmallVector<int, 16> SrcMask, ScaledMask;
46485 SDValue BC = peekThroughBitcasts(Op);
46486 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
46487 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
46488 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
46489 })) {
46490 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
46491 if (!UseSubVector && SrcOps.size() <= 2 &&
46492 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
46493 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
46494 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
46495 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
46496 }
46497 if (UseSubVector && SrcOps.size() == 1 &&
46498 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
46499 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
46500 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
46501 ShuffleMask.assign(Mask.begin(), Mask.end());
46502 }
46503 }
46504 };
46505
46506 // View LHS in the form
46507 // LHS = VECTOR_SHUFFLE A, B, LMask
46508 // If LHS is not a shuffle, then pretend it is the identity shuffle:
46509 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
46510 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
46511 SDValue A, B;
46512 SmallVector<int, 16> LMask;
46513 GetShuffle(LHS, A, B, LMask);
46514
46515 // Likewise, view RHS in the form
46516 // RHS = VECTOR_SHUFFLE C, D, RMask
46517 SDValue C, D;
46518 SmallVector<int, 16> RMask;
46519 GetShuffle(RHS, C, D, RMask);
46520
46521 // At least one of the operands should be a vector shuffle.
46522 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
46523 if (NumShuffles == 0)
46524 return false;
46525
46526 if (LMask.empty()) {
46527 A = LHS;
46528 for (unsigned i = 0; i != NumElts; ++i)
46529 LMask.push_back(i);
46530 }
46531
46532 if (RMask.empty()) {
46533 C = RHS;
46534 for (unsigned i = 0; i != NumElts; ++i)
46535 RMask.push_back(i);
46536 }
46537
46538 // If we have an unary mask, ensure the other op is set to null.
46539 if (isUndefOrInRange(LMask, 0, NumElts))
46540 B = SDValue();
46541 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
46542 A = SDValue();
46543
46544 if (isUndefOrInRange(RMask, 0, NumElts))
46545 D = SDValue();
46546 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
46547 C = SDValue();
46548
46549 // If A and B occur in reverse order in RHS, then canonicalize by commuting
46550 // RHS operands and shuffle mask.
46551 if (A != C) {
46552 std::swap(C, D);
46553 ShuffleVectorSDNode::commuteMask(RMask);
46554 }
46555 // Check that the shuffles are both shuffling the same vectors.
46556 if (!(A == C && B == D))
46557 return false;
46558
46559 PostShuffleMask.clear();
46560 PostShuffleMask.append(NumElts, SM_SentinelUndef);
46561
46562 // LHS and RHS are now:
46563 // LHS = shuffle A, B, LMask
46564 // RHS = shuffle A, B, RMask
46565 // Check that the masks correspond to performing a horizontal operation.
46566 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
46567 // so we just repeat the inner loop if this is a 256-bit op.
46568 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
46569 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
46570 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
46571 assert((NumEltsPer128BitChunk % 2 == 0) &&((void)0)
46572 "Vector type should have an even number of elements in each lane")((void)0);
46573 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
46574 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
46575 // Ignore undefined components.
46576 int LIdx = LMask[i + j], RIdx = RMask[i + j];
46577 if (LIdx < 0 || RIdx < 0 ||
46578 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
46579 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
46580 continue;
46581
46582 // Check that successive odd/even elements are being operated on. If not,
46583 // this is not a horizontal operation.
46584 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
46585 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
46586 return false;
46587
46588 // Compute the post-shuffle mask index based on where the element
46589 // is stored in the HOP result, and where it needs to be moved to.
46590 int Base = LIdx & ~1u;
46591 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
46592 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
46593
46594 // The low half of the 128-bit result must choose from A.
46595 // The high half of the 128-bit result must choose from B,
46596 // unless B is undef. In that case, we are always choosing from A.
46597 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
46598 Index += NumEltsPer64BitChunk;
46599 PostShuffleMask[i + j] = Index;
46600 }
46601 }
46602
46603 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
46604 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
46605
46606 bool IsIdentityPostShuffle =
46607 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
46608 if (IsIdentityPostShuffle)
46609 PostShuffleMask.clear();
46610
46611 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
46612 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
46613 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
46614 return false;
46615
46616 // If the source nodes are already used in HorizOps then always accept this.
46617 // Shuffle folding should merge these back together.
46618 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
46619 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46620 });
46621 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
46622 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46623 });
46624 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
46625
46626 // Assume a SingleSource HOP if we only shuffle one input and don't need to
46627 // shuffle the result.
46628 if (!ForceHorizOp &&
46629 !shouldUseHorizontalOp(NewLHS == NewRHS &&
46630 (NumShuffles < 2 || !IsIdentityPostShuffle),
46631 DAG, Subtarget))
46632 return false;
46633
46634 LHS = DAG.getBitcast(VT, NewLHS);
46635 RHS = DAG.getBitcast(VT, NewRHS);
46636 return true;
46637}
46638
46639// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
46640static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
46641 const X86Subtarget &Subtarget) {
46642 EVT VT = N->getValueType(0);
46643 unsigned Opcode = N->getOpcode();
46644 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
46645 SmallVector<int, 8> PostShuffleMask;
46646
46647 switch (Opcode) {
46648 case ISD::FADD:
46649 case ISD::FSUB:
46650 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
46651 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
46652 SDValue LHS = N->getOperand(0);
46653 SDValue RHS = N->getOperand(1);
46654 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
46655 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46656 PostShuffleMask)) {
46657 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
46658 if (!PostShuffleMask.empty())
46659 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46660 DAG.getUNDEF(VT), PostShuffleMask);
46661 return HorizBinOp;
46662 }
46663 }
46664 break;
46665 case ISD::ADD:
46666 case ISD::SUB:
46667 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
46668 VT == MVT::v16i16 || VT == MVT::v8i32)) {
46669 SDValue LHS = N->getOperand(0);
46670 SDValue RHS = N->getOperand(1);
46671 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
46672 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46673 PostShuffleMask)) {
46674 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
46675 ArrayRef<SDValue> Ops) {
46676 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
46677 };
46678 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
46679 {LHS, RHS}, HOpBuilder);
46680 if (!PostShuffleMask.empty())
46681 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46682 DAG.getUNDEF(VT), PostShuffleMask);
46683 return HorizBinOp;
46684 }
46685 }
46686 break;
46687 }
46688
46689 return SDValue();
46690}
46691
46692/// Do target-specific dag combines on floating-point adds/subs.
46693static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
46694 const X86Subtarget &Subtarget) {
46695 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
46696 return HOp;
46697 return SDValue();
46698}
46699
46700/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
46701/// the codegen.
46702/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
46703/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
46704/// anything that is guaranteed to be transformed by DAGCombiner.
46705static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
46706 const X86Subtarget &Subtarget,
46707 const SDLoc &DL) {
46708 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")((void)0);
46709 SDValue Src = N->getOperand(0);
46710 unsigned SrcOpcode = Src.getOpcode();
46711 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46712
46713 EVT VT = N->getValueType(0);
46714 EVT SrcVT = Src.getValueType();
46715
46716 auto IsFreeTruncation = [VT](SDValue Op) {
46717 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
46718
46719 // See if this has been extended from a smaller/equal size to
46720 // the truncation size, allowing a truncation to combine with the extend.
46721 unsigned Opcode = Op.getOpcode();
46722 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
46723 Opcode == ISD::ZERO_EXTEND) &&
46724 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
46725 return true;
46726
46727 // See if this is a single use constant which can be constant folded.
46728 // NOTE: We don't peek throught bitcasts here because there is currently
46729 // no support for constant folding truncate+bitcast+vector_of_constants. So
46730 // we'll just send up with a truncate on both operands which will
46731 // get turned back into (truncate (binop)) causing an infinite loop.
46732 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
46733 };
46734
46735 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
46736 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
46737 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
46738 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
46739 };
46740
46741 // Don't combine if the operation has other uses.
46742 if (!Src.hasOneUse())
46743 return SDValue();
46744
46745 // Only support vector truncation for now.
46746 // TODO: i64 scalar math would benefit as well.
46747 if (!VT.isVector())
46748 return SDValue();
46749
46750 // In most cases its only worth pre-truncating if we're only facing the cost
46751 // of one truncation.
46752 // i.e. if one of the inputs will constant fold or the input is repeated.
46753 switch (SrcOpcode) {
46754 case ISD::MUL:
46755 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
46756 // better to truncate if we have the chance.
46757 if (SrcVT.getScalarType() == MVT::i64 &&
46758 TLI.isOperationLegal(SrcOpcode, VT) &&
46759 !TLI.isOperationLegal(SrcOpcode, SrcVT))
46760 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
46761 LLVM_FALLTHROUGH[[gnu::fallthrough]];
46762 case ISD::AND:
46763 case ISD::XOR:
46764 case ISD::OR:
46765 case ISD::ADD:
46766 case ISD::SUB: {
46767 SDValue Op0 = Src.getOperand(0);
46768 SDValue Op1 = Src.getOperand(1);
46769 if (TLI.isOperationLegal(SrcOpcode, VT) &&
46770 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
46771 return TruncateArithmetic(Op0, Op1);
46772 break;
46773 }
46774 }
46775
46776 return SDValue();
46777}
46778
46779/// Truncate using ISD::AND mask and X86ISD::PACKUS.
46780/// e.g. trunc <8 x i32> X to <8 x i16> -->
46781/// MaskX = X & 0xffff (clear high bits to prevent saturation)
46782/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
46783static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
46784 const X86Subtarget &Subtarget,
46785 SelectionDAG &DAG) {
46786 SDValue In = N->getOperand(0);
46787 EVT InVT = In.getValueType();
46788 EVT OutVT = N->getValueType(0);
46789
46790 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
46791 OutVT.getScalarSizeInBits());
46792 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
46793 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
46794}
46795
46796/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
46797static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
46798 const X86Subtarget &Subtarget,
46799 SelectionDAG &DAG) {
46800 SDValue In = N->getOperand(0);
46801 EVT InVT = In.getValueType();
46802 EVT OutVT = N->getValueType(0);
46803 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
46804 DAG.getValueType(OutVT));
46805 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
46806}
46807
46808/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
46809/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
46810/// legalization the truncation will be translated into a BUILD_VECTOR with each
46811/// element that is extracted from a vector and then truncated, and it is
46812/// difficult to do this optimization based on them.
46813static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
46814 const X86Subtarget &Subtarget) {
46815 EVT OutVT = N->getValueType(0);
46816 if (!OutVT.isVector())
46817 return SDValue();
46818
46819 SDValue In = N->getOperand(0);
46820 if (!In.getValueType().isSimple())
46821 return SDValue();
46822
46823 EVT InVT = In.getValueType();
46824 unsigned NumElems = OutVT.getVectorNumElements();
46825
46826 // AVX512 provides fast truncate ops.
46827 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46828 return SDValue();
46829
46830 EVT OutSVT = OutVT.getVectorElementType();
46831 EVT InSVT = InVT.getVectorElementType();
46832 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
46833 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
46834 NumElems >= 8))
46835 return SDValue();
46836
46837 // SSSE3's pshufb results in less instructions in the cases below.
46838 if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64)
46839 return SDValue();
46840
46841 SDLoc DL(N);
46842 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
46843 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
46844 // truncate 2 x v4i32 to v8i16.
46845 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
46846 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
46847 if (InSVT == MVT::i32)
46848 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
46849
46850 return SDValue();
46851}
46852
46853/// This function transforms vector truncation of 'extended sign-bits' or
46854/// 'extended zero-bits' values.
46855/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
46856static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
46857 SelectionDAG &DAG,
46858 const X86Subtarget &Subtarget) {
46859 // Requires SSE2.
46860 if (!Subtarget.hasSSE2())
46861 return SDValue();
46862
46863 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
46864 return SDValue();
46865
46866 SDValue In = N->getOperand(0);
46867 if (!In.getValueType().isSimple())
46868 return SDValue();
46869
46870 MVT VT = N->getValueType(0).getSimpleVT();
46871 MVT SVT = VT.getScalarType();
46872
46873 MVT InVT = In.getValueType().getSimpleVT();
46874 MVT InSVT = InVT.getScalarType();
46875
46876 // Check we have a truncation suited for PACKSS/PACKUS.
46877 if (!isPowerOf2_32(VT.getVectorNumElements()))
46878 return SDValue();
46879 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
46880 return SDValue();
46881 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
46882 return SDValue();
46883
46884 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
46885 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
46886 return SDValue();
46887
46888 // AVX512 has fast truncate, but if the input is already going to be split,
46889 // there's no harm in trying pack.
46890 if (Subtarget.hasAVX512() &&
46891 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
46892 InVT.is512BitVector())) {
46893 // PACK should still be worth it for 128-bit vectors if the sources were
46894 // originally concatenated from subvectors.
46895 SmallVector<SDValue> ConcatOps;
46896 if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
46897 return SDValue();
46898 }
46899
46900 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
46901 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
46902
46903 // Use PACKUS if the input has zero-bits that extend all the way to the
46904 // packed/truncated value. e.g. masks, zext_in_reg, etc.
46905 KnownBits Known = DAG.computeKnownBits(In);
46906 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
46907 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
46908 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
46909
46910 // Use PACKSS if the input has sign-bits that extend all the way to the
46911 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
46912 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
46913
46914 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
46915 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
46916 // on and combines/simplifications can't then use it.
46917 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
46918 return SDValue();
46919
46920 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
46921 if (NumSignBits > MinSignBits)
46922 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
46923
46924 // If we have a srl that only generates signbits that we will discard in
46925 // the truncation then we can use PACKSS by converting the srl to a sra.
46926 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
46927 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
46928 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
46929 In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
46930 if (*ShAmt == MinSignBits) {
46931 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
46932 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
46933 Subtarget);
46934 }
46935 }
46936
46937 return SDValue();
46938}
46939
46940// Try to form a MULHU or MULHS node by looking for
46941// (trunc (srl (mul ext, ext), 16))
46942// TODO: This is X86 specific because we want to be able to handle wide types
46943// before type legalization. But we can only do it if the vector will be
46944// legalized via widening/splitting. Type legalization can't handle promotion
46945// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
46946// combiner.
46947static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
46948 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46949 // First instruction should be a right shift of a multiply.
46950 if (Src.getOpcode() != ISD::SRL ||
46951 Src.getOperand(0).getOpcode() != ISD::MUL)
46952 return SDValue();
46953
46954 if (!Subtarget.hasSSE2())
46955 return SDValue();
46956
46957 // Only handle vXi16 types that are at least 128-bits unless they will be
46958 // widened.
46959 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
46960 return SDValue();
46961
46962 // Input type should be at least vXi32.
46963 EVT InVT = Src.getValueType();
46964 if (InVT.getVectorElementType().getSizeInBits() < 32)
46965 return SDValue();
46966
46967 // Need a shift by 16.
46968 APInt ShiftAmt;
46969 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
46970 ShiftAmt != 16)
46971 return SDValue();
46972
46973 SDValue LHS = Src.getOperand(0).getOperand(0);
46974 SDValue RHS = Src.getOperand(0).getOperand(1);
46975
46976 unsigned ExtOpc = LHS.getOpcode();
46977 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
46978 RHS.getOpcode() != ExtOpc)
46979 return SDValue();
46980
46981 // Peek through the extends.
46982 LHS = LHS.getOperand(0);
46983 RHS = RHS.getOperand(0);
46984
46985 // Ensure the input types match.
46986 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
46987 return SDValue();
46988
46989 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
46990 return DAG.getNode(Opc, DL, VT, LHS, RHS);
46991}
46992
46993// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
46994// from one vector with signed bytes from another vector, adds together
46995// adjacent pairs of 16-bit products, and saturates the result before
46996// truncating to 16-bits.
46997//
46998// Which looks something like this:
46999// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
47000// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
47001static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
47002 const X86Subtarget &Subtarget,
47003 const SDLoc &DL) {
47004 if (!VT.isVector() || !Subtarget.hasSSSE3())
47005 return SDValue();
47006
47007 unsigned NumElems = VT.getVectorNumElements();
47008 EVT ScalarVT = VT.getVectorElementType();
47009 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
47010 return SDValue();
47011
47012 SDValue SSatVal = detectSSatPattern(In, VT);
47013 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
47014 return SDValue();
47015
47016 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
47017 // of multiplies from even/odd elements.
47018 SDValue N0 = SSatVal.getOperand(0);
47019 SDValue N1 = SSatVal.getOperand(1);
47020
47021 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
47022 return SDValue();
47023
47024 SDValue N00 = N0.getOperand(0);
47025 SDValue N01 = N0.getOperand(1);
47026 SDValue N10 = N1.getOperand(0);
47027 SDValue N11 = N1.getOperand(1);
47028
47029 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
47030 // Canonicalize zero_extend to LHS.
47031 if (N01.getOpcode() == ISD::ZERO_EXTEND)
47032 std::swap(N00, N01);
47033 if (N11.getOpcode() == ISD::ZERO_EXTEND)
47034 std::swap(N10, N11);
47035
47036 // Ensure we have a zero_extend and a sign_extend.
47037 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
47038 N01.getOpcode() != ISD::SIGN_EXTEND ||
47039 N10.getOpcode() != ISD::ZERO_EXTEND ||
47040 N11.getOpcode() != ISD::SIGN_EXTEND)
47041 return SDValue();
47042
47043 // Peek through the extends.
47044 N00 = N00.getOperand(0);
47045 N01 = N01.getOperand(0);
47046 N10 = N10.getOperand(0);
47047 N11 = N11.getOperand(0);
47048
47049 // Ensure the extend is from vXi8.
47050 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
47051 N01.getValueType().getVectorElementType() != MVT::i8 ||
47052 N10.getValueType().getVectorElementType() != MVT::i8 ||
47053 N11.getValueType().getVectorElementType() != MVT::i8)
47054 return SDValue();
47055
47056 // All inputs should be build_vectors.
47057 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
47058 N01.getOpcode() != ISD::BUILD_VECTOR ||
47059 N10.getOpcode() != ISD::BUILD_VECTOR ||
47060 N11.getOpcode() != ISD::BUILD_VECTOR)
47061 return SDValue();
47062
47063 // N00/N10 are zero extended. N01/N11 are sign extended.
47064
47065 // For each element, we need to ensure we have an odd element from one vector
47066 // multiplied by the odd element of another vector and the even element from
47067 // one of the same vectors being multiplied by the even element from the
47068 // other vector. So we need to make sure for each element i, this operator
47069 // is being performed:
47070 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
47071 SDValue ZExtIn, SExtIn;
47072 for (unsigned i = 0; i != NumElems; ++i) {
47073 SDValue N00Elt = N00.getOperand(i);
47074 SDValue N01Elt = N01.getOperand(i);
47075 SDValue N10Elt = N10.getOperand(i);
47076 SDValue N11Elt = N11.getOperand(i);
47077 // TODO: Be more tolerant to undefs.
47078 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47079 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47080 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47081 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
47082 return SDValue();
47083 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
47084 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
47085 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
47086 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
47087 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
47088 return SDValue();
47089 unsigned IdxN00 = ConstN00Elt->getZExtValue();
47090 unsigned IdxN01 = ConstN01Elt->getZExtValue();
47091 unsigned IdxN10 = ConstN10Elt->getZExtValue();
47092 unsigned IdxN11 = ConstN11Elt->getZExtValue();
47093 // Add is commutative so indices can be reordered.
47094 if (IdxN00 > IdxN10) {
47095 std::swap(IdxN00, IdxN10);
47096 std::swap(IdxN01, IdxN11);
47097 }
47098 // N0 indices be the even element. N1 indices must be the next odd element.
47099 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
47100 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
47101 return SDValue();
47102 SDValue N00In = N00Elt.getOperand(0);
47103 SDValue N01In = N01Elt.getOperand(0);
47104 SDValue N10In = N10Elt.getOperand(0);
47105 SDValue N11In = N11Elt.getOperand(0);
47106 // First time we find an input capture it.
47107 if (!ZExtIn) {
47108 ZExtIn = N00In;
47109 SExtIn = N01In;
47110 }
47111 if (ZExtIn != N00In || SExtIn != N01In ||
47112 ZExtIn != N10In || SExtIn != N11In)
47113 return SDValue();
47114 }
47115
47116 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47117 ArrayRef<SDValue> Ops) {
47118 // Shrink by adding truncate nodes and let DAGCombine fold with the
47119 // sources.
47120 EVT InVT = Ops[0].getValueType();
47121 assert(InVT.getScalarType() == MVT::i8 &&((void)0)
47122 "Unexpected scalar element type")((void)0);
47123 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((void)0);
47124 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
47125 InVT.getVectorNumElements() / 2);
47126 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
47127 };
47128 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
47129 PMADDBuilder);
47130}
47131
47132static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
47133 const X86Subtarget &Subtarget) {
47134 EVT VT = N->getValueType(0);
47135 SDValue Src = N->getOperand(0);
47136 SDLoc DL(N);
47137
47138 // Attempt to pre-truncate inputs to arithmetic ops instead.
47139 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
47140 return V;
47141
47142 // Try to detect AVG pattern first.
47143 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
47144 return Avg;
47145
47146 // Try to detect PMADD
47147 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
47148 return PMAdd;
47149
47150 // Try to combine truncation with signed/unsigned saturation.
47151 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
47152 return Val;
47153
47154 // Try to combine PMULHUW/PMULHW for vXi16.
47155 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
47156 return V;
47157
47158 // The bitcast source is a direct mmx result.
47159 // Detect bitcasts between i32 to x86mmx
47160 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
47161 SDValue BCSrc = Src.getOperand(0);
47162 if (BCSrc.getValueType() == MVT::x86mmx)
47163 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
47164 }
47165
47166 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
47167 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
47168 return V;
47169
47170 return combineVectorTruncation(N, DAG, Subtarget);
47171}
47172
47173static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
47174 TargetLowering::DAGCombinerInfo &DCI) {
47175 EVT VT = N->getValueType(0);
47176 SDValue In = N->getOperand(0);
47177 SDLoc DL(N);
47178
47179 if (auto SSatVal = detectSSatPattern(In, VT))
47180 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
47181 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
47182 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
47183
47184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47185 APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
47186 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47187 return SDValue(N, 0);
47188
47189 return SDValue();
47190}
47191
47192/// Returns the negated value if the node \p N flips sign of FP value.
47193///
47194/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
47195/// or FSUB(0, x)
47196/// AVX512F does not have FXOR, so FNEG is lowered as
47197/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
47198/// In this case we go though all bitcasts.
47199/// This also recognizes splat of a negated value and returns the splat of that
47200/// value.
47201static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
47202 if (N->getOpcode() == ISD::FNEG)
47203 return N->getOperand(0);
47204
47205 // Don't recurse exponentially.
47206 if (Depth > SelectionDAG::MaxRecursionDepth)
47207 return SDValue();
47208
47209 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
47210
47211 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
47212 EVT VT = Op->getValueType(0);
47213
47214 // Make sure the element size doesn't change.
47215 if (VT.getScalarSizeInBits() != ScalarSize)
47216 return SDValue();
47217
47218 unsigned Opc = Op.getOpcode();
47219 switch (Opc) {
47220 case ISD::VECTOR_SHUFFLE: {
47221 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
47222 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
47223 if (!Op.getOperand(1).isUndef())
47224 return SDValue();
47225 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
47226 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
47227 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
47228 cast<ShuffleVectorSDNode>(Op)->getMask());
47229 break;
47230 }
47231 case ISD::INSERT_VECTOR_ELT: {
47232 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
47233 // -V, INDEX).
47234 SDValue InsVector = Op.getOperand(0);
47235 SDValue InsVal = Op.getOperand(1);
47236 if (!InsVector.isUndef())
47237 return SDValue();
47238 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
47239 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
47240 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
47241 NegInsVal, Op.getOperand(2));
47242 break;
47243 }
47244 case ISD::FSUB:
47245 case ISD::XOR:
47246 case X86ISD::FXOR: {
47247 SDValue Op1 = Op.getOperand(1);
47248 SDValue Op0 = Op.getOperand(0);
47249
47250 // For XOR and FXOR, we want to check if constant
47251 // bits of Op1 are sign bit masks. For FSUB, we
47252 // have to check if constant bits of Op0 are sign
47253 // bit masks and hence we swap the operands.
47254 if (Opc == ISD::FSUB)
47255 std::swap(Op0, Op1);
47256
47257 APInt UndefElts;
47258 SmallVector<APInt, 16> EltBits;
47259 // Extract constant bits and see if they are all
47260 // sign bit masks. Ignore the undef elements.
47261 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
47262 /* AllowWholeUndefs */ true,
47263 /* AllowPartialUndefs */ false)) {
47264 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
47265 if (!UndefElts[I] && !EltBits[I].isSignMask())
47266 return SDValue();
47267
47268 return peekThroughBitcasts(Op0);
47269 }
47270 }
47271 }
47272
47273 return SDValue();
47274}
47275
47276static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
47277 bool NegRes) {
47278 if (NegMul) {
47279 switch (Opcode) {
47280 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
47281 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
47282 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
47283 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
47284 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
47285 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
47286 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
47287 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
47288 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
47289 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
47290 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
47291 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
47292 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
47293 }
47294 }
47295
47296 if (NegAcc) {
47297 switch (Opcode) {
47298 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
47299 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
47300 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
47301 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
47302 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
47303 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
47304 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
47305 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
47306 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
47307 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
47308 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
47309 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
47310 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
47311 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
47312 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
47313 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
47314 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
47315 }
47316 }
47317
47318 if (NegRes) {
47319 switch (Opcode) {
47320 // For accuracy reason, we never combine fneg and fma under strict FP.
47321 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
47322 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
47323 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
47324 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
47325 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
47326 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
47327 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
47328 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
47329 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
47330 }
47331 }
47332
47333 return Opcode;
47334}
47335
47336/// Do target-specific dag combines on floating point negations.
47337static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
47338 TargetLowering::DAGCombinerInfo &DCI,
47339 const X86Subtarget &Subtarget) {
47340 EVT OrigVT = N->getValueType(0);
47341 SDValue Arg = isFNEG(DAG, N);
47342 if (!Arg)
47343 return SDValue();
47344
47345 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47346 EVT VT = Arg.getValueType();
47347 EVT SVT = VT.getScalarType();
47348 SDLoc DL(N);
47349
47350 // Let legalize expand this if it isn't a legal type yet.
47351 if (!TLI.isTypeLegal(VT))
47352 return SDValue();
47353
47354 // If we're negating a FMUL node on a target with FMA, then we can avoid the
47355 // use of a constant by performing (-0 - A*B) instead.
47356 // FIXME: Check rounding control flags as well once it becomes available.
47357 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
47358 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
47359 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
47360 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
47361 Arg.getOperand(1), Zero);
47362 return DAG.getBitcast(OrigVT, NewNode);
47363 }
47364
47365 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
47366 bool LegalOperations = !DCI.isBeforeLegalizeOps();
47367 if (SDValue NegArg =
47368 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
47369 return DAG.getBitcast(OrigVT, NegArg);
47370
47371 return SDValue();
47372}
47373
47374SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
47375 bool LegalOperations,
47376 bool ForCodeSize,
47377 NegatibleCost &Cost,
47378 unsigned Depth) const {
47379 // fneg patterns are removable even if they have multiple uses.
47380 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
47381 Cost = NegatibleCost::Cheaper;
47382 return DAG.getBitcast(Op.getValueType(), Arg);
47383 }
47384
47385 EVT VT = Op.getValueType();
47386 EVT SVT = VT.getScalarType();
47387 unsigned Opc = Op.getOpcode();
47388 SDNodeFlags Flags = Op.getNode()->getFlags();
47389 switch (Opc) {
47390 case ISD::FMA:
47391 case X86ISD::FMSUB:
47392 case X86ISD::FNMADD:
47393 case X86ISD::FNMSUB:
47394 case X86ISD::FMADD_RND:
47395 case X86ISD::FMSUB_RND:
47396 case X86ISD::FNMADD_RND:
47397 case X86ISD::FNMSUB_RND: {
47398 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
47399 !(SVT == MVT::f32 || SVT == MVT::f64) ||
47400 !isOperationLegal(ISD::FMA, VT))
47401 break;
47402
47403 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
47404 // if it may have signed zeros.
47405 if (!Flags.hasNoSignedZeros())
47406 break;
47407
47408 // This is always negatible for free but we might be able to remove some
47409 // extra operand negations as well.
47410 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
47411 for (int i = 0; i != 3; ++i)
47412 NewOps[i] = getCheaperNegatedExpression(
47413 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
47414
47415 bool NegA = !!NewOps[0];
47416 bool NegB = !!NewOps[1];
47417 bool NegC = !!NewOps[2];
47418 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
47419
47420 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
47421 : NegatibleCost::Neutral;
47422
47423 // Fill in the non-negated ops with the original values.
47424 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
47425 if (!NewOps[i])
47426 NewOps[i] = Op.getOperand(i);
47427 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
47428 }
47429 case X86ISD::FRCP:
47430 if (SDValue NegOp0 =
47431 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
47432 ForCodeSize, Cost, Depth + 1))
47433 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
47434 break;
47435 }
47436
47437 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
47438 ForCodeSize, Cost, Depth);
47439}
47440
47441static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
47442 const X86Subtarget &Subtarget) {
47443 MVT VT = N->getSimpleValueType(0);
47444 // If we have integer vector types available, use the integer opcodes.
47445 if (!VT.isVector() || !Subtarget.hasSSE2())
47446 return SDValue();
47447
47448 SDLoc dl(N);
47449
47450 unsigned IntBits = VT.getScalarSizeInBits();
47451 MVT IntSVT = MVT::getIntegerVT(IntBits);
47452 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
47453
47454 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
47455 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
47456 unsigned IntOpcode;
47457 switch (N->getOpcode()) {
47458 default: llvm_unreachable("Unexpected FP logic op")__builtin_unreachable();
47459 case X86ISD::FOR: IntOpcode = ISD::OR; break;
47460 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
47461 case X86ISD::FAND: IntOpcode = ISD::AND; break;
47462 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
47463 }
47464 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
47465 return DAG.getBitcast(VT, IntOp);
47466}
47467
47468
47469/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
47470static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
47471 if (N->getOpcode() != ISD::XOR)
47472 return SDValue();
47473
47474 SDValue LHS = N->getOperand(0);
47475 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
47476 return SDValue();
47477
47478 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
47479 X86::CondCode(LHS->getConstantOperandVal(0)));
47480 SDLoc DL(N);
47481 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
47482}
47483
47484static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
47485 TargetLowering::DAGCombinerInfo &DCI,
47486 const X86Subtarget &Subtarget) {
47487 SDValue N0 = N->getOperand(0);
47488 SDValue N1 = N->getOperand(1);
47489 EVT VT = N->getValueType(0);
47490
47491 // If this is SSE1 only convert to FXOR to avoid scalarization.
47492 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47493 return DAG.getBitcast(MVT::v4i32,
47494 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
47495 DAG.getBitcast(MVT::v4f32, N0),
47496 DAG.getBitcast(MVT::v4f32, N1)));
47497 }
47498
47499 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
47500 return Cmp;
47501
47502 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47503 return R;
47504
47505 if (DCI.isBeforeLegalizeOps())
47506 return SDValue();
47507
47508 if (SDValue SetCC = foldXor1SetCC(N, DAG))
47509 return SetCC;
47510
47511 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
47512 return RV;
47513
47514 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
47515 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47516 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
47517 N0.getOperand(0).getValueType().isVector() &&
47518 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
47519 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
47520 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
47521 N0.getOperand(0).getValueType()));
47522 }
47523
47524 // Handle AVX512 mask widening.
47525 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
47526 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
47527 VT.getVectorElementType() == MVT::i1 &&
47528 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
47529 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
47530 return DAG.getNode(
47531 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
47532 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
47533 N0.getOperand(2));
47534 }
47535
47536 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
47537 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
47538 // TODO: Under what circumstances could this be performed in DAGCombine?
47539 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
47540 N0.getOperand(0).getOpcode() == N->getOpcode()) {
47541 SDValue TruncExtSrc = N0.getOperand(0);
47542 auto *N1C = dyn_cast<ConstantSDNode>(N1);
47543 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
47544 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
47545 SDLoc DL(N);
47546 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
47547 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
47548 return DAG.getNode(ISD::XOR, DL, VT, LHS,
47549 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
47550 }
47551 }
47552
47553 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
47554 return FPLogic;
47555
47556 return combineFneg(N, DAG, DCI, Subtarget);
47557}
47558
47559static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
47560 TargetLowering::DAGCombinerInfo &DCI,
47561 const X86Subtarget &Subtarget) {
47562 EVT VT = N->getValueType(0);
47563 unsigned NumBits = VT.getSizeInBits();
47564
47565 // TODO - Constant Folding.
47566
47567 // Simplify the inputs.
47568 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47569 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
47570 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47571 return SDValue(N, 0);
47572
47573 return SDValue();
47574}
47575
47576static bool isNullFPScalarOrVectorConst(SDValue V) {
47577 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
47578}
47579
47580/// If a value is a scalar FP zero or a vector FP zero (potentially including
47581/// undefined elements), return a zero constant that may be used to fold away
47582/// that value. In the case of a vector, the returned constant will not contain
47583/// undefined elements even if the input parameter does. This makes it suitable
47584/// to be used as a replacement operand with operations (eg, bitwise-and) where
47585/// an undef should not propagate.
47586static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
47587 const X86Subtarget &Subtarget) {
47588 if (!isNullFPScalarOrVectorConst(V))
47589 return SDValue();
47590
47591 if (V.getValueType().isVector())
47592 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
47593
47594 return V;
47595}
47596
47597static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
47598 const X86Subtarget &Subtarget) {
47599 SDValue N0 = N->getOperand(0);
47600 SDValue N1 = N->getOperand(1);
47601 EVT VT = N->getValueType(0);
47602 SDLoc DL(N);
47603
47604 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
47605 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
47606 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
47607 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
47608 return SDValue();
47609
47610 auto isAllOnesConstantFP = [](SDValue V) {
47611 if (V.getSimpleValueType().isVector())
47612 return ISD::isBuildVectorAllOnes(V.getNode());
47613 auto *C = dyn_cast<ConstantFPSDNode>(V);
47614 return C && C->getConstantFPValue()->isAllOnesValue();
47615 };
47616
47617 // fand (fxor X, -1), Y --> fandn X, Y
47618 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
47619 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
47620
47621 // fand X, (fxor Y, -1) --> fandn Y, X
47622 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
47623 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
47624
47625 return SDValue();
47626}
47627
47628/// Do target-specific dag combines on X86ISD::FAND nodes.
47629static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
47630 const X86Subtarget &Subtarget) {
47631 // FAND(0.0, x) -> 0.0
47632 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
47633 return V;
47634
47635 // FAND(x, 0.0) -> 0.0
47636 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47637 return V;
47638
47639 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
47640 return V;
47641
47642 return lowerX86FPLogicOp(N, DAG, Subtarget);
47643}
47644
47645/// Do target-specific dag combines on X86ISD::FANDN nodes.
47646static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
47647 const X86Subtarget &Subtarget) {
47648 // FANDN(0.0, x) -> x
47649 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47650 return N->getOperand(1);
47651
47652 // FANDN(x, 0.0) -> 0.0
47653 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47654 return V;
47655
47656 return lowerX86FPLogicOp(N, DAG, Subtarget);
47657}
47658
47659/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
47660static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
47661 TargetLowering::DAGCombinerInfo &DCI,
47662 const X86Subtarget &Subtarget) {
47663 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((void)0);
47664
47665 // F[X]OR(0.0, x) -> x
47666 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47667 return N->getOperand(1);
47668
47669 // F[X]OR(x, 0.0) -> x
47670 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
47671 return N->getOperand(0);
47672
47673 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
47674 return NewVal;
47675
47676 return lowerX86FPLogicOp(N, DAG, Subtarget);
47677}
47678
47679/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
47680static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
47681 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((void)0);
47682
47683 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
47684 if (!DAG.getTarget().Options.NoNaNsFPMath ||
47685 !DAG.getTarget().Options.NoSignedZerosFPMath)
47686 return SDValue();
47687
47688 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
47689 // into FMINC and FMAXC, which are Commutative operations.
47690 unsigned NewOp = 0;
47691 switch (N->getOpcode()) {
47692 default: llvm_unreachable("unknown opcode")__builtin_unreachable();
47693 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
47694 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
47695 }
47696
47697 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
47698 N->getOperand(0), N->getOperand(1));
47699}
47700
47701static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
47702 const X86Subtarget &Subtarget) {
47703 if (Subtarget.useSoftFloat())
47704 return SDValue();
47705
47706 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47707
47708 EVT VT = N->getValueType(0);
47709 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
47710 (Subtarget.hasSSE2() && VT == MVT::f64) ||
47711 (VT.isVector() && TLI.isTypeLegal(VT))))
47712 return SDValue();
47713
47714 SDValue Op0 = N->getOperand(0);
47715 SDValue Op1 = N->getOperand(1);
47716 SDLoc DL(N);
47717 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
47718
47719 // If we don't have to respect NaN inputs, this is a direct translation to x86
47720 // min/max instructions.
47721 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
47722 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47723
47724 // If one of the operands is known non-NaN use the native min/max instructions
47725 // with the non-NaN input as second operand.
47726 if (DAG.isKnownNeverNaN(Op1))
47727 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47728 if (DAG.isKnownNeverNaN(Op0))
47729 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
47730
47731 // If we have to respect NaN inputs, this takes at least 3 instructions.
47732 // Favor a library call when operating on a scalar and minimizing code size.
47733 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
47734 return SDValue();
47735
47736 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
47737 VT);
47738
47739 // There are 4 possibilities involving NaN inputs, and these are the required
47740 // outputs:
47741 // Op1
47742 // Num NaN
47743 // ----------------
47744 // Num | Max | Op0 |
47745 // Op0 ----------------
47746 // NaN | Op1 | NaN |
47747 // ----------------
47748 //
47749 // The SSE FP max/min instructions were not designed for this case, but rather
47750 // to implement:
47751 // Min = Op1 < Op0 ? Op1 : Op0
47752 // Max = Op1 > Op0 ? Op1 : Op0
47753 //
47754 // So they always return Op0 if either input is a NaN. However, we can still
47755 // use those instructions for fmaxnum by selecting away a NaN input.
47756
47757 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
47758 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
47759 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
47760
47761 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
47762 // are NaN, the NaN value of Op1 is the result.
47763 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
47764}
47765
47766static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
47767 TargetLowering::DAGCombinerInfo &DCI) {
47768 EVT VT = N->getValueType(0);
47769 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47770
47771 APInt KnownUndef, KnownZero;
47772 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
47773 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
47774 KnownZero, DCI))
47775 return SDValue(N, 0);
47776
47777 // Convert a full vector load into vzload when not all bits are needed.
47778 SDValue In = N->getOperand(0);
47779 MVT InVT = In.getSimpleValueType();
47780 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47781 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47782 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((void)0);
47783 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
47784 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47785 MVT MemVT = MVT::getIntegerVT(NumBits);
47786 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47787 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47788 SDLoc dl(N);
47789 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
47790 DAG.getBitcast(InVT, VZLoad));
47791 DCI.CombineTo(N, Convert);
47792 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47793 DCI.recursivelyDeleteUnusedNodes(LN);
47794 return SDValue(N, 0);
47795 }
47796 }
47797
47798 return SDValue();
47799}
47800
47801static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
47802 TargetLowering::DAGCombinerInfo &DCI) {
47803 bool IsStrict = N->isTargetStrictFPOpcode();
47804 EVT VT = N->getValueType(0);
47805
47806 // Convert a full vector load into vzload when not all bits are needed.
47807 SDValue In = N->getOperand(IsStrict ? 1 : 0);
47808 MVT InVT = In.getSimpleValueType();
47809 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47810 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47811 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((void)0);
47812 LoadSDNode *LN = cast<LoadSDNode>(In);
47813 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47814 MVT MemVT = MVT::getFloatingPointVT(NumBits);
47815 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47816 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47817 SDLoc dl(N);
47818 if (IsStrict) {
47819 SDValue Convert =
47820 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
47821 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
47822 DCI.CombineTo(N, Convert, Convert.getValue(1));
47823 } else {
47824 SDValue Convert =
47825 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
47826 DCI.CombineTo(N, Convert);
47827 }
47828 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47829 DCI.recursivelyDeleteUnusedNodes(LN);
47830 return SDValue(N, 0);
47831 }
47832 }
47833
47834 return SDValue();
47835}
47836
47837/// Do target-specific dag combines on X86ISD::ANDNP nodes.
47838static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
47839 TargetLowering::DAGCombinerInfo &DCI,
47840 const X86Subtarget &Subtarget) {
47841 MVT VT = N->getSimpleValueType(0);
47842
47843 // ANDNP(0, x) -> x
47844 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
47845 return N->getOperand(1);
47846
47847 // ANDNP(x, 0) -> 0
47848 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
47849 return DAG.getConstant(0, SDLoc(N), VT);
47850
47851 // Turn ANDNP back to AND if input is inverted.
47852 if (SDValue Not = IsNOT(N->getOperand(0), DAG))
47853 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
47854 N->getOperand(1));
47855
47856 // Attempt to recursively combine a bitmask ANDNP with shuffles.
47857 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47858 SDValue Op(N, 0);
47859 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47860 return Res;
47861 }
47862
47863 return SDValue();
47864}
47865
47866static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
47867 TargetLowering::DAGCombinerInfo &DCI) {
47868 SDValue N1 = N->getOperand(1);
47869
47870 // BT ignores high bits in the bit index operand.
47871 unsigned BitWidth = N1.getValueSizeInBits();
47872 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
47873 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
47874 if (N->getOpcode() != ISD::DELETED_NODE)
47875 DCI.AddToWorklist(N);
47876 return SDValue(N, 0);
47877 }
47878
47879 return SDValue();
47880}
47881
47882static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
47883 TargetLowering::DAGCombinerInfo &DCI) {
47884 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
47885 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
47886
47887 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
47888 APInt KnownUndef, KnownZero;
47889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47890 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
47891 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
47892 DCI)) {
47893 if (N->getOpcode() != ISD::DELETED_NODE)
47894 DCI.AddToWorklist(N);
47895 return SDValue(N, 0);
47896 }
47897
47898 // Convert a full vector load into vzload when not all bits are needed.
47899 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
47900 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
47901 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
47902 SDLoc dl(N);
47903 if (IsStrict) {
47904 SDValue Convert = DAG.getNode(
47905 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
47906 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
47907 DCI.CombineTo(N, Convert, Convert.getValue(1));
47908 } else {
47909 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
47910 DAG.getBitcast(MVT::v8i16, VZLoad));
47911 DCI.CombineTo(N, Convert);
47912 }
47913
47914 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47915 DCI.recursivelyDeleteUnusedNodes(LN);
47916 return SDValue(N, 0);
47917 }
47918 }
47919 }
47920
47921 return SDValue();
47922}
47923
47924// Try to combine sext_in_reg of a cmov of constants by extending the constants.
47925static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
47926 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((void)0);
47927
47928 EVT DstVT = N->getValueType(0);
47929
47930 SDValue N0 = N->getOperand(0);
47931 SDValue N1 = N->getOperand(1);
47932 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47933
47934 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
47935 return SDValue();
47936
47937 // Look through single use any_extends / truncs.
47938 SDValue IntermediateBitwidthOp;
47939 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
47940 N0.hasOneUse()) {
47941 IntermediateBitwidthOp = N0;
47942 N0 = N0.getOperand(0);
47943 }
47944
47945 // See if we have a single use cmov.
47946 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
47947 return SDValue();
47948
47949 SDValue CMovOp0 = N0.getOperand(0);
47950 SDValue CMovOp1 = N0.getOperand(1);
47951
47952 // Make sure both operands are constants.
47953 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
47954 !isa<ConstantSDNode>(CMovOp1.getNode()))
47955 return SDValue();
47956
47957 SDLoc DL(N);
47958
47959 // If we looked through an any_extend/trunc above, add one to the constants.
47960 if (IntermediateBitwidthOp) {
47961 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
47962 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
47963 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
47964 }
47965
47966 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
47967 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
47968
47969 EVT CMovVT = DstVT;
47970 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
47971 if (DstVT == MVT::i16) {
47972 CMovVT = MVT::i32;
47973 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
47974 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
47975 }
47976
47977 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
47978 N0.getOperand(2), N0.getOperand(3));
47979
47980 if (CMovVT != DstVT)
47981 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
47982
47983 return CMov;
47984}
47985
47986static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
47987 const X86Subtarget &Subtarget) {
47988 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((void)0);
47989
47990 if (SDValue V = combineSextInRegCmov(N, DAG))
47991 return V;
47992
47993 EVT VT = N->getValueType(0);
47994 SDValue N0 = N->getOperand(0);
47995 SDValue N1 = N->getOperand(1);
47996 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47997 SDLoc dl(N);
47998
47999 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
48000 // both SSE and AVX2 since there is no sign-extended shift right
48001 // operation on a vector with 64-bit elements.
48002 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
48003 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
48004 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
48005 N0.getOpcode() == ISD::SIGN_EXTEND)) {
48006 SDValue N00 = N0.getOperand(0);
48007
48008 // EXTLOAD has a better solution on AVX2,
48009 // it may be replaced with X86ISD::VSEXT node.
48010 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
48011 if (!ISD::isNormalLoad(N00.getNode()))
48012 return SDValue();
48013
48014 // Attempt to promote any comparison mask ops before moving the
48015 // SIGN_EXTEND_INREG in the way.
48016 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
48017 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
48018
48019 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
48020 SDValue Tmp =
48021 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
48022 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
48023 }
48024 }
48025 return SDValue();
48026}
48027
48028/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
48029/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
48030/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
48031/// opportunities to combine math ops, use an LEA, or use a complex addressing
48032/// mode. This can eliminate extend, add, and shift instructions.
48033static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
48034 const X86Subtarget &Subtarget) {
48035 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
48036 Ext->getOpcode() != ISD::ZERO_EXTEND)
48037 return SDValue();
48038
48039 // TODO: This should be valid for other integer types.
48040 EVT VT = Ext->getValueType(0);
48041 if (VT != MVT::i64)
48042 return SDValue();
48043
48044 SDValue Add = Ext->getOperand(0);
48045 if (Add.getOpcode() != ISD::ADD)
48046 return SDValue();
48047
48048 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
48049 bool NSW = Add->getFlags().hasNoSignedWrap();
48050 bool NUW = Add->getFlags().hasNoUnsignedWrap();
48051
48052 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
48053 // into the 'zext'
48054 if ((Sext && !NSW) || (!Sext && !NUW))
48055 return SDValue();
48056
48057 // Having a constant operand to the 'add' ensures that we are not increasing
48058 // the instruction count because the constant is extended for free below.
48059 // A constant operand can also become the displacement field of an LEA.
48060 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
48061 if (!AddOp1)
48062 return SDValue();
48063
48064 // Don't make the 'add' bigger if there's no hope of combining it with some
48065 // other 'add' or 'shl' instruction.
48066 // TODO: It may be profitable to generate simpler LEA instructions in place
48067 // of single 'add' instructions, but the cost model for selecting an LEA
48068 // currently has a high threshold.
48069 bool HasLEAPotential = false;
48070 for (auto *User : Ext->uses()) {
48071 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
48072 HasLEAPotential = true;
48073 break;
48074 }
48075 }
48076 if (!HasLEAPotential)
48077 return SDValue();
48078
48079 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
48080 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
48081 SDValue AddOp0 = Add.getOperand(0);
48082 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
48083 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
48084
48085 // The wider add is guaranteed to not wrap because both operands are
48086 // sign-extended.
48087 SDNodeFlags Flags;
48088 Flags.setNoSignedWrap(NSW);
48089 Flags.setNoUnsignedWrap(NUW);
48090 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
48091}
48092
48093// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
48094// operands and the result of CMOV is not used anywhere else - promote CMOV
48095// itself instead of promoting its result. This could be beneficial, because:
48096// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
48097// (or more) pseudo-CMOVs only when they go one-after-another and
48098// getting rid of result extension code after CMOV will help that.
48099// 2) Promotion of constant CMOV arguments is free, hence the
48100// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
48101// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
48102// promotion is also good in terms of code-size.
48103// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
48104// promotion).
48105static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
48106 SDValue CMovN = Extend->getOperand(0);
48107 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
48108 return SDValue();
48109
48110 EVT TargetVT = Extend->getValueType(0);
48111 unsigned ExtendOpcode = Extend->getOpcode();
48112 SDLoc DL(Extend);
48113
48114 EVT VT = CMovN.getValueType();
48115 SDValue CMovOp0 = CMovN.getOperand(0);
48116 SDValue CMovOp1 = CMovN.getOperand(1);
48117
48118 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
48119 !isa<ConstantSDNode>(CMovOp1.getNode()))
48120 return SDValue();
48121
48122 // Only extend to i32 or i64.
48123 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
48124 return SDValue();
48125
48126 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
48127 // are free.
48128 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
48129 return SDValue();
48130
48131 // If this a zero extend to i64, we should only extend to i32 and use a free
48132 // zero extend to finish.
48133 EVT ExtendVT = TargetVT;
48134 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
48135 ExtendVT = MVT::i32;
48136
48137 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
48138 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
48139
48140 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
48141 CMovN.getOperand(2), CMovN.getOperand(3));
48142
48143 // Finish extending if needed.
48144 if (ExtendVT != TargetVT)
48145 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
48146
48147 return Res;
48148}
48149
48150// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
48151// This is more or less the reverse of combineBitcastvxi1.
48152static SDValue
48153combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
48154 TargetLowering::DAGCombinerInfo &DCI,
48155 const X86Subtarget &Subtarget) {
48156 unsigned Opcode = N->getOpcode();
48157 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
48158 Opcode != ISD::ANY_EXTEND)
48159 return SDValue();
48160 if (!DCI.isBeforeLegalizeOps())
48161 return SDValue();
48162 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
48163 return SDValue();
48164
48165 SDValue N0 = N->getOperand(0);
48166 EVT VT = N->getValueType(0);
48167 EVT SVT = VT.getScalarType();
48168 EVT InSVT = N0.getValueType().getScalarType();
48169 unsigned EltSizeInBits = SVT.getSizeInBits();
48170
48171 // Input type must be extending a bool vector (bit-casted from a scalar
48172 // integer) to legal integer types.
48173 if (!VT.isVector())
48174 return SDValue();
48175 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
48176 return SDValue();
48177 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
48178 return SDValue();
48179
48180 SDValue N00 = N0.getOperand(0);
48181 EVT SclVT = N0.getOperand(0).getValueType();
48182 if (!SclVT.isScalarInteger())
48183 return SDValue();
48184
48185 SDLoc DL(N);
48186 SDValue Vec;
48187 SmallVector<int, 32> ShuffleMask;
48188 unsigned NumElts = VT.getVectorNumElements();
48189 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")((void)0);
48190
48191 // Broadcast the scalar integer to the vector elements.
48192 if (NumElts > EltSizeInBits) {
48193 // If the scalar integer is greater than the vector element size, then we
48194 // must split it down into sub-sections for broadcasting. For example:
48195 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
48196 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
48197 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")((void)0);
48198 unsigned Scale = NumElts / EltSizeInBits;
48199 EVT BroadcastVT =
48200 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
48201 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48202 Vec = DAG.getBitcast(VT, Vec);
48203
48204 for (unsigned i = 0; i != Scale; ++i)
48205 ShuffleMask.append(EltSizeInBits, i);
48206 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48207 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
48208 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
48209 // If we have register broadcast instructions, use the scalar size as the
48210 // element type for the shuffle. Then cast to the wider element type. The
48211 // widened bits won't be used, and this might allow the use of a broadcast
48212 // load.
48213 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")((void)0);
48214 unsigned Scale = EltSizeInBits / NumElts;
48215 EVT BroadcastVT =
48216 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
48217 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48218 ShuffleMask.append(NumElts * Scale, 0);
48219 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
48220 Vec = DAG.getBitcast(VT, Vec);
48221 } else {
48222 // For smaller scalar integers, we can simply any-extend it to the vector
48223 // element size (we don't care about the upper bits) and broadcast it to all
48224 // elements.
48225 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
48226 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
48227 ShuffleMask.append(NumElts, 0);
48228 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48229 }
48230
48231 // Now, mask the relevant bit in each element.
48232 SmallVector<SDValue, 32> Bits;
48233 for (unsigned i = 0; i != NumElts; ++i) {
48234 int BitIdx = (i % EltSizeInBits);
48235 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
48236 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
48237 }
48238 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
48239 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
48240
48241 // Compare against the bitmask and extend the result.
48242 EVT CCVT = VT.changeVectorElementType(MVT::i1);
48243 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
48244 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
48245
48246 // For SEXT, this is now done, otherwise shift the result down for
48247 // zero-extension.
48248 if (Opcode == ISD::SIGN_EXTEND)
48249 return Vec;
48250 return DAG.getNode(ISD::SRL, DL, VT, Vec,
48251 DAG.getConstant(EltSizeInBits - 1, DL, VT));
48252}
48253
48254// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
48255// result type.
48256static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
48257 const X86Subtarget &Subtarget) {
48258 SDValue N0 = N->getOperand(0);
48259 EVT VT = N->getValueType(0);
48260 SDLoc dl(N);
48261
48262 // Only do this combine with AVX512 for vector extends.
48263 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
48264 return SDValue();
48265
48266 // Only combine legal element types.
48267 EVT SVT = VT.getVectorElementType();
48268 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
48269 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
48270 return SDValue();
48271
48272 // We can only do this if the vector size in 256 bits or less.
48273 unsigned Size = VT.getSizeInBits();
48274 if (Size > 256 && Subtarget.useAVX512Regs())
48275 return SDValue();
48276
48277 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
48278 // that's the only integer compares with we have.
48279 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48280 if (ISD::isUnsignedIntSetCC(CC))
48281 return SDValue();
48282
48283 // Only do this combine if the extension will be fully consumed by the setcc.
48284 EVT N00VT = N0.getOperand(0).getValueType();
48285 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
48286 if (Size != MatchingVecType.getSizeInBits())
48287 return SDValue();
48288
48289 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
48290
48291 if (N->getOpcode() == ISD::ZERO_EXTEND)
48292 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
48293
48294 return Res;
48295}
48296
48297static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
48298 TargetLowering::DAGCombinerInfo &DCI,
48299 const X86Subtarget &Subtarget) {
48300 SDValue N0 = N->getOperand(0);
48301 EVT VT = N->getValueType(0);
48302 SDLoc DL(N);
48303
48304 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48305 if (!DCI.isBeforeLegalizeOps() &&
48306 N0.getOpcode() == X86ISD::SETCC_CARRY) {
48307 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
48308 N0->getOperand(1));
48309 bool ReplaceOtherUses = !N0.hasOneUse();
48310 DCI.CombineTo(N, Setcc);
48311 // Replace other uses with a truncate of the widened setcc_carry.
48312 if (ReplaceOtherUses) {
48313 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48314 N0.getValueType(), Setcc);
48315 DCI.CombineTo(N0.getNode(), Trunc);
48316 }
48317
48318 return SDValue(N, 0);
48319 }
48320
48321 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48322 return NewCMov;
48323
48324 if (!DCI.isBeforeLegalizeOps())
48325 return SDValue();
48326
48327 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48328 return V;
48329
48330 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48331 return V;
48332
48333 if (VT.isVector()) {
48334 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48335 return R;
48336
48337 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
48338 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
48339 }
48340
48341 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48342 return NewAdd;
48343
48344 return SDValue();
48345}
48346
48347static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
48348 TargetLowering::DAGCombinerInfo &DCI,
48349 const X86Subtarget &Subtarget) {
48350 SDLoc dl(N);
48351 EVT VT = N->getValueType(0);
48352 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
48353
48354 // Let legalize expand this if it isn't a legal type yet.
48355 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48356 if (!TLI.isTypeLegal(VT))
48357 return SDValue();
48358
48359 SDValue A = N->getOperand(IsStrict ? 1 : 0);
48360 SDValue B = N->getOperand(IsStrict ? 2 : 1);
48361 SDValue C = N->getOperand(IsStrict ? 3 : 2);
48362
48363 // If the operation allows fast-math and the target does not support FMA,
48364 // split this into mul+add to avoid libcall(s).
48365 SDNodeFlags Flags = N->getFlags();
48366 if (!IsStrict && Flags.hasAllowReassociation() &&
48367 TLI.isOperationExpand(ISD::FMA, VT)) {
48368 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
48369 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
48370 }
48371
48372 EVT ScalarVT = VT.getScalarType();
48373 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
48374 return SDValue();
48375
48376 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
48377 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48378 bool LegalOperations = !DCI.isBeforeLegalizeOps();
48379 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
48380 CodeSize)) {
48381 V = NegV;
48382 return true;
48383 }
48384 // Look through extract_vector_elts. If it comes from an FNEG, create a
48385 // new extract from the FNEG input.
48386 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
48387 isNullConstant(V.getOperand(1))) {
48388 SDValue Vec = V.getOperand(0);
48389 if (SDValue NegV = TLI.getCheaperNegatedExpression(
48390 Vec, DAG, LegalOperations, CodeSize)) {
48391 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
48392 NegV, V.getOperand(1));
48393 return true;
48394 }
48395 }
48396
48397 return false;
48398 };
48399
48400 // Do not convert the passthru input of scalar intrinsics.
48401 // FIXME: We could allow negations of the lower element only.
48402 bool NegA = invertIfNegative(A);
48403 bool NegB = invertIfNegative(B);
48404 bool NegC = invertIfNegative(C);
48405
48406 if (!NegA && !NegB && !NegC)
48407 return SDValue();
48408
48409 unsigned NewOpcode =
48410 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
48411
48412 // Propagate fast-math-flags to new FMA node.
48413 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
48414 if (IsStrict) {
48415 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")((void)0);
48416 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
48417 {N->getOperand(0), A, B, C});
48418 } else {
48419 if (N->getNumOperands() == 4)
48420 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
48421 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
48422 }
48423}
48424
48425// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
48426// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
48427static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
48428 TargetLowering::DAGCombinerInfo &DCI) {
48429 SDLoc dl(N);
48430 EVT VT = N->getValueType(0);
48431 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48432 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48433 bool LegalOperations = !DCI.isBeforeLegalizeOps();
48434
48435 SDValue N2 = N->getOperand(2);
48436
48437 SDValue NegN2 =
48438 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
48439 if (!NegN2)
48440 return SDValue();
48441 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
48442
48443 if (N->getNumOperands() == 4)
48444 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48445 NegN2, N->getOperand(3));
48446 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48447 NegN2);
48448}
48449
48450static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
48451 TargetLowering::DAGCombinerInfo &DCI,
48452 const X86Subtarget &Subtarget) {
48453 SDLoc dl(N);
48454 SDValue N0 = N->getOperand(0);
48455 EVT VT = N->getValueType(0);
48456
48457 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48458 // FIXME: Is this needed? We don't seem to have any tests for it.
48459 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
48460 N0.getOpcode() == X86ISD::SETCC_CARRY) {
48461 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
48462 N0->getOperand(1));
48463 bool ReplaceOtherUses = !N0.hasOneUse();
48464 DCI.CombineTo(N, Setcc);
48465 // Replace other uses with a truncate of the widened setcc_carry.
48466 if (ReplaceOtherUses) {
48467 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48468 N0.getValueType(), Setcc);
48469 DCI.CombineTo(N0.getNode(), Trunc);
48470 }
48471
48472 return SDValue(N, 0);
48473 }
48474
48475 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48476 return NewCMov;
48477
48478 if (DCI.isBeforeLegalizeOps())
48479 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48480 return V;
48481
48482 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48483 return V;
48484
48485 if (VT.isVector())
48486 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48487 return R;
48488
48489 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48490 return NewAdd;
48491
48492 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
48493 return R;
48494
48495 // TODO: Combine with any target/faux shuffle.
48496 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
48497 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
48498 SDValue N00 = N0.getOperand(0);
48499 SDValue N01 = N0.getOperand(1);
48500 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
48501 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
48502 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
48503 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
48504 return concatSubVectors(N00, N01, DAG, dl);
48505 }
48506 }
48507
48508 return SDValue();
48509}
48510
48511/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
48512/// recognizable memcmp expansion.
48513static bool isOrXorXorTree(SDValue X, bool Root = true) {
48514 if (X.getOpcode() == ISD::OR)
48515 return isOrXorXorTree(X.getOperand(0), false) &&
48516 isOrXorXorTree(X.getOperand(1), false);
48517 if (Root)
48518 return false;
48519 return X.getOpcode() == ISD::XOR;
48520}
48521
48522/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
48523/// expansion.
48524template<typename F>
48525static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
48526 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
48527 SDValue Op0 = X.getOperand(0);
48528 SDValue Op1 = X.getOperand(1);
48529 if (X.getOpcode() == ISD::OR) {
48530 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48531 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48532 if (VecVT != CmpVT)
48533 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
48534 if (HasPT)
48535 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
48536 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
48537 } else if (X.getOpcode() == ISD::XOR) {
48538 SDValue A = SToV(Op0);
48539 SDValue B = SToV(Op1);
48540 if (VecVT != CmpVT)
48541 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
48542 if (HasPT)
48543 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
48544 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
48545 }
48546 llvm_unreachable("Impossible")__builtin_unreachable();
48547}
48548
48549/// Try to map a 128-bit or larger integer comparison to vector instructions
48550/// before type legalization splits it up into chunks.
48551static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
48552 const X86Subtarget &Subtarget) {
48553 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
48554 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")((void)0);
48555
48556 // We're looking for an oversized integer equality comparison.
48557 SDValue X = SetCC->getOperand(0);
48558 SDValue Y = SetCC->getOperand(1);
48559 EVT OpVT = X.getValueType();
48560 unsigned OpSize = OpVT.getSizeInBits();
48561 if (!OpVT.isScalarInteger() || OpSize < 128)
48562 return SDValue();
48563
48564 // Ignore a comparison with zero because that gets special treatment in
48565 // EmitTest(). But make an exception for the special case of a pair of
48566 // logically-combined vector-sized operands compared to zero. This pattern may
48567 // be generated by the memcmp expansion pass with oversized integer compares
48568 // (see PR33325).
48569 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
48570 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
48571 return SDValue();
48572
48573 // Don't perform this combine if constructing the vector will be expensive.
48574 auto IsVectorBitCastCheap = [](SDValue X) {
48575 X = peekThroughBitcasts(X);
48576 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
48577 X.getOpcode() == ISD::LOAD;
48578 };
48579 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
48580 !IsOrXorXorTreeCCZero)
48581 return SDValue();
48582
48583 EVT VT = SetCC->getValueType(0);
48584 SDLoc DL(SetCC);
48585
48586 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
48587 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
48588 // Otherwise use PCMPEQ (plus AND) and mask testing.
48589 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
48590 (OpSize == 256 && Subtarget.hasAVX()) ||
48591 (OpSize == 512 && Subtarget.useAVX512Regs())) {
48592 bool HasPT = Subtarget.hasSSE41();
48593
48594 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
48595 // vector registers are essentially free. (Technically, widening registers
48596 // prevents load folding, but the tradeoff is worth it.)
48597 bool PreferKOT = Subtarget.preferMaskRegisters();
48598 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
48599
48600 EVT VecVT = MVT::v16i8;
48601 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
48602 if (OpSize == 256) {
48603 VecVT = MVT::v32i8;
48604 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
48605 }
48606 EVT CastVT = VecVT;
48607 bool NeedsAVX512FCast = false;
48608 if (OpSize == 512 || NeedZExt) {
48609 if (Subtarget.hasBWI()) {
48610 VecVT = MVT::v64i8;
48611 CmpVT = MVT::v64i1;
48612 if (OpSize == 512)
48613 CastVT = VecVT;
48614 } else {
48615 VecVT = MVT::v16i32;
48616 CmpVT = MVT::v16i1;
48617 CastVT = OpSize == 512 ? VecVT :
48618 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
48619 NeedsAVX512FCast = true;
48620 }
48621 }
48622
48623 auto ScalarToVector = [&](SDValue X) -> SDValue {
48624 bool TmpZext = false;
48625 EVT TmpCastVT = CastVT;
48626 if (X.getOpcode() == ISD::ZERO_EXTEND) {
48627 SDValue OrigX = X.getOperand(0);
48628 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
48629 if (OrigSize < OpSize) {
48630 if (OrigSize == 128) {
48631 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
48632 X = OrigX;
48633 TmpZext = true;
48634 } else if (OrigSize == 256) {
48635 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
48636 X = OrigX;
48637 TmpZext = true;
48638 }
48639 }
48640 }
48641 X = DAG.getBitcast(TmpCastVT, X);
48642 if (!NeedZExt && !TmpZext)
48643 return X;
48644 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
48645 DAG.getConstant(0, DL, VecVT), X,
48646 DAG.getVectorIdxConstant(0, DL));
48647 };
48648
48649 SDValue Cmp;
48650 if (IsOrXorXorTreeCCZero) {
48651 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
48652 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
48653 // Use 2 vector equality compares and 'and' the results before doing a
48654 // MOVMSK.
48655 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
48656 } else {
48657 SDValue VecX = ScalarToVector(X);
48658 SDValue VecY = ScalarToVector(Y);
48659 if (VecVT != CmpVT) {
48660 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
48661 } else if (HasPT) {
48662 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
48663 } else {
48664 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
48665 }
48666 }
48667 // AVX512 should emit a setcc that will lower to kortest.
48668 if (VecVT != CmpVT) {
48669 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
48670 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
48671 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
48672 DAG.getConstant(0, DL, KRegVT), CC);
48673 }
48674 if (HasPT) {
48675 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
48676 Cmp);
48677 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
48678 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
48679 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
48680 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
48681 }
48682 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
48683 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
48684 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
48685 assert(Cmp.getValueType() == MVT::v16i8 &&((void)0)
48686 "Non 128-bit vector on pre-SSE41 target")((void)0);
48687 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
48688 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
48689 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
48690 }
48691
48692 return SDValue();
48693}
48694
48695static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
48696 TargetLowering::DAGCombinerInfo &DCI,
48697 const X86Subtarget &Subtarget) {
48698 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
48699 const SDValue LHS = N->getOperand(0);
48700 const SDValue RHS = N->getOperand(1);
48701 EVT VT = N->getValueType(0);
48702 EVT OpVT = LHS.getValueType();
48703 SDLoc DL(N);
48704
48705 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
48706 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
48707 return V;
48708
48709 if (VT == MVT::i1 && isNullConstant(RHS)) {
48710 SDValue X86CC;
48711 if (SDValue V =
48712 MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
48713 return DAG.getNode(ISD::TRUNCATE, DL, VT,
48714 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
48715 }
48716
48717 if (OpVT.isScalarInteger()) {
48718 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
48719 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
48720 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
48721 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
48722 if (N0.getOperand(0) == N1)
48723 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48724 N0.getOperand(1));
48725 if (N0.getOperand(1) == N1)
48726 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48727 N0.getOperand(0));
48728 }
48729 return SDValue();
48730 };
48731 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
48732 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48733 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
48734 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48735
48736 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
48737 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
48738 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
48739 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
48740 if (N0.getOperand(0) == N1)
48741 return DAG.getNode(ISD::AND, DL, OpVT, N1,
48742 DAG.getNOT(DL, N0.getOperand(1), OpVT));
48743 if (N0.getOperand(1) == N1)
48744 return DAG.getNode(ISD::AND, DL, OpVT, N1,
48745 DAG.getNOT(DL, N0.getOperand(0), OpVT));
48746 }
48747 return SDValue();
48748 };
48749 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
48750 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48751 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
48752 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48753
48754 // cmpeq(trunc(x),0) --> cmpeq(x,0)
48755 // cmpne(trunc(x),0) --> cmpne(x,0)
48756 // iff x upper bits are zero.
48757 // TODO: Add support for RHS to be truncate as well?
48758 if (LHS.getOpcode() == ISD::TRUNCATE &&
48759 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
48760 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
48761 EVT SrcVT = LHS.getOperand(0).getValueType();
48762 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
48763 OpVT.getScalarSizeInBits());
48764 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48765 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
48766 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
48767 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
48768 DAG.getConstant(0, DL, SrcVT), CC);
48769 }
48770 }
48771 }
48772
48773 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
48774 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
48775 // Using temporaries to avoid messing up operand ordering for later
48776 // transformations if this doesn't work.
48777 SDValue Op0 = LHS;
48778 SDValue Op1 = RHS;
48779 ISD::CondCode TmpCC = CC;
48780 // Put build_vector on the right.
48781 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
48782 std::swap(Op0, Op1);
48783 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
48784 }
48785
48786 bool IsSEXT0 =
48787 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
48788 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
48789 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
48790
48791 if (IsSEXT0 && IsVZero1) {
48792 assert(VT == Op0.getOperand(0).getValueType() &&((void)0)
48793 "Unexpected operand type")((void)0);
48794 if (TmpCC == ISD::SETGT)
48795 return DAG.getConstant(0, DL, VT);
48796 if (TmpCC == ISD::SETLE)
48797 return DAG.getConstant(1, DL, VT);
48798 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
48799 return DAG.getNOT(DL, Op0.getOperand(0), VT);
48800
48801 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&((void)0)
48802 "Unexpected condition code!")((void)0);
48803 return Op0.getOperand(0);
48804 }
48805 }
48806
48807 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
48808 // pre-promote its result type since vXi1 vectors don't get promoted
48809 // during type legalization.
48810 // NOTE: The element count check is to ignore operand types that need to
48811 // go through type promotion to a 128-bit vector.
48812 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
48813 VT.getVectorElementType() == MVT::i1 &&
48814 (OpVT.getVectorElementType() == MVT::i8 ||
48815 OpVT.getVectorElementType() == MVT::i16)) {
48816 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
48817 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
48818 }
48819
48820 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
48821 // to avoid scalarization via legalization because v4i32 is not a legal type.
48822 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
48823 LHS.getValueType() == MVT::v4f32)
48824 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
48825
48826 return SDValue();
48827}
48828
48829static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
48830 TargetLowering::DAGCombinerInfo &DCI,
48831 const X86Subtarget &Subtarget) {
48832 SDValue Src = N->getOperand(0);
48833 MVT SrcVT = Src.getSimpleValueType();
48834 MVT VT = N->getSimpleValueType(0);
48835 unsigned NumBits = VT.getScalarSizeInBits();
48836 unsigned NumElts = SrcVT.getVectorNumElements();
48837
48838 // Perform constant folding.
48839 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
48840 assert(VT == MVT::i32 && "Unexpected result type")((void)0);
48841 APInt Imm(32, 0);
48842 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
48843 if (!Src.getOperand(Idx).isUndef() &&
48844 Src.getConstantOperandAPInt(Idx).isNegative())
48845 Imm.setBit(Idx);
48846 }
48847 return DAG.getConstant(Imm, SDLoc(N), VT);
48848 }
48849
48850 // Look through int->fp bitcasts that don't change the element width.
48851 unsigned EltWidth = SrcVT.getScalarSizeInBits();
48852 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
48853 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
48854 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
48855
48856 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
48857 // with scalar comparisons.
48858 if (SDValue NotSrc = IsNOT(Src, DAG)) {
48859 SDLoc DL(N);
48860 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48861 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
48862 return DAG.getNode(ISD::XOR, DL, VT,
48863 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
48864 DAG.getConstant(NotMask, DL, VT));
48865 }
48866
48867 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
48868 // results with scalar comparisons.
48869 if (Src.getOpcode() == X86ISD::PCMPGT &&
48870 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
48871 SDLoc DL(N);
48872 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48873 return DAG.getNode(ISD::XOR, DL, VT,
48874 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
48875 DAG.getConstant(NotMask, DL, VT));
48876 }
48877
48878 // Simplify the inputs.
48879 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48880 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
48881 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48882 return SDValue(N, 0);
48883
48884 return SDValue();
48885}
48886
48887static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
48888 TargetLowering::DAGCombinerInfo &DCI) {
48889 // With vector masks we only demand the upper bit of the mask.
48890 SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
48891 if (Mask.getScalarValueSizeInBits() != 1) {
48892 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48893 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48894 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48895 if (N->getOpcode() != ISD::DELETED_NODE)
48896 DCI.AddToWorklist(N);
48897 return SDValue(N, 0);
48898 }
48899 }
48900
48901 return SDValue();
48902}
48903
48904static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
48905 SDValue Index, SDValue Base, SDValue Scale,
48906 SelectionDAG &DAG) {
48907 SDLoc DL(GorS);
48908
48909 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
48910 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
48911 Gather->getMask(), Base, Index, Scale } ;
48912 return DAG.getMaskedGather(Gather->getVTList(),
48913 Gather->getMemoryVT(), DL, Ops,
48914 Gather->getMemOperand(),
48915 Gather->getIndexType(),
48916 Gather->getExtensionType());
48917 }
48918 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
48919 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
48920 Scatter->getMask(), Base, Index, Scale };
48921 return DAG.getMaskedScatter(Scatter->getVTList(),
48922 Scatter->getMemoryVT(), DL,
48923 Ops, Scatter->getMemOperand(),
48924 Scatter->getIndexType(),
48925 Scatter->isTruncatingStore());
48926}
48927
48928static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
48929 TargetLowering::DAGCombinerInfo &DCI) {
48930 SDLoc DL(N);
48931 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
48932 SDValue Index = GorS->getIndex();
48933 SDValue Base = GorS->getBasePtr();
48934 SDValue Scale = GorS->getScale();
48935
48936 if (DCI.isBeforeLegalize()) {
48937 unsigned IndexWidth = Index.getScalarValueSizeInBits();
48938
48939 // Shrink constant indices if they are larger than 32-bits.
48940 // Only do this before legalize types since v2i64 could become v2i32.
48941 // FIXME: We could check that the type is legal if we're after legalize
48942 // types, but then we would need to construct test cases where that happens.
48943 // FIXME: We could support more than just constant vectors, but we need to
48944 // careful with costing. A truncate that can be optimized out would be fine.
48945 // Otherwise we might only want to create a truncate if it avoids a split.
48946 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
48947 if (BV->isConstant() && IndexWidth > 32 &&
48948 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48949 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48950 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48951 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48952 }
48953 }
48954
48955 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
48956 // there are sufficient sign bits. Only do this before legalize types to
48957 // avoid creating illegal types in truncate.
48958 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
48959 Index.getOpcode() == ISD::ZERO_EXTEND) &&
48960 IndexWidth > 32 &&
48961 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
48962 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48963 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48964 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48965 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48966 }
48967 }
48968
48969 if (DCI.isBeforeLegalizeOps()) {
48970 unsigned IndexWidth = Index.getScalarValueSizeInBits();
48971
48972 // Make sure the index is either i32 or i64
48973 if (IndexWidth != 32 && IndexWidth != 64) {
48974 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
48975 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
48976 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
48977 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48978 }
48979 }
48980
48981 // With vector masks we only demand the upper bit of the mask.
48982 SDValue Mask = GorS->getMask();
48983 if (Mask.getScalarValueSizeInBits() != 1) {
48984 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48985 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48986 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48987 if (N->getOpcode() != ISD::DELETED_NODE)
48988 DCI.AddToWorklist(N);
48989 return SDValue(N, 0);
48990 }
48991 }
48992
48993 return SDValue();
48994}
48995
48996// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
48997static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
48998 const X86Subtarget &Subtarget) {
48999 SDLoc DL(N);
49000 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
49001 SDValue EFLAGS = N->getOperand(1);
49002
49003 // Try to simplify the EFLAGS and condition code operands.
49004 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
49005 return getSETCC(CC, Flags, DL, DAG);
49006
49007 return SDValue();
49008}
49009
49010/// Optimize branch condition evaluation.
49011static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
49012 const X86Subtarget &Subtarget) {
49013 SDLoc DL(N);
49014 SDValue EFLAGS = N->getOperand(3);
49015 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
49016
49017 // Try to simplify the EFLAGS and condition code operands.
49018 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
49019 // RAUW them under us.
49020 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
49021 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
49022 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
49023 N->getOperand(1), Cond, Flags);
49024 }
49025
49026 return SDValue();
49027}
49028
49029// TODO: Could we move this to DAGCombine?
49030static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
49031 SelectionDAG &DAG) {
49032 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
49033 // to optimize away operation when it's from a constant.
49034 //
49035 // The general transformation is:
49036 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
49037 // AND(VECTOR_CMP(x,y), constant2)
49038 // constant2 = UNARYOP(constant)
49039
49040 // Early exit if this isn't a vector operation, the operand of the
49041 // unary operation isn't a bitwise AND, or if the sizes of the operations
49042 // aren't the same.
49043 EVT VT = N->getValueType(0);
49044 bool IsStrict = N->isStrictFPOpcode();
49045 unsigned NumEltBits = VT.getScalarSizeInBits();
49046 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49047 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
49048 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
49049 VT.getSizeInBits() != Op0.getValueSizeInBits())
49050 return SDValue();
49051
49052 // Now check that the other operand of the AND is a constant. We could
49053 // make the transformation for non-constant splats as well, but it's unclear
49054 // that would be a benefit as it would not eliminate any operations, just
49055 // perform one more step in scalar code before moving to the vector unit.
49056 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
49057 // Bail out if the vector isn't a constant.
49058 if (!BV->isConstant())
49059 return SDValue();
49060
49061 // Everything checks out. Build up the new and improved node.
49062 SDLoc DL(N);
49063 EVT IntVT = BV->getValueType(0);
49064 // Create a new constant of the appropriate type for the transformed
49065 // DAG.
49066 SDValue SourceConst;
49067 if (IsStrict)
49068 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
49069 {N->getOperand(0), SDValue(BV, 0)});
49070 else
49071 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
49072 // The AND node needs bitcasts to/from an integer vector type around it.
49073 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
49074 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
49075 MaskConst);
49076 SDValue Res = DAG.getBitcast(VT, NewAnd);
49077 if (IsStrict)
49078 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
49079 return Res;
49080 }
49081
49082 return SDValue();
49083}
49084
49085/// If we are converting a value to floating-point, try to replace scalar
49086/// truncate of an extracted vector element with a bitcast. This tries to keep
49087/// the sequence on XMM registers rather than moving between vector and GPRs.
49088static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
49089 // TODO: This is currently only used by combineSIntToFP, but it is generalized
49090 // to allow being called by any similar cast opcode.
49091 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
49092 SDValue Trunc = N->getOperand(0);
49093 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
49094 return SDValue();
49095
49096 SDValue ExtElt = Trunc.getOperand(0);
49097 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49098 !isNullConstant(ExtElt.getOperand(1)))
49099 return SDValue();
49100
49101 EVT TruncVT = Trunc.getValueType();
49102 EVT SrcVT = ExtElt.getValueType();
49103 unsigned DestWidth = TruncVT.getSizeInBits();
49104 unsigned SrcWidth = SrcVT.getSizeInBits();
49105 if (SrcWidth % DestWidth != 0)
49106 return SDValue();
49107
49108 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
49109 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
49110 unsigned VecWidth = SrcVecVT.getSizeInBits();
49111 unsigned NumElts = VecWidth / DestWidth;
49112 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
49113 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
49114 SDLoc DL(N);
49115 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
49116 BitcastVec, ExtElt.getOperand(1));
49117 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
49118}
49119
49120static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
49121 const X86Subtarget &Subtarget) {
49122 bool IsStrict = N->isStrictFPOpcode();
49123 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49124 EVT VT = N->getValueType(0);
49125 EVT InVT = Op0.getValueType();
49126
49127 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
49128 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
49129 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
49130 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
49131 SDLoc dl(N);
49132 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49133 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
49134
49135 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
49136 if (IsStrict)
49137 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49138 {N->getOperand(0), P});
49139 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49140 }
49141
49142 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
49143 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
49144 // the optimization here.
49145 if (DAG.SignBitIsZero(Op0)) {
49146 if (IsStrict)
49147 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
49148 {N->getOperand(0), Op0});
49149 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
49150 }
49151
49152 return SDValue();
49153}
49154
49155static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
49156 TargetLowering::DAGCombinerInfo &DCI,
49157 const X86Subtarget &Subtarget) {
49158 // First try to optimize away the conversion entirely when it's
49159 // conditionally from a constant. Vectors only.
49160 bool IsStrict = N->isStrictFPOpcode();
49161 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
49162 return Res;
49163
49164 // Now move on to more general possibilities.
49165 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49166 EVT VT = N->getValueType(0);
49167 EVT InVT = Op0.getValueType();
49168
49169 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
49170 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
49171 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
49172 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
49173 SDLoc dl(N);
49174 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49175 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
49176 if (IsStrict)
49177 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49178 {N->getOperand(0), P});
49179 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49180 }
49181
49182 // Without AVX512DQ we only support i64 to float scalar conversion. For both
49183 // vectors and scalars, see if we know that the upper bits are all the sign
49184 // bit, in which case we can truncate the input to i32 and convert from that.
49185 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
49186 unsigned BitWidth = InVT.getScalarSizeInBits();
49187 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
49188 if (NumSignBits >= (BitWidth - 31)) {
49189 EVT TruncVT = MVT::i32;
49190 if (InVT.isVector())
49191 TruncVT = InVT.changeVectorElementType(TruncVT);
49192 SDLoc dl(N);
49193 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
49194 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
49195 if (IsStrict)
49196 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49197 {N->getOperand(0), Trunc});
49198 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
49199 }
49200 // If we're after legalize and the type is v2i32 we need to shuffle and
49201 // use CVTSI2P.
49202 assert(InVT == MVT::v2i64 && "Unexpected VT!")((void)0);
49203 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
49204 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
49205 { 0, 2, -1, -1 });
49206 if (IsStrict)
49207 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
49208 {N->getOperand(0), Shuf});
49209 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
49210 }
49211 }
49212
49213 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
49214 // a 32-bit target where SSE doesn't support i64->FP operations.
49215 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
49216 Op0.getOpcode() == ISD::LOAD) {
49217 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
49218
49219 // This transformation is not supported if the result type is f16 or f128.
49220 if (VT == MVT::f16 || VT == MVT::f128)
49221 return SDValue();
49222
49223 // If we have AVX512DQ we can use packed conversion instructions unless
49224 // the VT is f80.
49225 if (Subtarget.hasDQI() && VT != MVT::f80)
49226 return SDValue();
49227
49228 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
49229 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
49230 std::pair<SDValue, SDValue> Tmp =
49231 Subtarget.getTargetLowering()->BuildFILD(
49232 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
49233 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
49234 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
49235 return Tmp.first;
49236 }
49237 }
49238
49239 if (IsStrict)
49240 return SDValue();
49241
49242 if (SDValue V = combineToFPTruncExtElt(N, DAG))
49243 return V;
49244
49245 return SDValue();
49246}
49247
49248static bool needCarryOrOverflowFlag(SDValue Flags) {
49249 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((void)0);
49250
49251 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49252 UI != UE; ++UI) {
49253 SDNode *User = *UI;
49254
49255 X86::CondCode CC;
49256 switch (User->getOpcode()) {
49257 default:
49258 // Be conservative.
49259 return true;
49260 case X86ISD::SETCC:
49261 case X86ISD::SETCC_CARRY:
49262 CC = (X86::CondCode)User->getConstantOperandVal(0);
49263 break;
49264 case X86ISD::BRCOND:
49265 CC = (X86::CondCode)User->getConstantOperandVal(2);
49266 break;
49267 case X86ISD::CMOV:
49268 CC = (X86::CondCode)User->getConstantOperandVal(2);
49269 break;
49270 }
49271
49272 switch (CC) {
49273 default: break;
49274 case X86::COND_A: case X86::COND_AE:
49275 case X86::COND_B: case X86::COND_BE:
49276 case X86::COND_O: case X86::COND_NO:
49277 case X86::COND_G: case X86::COND_GE:
49278 case X86::COND_L: case X86::COND_LE:
49279 return true;
49280 }
49281 }
49282
49283 return false;
49284}
49285
49286static bool onlyZeroFlagUsed(SDValue Flags) {
49287 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((void)0);
49288
49289 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49290 UI != UE; ++UI) {
49291 SDNode *User = *UI;
49292
49293 unsigned CCOpNo;
49294 switch (User->getOpcode()) {
49295 default:
49296 // Be conservative.
49297 return false;
49298 case X86ISD::SETCC: CCOpNo = 0; break;
49299 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
49300 case X86ISD::BRCOND: CCOpNo = 2; break;
49301 case X86ISD::CMOV: CCOpNo = 2; break;
49302 }
49303
49304 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
49305 if (CC != X86::COND_E && CC != X86::COND_NE)
49306 return false;
49307 }
49308
49309 return true;
49310}
49311
49312static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
49313 // Only handle test patterns.
49314 if (!isNullConstant(N->getOperand(1)))
49315 return SDValue();
49316
49317 // If we have a CMP of a truncated binop, see if we can make a smaller binop
49318 // and use its flags directly.
49319 // TODO: Maybe we should try promoting compares that only use the zero flag
49320 // first if we can prove the upper bits with computeKnownBits?
49321 SDLoc dl(N);
49322 SDValue Op = N->getOperand(0);
49323 EVT VT = Op.getValueType();
49324
49325 // If we have a constant logical shift that's only used in a comparison
49326 // against zero turn it into an equivalent AND. This allows turning it into
49327 // a TEST instruction later.
49328 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
49329 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
49330 onlyZeroFlagUsed(SDValue(N, 0))) {
49331 unsigned BitWidth = VT.getSizeInBits();
49332 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
49333 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
49334 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
49335 APInt Mask = Op.getOpcode() == ISD::SRL
49336 ? APInt::getHighBitsSet(BitWidth, MaskBits)
49337 : APInt::getLowBitsSet(BitWidth, MaskBits);
49338 if (Mask.isSignedIntN(32)) {
49339 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
49340 DAG.getConstant(Mask, dl, VT));
49341 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49342 DAG.getConstant(0, dl, VT));
49343 }
49344 }
49345 }
49346
49347 // Look for a truncate.
49348 if (Op.getOpcode() != ISD::TRUNCATE)
49349 return SDValue();
49350
49351 SDValue Trunc = Op;
49352 Op = Op.getOperand(0);
49353
49354 // See if we can compare with zero against the truncation source,
49355 // which should help using the Z flag from many ops. Only do this for
49356 // i32 truncated op to prevent partial-reg compares of promoted ops.
49357 EVT OpVT = Op.getValueType();
49358 APInt UpperBits =
49359 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
49360 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
49361 onlyZeroFlagUsed(SDValue(N, 0))) {
49362 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49363 DAG.getConstant(0, dl, OpVT));
49364 }
49365
49366 // After this the truncate and arithmetic op must have a single use.
49367 if (!Trunc.hasOneUse() || !Op.hasOneUse())
49368 return SDValue();
49369
49370 unsigned NewOpc;
49371 switch (Op.getOpcode()) {
49372 default: return SDValue();
49373 case ISD::AND:
49374 // Skip and with constant. We have special handling for and with immediate
49375 // during isel to generate test instructions.
49376 if (isa<ConstantSDNode>(Op.getOperand(1)))
49377 return SDValue();
49378 NewOpc = X86ISD::AND;
49379 break;
49380 case ISD::OR: NewOpc = X86ISD::OR; break;
49381 case ISD::XOR: NewOpc = X86ISD::XOR; break;
49382 case ISD::ADD:
49383 // If the carry or overflow flag is used, we can't truncate.
49384 if (needCarryOrOverflowFlag(SDValue(N, 0)))
49385 return SDValue();
49386 NewOpc = X86ISD::ADD;
49387 break;
49388 case ISD::SUB:
49389 // If the carry or overflow flag is used, we can't truncate.
49390 if (needCarryOrOverflowFlag(SDValue(N, 0)))
49391 return SDValue();
49392 NewOpc = X86ISD::SUB;
49393 break;
49394 }
49395
49396 // We found an op we can narrow. Truncate its inputs.
49397 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
49398 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
49399
49400 // Use a X86 specific opcode to avoid DAG combine messing with it.
49401 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49402 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
49403
49404 // For AND, keep a CMP so that we can match the test pattern.
49405 if (NewOpc == X86ISD::AND)
49406 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49407 DAG.getConstant(0, dl, VT));
49408
49409 // Return the flags.
49410 return Op.getValue(1);
49411}
49412
49413static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
49414 TargetLowering::DAGCombinerInfo &DCI) {
49415 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&((void)0)
49416 "Expected X86ISD::ADD or X86ISD::SUB")((void)0);
49417
49418 SDLoc DL(N);
49419 SDValue LHS = N->getOperand(0);
49420 SDValue RHS = N->getOperand(1);
49421 MVT VT = LHS.getSimpleValueType();
49422 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
49423
49424 // If we don't use the flag result, simplify back to a generic ADD/SUB.
49425 if (!N->hasAnyUseOfValue(1)) {
49426 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
49427 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
49428 }
49429
49430 // Fold any similar generic ADD/SUB opcodes to reuse this node.
49431 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
49432 SDValue Ops[] = {N0, N1};
49433 SDVTList VTs = DAG.getVTList(N->getValueType(0));
49434 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
49435 SDValue Op(N, 0);
49436 if (Negate)
49437 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
49438 DCI.CombineTo(GenericAddSub, Op);
49439 }
49440 };
49441 MatchGeneric(LHS, RHS, false);
49442 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
49443
49444 return SDValue();
49445}
49446
49447static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
49448 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49449 MVT VT = N->getSimpleValueType(0);
49450 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49451 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
49452 N->getOperand(0), N->getOperand(1),
49453 Flags);
49454 }
49455
49456 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
49457 // iff the flag result is dead.
49458 SDValue Op0 = N->getOperand(0);
49459 SDValue Op1 = N->getOperand(1);
49460 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
49461 !N->hasAnyUseOfValue(1))
49462 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
49463 Op0.getOperand(1), N->getOperand(2));
49464
49465 return SDValue();
49466}
49467
49468// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
49469static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
49470 TargetLowering::DAGCombinerInfo &DCI) {
49471 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
49472 // the result is either zero or one (depending on the input carry bit).
49473 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
49474 if (X86::isZeroNode(N->getOperand(0)) &&
49475 X86::isZeroNode(N->getOperand(1)) &&
49476 // We don't have a good way to replace an EFLAGS use, so only do this when
49477 // dead right now.
49478 SDValue(N, 1).use_empty()) {
49479 SDLoc DL(N);
49480 EVT VT = N->getValueType(0);
49481 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
49482 SDValue Res1 =
49483 DAG.getNode(ISD::AND, DL, VT,
49484 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49485 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49486 N->getOperand(2)),
49487 DAG.getConstant(1, DL, VT));
49488 return DCI.CombineTo(N, Res1, CarryOut);
49489 }
49490
49491 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49492 MVT VT = N->getSimpleValueType(0);
49493 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49494 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
49495 N->getOperand(0), N->getOperand(1),
49496 Flags);
49497 }
49498
49499 return SDValue();
49500}
49501
49502/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49503/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49504/// with CMP+{ADC, SBB}.
49505static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
49506 bool IsSub = N->getOpcode() == ISD::SUB;
49507 SDValue X = N->getOperand(0);
49508 SDValue Y = N->getOperand(1);
49509
49510 // If this is an add, canonicalize a zext operand to the RHS.
49511 // TODO: Incomplete? What if both sides are zexts?
49512 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
49513 Y.getOpcode() != ISD::ZERO_EXTEND)
49514 std::swap(X, Y);
49515
49516 // Look through a one-use zext.
49517 bool PeekedThroughZext = false;
49518 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
49519 Y = Y.getOperand(0);
49520 PeekedThroughZext = true;
49521 }
49522
49523 // If this is an add, canonicalize a setcc operand to the RHS.
49524 // TODO: Incomplete? What if both sides are setcc?
49525 // TODO: Should we allow peeking through a zext of the other operand?
49526 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
49527 Y.getOpcode() != X86ISD::SETCC)
49528 std::swap(X, Y);
49529
49530 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
49531 return SDValue();
49532
49533 SDLoc DL(N);
49534 EVT VT = N->getValueType(0);
49535 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
49536
49537 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49538 // the general case below.
49539 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49540 if (ConstantX) {
49541 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
49542 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
49543 // This is a complicated way to get -1 or 0 from the carry flag:
49544 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49545 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49546 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49547 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49548 Y.getOperand(1));
49549 }
49550
49551 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
49552 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
49553 SDValue EFLAGS = Y->getOperand(1);
49554 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49555 EFLAGS.getValueType().isInteger() &&
49556 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49557 // Swap the operands of a SUB, and we have the same pattern as above.
49558 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49559 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
49560 SDValue NewSub = DAG.getNode(
49561 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49562 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49563 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49564 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49565 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49566 NewEFLAGS);
49567 }
49568 }
49569 }
49570
49571 if (CC == X86::COND_B) {
49572 // X + SETB Z --> adc X, 0
49573 // X - SETB Z --> sbb X, 0
49574 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49575 DAG.getVTList(VT, MVT::i32), X,
49576 DAG.getConstant(0, DL, VT), Y.getOperand(1));
49577 }
49578
49579 if (CC == X86::COND_A) {
49580 SDValue EFLAGS = Y.getOperand(1);
49581 // Try to convert COND_A into COND_B in an attempt to facilitate
49582 // materializing "setb reg".
49583 //
49584 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49585 // cannot take an immediate as its first operand.
49586 //
49587 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49588 EFLAGS.getValueType().isInteger() &&
49589 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49590 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
49591 EFLAGS.getNode()->getVTList(),
49592 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49593 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49594 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49595 DAG.getVTList(VT, MVT::i32), X,
49596 DAG.getConstant(0, DL, VT), NewEFLAGS);
49597 }
49598 }
49599
49600 if (CC == X86::COND_AE) {
49601 // X + SETAE --> sbb X, -1
49602 // X - SETAE --> adc X, -1
49603 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49604 DAG.getVTList(VT, MVT::i32), X,
49605 DAG.getConstant(-1, DL, VT), Y.getOperand(1));
49606 }
49607
49608 if (CC == X86::COND_BE) {
49609 // X + SETBE --> sbb X, -1
49610 // X - SETBE --> adc X, -1
49611 SDValue EFLAGS = Y.getOperand(1);
49612 // Try to convert COND_BE into COND_AE in an attempt to facilitate
49613 // materializing "setae reg".
49614 //
49615 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49616 // cannot take an immediate as its first operand.
49617 //
49618 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49619 EFLAGS.getValueType().isInteger() &&
49620 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49621 SDValue NewSub = DAG.getNode(
49622 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49623 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49624 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49625 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49626 DAG.getVTList(VT, MVT::i32), X,
49627 DAG.getConstant(-1, DL, VT), NewEFLAGS);
49628 }
49629 }
49630
49631 if (CC != X86::COND_E && CC != X86::COND_NE)
49632 return SDValue();
49633
49634 SDValue Cmp = Y.getOperand(1);
49635 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
49636 !X86::isZeroNode(Cmp.getOperand(1)) ||
49637 !Cmp.getOperand(0).getValueType().isInteger())
49638 return SDValue();
49639
49640 SDValue Z = Cmp.getOperand(0);
49641 EVT ZVT = Z.getValueType();
49642
49643 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49644 // the general case below.
49645 if (ConstantX) {
49646 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49647 // fake operands:
49648 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49649 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49650 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
49651 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
49652 SDValue Zero = DAG.getConstant(0, DL, ZVT);
49653 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49654 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49655 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49656 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49657 SDValue(Neg.getNode(), 1));
49658 }
49659
49660 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49661 // with fake operands:
49662 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49663 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49664 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
49665 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
49666 SDValue One = DAG.getConstant(1, DL, ZVT);
49667 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49668 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49669 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49670 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49671 Cmp1.getValue(1));
49672 }
49673 }
49674
49675 // (cmp Z, 1) sets the carry flag if Z is 0.
49676 SDValue One = DAG.getConstant(1, DL, ZVT);
49677 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49678 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49679
49680 // Add the flags type for ADC/SBB nodes.
49681 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49682
49683 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49684 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49685 if (CC == X86::COND_NE)
49686 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49687 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49688
49689 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
49690 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
49691 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49692 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49693}
49694
49695static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
49696 const SDLoc &DL, EVT VT,
49697 const X86Subtarget &Subtarget) {
49698 // Example of pattern we try to detect:
49699 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
49700 //(add (build_vector (extract_elt t, 0),
49701 // (extract_elt t, 2),
49702 // (extract_elt t, 4),
49703 // (extract_elt t, 6)),
49704 // (build_vector (extract_elt t, 1),
49705 // (extract_elt t, 3),
49706 // (extract_elt t, 5),
49707 // (extract_elt t, 7)))
49708
49709 if (!Subtarget.hasSSE2())
49710 return SDValue();
49711
49712 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
49713 Op1.getOpcode() != ISD::BUILD_VECTOR)
49714 return SDValue();
49715
49716 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49717 VT.getVectorNumElements() < 4 ||
49718 !isPowerOf2_32(VT.getVectorNumElements()))
49719 return SDValue();
49720
49721 // Check if one of Op0,Op1 is of the form:
49722 // (build_vector (extract_elt Mul, 0),
49723 // (extract_elt Mul, 2),
49724 // (extract_elt Mul, 4),
49725 // ...
49726 // the other is of the form:
49727 // (build_vector (extract_elt Mul, 1),
49728 // (extract_elt Mul, 3),
49729 // (extract_elt Mul, 5),
49730 // ...
49731 // and identify Mul.
49732 SDValue Mul;
49733 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
49734 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
49735 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
49736 // TODO: Be more tolerant to undefs.
49737 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49738 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49739 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49740 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49741 return SDValue();
49742 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
49743 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
49744 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
49745 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
49746 if (!Const0L || !Const1L || !Const0H || !Const1H)
49747 return SDValue();
49748 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
49749 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
49750 // Commutativity of mul allows factors of a product to reorder.
49751 if (Idx0L > Idx1L)
49752 std::swap(Idx0L, Idx1L);
49753 if (Idx0H > Idx1H)
49754 std::swap(Idx0H, Idx1H);
49755 // Commutativity of add allows pairs of factors to reorder.
49756 if (Idx0L > Idx0H) {
49757 std::swap(Idx0L, Idx0H);
49758 std::swap(Idx1L, Idx1H);
49759 }
49760 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
49761 Idx1H != 2 * i + 3)
49762 return SDValue();
49763 if (!Mul) {
49764 // First time an extract_elt's source vector is visited. Must be a MUL
49765 // with 2X number of vector elements than the BUILD_VECTOR.
49766 // Both extracts must be from same MUL.
49767 Mul = Op0L->getOperand(0);
49768 if (Mul->getOpcode() != ISD::MUL ||
49769 Mul.getValueType().getVectorNumElements() != 2 * e)
49770 return SDValue();
49771 }
49772 // Check that the extract is from the same MUL previously seen.
49773 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
49774 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
49775 return SDValue();
49776 }
49777
49778 // Check if the Mul source can be safely shrunk.
49779 ShrinkMode Mode;
49780 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
49781 Mode == ShrinkMode::MULU16)
49782 return SDValue();
49783
49784 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49785 VT.getVectorNumElements() * 2);
49786 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
49787 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
49788
49789 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49790 ArrayRef<SDValue> Ops) {
49791 EVT InVT = Ops[0].getValueType();
49792 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((void)0);
49793 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49794 InVT.getVectorNumElements() / 2);
49795 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49796 };
49797 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
49798}
49799
49800// Attempt to turn this pattern into PMADDWD.
49801// (add (mul (sext (build_vector)), (sext (build_vector))),
49802// (mul (sext (build_vector)), (sext (build_vector)))
49803static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
49804 const SDLoc &DL, EVT VT,
49805 const X86Subtarget &Subtarget) {
49806 if (!Subtarget.hasSSE2())
49807 return SDValue();
49808
49809 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49810 return SDValue();
49811
49812 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49813 VT.getVectorNumElements() < 4 ||
49814 !isPowerOf2_32(VT.getVectorNumElements()))
49815 return SDValue();
49816
49817 SDValue N00 = N0.getOperand(0);
49818 SDValue N01 = N0.getOperand(1);
49819 SDValue N10 = N1.getOperand(0);
49820 SDValue N11 = N1.getOperand(1);
49821
49822 // All inputs need to be sign extends.
49823 // TODO: Support ZERO_EXTEND from known positive?
49824 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
49825 N01.getOpcode() != ISD::SIGN_EXTEND ||
49826 N10.getOpcode() != ISD::SIGN_EXTEND ||
49827 N11.getOpcode() != ISD::SIGN_EXTEND)
49828 return SDValue();
49829
49830 // Peek through the extends.
49831 N00 = N00.getOperand(0);
49832 N01 = N01.getOperand(0);
49833 N10 = N10.getOperand(0);
49834 N11 = N11.getOperand(0);
49835
49836 // Must be extending from vXi16.
49837 EVT InVT = N00.getValueType();
49838 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
49839 N10.getValueType() != InVT || N11.getValueType() != InVT)
49840 return SDValue();
49841
49842 // All inputs should be build_vectors.
49843 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49844 N01.getOpcode() != ISD::BUILD_VECTOR ||
49845 N10.getOpcode() != ISD::BUILD_VECTOR ||
49846 N11.getOpcode() != ISD::BUILD_VECTOR)
49847 return SDValue();
49848
49849 // For each element, we need to ensure we have an odd element from one vector
49850 // multiplied by the odd element of another vector and the even element from
49851 // one of the same vectors being multiplied by the even element from the
49852 // other vector. So we need to make sure for each element i, this operator
49853 // is being performed:
49854 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49855 SDValue In0, In1;
49856 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
49857 SDValue N00Elt = N00.getOperand(i);
49858 SDValue N01Elt = N01.getOperand(i);
49859 SDValue N10Elt = N10.getOperand(i);
49860 SDValue N11Elt = N11.getOperand(i);
49861 // TODO: Be more tolerant to undefs.
49862 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49863 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49864 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49865 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49866 return SDValue();
49867 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49868 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49869 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49870 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49871 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49872 return SDValue();
49873 unsigned IdxN00 = ConstN00Elt->getZExtValue();
49874 unsigned IdxN01 = ConstN01Elt->getZExtValue();
49875 unsigned IdxN10 = ConstN10Elt->getZExtValue();
49876 unsigned IdxN11 = ConstN11Elt->getZExtValue();
49877 // Add is commutative so indices can be reordered.
49878 if (IdxN00 > IdxN10) {
49879 std::swap(IdxN00, IdxN10);
49880 std::swap(IdxN01, IdxN11);
49881 }
49882 // N0 indices be the even element. N1 indices must be the next odd element.
49883 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49884 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49885 return SDValue();
49886 SDValue N00In = N00Elt.getOperand(0);
49887 SDValue N01In = N01Elt.getOperand(0);
49888 SDValue N10In = N10Elt.getOperand(0);
49889 SDValue N11In = N11Elt.getOperand(0);
49890
49891 // First time we find an input capture it.
49892 if (!In0) {
49893 In0 = N00In;
49894 In1 = N01In;
49895
49896 // The input vectors must be at least as wide as the output.
49897 // If they are larger than the output, we extract subvector below.
49898 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
49899 In1.getValueSizeInBits() < VT.getSizeInBits())
49900 return SDValue();
49901 }
49902 // Mul is commutative so the input vectors can be in any order.
49903 // Canonicalize to make the compares easier.
49904 if (In0 != N00In)
49905 std::swap(N00In, N01In);
49906 if (In0 != N10In)
49907 std::swap(N10In, N11In);
49908 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
49909 return SDValue();
49910 }
49911
49912 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49913 ArrayRef<SDValue> Ops) {
49914 EVT OpVT = Ops[0].getValueType();
49915 assert(OpVT.getScalarType() == MVT::i16 &&((void)0)
49916 "Unexpected scalar element type")((void)0);
49917 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")((void)0);
49918 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49919 OpVT.getVectorNumElements() / 2);
49920 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49921 };
49922
49923 // If the output is narrower than an input, extract the low part of the input
49924 // vector.
49925 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49926 VT.getVectorNumElements() * 2);
49927 if (OutVT16.bitsLT(In0.getValueType())) {
49928 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
49929 DAG.getIntPtrConstant(0, DL));
49930 }
49931 if (OutVT16.bitsLT(In1.getValueType())) {
49932 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
49933 DAG.getIntPtrConstant(0, DL));
49934 }
49935 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
49936 PMADDBuilder);
49937}
49938
49939/// CMOV of constants requires materializing constant operands in registers.
49940/// Try to fold those constants into an 'add' instruction to reduce instruction
49941/// count. We do this with CMOV rather the generic 'select' because there are
49942/// earlier folds that may be used to turn select-of-constants into logic hacks.
49943static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
49944 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
49945 // better because we eliminate 1-2 instructions. This transform is still
49946 // an improvement without zero operands because we trade 2 move constants and
49947 // 1 add for 2 adds (LEA) as long as the constants can be represented as
49948 // immediate asm operands (fit in 32-bits).
49949 auto isSuitableCmov = [](SDValue V) {
49950 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
49951 return false;
49952 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
49953 !isa<ConstantSDNode>(V.getOperand(1)))
49954 return false;
49955 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
49956 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
49957 V.getConstantOperandAPInt(1).isSignedIntN(32));
49958 };
49959
49960 // Match an appropriate CMOV as the first operand of the add.
49961 SDValue Cmov = N->getOperand(0);
49962 SDValue OtherOp = N->getOperand(1);
49963 if (!isSuitableCmov(Cmov))
49964 std::swap(Cmov, OtherOp);
49965 if (!isSuitableCmov(Cmov))
49966 return SDValue();
49967
49968 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
49969 EVT VT = N->getValueType(0);
49970 SDLoc DL(N);
49971 SDValue FalseOp = Cmov.getOperand(0);
49972 SDValue TrueOp = Cmov.getOperand(1);
49973 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
49974 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
49975 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
49976 Cmov.getOperand(3));
49977}
49978
49979static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
49980 TargetLowering::DAGCombinerInfo &DCI,
49981 const X86Subtarget &Subtarget) {
49982 EVT VT = N->getValueType(0);
49983 SDValue Op0 = N->getOperand(0);
49984 SDValue Op1 = N->getOperand(1);
49985
49986 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
49987 return Select;
49988
49989 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49990 return MAdd;
49991 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49992 return MAdd;
49993
49994 // Try to synthesize horizontal adds from adds of shuffles.
49995 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
49996 return V;
49997
49998 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
49999 // (sub Y, (sext (vXi1 X))).
50000 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
50001 // generic DAG combine without a legal type check, but adding this there
50002 // caused regressions.
50003 if (VT.isVector()) {
50004 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50005 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
50006 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50007 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
50008 SDLoc DL(N);
50009 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
50010 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
50011 }
50012
50013 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
50014 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50015 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
50016 SDLoc DL(N);
50017 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
50018 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
50019 }
50020 }
50021
50022 return combineAddOrSubToADCOrSBB(N, DAG);
50023}
50024
50025static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
50026 TargetLowering::DAGCombinerInfo &DCI,
50027 const X86Subtarget &Subtarget) {
50028 SDValue Op0 = N->getOperand(0);
50029 SDValue Op1 = N->getOperand(1);
50030
50031 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
50032 auto IsNonOpaqueConstant = [&](SDValue Op) {
50033 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
50034 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
50035 return !Cst->isOpaque();
50036 return true;
50037 }
50038 return false;
50039 };
50040
50041 // X86 can't encode an immediate LHS of a sub. See if we can push the
50042 // negation into a preceding instruction. If the RHS of the sub is a XOR with
50043 // one use and a constant, invert the immediate, saving one register.
50044 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
50045 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
50046 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
50047 SDLoc DL(N);
50048 EVT VT = Op0.getValueType();
50049 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
50050 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
50051 SDValue NewAdd =
50052 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
50053 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
50054 }
50055
50056 // Try to synthesize horizontal subs from subs of shuffles.
50057 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
50058 return V;
50059
50060 return combineAddOrSubToADCOrSBB(N, DAG);
50061}
50062
50063static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
50064 const X86Subtarget &Subtarget) {
50065 MVT VT = N->getSimpleValueType(0);
50066 SDLoc DL(N);
50067
50068 if (N->getOperand(0) == N->getOperand(1)) {
50069 if (N->getOpcode() == X86ISD::PCMPEQ)
50070 return DAG.getConstant(-1, DL, VT);
50071 if (N->getOpcode() == X86ISD::PCMPGT)
50072 return DAG.getConstant(0, DL, VT);
50073 }
50074
50075 return SDValue();
50076}
50077
50078/// Helper that combines an array of subvector ops as if they were the operands
50079/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
50080/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
50081static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
50082 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
50083 TargetLowering::DAGCombinerInfo &DCI,
50084 const X86Subtarget &Subtarget) {
50085 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")((void)0);
50086 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50087
50088 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
50089 return DAG.getUNDEF(VT);
50090
50091 if (llvm::all_of(Ops, [](SDValue Op) {
50092 return ISD::isBuildVectorAllZeros(Op.getNode());
50093 }))
50094 return getZeroVector(VT, Subtarget, DAG, DL);
50095
50096 SDValue Op0 = Ops[0];
50097 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
50098
50099 // Repeated subvectors.
50100 if (IsSplat &&
50101 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
50102 // If this broadcast is inserted into both halves, use a larger broadcast.
50103 if (Op0.getOpcode() == X86ISD::VBROADCAST)
50104 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
50105
50106 // If this scalar/subvector broadcast_load is inserted into both halves, use
50107 // a larger broadcast_load. Update other uses to use an extracted subvector.
50108 if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50109 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
50110 auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
50111 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50112 SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
50113 SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
50114 MemIntr->getMemoryVT(),
50115 MemIntr->getMemOperand());
50116 DAG.ReplaceAllUsesOfValueWith(
50117 Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50118 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50119 return BcastLd;
50120 }
50121
50122 // If this is a simple subvector load repeated across multiple lanes, then
50123 // broadcast the load. Update other uses to use an extracted subvector.
50124 if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
50125 if (Ld->isSimple() && !Ld->isNonTemporal() &&
50126 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
50127 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50128 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
50129 SDValue BcastLd =
50130 DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
50131 Ld->getMemoryVT(), Ld->getMemOperand());
50132 DAG.ReplaceAllUsesOfValueWith(
50133 Op0,
50134 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50135 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
50136 return BcastLd;
50137 }
50138 }
50139
50140 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
50141 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
50142 (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
50143 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
50144 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
50145 Op0.getOperand(0),
50146 DAG.getIntPtrConstant(0, DL)));
50147
50148 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
50149 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50150 (Subtarget.hasAVX2() ||
50151 (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
50152 Op0.getOperand(0).getValueType() == VT.getScalarType())
50153 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
50154
50155 // concat_vectors(extract_subvector(broadcast(x)),
50156 // extract_subvector(broadcast(x))) -> broadcast(x)
50157 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50158 Op0.getOperand(0).getValueType() == VT) {
50159 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
50160 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
50161 return Op0.getOperand(0);
50162 }
50163 }
50164
50165 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
50166 // Only concat of subvector high halves which vperm2x128 is best at.
50167 // TODO: This should go in combineX86ShufflesRecursively eventually.
50168 if (VT.is256BitVector() && Ops.size() == 2) {
50169 SDValue Src0 = peekThroughBitcasts(Ops[0]);
50170 SDValue Src1 = peekThroughBitcasts(Ops[1]);
50171 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50172 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
50173 EVT SrcVT0 = Src0.getOperand(0).getValueType();
50174 EVT SrcVT1 = Src1.getOperand(0).getValueType();
50175 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
50176 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
50177 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
50178 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
50179 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
50180 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
50181 DAG.getBitcast(VT, Src0.getOperand(0)),
50182 DAG.getBitcast(VT, Src1.getOperand(0)),
50183 DAG.getTargetConstant(0x31, DL, MVT::i8));
50184 }
50185 }
50186 }
50187
50188 // Repeated opcode.
50189 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
50190 // but it currently struggles with different vector widths.
50191 if (llvm::all_of(Ops, [Op0](SDValue Op) {
50192 return Op.getOpcode() == Op0.getOpcode();
50193 })) {
50194 auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
50195 SmallVector<SDValue> Subs;
50196 for (SDValue SubOp : SubOps)
50197 Subs.push_back(SubOp.getOperand(I));
50198 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
50199 };
50200
50201 unsigned NumOps = Ops.size();
50202 switch (Op0.getOpcode()) {
50203 case X86ISD::SHUFP: {
50204 // Add SHUFPD support if/when necessary.
50205 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
50206 llvm::all_of(Ops, [Op0](SDValue Op) {
50207 return Op.getOperand(2) == Op0.getOperand(2);
50208 })) {
50209 return DAG.getNode(Op0.getOpcode(), DL, VT,
50210 ConcatSubOperand(VT, Ops, 0),
50211 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50212 }
50213 break;
50214 }
50215 case X86ISD::PSHUFHW:
50216 case X86ISD::PSHUFLW:
50217 case X86ISD::PSHUFD:
50218 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
50219 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50220 return DAG.getNode(Op0.getOpcode(), DL, VT,
50221 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50222 }
50223 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50224 case X86ISD::VPERMILPI:
50225 // TODO - add support for vXf64/vXi64 shuffles.
50226 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
50227 Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50228 SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
50229 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
50230 Op0.getOperand(1));
50231 return DAG.getBitcast(VT, Res);
50232 }
50233 break;
50234 case X86ISD::VPERMV3:
50235 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
50236 MVT OpVT = Op0.getSimpleValueType();
50237 int NumSrcElts = OpVT.getVectorNumElements();
50238 SmallVector<int, 64> ConcatMask;
50239 for (unsigned i = 0; i != NumOps; ++i) {
50240 SmallVector<int, 64> SubMask;
50241 SmallVector<SDValue, 2> SubOps;
50242 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
50243 SubMask))
50244 break;
50245 for (int M : SubMask) {
50246 if (0 <= M) {
50247 M += M < NumSrcElts ? 0 : NumSrcElts;
50248 M += i * NumSrcElts;
50249 }
50250 ConcatMask.push_back(M);
50251 }
50252 }
50253 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
50254 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
50255 Ops[1].getOperand(0), DAG, DL);
50256 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
50257 Ops[1].getOperand(2), DAG, DL);
50258 MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
50259 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
50260 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
50261 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
50262 }
50263 }
50264 break;
50265 case X86ISD::VSHLI:
50266 case X86ISD::VSRLI:
50267 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
50268 // TODO: Move this to LowerScalarImmediateShift?
50269 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
50270 llvm::all_of(Ops, [](SDValue Op) {
50271 return Op.getConstantOperandAPInt(1) == 32;
50272 })) {
50273 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
50274 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
50275 if (Op0.getOpcode() == X86ISD::VSHLI) {
50276 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
50277 {8, 0, 8, 2, 8, 4, 8, 6});
50278 } else {
50279 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
50280 {1, 8, 3, 8, 5, 8, 7, 8});
50281 }
50282 return DAG.getBitcast(VT, Res);
50283 }
50284 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50285 case X86ISD::VSRAI:
50286 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
50287 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
50288 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
50289 llvm::all_of(Ops, [Op0](SDValue Op) {
50290 return Op0.getOperand(1) == Op.getOperand(1);
50291 })) {
50292 return DAG.getNode(Op0.getOpcode(), DL, VT,
50293 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50294 }
50295 break;
50296 case X86ISD::VPERMI:
50297 case X86ISD::VROTLI:
50298 case X86ISD::VROTRI:
50299 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
50300 llvm::all_of(Ops, [Op0](SDValue Op) {
50301 return Op0.getOperand(1) == Op.getOperand(1);
50302 })) {
50303 return DAG.getNode(Op0.getOpcode(), DL, VT,
50304 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50305 }
50306 break;
50307 case ISD::AND:
50308 case ISD::OR:
50309 case ISD::XOR:
50310 case X86ISD::ANDNP:
50311 // TODO: Add 256-bit support.
50312 if (!IsSplat && VT.is512BitVector()) {
50313 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
50314 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
50315 NumOps * SrcVT.getVectorNumElements());
50316 return DAG.getNode(Op0.getOpcode(), DL, VT,
50317 ConcatSubOperand(SrcVT, Ops, 0),
50318 ConcatSubOperand(SrcVT, Ops, 1));
50319 }
50320 break;
50321 case X86ISD::HADD:
50322 case X86ISD::HSUB:
50323 case X86ISD::FHADD:
50324 case X86ISD::FHSUB:
50325 case X86ISD::PACKSS:
50326 case X86ISD::PACKUS:
50327 if (!IsSplat && VT.is256BitVector() &&
50328 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
50329 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
50330 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
50331 NumOps * SrcVT.getVectorNumElements());
50332 return DAG.getNode(Op0.getOpcode(), DL, VT,
50333 ConcatSubOperand(SrcVT, Ops, 0),
50334 ConcatSubOperand(SrcVT, Ops, 1));
50335 }
50336 break;
50337 case X86ISD::PALIGNR:
50338 if (!IsSplat &&
50339 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
50340 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
50341 llvm::all_of(Ops, [Op0](SDValue Op) {
50342 return Op0.getOperand(2) == Op.getOperand(2);
50343 })) {
50344 return DAG.getNode(Op0.getOpcode(), DL, VT,
50345 ConcatSubOperand(VT, Ops, 0),
50346 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50347 }
50348 break;
50349 }
50350 }
50351
50352 // Fold subvector loads into one.
50353 // If needed, look through bitcasts to get to the load.
50354 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
50355 bool Fast;
50356 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
50357 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50358 *FirstLd->getMemOperand(), &Fast) &&
50359 Fast) {
50360 if (SDValue Ld =
50361 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
50362 return Ld;
50363 }
50364 }
50365
50366 return SDValue();
50367}
50368
50369static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
50370 TargetLowering::DAGCombinerInfo &DCI,
50371 const X86Subtarget &Subtarget) {
50372 EVT VT = N->getValueType(0);
50373 EVT SrcVT = N->getOperand(0).getValueType();
50374 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50375
50376 // Don't do anything for i1 vectors.
50377 if (VT.getVectorElementType() == MVT::i1)
50378 return SDValue();
50379
50380 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
50381 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
50382 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
50383 DCI, Subtarget))
50384 return R;
50385 }
50386
50387 return SDValue();
50388}
50389
50390static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
50391 TargetLowering::DAGCombinerInfo &DCI,
50392 const X86Subtarget &Subtarget) {
50393 if (DCI.isBeforeLegalizeOps())
50394 return SDValue();
50395
50396 MVT OpVT = N->getSimpleValueType(0);
50397
50398 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
50399
50400 SDLoc dl(N);
50401 SDValue Vec = N->getOperand(0);
50402 SDValue SubVec = N->getOperand(1);
50403
50404 uint64_t IdxVal = N->getConstantOperandVal(2);
50405 MVT SubVecVT = SubVec.getSimpleValueType();
50406
50407 if (Vec.isUndef() && SubVec.isUndef())
50408 return DAG.getUNDEF(OpVT);
50409
50410 // Inserting undefs/zeros into zeros/undefs is a zero vector.
50411 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
50412 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
50413 return getZeroVector(OpVT, Subtarget, DAG, dl);
50414
50415 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
50416 // If we're inserting into a zero vector and then into a larger zero vector,
50417 // just insert into the larger zero vector directly.
50418 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
50419 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
50420 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
50421 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50422 getZeroVector(OpVT, Subtarget, DAG, dl),
50423 SubVec.getOperand(1),
50424 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
50425 }
50426
50427 // If we're inserting into a zero vector and our input was extracted from an
50428 // insert into a zero vector of the same type and the extraction was at
50429 // least as large as the original insertion. Just insert the original
50430 // subvector into a zero vector.
50431 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
50432 isNullConstant(SubVec.getOperand(1)) &&
50433 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
50434 SDValue Ins = SubVec.getOperand(0);
50435 if (isNullConstant(Ins.getOperand(2)) &&
50436 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
50437 Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
50438 SubVecVT.getFixedSizeInBits())
50439 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50440 getZeroVector(OpVT, Subtarget, DAG, dl),
50441 Ins.getOperand(1), N->getOperand(2));
50442 }
50443 }
50444
50445 // Stop here if this is an i1 vector.
50446 if (IsI1Vector)
50447 return SDValue();
50448
50449 // If this is an insert of an extract, combine to a shuffle. Don't do this
50450 // if the insert or extract can be represented with a subregister operation.
50451 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50452 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
50453 (IdxVal != 0 ||
50454 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
50455 int ExtIdxVal = SubVec.getConstantOperandVal(1);
50456 if (ExtIdxVal != 0) {
50457 int VecNumElts = OpVT.getVectorNumElements();
50458 int SubVecNumElts = SubVecVT.getVectorNumElements();
50459 SmallVector<int, 64> Mask(VecNumElts);
50460 // First create an identity shuffle mask.
50461 for (int i = 0; i != VecNumElts; ++i)
50462 Mask[i] = i;
50463 // Now insert the extracted portion.
50464 for (int i = 0; i != SubVecNumElts; ++i)
50465 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
50466
50467 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
50468 }
50469 }
50470
50471 // Match concat_vector style patterns.
50472 SmallVector<SDValue, 2> SubVectorOps;
50473 if (collectConcatOps(N, SubVectorOps)) {
50474 if (SDValue Fold =
50475 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
50476 return Fold;
50477
50478 // If we're inserting all zeros into the upper half, change this to
50479 // a concat with zero. We will match this to a move
50480 // with implicit upper bit zeroing during isel.
50481 // We do this here because we don't want combineConcatVectorOps to
50482 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
50483 if (SubVectorOps.size() == 2 &&
50484 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
50485 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50486 getZeroVector(OpVT, Subtarget, DAG, dl),
50487 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
50488 }
50489
50490 // If this is a broadcast insert into an upper undef, use a larger broadcast.
50491 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
50492 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
50493
50494 // If this is a broadcast load inserted into an upper undef, use a larger
50495 // broadcast load.
50496 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
50497 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
50498 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
50499 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
50500 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
50501 SDValue BcastLd =
50502 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
50503 MemIntr->getMemoryVT(),
50504 MemIntr->getMemOperand());
50505 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50506 return BcastLd;
50507 }
50508
50509 // If we're splatting the lower half subvector of a full vector load into the
50510 // upper half, attempt to create a subvector broadcast.
50511 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
50512 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
50513 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
50514 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
50515 if (VecLd && SubLd &&
50516 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
50517 SubVec.getValueSizeInBits() / 8, 0))
50518 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
50519 SubLd, 0, DAG);
50520 }
50521
50522 return SDValue();
50523}
50524
50525/// If we are extracting a subvector of a vector select and the select condition
50526/// is composed of concatenated vectors, try to narrow the select width. This
50527/// is a common pattern for AVX1 integer code because 256-bit selects may be
50528/// legal, but there is almost no integer math/logic available for 256-bit.
50529/// This function should only be called with legal types (otherwise, the calls
50530/// to get simple value types will assert).
50531static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
50532 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
50533 SmallVector<SDValue, 4> CatOps;
50534 if (Sel.getOpcode() != ISD::VSELECT ||
50535 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
50536 return SDValue();
50537
50538 // Note: We assume simple value types because this should only be called with
50539 // legal operations/types.
50540 // TODO: This can be extended to handle extraction to 256-bits.
50541 MVT VT = Ext->getSimpleValueType(0);
50542 if (!VT.is128BitVector())
50543 return SDValue();
50544
50545 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
50546 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
50547 return SDValue();
50548
50549 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
50550 MVT SelVT = Sel.getSimpleValueType();
50551 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&((void)0)
50552 "Unexpected vector type with legal operations")((void)0);
50553
50554 unsigned SelElts = SelVT.getVectorNumElements();
50555 unsigned CastedElts = WideVT.getVectorNumElements();
50556 unsigned ExtIdx = Ext->getConstantOperandVal(1);
50557 if (SelElts % CastedElts == 0) {
50558 // The select has the same or more (narrower) elements than the extract
50559 // operand. The extraction index gets scaled by that factor.
50560 ExtIdx *= (SelElts / CastedElts);
50561 } else if (CastedElts % SelElts == 0) {
50562 // The select has less (wider) elements than the extract operand. Make sure
50563 // that the extraction index can be divided evenly.
50564 unsigned IndexDivisor = CastedElts / SelElts;
50565 if (ExtIdx % IndexDivisor != 0)
50566 return SDValue();
50567 ExtIdx /= IndexDivisor;
50568 } else {
50569 llvm_unreachable("Element count of simple vector types are not divisible?")__builtin_unreachable();
50570 }
50571
50572 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
50573 unsigned NarrowElts = SelElts / NarrowingFactor;
50574 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
50575 SDLoc DL(Ext);
50576 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
50577 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
50578 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
50579 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
50580 return DAG.getBitcast(VT, NarrowSel);
50581}
50582
50583static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
50584 TargetLowering::DAGCombinerInfo &DCI,
50585 const X86Subtarget &Subtarget) {
50586 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
50587 // eventually get combined/lowered into ANDNP) with a concatenated operand,
50588 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
50589 // We let generic combining take over from there to simplify the
50590 // insert/extract and 'not'.
50591 // This pattern emerges during AVX1 legalization. We handle it before lowering
50592 // to avoid complications like splitting constant vector loads.
50593
50594 // Capture the original wide type in the likely case that we need to bitcast
50595 // back to this type.
50596 if (!N->getValueType(0).isSimple())
50597 return SDValue();
50598
50599 MVT VT = N->getSimpleValueType(0);
50600 SDValue InVec = N->getOperand(0);
50601 unsigned IdxVal = N->getConstantOperandVal(1);
50602 SDValue InVecBC = peekThroughBitcasts(InVec);
50603 EVT InVecVT = InVec.getValueType();
50604 unsigned SizeInBits = VT.getSizeInBits();
50605 unsigned InSizeInBits = InVecVT.getSizeInBits();
50606 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50607
50608 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
50609 TLI.isTypeLegal(InVecVT) &&
50610 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
50611 auto isConcatenatedNot = [](SDValue V) {
50612 V = peekThroughBitcasts(V);
50613 if (!isBitwiseNot(V))
50614 return false;
50615 SDValue NotOp = V->getOperand(0);
50616 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
50617 };
50618 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
50619 isConcatenatedNot(InVecBC.getOperand(1))) {
50620 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
50621 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
50622 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
50623 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
50624 }
50625 }
50626
50627 if (DCI.isBeforeLegalizeOps())
50628 return SDValue();
50629
50630 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
50631 return V;
50632
50633 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
50634 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50635
50636 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
50637 if (VT.getScalarType() == MVT::i1)
50638 return DAG.getConstant(1, SDLoc(N), VT);
50639 return getOnesVector(VT, DAG, SDLoc(N));
50640 }
50641
50642 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
50643 return DAG.getBuildVector(
50644 VT, SDLoc(N),
50645 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
50646
50647 // If we are extracting from an insert into a zero vector, replace with a
50648 // smaller insert into zero if we don't access less than the original
50649 // subvector. Don't do this for i1 vectors.
50650 if (VT.getVectorElementType() != MVT::i1 &&
50651 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
50652 InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
50653 ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
50654 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
50655 SDLoc DL(N);
50656 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
50657 getZeroVector(VT, Subtarget, DAG, DL),
50658 InVec.getOperand(1), InVec.getOperand(2));
50659 }
50660
50661 // If we're extracting an upper subvector from a broadcast we should just
50662 // extract the lowest subvector instead which should allow
50663 // SimplifyDemandedVectorElts do more simplifications.
50664 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
50665 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50666 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
50667 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50668
50669 // If we're extracting a broadcasted subvector, just use the lowest subvector.
50670 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50671 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
50672 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50673
50674 // Attempt to extract from the source of a shuffle vector.
50675 if ((InSizeInBits % SizeInBits) == 0 &&
50676 (IdxVal % VT.getVectorNumElements()) == 0) {
50677 SmallVector<int, 32> ShuffleMask;
50678 SmallVector<int, 32> ScaledMask;
50679 SmallVector<SDValue, 2> ShuffleInputs;
50680 unsigned NumSubVecs = InSizeInBits / SizeInBits;
50681 // Decode the shuffle mask and scale it so its shuffling subvectors.
50682 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
50683 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
50684 unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
50685 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
50686 return DAG.getUNDEF(VT);
50687 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
50688 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50689 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
50690 if (Src.getValueSizeInBits() == InSizeInBits) {
50691 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
50692 unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
50693 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
50694 SDLoc(N), SizeInBits);
50695 }
50696 }
50697 }
50698
50699 // If we're extracting the lowest subvector and we're the only user,
50700 // we may be able to perform this with a smaller vector width.
50701 unsigned InOpcode = InVec.getOpcode();
50702 if (IdxVal == 0 && InVec.hasOneUse()) {
50703 if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
50704 // v2f64 CVTDQ2PD(v4i32).
50705 if (InOpcode == ISD::SINT_TO_FP &&
50706 InVec.getOperand(0).getValueType() == MVT::v4i32) {
50707 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
50708 }
50709 // v2f64 CVTUDQ2PD(v4i32).
50710 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
50711 InVec.getOperand(0).getValueType() == MVT::v4i32) {
50712 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
50713 }
50714 // v2f64 CVTPS2PD(v4f32).
50715 if (InOpcode == ISD::FP_EXTEND &&
50716 InVec.getOperand(0).getValueType() == MVT::v4f32) {
50717 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
50718 }
50719 }
50720 if ((InOpcode == ISD::ANY_EXTEND ||
50721 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50722 InOpcode == ISD::ZERO_EXTEND ||
50723 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
50724 InOpcode == ISD::SIGN_EXTEND ||
50725 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50726 (SizeInBits == 128 || SizeInBits == 256) &&
50727 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
50728 SDLoc DL(N);
50729 SDValue Ext = InVec.getOperand(0);
50730 if (Ext.getValueSizeInBits() > SizeInBits)
50731 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
50732 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
50733 return DAG.getNode(ExtOp, DL, VT, Ext);
50734 }
50735 if (InOpcode == ISD::VSELECT &&
50736 InVec.getOperand(0).getValueType().is256BitVector() &&
50737 InVec.getOperand(1).getValueType().is256BitVector() &&
50738 InVec.getOperand(2).getValueType().is256BitVector()) {
50739 SDLoc DL(N);
50740 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
50741 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
50742 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
50743 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
50744 }
50745 if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
50746 (VT.is128BitVector() || VT.is256BitVector())) {
50747 SDLoc DL(N);
50748 SDValue InVecSrc = InVec.getOperand(0);
50749 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
50750 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
50751 return DAG.getNode(InOpcode, DL, VT, Ext);
50752 }
50753 }
50754
50755 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
50756 // as this is very likely to fold into a shuffle/truncation.
50757 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
50758 InVecVT.getScalarSizeInBits() == 64 &&
50759 InVec.getConstantOperandAPInt(1) == 32) {
50760 SDLoc DL(N);
50761 SDValue Ext =
50762 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
50763 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
50764 }
50765
50766 return SDValue();
50767}
50768
50769static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
50770 EVT VT = N->getValueType(0);
50771 SDValue Src = N->getOperand(0);
50772 SDLoc DL(N);
50773
50774 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
50775 // This occurs frequently in our masked scalar intrinsic code and our
50776 // floating point select lowering with AVX512.
50777 // TODO: SimplifyDemandedBits instead?
50778 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
50779 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50780 if (C->getAPIntValue().isOneValue())
50781 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
50782 Src.getOperand(0));
50783
50784 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
50785 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50786 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
50787 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
50788 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50789 if (C->isNullValue())
50790 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
50791 Src.getOperand(1));
50792
50793 // Reduce v2i64 to v4i32 if we don't need the upper bits.
50794 // TODO: Move to DAGCombine/SimplifyDemandedBits?
50795 if (VT == MVT::v2i64 || VT == MVT::v2f64) {
50796 auto IsAnyExt64 = [](SDValue Op) {
50797 if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
50798 return SDValue();
50799 if (Op.getOpcode() == ISD::ANY_EXTEND &&
50800 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
50801 return Op.getOperand(0);
50802 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
50803 if (Ld->getExtensionType() == ISD::EXTLOAD &&
50804 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
50805 return Op;
50806 return SDValue();
50807 };
50808 if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
50809 return DAG.getBitcast(
50810 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
50811 DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
50812 }
50813
50814 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
50815 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
50816 Src.getOperand(0).getValueType() == MVT::x86mmx)
50817 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
50818
50819 // See if we're broadcasting the scalar value, in which case just reuse that.
50820 // Ensure the same SDValue from the SDNode use is being used.
50821 if (VT.getScalarType() == Src.getValueType())
50822 for (SDNode *User : Src->uses())
50823 if (User->getOpcode() == X86ISD::VBROADCAST &&
50824 Src == User->getOperand(0)) {
50825 unsigned SizeInBits = VT.getFixedSizeInBits();
50826 unsigned BroadcastSizeInBits =
50827 User->getValueSizeInBits(0).getFixedSize();
50828 if (BroadcastSizeInBits == SizeInBits)
50829 return SDValue(User, 0);
50830 if (BroadcastSizeInBits > SizeInBits)
50831 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
50832 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
50833 // coverage.
50834 }
50835
50836 return SDValue();
50837}
50838
50839// Simplify PMULDQ and PMULUDQ operations.
50840static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
50841 TargetLowering::DAGCombinerInfo &DCI,
50842 const X86Subtarget &Subtarget) {
50843 SDValue LHS = N->getOperand(0);
50844 SDValue RHS = N->getOperand(1);
50845
50846 // Canonicalize constant to RHS.
50847 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
50848 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
50849 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
50850
50851 // Multiply by zero.
50852 // Don't return RHS as it may contain UNDEFs.
50853 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
50854 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
50855
50856 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
50857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50858 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
50859 return SDValue(N, 0);
50860
50861 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
50862 // convert it to any_extend_invec, due to the LegalOperations check, do the
50863 // conversion directly to a vector shuffle manually. This exposes combine
50864 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
50865 // combineX86ShufflesRecursively on SSE4.1 targets.
50866 // FIXME: This is basically a hack around several other issues related to
50867 // ANY_EXTEND_VECTOR_INREG.
50868 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
50869 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50870 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50871 LHS.getOperand(0).getValueType() == MVT::v4i32) {
50872 SDLoc dl(N);
50873 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
50874 LHS.getOperand(0), { 0, -1, 1, -1 });
50875 LHS = DAG.getBitcast(MVT::v2i64, LHS);
50876 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50877 }
50878 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
50879 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50880 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50881 RHS.getOperand(0).getValueType() == MVT::v4i32) {
50882 SDLoc dl(N);
50883 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
50884 RHS.getOperand(0), { 0, -1, 1, -1 });
50885 RHS = DAG.getBitcast(MVT::v2i64, RHS);
50886 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50887 }
50888
50889 return SDValue();
50890}
50891
50892static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
50893 TargetLowering::DAGCombinerInfo &DCI,
50894 const X86Subtarget &Subtarget) {
50895 EVT VT = N->getValueType(0);
50896 SDValue In = N->getOperand(0);
50897 unsigned Opcode = N->getOpcode();
50898 unsigned InOpcode = In.getOpcode();
50899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50900
50901 // Try to merge vector loads and extend_inreg to an extload.
50902 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
50903 In.hasOneUse()) {
50904 auto *Ld = cast<LoadSDNode>(In);
50905 if (Ld->isSimple()) {
50906 MVT SVT = In.getSimpleValueType().getVectorElementType();
50907 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
50908 ? ISD::SEXTLOAD
50909 : ISD::ZEXTLOAD;
50910 EVT MemVT = VT.changeVectorElementType(SVT);
50911 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
50912 SDValue Load =
50913 DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
50914 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50915 Ld->getMemOperand()->getFlags());
50916 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
50917 return Load;
50918 }
50919 }
50920 }
50921
50922 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
50923 if (Opcode == InOpcode)
50924 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
50925
50926 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
50927 // -> EXTEND_VECTOR_INREG(X).
50928 // TODO: Handle non-zero subvector indices.
50929 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
50930 In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
50931 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
50932 In.getValueSizeInBits())
50933 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
50934
50935 // Attempt to combine as a shuffle.
50936 // TODO: General ZERO_EXTEND_VECTOR_INREG support.
50937 if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50938 (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
50939 SDValue Op(N, 0);
50940 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
50941 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50942 return Res;
50943 }
50944
50945 return SDValue();
50946}
50947
50948static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
50949 TargetLowering::DAGCombinerInfo &DCI) {
50950 EVT VT = N->getValueType(0);
50951
50952 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
50953 return DAG.getConstant(0, SDLoc(N), VT);
50954
50955 APInt KnownUndef, KnownZero;
50956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50957 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
50958 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
50959 KnownZero, DCI))
50960 return SDValue(N, 0);
50961
50962 return SDValue();
50963}
50964
50965// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
50966// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
50967// extra instructions between the conversion due to going to scalar and back.
50968static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
50969 const X86Subtarget &Subtarget) {
50970 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
50971 return SDValue();
50972
50973 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
50974 return SDValue();
50975
50976 if (N->getValueType(0) != MVT::f32 ||
50977 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
50978 return SDValue();
50979
50980 SDLoc dl(N);
50981 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
50982 N->getOperand(0).getOperand(0));
50983 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
50984 DAG.getTargetConstant(4, dl, MVT::i32));
50985 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
50986 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
50987 DAG.getIntPtrConstant(0, dl));
50988}
50989
50990static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
50991 const X86Subtarget &Subtarget) {
50992 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
50993 return SDValue();
50994
50995 bool IsStrict = N->isStrictFPOpcode();
50996 EVT VT = N->getValueType(0);
50997 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50998 EVT SrcVT = Src.getValueType();
50999
51000 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
51001 return SDValue();
51002
51003 if (VT.getVectorElementType() != MVT::f32 &&
51004 VT.getVectorElementType() != MVT::f64)
51005 return SDValue();
51006
51007 unsigned NumElts = VT.getVectorNumElements();
51008 if (NumElts == 1 || !isPowerOf2_32(NumElts))
51009 return SDValue();
51010
51011 SDLoc dl(N);
51012
51013 // Convert the input to vXi16.
51014 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
51015 Src = DAG.getBitcast(IntVT, Src);
51016
51017 // Widen to at least 8 input elements.
51018 if (NumElts < 8) {
51019 unsigned NumConcats = 8 / NumElts;
51020 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
51021 : DAG.getConstant(0, dl, IntVT);
51022 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
51023 Ops[0] = Src;
51024 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
51025 }
51026
51027 // Destination is vXf32 with at least 4 elements.
51028 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
51029 std::max(4U, NumElts));
51030 SDValue Cvt, Chain;
51031 if (IsStrict) {
51032 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
51033 {N->getOperand(0), Src});
51034 Chain = Cvt.getValue(1);
51035 } else {
51036 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
51037 }
51038
51039 if (NumElts < 4) {
51040 assert(NumElts == 2 && "Unexpected size")((void)0);
51041 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
51042 DAG.getIntPtrConstant(0, dl));
51043 }
51044
51045 if (IsStrict) {
51046 // Extend to the original VT if necessary.
51047 if (Cvt.getValueType() != VT) {
51048 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
51049 {Chain, Cvt});
51050 Chain = Cvt.getValue(1);
51051 }
51052 return DAG.getMergeValues({Cvt, Chain}, dl);
51053 }
51054
51055 // Extend to the original VT if necessary.
51056 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
51057}
51058
51059// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
51060// from. Limit this to cases where the loads have the same input chain and the
51061// output chains are unused. This avoids any memory ordering issues.
51062static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
51063 TargetLowering::DAGCombinerInfo &DCI) {
51064 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||((void)0)
51065 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&((void)0)
51066 "Unknown broadcast load type")((void)0);
51067
51068 // Only do this if the chain result is unused.
51069 if (N->hasAnyUseOfValue(1))
51070 return SDValue();
51071
51072 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
51073
51074 SDValue Ptr = MemIntrin->getBasePtr();
51075 SDValue Chain = MemIntrin->getChain();
51076 EVT VT = N->getSimpleValueType(0);
51077 EVT MemVT = MemIntrin->getMemoryVT();
51078
51079 // Look at other users of our base pointer and try to find a wider broadcast.
51080 // The input chain and the size of the memory VT must match.
51081 for (SDNode *User : Ptr->uses())
51082 if (User != N && User->getOpcode() == N->getOpcode() &&
51083 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51084 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51085 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51086 MemVT.getSizeInBits() &&
51087 !User->hasAnyUseOfValue(1) &&
51088 User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
51089 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51090 VT.getSizeInBits());
51091 Extract = DAG.getBitcast(VT, Extract);
51092 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51093 }
51094
51095 return SDValue();
51096}
51097
51098static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
51099 const X86Subtarget &Subtarget) {
51100 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
51101 return SDValue();
51102
51103 EVT VT = N->getValueType(0);
51104 SDValue Src = N->getOperand(0);
51105 EVT SrcVT = Src.getValueType();
51106
51107 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
51108 SrcVT.getVectorElementType() != MVT::f32)
51109 return SDValue();
51110
51111 unsigned NumElts = VT.getVectorNumElements();
51112 if (NumElts == 1 || !isPowerOf2_32(NumElts))
51113 return SDValue();
51114
51115 SDLoc dl(N);
51116
51117 // Widen to at least 4 input elements.
51118 if (NumElts < 4)
51119 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
51120 DAG.getConstantFP(0.0, dl, SrcVT));
51121
51122 // Destination is v8i16 with at least 8 elements.
51123 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51124 std::max(8U, NumElts));
51125 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
51126 DAG.getTargetConstant(4, dl, MVT::i32));
51127
51128 // Extract down to real number of elements.
51129 if (NumElts < 8) {
51130 EVT IntVT = VT.changeVectorElementTypeToInteger();
51131 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
51132 DAG.getIntPtrConstant(0, dl));
51133 }
51134
51135 return DAG.getBitcast(VT, Cvt);
51136}
51137
51138static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
51139 SDValue Src = N->getOperand(0);
51140
51141 // Turn MOVDQ2Q+simple_load into an mmx load.
51142 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
51143 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
51144
51145 if (LN->isSimple()) {
51146 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
51147 LN->getBasePtr(),
51148 LN->getPointerInfo(),
51149 LN->getOriginalAlign(),
51150 LN->getMemOperand()->getFlags());
51151 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
51152 return NewLd;
51153 }
51154 }
51155
51156 return SDValue();
51157}
51158
51159static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
51160 TargetLowering::DAGCombinerInfo &DCI) {
51161 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
51162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51163 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
51164 APInt::getAllOnesValue(NumBits), DCI))
51165 return SDValue(N, 0);
51166
51167 return SDValue();
51168}
51169
51170SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
51171 DAGCombinerInfo &DCI) const {
51172 SelectionDAG &DAG = DCI.DAG;
51173 switch (N->getOpcode()) {
51174 default: break;
51175 case ISD::SCALAR_TO_VECTOR:
51176 return combineScalarToVector(N, DAG);
51177 case ISD::EXTRACT_VECTOR_ELT:
51178 case X86ISD::PEXTRW:
51179 case X86ISD::PEXTRB:
51180 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
51181 case ISD::CONCAT_VECTORS:
51182 return combineConcatVectors(N, DAG, DCI, Subtarget);
51183 case ISD::INSERT_SUBVECTOR:
51184 return combineInsertSubvector(N, DAG, DCI, Subtarget);
51185 case ISD::EXTRACT_SUBVECTOR:
51186 return combineExtractSubvector(N, DAG, DCI, Subtarget);
51187 case ISD::VSELECT:
51188 case ISD::SELECT:
51189 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
51190 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
51191 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
51192 case X86ISD::CMP: return combineCMP(N, DAG);
51193 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
51194 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
51195 case X86ISD::ADD:
51196 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
51197 case X86ISD::SBB: return combineSBB(N, DAG);
51198 case X86ISD::ADC: return combineADC(N, DAG, DCI);
51199 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
51200 case ISD::SHL: return combineShiftLeft(N, DAG);
51201 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
51202 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
51203 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
51204 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
51205 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
51206 case X86ISD::BEXTR:
51207 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
51208 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
51209 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
51210 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
51211 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
51212 case X86ISD::VEXTRACT_STORE:
51213 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
51214 case ISD::SINT_TO_FP:
51215 case ISD::STRICT_SINT_TO_FP:
51216 return combineSIntToFP(N, DAG, DCI, Subtarget);
51217 case ISD::UINT_TO_FP:
51218 case ISD::STRICT_UINT_TO_FP:
51219 return combineUIntToFP(N, DAG, Subtarget);
51220 case ISD::FADD:
51221 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
51222 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
51223 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
51224 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
51225 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
51226 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
51227 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
51228 case X86ISD::FXOR:
51229 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
51230 case X86ISD::FMIN:
51231 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
51232 case ISD::FMINNUM:
51233 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
51234 case X86ISD::CVTSI2P:
51235 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
51236 case X86ISD::CVTP2SI:
51237 case X86ISD::CVTP2UI:
51238 case X86ISD::STRICT_CVTTP2SI:
51239 case X86ISD::CVTTP2SI:
51240 case X86ISD::STRICT_CVTTP2UI:
51241 case X86ISD::CVTTP2UI:
51242 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
51243 case X86ISD::STRICT_CVTPH2PS:
51244 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
51245 case X86ISD::BT: return combineBT(N, DAG, DCI);
51246 case ISD::ANY_EXTEND:
51247 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
51248 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
51249 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
51250 case ISD::ANY_EXTEND_VECTOR_INREG:
51251 case ISD::SIGN_EXTEND_VECTOR_INREG:
51252 case ISD::ZERO_EXTEND_VECTOR_INREG:
51253 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
51254 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
51255 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
51256 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
51257 case X86ISD::PACKSS:
51258 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
51259 case X86ISD::HADD:
51260 case X86ISD::HSUB:
51261 case X86ISD::FHADD:
51262 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
51263 case X86ISD::VSHL:
51264 case X86ISD::VSRA:
51265 case X86ISD::VSRL:
51266 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
51267 case X86ISD::VSHLI:
51268 case X86ISD::VSRAI:
51269 case X86ISD::VSRLI:
51270 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
51271 case ISD::INSERT_VECTOR_ELT:
51272 case X86ISD::PINSRB:
51273 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
51274 case X86ISD::SHUFP: // Handle all target specific shuffles
51275 case X86ISD::INSERTPS:
51276 case X86ISD::EXTRQI:
51277 case X86ISD::INSERTQI:
51278 case X86ISD::VALIGN:
51279 case X86ISD::PALIGNR:
51280 case X86ISD::VSHLDQ:
51281 case X86ISD::VSRLDQ:
51282 case X86ISD::BLENDI:
51283 case X86ISD::UNPCKH:
51284 case X86ISD::UNPCKL:
51285 case X86ISD::MOVHLPS:
51286 case X86ISD::MOVLHPS:
51287 case X86ISD::PSHUFB:
51288 case X86ISD::PSHUFD:
51289 case X86ISD::PSHUFHW:
51290 case X86ISD::PSHUFLW:
51291 case X86ISD::MOVSHDUP:
51292 case X86ISD::MOVSLDUP:
51293 case X86ISD::MOVDDUP:
51294 case X86ISD::MOVSS:
51295 case X86ISD::MOVSD:
51296 case X86ISD::VBROADCAST:
51297 case X86ISD::VPPERM:
51298 case X86ISD::VPERMI:
51299 case X86ISD::VPERMV:
51300 case X86ISD::VPERMV3:
51301 case X86ISD::VPERMIL2:
51302 case X86ISD::VPERMILPI:
51303 case X86ISD::VPERMILPV:
51304 case X86ISD::VPERM2X128:
51305 case X86ISD::SHUF128:
51306 case X86ISD::VZEXT_MOVL:
51307 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
51308 case X86ISD::FMADD_RND:
51309 case X86ISD::FMSUB:
51310 case X86ISD::STRICT_FMSUB:
51311 case X86ISD::FMSUB_RND:
51312 case X86ISD::FNMADD:
51313 case X86ISD::STRICT_FNMADD:
51314 case X86ISD::FNMADD_RND:
51315 case X86ISD::FNMSUB:
51316 case X86ISD::STRICT_FNMSUB:
51317 case X86ISD::FNMSUB_RND:
51318 case ISD::FMA:
51319 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
51320 case X86ISD::FMADDSUB_RND:
51321 case X86ISD::FMSUBADD_RND:
51322 case X86ISD::FMADDSUB:
51323 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
51324 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
51325 case X86ISD::MGATHER:
51326 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
51327 case ISD::MGATHER:
51328 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
51329 case X86ISD::PCMPEQ:
51330 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
51331 case X86ISD::PMULDQ:
51332 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
51333 case X86ISD::KSHIFTL:
51334 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
51335 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
51336 case ISD::STRICT_FP_EXTEND:
51337 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
51338 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
51339 case X86ISD::VBROADCAST_LOAD:
51340 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
51341 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
51342 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
51343 }
51344
51345 return SDValue();
51346}
51347
51348bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
51349 if (!isTypeLegal(VT))
51350 return false;
51351
51352 // There are no vXi8 shifts.
51353 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
51354 return false;
51355
51356 // TODO: Almost no 8-bit ops are desirable because they have no actual
51357 // size/speed advantages vs. 32-bit ops, but they do have a major
51358 // potential disadvantage by causing partial register stalls.
51359 //
51360 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
51361 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
51362 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
51363 // check for a constant operand to the multiply.
51364 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
51365 return false;
51366
51367 // i16 instruction encodings are longer and some i16 instructions are slow,
51368 // so those are not desirable.
51369 if (VT == MVT::i16) {
51370 switch (Opc) {
51371 default:
51372 break;
51373 case ISD::LOAD:
51374 case ISD::SIGN_EXTEND:
51375 case ISD::ZERO_EXTEND:
51376 case ISD::ANY_EXTEND:
51377 case ISD::SHL:
51378 case ISD::SRA:
51379 case ISD::SRL:
51380 case ISD::SUB:
51381 case ISD::ADD:
51382 case ISD::MUL:
51383 case ISD::AND:
51384 case ISD::OR:
51385 case ISD::XOR:
51386 return false;
51387 }
51388 }
51389
51390 // Any legal type not explicitly accounted for above here is desirable.
51391 return true;
51392}
51393
51394SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
51395 SDValue Value, SDValue Addr,
51396 SelectionDAG &DAG) const {
51397 const Module *M = DAG.getMachineFunction().getMMI().getModule();
51398 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
51399 if (IsCFProtectionSupported) {
51400 // In case control-flow branch protection is enabled, we need to add
51401 // notrack prefix to the indirect branch.
51402 // In order to do that we create NT_BRIND SDNode.
51403 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
51404 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
51405 }
51406
51407 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
51408}
51409
51410bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
51411 EVT VT = Op.getValueType();
51412 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
51413 isa<ConstantSDNode>(Op.getOperand(1));
51414
51415 // i16 is legal, but undesirable since i16 instruction encodings are longer
51416 // and some i16 instructions are slow.
51417 // 8-bit multiply-by-constant can usually be expanded to something cheaper
51418 // using LEA and/or other ALU ops.
51419 if (VT != MVT::i16 && !Is8BitMulByConstant)
51420 return false;
51421
51422 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
51423 if (!Op.hasOneUse())
51424 return false;
51425 SDNode *User = *Op->use_begin();
51426 if (!ISD::isNormalStore(User))
51427 return false;
51428 auto *Ld = cast<LoadSDNode>(Load);
51429 auto *St = cast<StoreSDNode>(User);
51430 return Ld->getBasePtr() == St->getBasePtr();
51431 };
51432
51433 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
51434 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
51435 return false;
51436 if (!Op.hasOneUse())
51437 return false;
51438 SDNode *User = *Op->use_begin();
51439 if (User->getOpcode() != ISD::ATOMIC_STORE)
51440 return false;
51441 auto *Ld = cast<AtomicSDNode>(Load);
51442 auto *St = cast<AtomicSDNode>(User);
51443 return Ld->getBasePtr() == St->getBasePtr();
51444 };
51445
51446 bool Commute = false;
51447 switch (Op.getOpcode()) {
51448 default: return false;
51449 case ISD::SIGN_EXTEND:
51450 case ISD::ZERO_EXTEND:
51451 case ISD::ANY_EXTEND:
51452 break;
51453 case ISD::SHL:
51454 case ISD::SRA:
51455 case ISD::SRL: {
51456 SDValue N0 = Op.getOperand(0);
51457 // Look out for (store (shl (load), x)).
51458 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
51459 return false;
51460 break;
51461 }
51462 case ISD::ADD:
51463 case ISD::MUL:
51464 case ISD::AND:
51465 case ISD::OR:
51466 case ISD::XOR:
51467 Commute = true;
51468 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51469 case ISD::SUB: {
51470 SDValue N0 = Op.getOperand(0);
51471 SDValue N1 = Op.getOperand(1);
51472 // Avoid disabling potential load folding opportunities.
51473 if (MayFoldLoad(N1) &&
51474 (!Commute || !isa<ConstantSDNode>(N0) ||
51475 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
51476 return false;
51477 if (MayFoldLoad(N0) &&
51478 ((Commute && !isa<ConstantSDNode>(N1)) ||
51479 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
51480 return false;
51481 if (IsFoldableAtomicRMW(N0, Op) ||
51482 (Commute && IsFoldableAtomicRMW(N1, Op)))
51483 return false;
51484 }
51485 }
51486
51487 PVT = MVT::i32;
51488 return true;
51489}
51490
51491//===----------------------------------------------------------------------===//
51492// X86 Inline Assembly Support
51493//===----------------------------------------------------------------------===//
51494
51495// Helper to match a string separated by whitespace.
51496static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
51497 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
51498
51499 for (StringRef Piece : Pieces) {
51500 if (!S.startswith(Piece)) // Check if the piece matches.
51501 return false;
51502
51503 S = S.substr(Piece.size());
51504 StringRef::size_type Pos = S.find_first_not_of(" \t");
51505 if (Pos == 0) // We matched a prefix.
51506 return false;
51507
51508 S = S.substr(Pos);
51509 }
51510
51511 return S.empty();
51512}
51513
51514static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
51515
51516 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
51517 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
51518 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
51519 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
51520
51521 if (AsmPieces.size() == 3)
51522 return true;
51523 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
51524 return true;
51525 }
51526 }
51527 return false;
51528}
51529
51530bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
51531 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
51532
51533 const std::string &AsmStr = IA->getAsmString();
51534
51535 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
51536 if (!Ty || Ty->getBitWidth() % 16 != 0)
51537 return false;
51538
51539 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
51540 SmallVector<StringRef, 4> AsmPieces;
51541 SplitString(AsmStr, AsmPieces, ";\n");
51542
51543 switch (AsmPieces.size()) {
51544 default: return false;
51545 case 1:
51546 // FIXME: this should verify that we are targeting a 486 or better. If not,
51547 // we will turn this bswap into something that will be lowered to logical
51548 // ops instead of emitting the bswap asm. For now, we don't support 486 or
51549 // lower so don't worry about this.
51550 // bswap $0
51551 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
51552 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
51553 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
51554 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
51555 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
51556 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
51557 // No need to check constraints, nothing other than the equivalent of
51558 // "=r,0" would be valid here.
51559 return IntrinsicLowering::LowerToByteSwap(CI);
51560 }
51561
51562 // rorw $$8, ${0:w} --> llvm.bswap.i16
51563 if (CI->getType()->isIntegerTy(16) &&
51564 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51565 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
51566 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
51567 AsmPieces.clear();
51568 StringRef ConstraintsStr = IA->getConstraintString();
51569 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51570 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51571 if (clobbersFlagRegisters(AsmPieces))
51572 return IntrinsicLowering::LowerToByteSwap(CI);
51573 }
51574 break;
51575 case 3:
51576 if (CI->getType()->isIntegerTy(32) &&
51577 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51578 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
51579 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
51580 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
51581 AsmPieces.clear();
51582 StringRef ConstraintsStr = IA->getConstraintString();
51583 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51584 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51585 if (clobbersFlagRegisters(AsmPieces))
51586 return IntrinsicLowering::LowerToByteSwap(CI);
51587 }
51588
51589 if (CI->getType()->isIntegerTy(64)) {
51590 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
51591 if (Constraints.size() >= 2 &&
51592 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
51593 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
51594 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
51595 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
51596 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
51597 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
51598 return IntrinsicLowering::LowerToByteSwap(CI);
51599 }
51600 }
51601 break;
51602 }
51603 return false;
51604}
51605
51606static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
51607 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
51608 .Case("{@cca}", X86::COND_A)
51609 .Case("{@ccae}", X86::COND_AE)
51610 .Case("{@ccb}", X86::COND_B)
51611 .Case("{@ccbe}", X86::COND_BE)
51612 .Case("{@ccc}", X86::COND_B)
51613 .Case("{@cce}", X86::COND_E)
51614 .Case("{@ccz}", X86::COND_E)
51615 .Case("{@ccg}", X86::COND_G)
51616 .Case("{@ccge}", X86::COND_GE)
51617 .Case("{@ccl}", X86::COND_L)
51618 .Case("{@ccle}", X86::COND_LE)
51619 .Case("{@ccna}", X86::COND_BE)
51620 .Case("{@ccnae}", X86::COND_B)
51621 .Case("{@ccnb}", X86::COND_AE)
51622 .Case("{@ccnbe}", X86::COND_A)
51623 .Case("{@ccnc}", X86::COND_AE)
51624 .Case("{@ccne}", X86::COND_NE)
51625 .Case("{@ccnz}", X86::COND_NE)
51626 .Case("{@ccng}", X86::COND_LE)
51627 .Case("{@ccnge}", X86::COND_L)
51628 .Case("{@ccnl}", X86::COND_GE)
51629 .Case("{@ccnle}", X86::COND_G)
51630 .Case("{@ccno}", X86::COND_NO)
51631 .Case("{@ccnp}", X86::COND_NP)
51632 .Case("{@ccns}", X86::COND_NS)
51633 .Case("{@cco}", X86::COND_O)
51634 .Case("{@ccp}", X86::COND_P)
51635 .Case("{@ccs}", X86::COND_S)
51636 .Default(X86::COND_INVALID);
51637 return Cond;
51638}
51639
51640/// Given a constraint letter, return the type of constraint for this target.
51641X86TargetLowering::ConstraintType
51642X86TargetLowering::getConstraintType(StringRef Constraint) const {
51643 if (Constraint.size() == 1) {
51644 switch (Constraint[0]) {
51645 case 'R':
51646 case 'q':
51647 case 'Q':
51648 case 'f':
51649 case 't':
51650 case 'u':
51651 case 'y':
51652 case 'x':
51653 case 'v':
51654 case 'l':
51655 case 'k': // AVX512 masking registers.
51656 return C_RegisterClass;
51657 case 'a':
51658 case 'b':
51659 case 'c':
51660 case 'd':
51661 case 'S':
51662 case 'D':
51663 case 'A':
51664 return C_Register;
51665 case 'I':
51666 case 'J':
51667 case 'K':
51668 case 'N':
51669 case 'G':
51670 case 'L':
51671 case 'M':
51672 return C_Immediate;
51673 case 'C':
51674 case 'e':
51675 case 'Z':
51676 return C_Other;
51677 default:
51678 break;
51679 }
51680 }
51681 else if (Constraint.size() == 2) {
51682 switch (Constraint[0]) {
51683 default:
51684 break;
51685 case 'Y':
51686 switch (Constraint[1]) {
51687 default:
51688 break;
51689 case 'z':
51690 return C_Register;
51691 case 'i':
51692 case 'm':
51693 case 'k':
51694 case 't':
51695 case '2':
51696 return C_RegisterClass;
51697 }
51698 }
51699 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
51700 return C_Other;
51701 return TargetLowering::getConstraintType(Constraint);
51702}
51703
51704/// Examine constraint type and operand type and determine a weight value.
51705/// This object must already have been set up with the operand type
51706/// and the current alternative constraint selected.
51707TargetLowering::ConstraintWeight
51708 X86TargetLowering::getSingleConstraintMatchWeight(
51709 AsmOperandInfo &info, const char *constraint) const {
51710 ConstraintWeight weight = CW_Invalid;
51711 Value *CallOperandVal = info.CallOperandVal;
51712 // If we don't have a value, we can't do a match,
51713 // but allow it at the lowest weight.
51714 if (!CallOperandVal)
51715 return CW_Default;
51716 Type *type = CallOperandVal->getType();
51717 // Look at the constraint type.
51718 switch (*constraint) {
51719 default:
51720 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
51721 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51722 case 'R':
51723 case 'q':
51724 case 'Q':
51725 case 'a':
51726 case 'b':
51727 case 'c':
51728 case 'd':
51729 case 'S':
51730 case 'D':
51731 case 'A':
51732 if (CallOperandVal->getType()->isIntegerTy())
51733 weight = CW_SpecificReg;
51734 break;
51735 case 'f':
51736 case 't':
51737 case 'u':
51738 if (type->isFloatingPointTy())
51739 weight = CW_SpecificReg;
51740 break;
51741 case 'y':
51742 if (type->isX86_MMXTy() && Subtarget.hasMMX())
51743 weight = CW_SpecificReg;
51744 break;
51745 case 'Y':
51746 if (StringRef(constraint).size() != 2)
51747 break;
51748 switch (constraint[1]) {
51749 default:
51750 return CW_Invalid;
51751 // XMM0
51752 case 'z':
51753 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51754 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
51755 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
51756 return CW_SpecificReg;
51757 return CW_Invalid;
51758 // Conditional OpMask regs (AVX512)
51759 case 'k':
51760 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51761 return CW_Register;
51762 return CW_Invalid;
51763 // Any MMX reg
51764 case 'm':
51765 if (type->isX86_MMXTy() && Subtarget.hasMMX())
51766 return weight;
51767 return CW_Invalid;
51768 // Any SSE reg when ISA >= SSE2, same as 'x'
51769 case 'i':
51770 case 't':
51771 case '2':
51772 if (!Subtarget.hasSSE2())
51773 return CW_Invalid;
51774 break;
51775 }
51776 break;
51777 case 'v':
51778 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
51779 weight = CW_Register;
51780 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51781 case 'x':
51782 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51783 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
51784 weight = CW_Register;
51785 break;
51786 case 'k':
51787 // Enable conditional vector operations using %k<#> registers.
51788 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51789 weight = CW_Register;
51790 break;
51791 case 'I':
51792 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
51793 if (C->getZExtValue() <= 31)
51794 weight = CW_Constant;
51795 }
51796 break;
51797 case 'J':
51798 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51799 if (C->getZExtValue() <= 63)
51800 weight = CW_Constant;
51801 }
51802 break;
51803 case 'K':
51804 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51805 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
51806 weight = CW_Constant;
51807 }
51808 break;
51809 case 'L':
51810 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51811 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
51812 weight = CW_Constant;
51813 }
51814 break;
51815 case 'M':
51816 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51817 if (C->getZExtValue() <= 3)
51818 weight = CW_Constant;
51819 }
51820 break;
51821 case 'N':
51822 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51823 if (C->getZExtValue() <= 0xff)
51824 weight = CW_Constant;
51825 }
51826 break;
51827 case 'G':
51828 case 'C':
51829 if (isa<ConstantFP>(CallOperandVal)) {
51830 weight = CW_Constant;
51831 }
51832 break;
51833 case 'e':
51834 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51835 if ((C->getSExtValue() >= -0x80000000LL) &&
51836 (C->getSExtValue() <= 0x7fffffffLL))
51837 weight = CW_Constant;
51838 }
51839 break;
51840 case 'Z':
51841 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51842 if (C->getZExtValue() <= 0xffffffff)
51843 weight = CW_Constant;
51844 }
51845 break;
51846 }
51847 return weight;
51848}
51849
51850/// Try to replace an X constraint, which matches anything, with another that
51851/// has more specific requirements based on the type of the corresponding
51852/// operand.
51853const char *X86TargetLowering::
51854LowerXConstraint(EVT ConstraintVT) const {
51855 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
51856 // 'f' like normal targets.
51857 if (ConstraintVT.isFloatingPoint()) {
51858 if (Subtarget.hasSSE1())
51859 return "x";
51860 }
51861
51862 return TargetLowering::LowerXConstraint(ConstraintVT);
51863}
51864
51865// Lower @cc targets via setcc.
51866SDValue X86TargetLowering::LowerAsmOutputForConstraint(
51867 SDValue &Chain, SDValue &Flag, const SDLoc &DL,
51868 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
51869 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
51870 if (Cond == X86::COND_INVALID)
51871 return SDValue();
51872 // Check that return type is valid.
51873 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
51874 OpInfo.ConstraintVT.getSizeInBits() < 8)
51875 report_fatal_error("Flag output operand is of invalid type");
51876
51877 // Get EFLAGS register. Only update chain when copyfrom is glued.
51878 if (Flag.getNode()) {
51879 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
51880 Chain = Flag.getValue(1);
51881 } else
51882 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
51883 // Extract CC code.
51884 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
51885 // Extend to 32-bits
51886 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
51887
51888 return Result;
51889}
51890
51891/// Lower the specified operand into the Ops vector.
51892/// If it is invalid, don't add anything to Ops.
51893void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
51894 std::string &Constraint,
51895 std::vector<SDValue>&Ops,
51896 SelectionDAG &DAG) const {
51897 SDValue Result;
51898
51899 // Only support length 1 constraints for now.
51900 if (Constraint.length() > 1) return;
51901
51902 char ConstraintLetter = Constraint[0];
51903 switch (ConstraintLetter) {
51904 default: break;
51905 case 'I':
51906 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51907 if (C->getZExtValue() <= 31) {
51908 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51909 Op.getValueType());
51910 break;
51911 }
51912 }
51913 return;
51914 case 'J':
51915 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51916 if (C->getZExtValue() <= 63) {
51917 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51918 Op.getValueType());
51919 break;
51920 }
51921 }
51922 return;
51923 case 'K':
51924 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51925 if (isInt<8>(C->getSExtValue())) {
51926 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51927 Op.getValueType());
51928 break;
51929 }
51930 }
51931 return;
51932 case 'L':
51933 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51934 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
51935 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
51936 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
51937 Op.getValueType());
51938 break;
51939 }
51940 }
51941 return;
51942 case 'M':
51943 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51944 if (C->getZExtValue() <= 3) {
51945 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51946 Op.getValueType());
51947 break;
51948 }
51949 }
51950 return;
51951 case 'N':
51952 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51953 if (C->getZExtValue() <= 255) {
51954 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51955 Op.getValueType());
51956 break;
51957 }
51958 }
51959 return;
51960 case 'O':
51961 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51962 if (C->getZExtValue() <= 127) {
51963 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51964 Op.getValueType());
51965 break;
51966 }
51967 }
51968 return;
51969 case 'e': {
51970 // 32-bit signed value
51971 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51972 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51973 C->getSExtValue())) {
51974 // Widen to 64 bits here to get it sign extended.
51975 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
51976 break;
51977 }
51978 // FIXME gcc accepts some relocatable values here too, but only in certain
51979 // memory models; it's complicated.
51980 }
51981 return;
51982 }
51983 case 'Z': {
51984 // 32-bit unsigned value
51985 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51986 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51987 C->getZExtValue())) {
51988 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51989 Op.getValueType());
51990 break;
51991 }
51992 }
51993 // FIXME gcc accepts some relocatable values here too, but only in certain
51994 // memory models; it's complicated.
51995 return;
51996 }
51997 case 'i': {
51998 // Literal immediates are always ok.
51999 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
52000 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
52001 BooleanContent BCont = getBooleanContents(MVT::i64);
52002 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
52003 : ISD::SIGN_EXTEND;
52004 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
52005 : CST->getSExtValue();
52006 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
52007 break;
52008 }
52009
52010 // In any sort of PIC mode addresses need to be computed at runtime by
52011 // adding in a register or some sort of table lookup. These can't
52012 // be used as immediates.
52013 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
52014 return;
52015
52016 // If we are in non-pic codegen mode, we allow the address of a global (with
52017 // an optional displacement) to be used with 'i'.
52018 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
52019 // If we require an extra load to get this address, as in PIC mode, we
52020 // can't accept it.
52021 if (isGlobalStubReference(
52022 Subtarget.classifyGlobalReference(GA->getGlobal())))
52023 return;
52024 break;
52025 }
52026 }
52027
52028 if (Result.getNode()) {
52029 Ops.push_back(Result);
52030 return;
52031 }
52032 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
52033}
52034
52035/// Check if \p RC is a general purpose register class.
52036/// I.e., GR* or one of their variant.
52037static bool isGRClass(const TargetRegisterClass &RC) {
52038 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
52039 RC.hasSuperClassEq(&X86::GR16RegClass) ||
52040 RC.hasSuperClassEq(&X86::GR32RegClass) ||
52041 RC.hasSuperClassEq(&X86::GR64RegClass) ||
52042 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
52043}
52044
52045/// Check if \p RC is a vector register class.
52046/// I.e., FR* / VR* or one of their variant.
52047static bool isFRClass(const TargetRegisterClass &RC) {
52048 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
52049 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
52050 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
52051 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
52052 RC.hasSuperClassEq(&X86::VR512RegClass);
52053}
52054
52055/// Check if \p RC is a mask register class.
52056/// I.e., VK* or one of their variant.
52057static bool isVKClass(const TargetRegisterClass &RC) {
52058 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
52059 RC.hasSuperClassEq(&X86::VK2RegClass) ||
52060 RC.hasSuperClassEq(&X86::VK4RegClass) ||
52061 RC.hasSuperClassEq(&X86::VK8RegClass) ||
52062 RC.hasSuperClassEq(&X86::VK16RegClass) ||
52063 RC.hasSuperClassEq(&X86::VK32RegClass) ||
52064 RC.hasSuperClassEq(&X86::VK64RegClass);
52065}
52066
52067std::pair<unsigned, const TargetRegisterClass *>
52068X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
52069 StringRef Constraint,
52070 MVT VT) const {
52071 // First, see if this is a constraint that directly corresponds to an LLVM
52072 // register class.
52073 if (Constraint.size() == 1) {
52074 // GCC Constraint Letters
52075 switch (Constraint[0]) {
52076 default: break;
52077 // 'A' means [ER]AX + [ER]DX.
52078 case 'A':
52079 if (Subtarget.is64Bit())
52080 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
52081 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&((void)0)
52082 "Expecting 64, 32 or 16 bit subtarget")((void)0);
52083 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52084
52085 // TODO: Slight differences here in allocation order and leaving
52086 // RIP in the class. Do they matter any more here than they do
52087 // in the normal allocation?
52088 case 'k':
52089 if (Subtarget.hasAVX512()) {
52090 if (VT == MVT::i1)
52091 return std::make_pair(0U, &X86::VK1RegClass);
52092 if (VT == MVT::i8)
52093 return std::make_pair(0U, &X86::VK8RegClass);
52094 if (VT == MVT::i16)
52095 return std::make_pair(0U, &X86::VK16RegClass);
52096 }
52097 if (Subtarget.hasBWI()) {
52098 if (VT == MVT::i32)
52099 return std::make_pair(0U, &X86::VK32RegClass);
52100 if (VT == MVT::i64)
52101 return std::make_pair(0U, &X86::VK64RegClass);
52102 }
52103 break;
52104 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
52105 if (Subtarget.is64Bit()) {
52106 if (VT == MVT::i8 || VT == MVT::i1)
52107 return std::make_pair(0U, &X86::GR8RegClass);
52108 if (VT == MVT::i16)
52109 return std::make_pair(0U, &X86::GR16RegClass);
52110 if (VT == MVT::i32 || VT == MVT::f32)
52111 return std::make_pair(0U, &X86::GR32RegClass);
52112 if (VT != MVT::f80 && !VT.isVector())
52113 return std::make_pair(0U, &X86::GR64RegClass);
52114 break;
52115 }
52116 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52117 // 32-bit fallthrough
52118 case 'Q': // Q_REGS
52119 if (VT == MVT::i8 || VT == MVT::i1)
52120 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
52121 if (VT == MVT::i16)
52122 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
52123 if (VT == MVT::i32 || VT == MVT::f32 ||
52124 (!VT.isVector() && !Subtarget.is64Bit()))
52125 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
52126 if (VT != MVT::f80 && !VT.isVector())
52127 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
52128 break;
52129 case 'r': // GENERAL_REGS
52130 case 'l': // INDEX_REGS
52131 if (VT == MVT::i8 || VT == MVT::i1)
52132 return std::make_pair(0U, &X86::GR8RegClass);
52133 if (VT == MVT::i16)
52134 return std::make_pair(0U, &X86::GR16RegClass);
52135 if (VT == MVT::i32 || VT == MVT::f32 ||
52136 (!VT.isVector() && !Subtarget.is64Bit()))
52137 return std::make_pair(0U, &X86::GR32RegClass);
52138 if (VT != MVT::f80 && !VT.isVector())
52139 return std::make_pair(0U, &X86::GR64RegClass);
52140 break;
52141 case 'R': // LEGACY_REGS
52142 if (VT == MVT::i8 || VT == MVT::i1)
52143 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
52144 if (VT == MVT::i16)
52145 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
52146 if (VT == MVT::i32 || VT == MVT::f32 ||
52147 (!VT.isVector() && !Subtarget.is64Bit()))
52148 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
52149 if (VT != MVT::f80 && !VT.isVector())
52150 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
52151 break;
52152 case 'f': // FP Stack registers.
52153 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
52154 // value to the correct fpstack register class.
52155 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
52156 return std::make_pair(0U, &X86::RFP32RegClass);
52157 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
52158 return std::make_pair(0U, &X86::RFP64RegClass);
52159 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
52160 return std::make_pair(0U, &X86::RFP80RegClass);
52161 break;
52162 case 'y': // MMX_REGS if MMX allowed.
52163 if (!Subtarget.hasMMX()) break;
52164 return std::make_pair(0U, &X86::VR64RegClass);
52165 case 'v':
52166 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
52167 if (!Subtarget.hasSSE1()) break;
52168 bool VConstraint = (Constraint[0] == 'v');
52169
52170 switch (VT.SimpleTy) {
52171 default: break;
52172 // Scalar SSE types.
52173 case MVT::f32:
52174 case MVT::i32:
52175 if (VConstraint && Subtarget.hasVLX())
52176 return std::make_pair(0U, &X86::FR32XRegClass);
52177 return std::make_pair(0U, &X86::FR32RegClass);
52178 case MVT::f64:
52179 case MVT::i64:
52180 if (VConstraint && Subtarget.hasVLX())
52181 return std::make_pair(0U, &X86::FR64XRegClass);
52182 return std::make_pair(0U, &X86::FR64RegClass);
52183 case MVT::i128:
52184 if (Subtarget.is64Bit()) {
52185 if (VConstraint && Subtarget.hasVLX())
52186 return std::make_pair(0U, &X86::VR128XRegClass);
52187 return std::make_pair(0U, &X86::VR128RegClass);
52188 }
52189 break;
52190 // Vector types and fp128.
52191 case MVT::f128:
52192 case MVT::v16i8:
52193 case MVT::v8i16:
52194 case MVT::v4i32:
52195 case MVT::v2i64:
52196 case MVT::v4f32:
52197 case MVT::v2f64:
52198 if (VConstraint && Subtarget.hasVLX())
52199 return std::make_pair(0U, &X86::VR128XRegClass);
52200 return std::make_pair(0U, &X86::VR128RegClass);
52201 // AVX types.
52202 case MVT::v32i8:
52203 case MVT::v16i16:
52204 case MVT::v8i32:
52205 case MVT::v4i64:
52206 case MVT::v8f32:
52207 case MVT::v4f64:
52208 if (VConstraint && Subtarget.hasVLX())
52209 return std::make_pair(0U, &X86::VR256XRegClass);
52210 if (Subtarget.hasAVX())
52211 return std::make_pair(0U, &X86::VR256RegClass);
52212 break;
52213 case MVT::v64i8:
52214 case MVT::v32i16:
52215 case MVT::v8f64:
52216 case MVT::v16f32:
52217 case MVT::v16i32:
52218 case MVT::v8i64:
52219 if (!Subtarget.hasAVX512()) break;
52220 if (VConstraint)
52221 return std::make_pair(0U, &X86::VR512RegClass);
52222 return std::make_pair(0U, &X86::VR512_0_15RegClass);
52223 }
52224 break;
52225 }
52226 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
52227 switch (Constraint[1]) {
52228 default:
52229 break;
52230 case 'i':
52231 case 't':
52232 case '2':
52233 return getRegForInlineAsmConstraint(TRI, "x", VT);
52234 case 'm':
52235 if (!Subtarget.hasMMX()) break;
52236 return std::make_pair(0U, &X86::VR64RegClass);
52237 case 'z':
52238 if (!Subtarget.hasSSE1()) break;
52239 switch (VT.SimpleTy) {
52240 default: break;
52241 // Scalar SSE types.
52242 case MVT::f32:
52243 case MVT::i32:
52244 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
52245 case MVT::f64:
52246 case MVT::i64:
52247 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
52248 case MVT::f128:
52249 case MVT::v16i8:
52250 case MVT::v8i16:
52251 case MVT::v4i32:
52252 case MVT::v2i64:
52253 case MVT::v4f32:
52254 case MVT::v2f64:
52255 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
52256 // AVX types.
52257 case MVT::v32i8:
52258 case MVT::v16i16:
52259 case MVT::v8i32:
52260 case MVT::v4i64:
52261 case MVT::v8f32:
52262 case MVT::v4f64:
52263 if (Subtarget.hasAVX())
52264 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
52265 break;
52266 case MVT::v64i8:
52267 case MVT::v32i16:
52268 case MVT::v8f64:
52269 case MVT::v16f32:
52270 case MVT::v16i32:
52271 case MVT::v8i64:
52272 if (Subtarget.hasAVX512())
52273 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
52274 break;
52275 }
52276 break;
52277 case 'k':
52278 // This register class doesn't allocate k0 for masked vector operation.
52279 if (Subtarget.hasAVX512()) {
52280 if (VT == MVT::i1)
52281 return std::make_pair(0U, &X86::VK1WMRegClass);
52282 if (VT == MVT::i8)
52283 return std::make_pair(0U, &X86::VK8WMRegClass);
52284 if (VT == MVT::i16)
52285 return std::make_pair(0U, &X86::VK16WMRegClass);
52286 }
52287 if (Subtarget.hasBWI()) {
52288 if (VT == MVT::i32)
52289 return std::make_pair(0U, &X86::VK32WMRegClass);
52290 if (VT == MVT::i64)
52291 return std::make_pair(0U, &X86::VK64WMRegClass);
52292 }
52293 break;
52294 }
52295 }
52296
52297 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
52298 return std::make_pair(0U, &X86::GR32RegClass);
52299
52300 // Use the default implementation in TargetLowering to convert the register
52301 // constraint into a member of a register class.
52302 std::pair<Register, const TargetRegisterClass*> Res;
52303 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
52304
52305 // Not found as a standard register?
52306 if (!Res.second) {
52307 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
52308 // to/from f80.
52309 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
52310 // Map st(0) -> st(7) -> ST0
52311 if (Constraint.size() == 7 && Constraint[0] == '{' &&
52312 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
52313 Constraint[3] == '(' &&
52314 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
52315 Constraint[5] == ')' && Constraint[6] == '}') {
52316 // st(7) is not allocatable and thus not a member of RFP80. Return
52317 // singleton class in cases where we have a reference to it.
52318 if (Constraint[4] == '7')
52319 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
52320 return std::make_pair(X86::FP0 + Constraint[4] - '0',
52321 &X86::RFP80RegClass);
52322 }
52323
52324 // GCC allows "st(0)" to be called just plain "st".
52325 if (StringRef("{st}").equals_insensitive(Constraint))
52326 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
52327 }
52328
52329 // flags -> EFLAGS
52330 if (StringRef("{flags}").equals_insensitive(Constraint))
52331 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
52332
52333 // dirflag -> DF
52334 // Only allow for clobber.
52335 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
52336 VT == MVT::Other)
52337 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
52338
52339 // fpsr -> FPSW
52340 if (StringRef("{fpsr}").equals_insensitive(Constraint))
52341 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
52342
52343 return Res;
52344 }
52345
52346 // Make sure it isn't a register that requires 64-bit mode.
52347 if (!Subtarget.is64Bit() &&
52348 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
52349 TRI->getEncodingValue(Res.first) >= 8) {
52350 // Register requires REX prefix, but we're in 32-bit mode.
52351 return std::make_pair(0, nullptr);
52352 }
52353
52354 // Make sure it isn't a register that requires AVX512.
52355 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
52356 TRI->getEncodingValue(Res.first) & 0x10) {
52357 // Register requires EVEX prefix.
52358 return std::make_pair(0, nullptr);
52359 }
52360
52361 // Otherwise, check to see if this is a register class of the wrong value
52362 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
52363 // turn into {ax},{dx}.
52364 // MVT::Other is used to specify clobber names.
52365 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
52366 return Res; // Correct type already, nothing to do.
52367
52368 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
52369 // return "eax". This should even work for things like getting 64bit integer
52370 // registers when given an f64 type.
52371 const TargetRegisterClass *Class = Res.second;
52372 // The generic code will match the first register class that contains the
52373 // given register. Thus, based on the ordering of the tablegened file,
52374 // the "plain" GR classes might not come first.
52375 // Therefore, use a helper method.
52376 if (isGRClass(*Class)) {
52377 unsigned Size = VT.getSizeInBits();
52378 if (Size == 1) Size = 8;
52379 Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
52380 if (DestReg > 0) {
52381 bool is64Bit = Subtarget.is64Bit();
52382 const TargetRegisterClass *RC =
52383 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
52384 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
52385 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
52386 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
52387 : nullptr;
52388 if (Size == 64 && !is64Bit) {
52389 // Model GCC's behavior here and select a fixed pair of 32-bit
52390 // registers.
52391 switch (DestReg) {
52392 case X86::RAX:
52393 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52394 case X86::RDX:
52395 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
52396 case X86::RCX:
52397 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
52398 case X86::RBX:
52399 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
52400 case X86::RSI:
52401 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
52402 case X86::RDI:
52403 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
52404 case X86::RBP:
52405 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
52406 default:
52407 return std::make_pair(0, nullptr);
52408 }
52409 }
52410 if (RC && RC->contains(DestReg))
52411 return std::make_pair(DestReg, RC);
52412 return Res;
52413 }
52414 // No register found/type mismatch.
52415 return std::make_pair(0, nullptr);
52416 } else if (isFRClass(*Class)) {
52417 // Handle references to XMM physical registers that got mapped into the
52418 // wrong class. This can happen with constraints like {xmm0} where the
52419 // target independent register mapper will just pick the first match it can
52420 // find, ignoring the required type.
52421
52422 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
52423 if (VT == MVT::f32 || VT == MVT::i32)
52424 Res.second = &X86::FR32XRegClass;
52425 else if (VT == MVT::f64 || VT == MVT::i64)
52426 Res.second = &X86::FR64XRegClass;
52427 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
52428 Res.second = &X86::VR128XRegClass;
52429 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
52430 Res.second = &X86::VR256XRegClass;
52431 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
52432 Res.second = &X86::VR512RegClass;
52433 else {
52434 // Type mismatch and not a clobber: Return an error;
52435 Res.first = 0;
52436 Res.second = nullptr;
52437 }
52438 } else if (isVKClass(*Class)) {
52439 if (VT == MVT::i1)
52440 Res.second = &X86::VK1RegClass;
52441 else if (VT == MVT::i8)
52442 Res.second = &X86::VK8RegClass;
52443 else if (VT == MVT::i16)
52444 Res.second = &X86::VK16RegClass;
52445 else if (VT == MVT::i32)
52446 Res.second = &X86::VK32RegClass;
52447 else if (VT == MVT::i64)
52448 Res.second = &X86::VK64RegClass;
52449 else {
52450 // Type mismatch and not a clobber: Return an error;
52451 Res.first = 0;
52452 Res.second = nullptr;
52453 }
52454 }
52455
52456 return Res;
52457}
52458
52459InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
52460 const AddrMode &AM,
52461 Type *Ty,
52462 unsigned AS) const {
52463 // Scaling factors are not free at all.
52464 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
52465 // will take 2 allocations in the out of order engine instead of 1
52466 // for plain addressing mode, i.e. inst (reg1).
52467 // E.g.,
52468 // vaddps (%rsi,%rdx), %ymm0, %ymm1
52469 // Requires two allocations (one for the load, one for the computation)
52470 // whereas:
52471 // vaddps (%rsi), %ymm0, %ymm1
52472 // Requires just 1 allocation, i.e., freeing allocations for other operations
52473 // and having less micro operations to execute.
52474 //
52475 // For some X86 architectures, this is even worse because for instance for
52476 // stores, the complex addressing mode forces the instruction to use the
52477 // "load" ports instead of the dedicated "store" port.
52478 // E.g., on Haswell:
52479 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
52480 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
52481 if (isLegalAddressingMode(DL, AM, Ty, AS))
52482 // Scale represents reg2 * scale, thus account for 1
52483 // as soon as we use a second register.
52484 return AM.Scale != 0;
52485 return -1;
52486}
52487
52488bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
52489 // Integer division on x86 is expensive. However, when aggressively optimizing
52490 // for code size, we prefer to use a div instruction, as it is usually smaller
52491 // than the alternative sequence.
52492 // The exception to this is vector division. Since x86 doesn't have vector
52493 // integer division, leaving the division as-is is a loss even in terms of
52494 // size, because it will have to be scalarized, while the alternative code
52495 // sequence can be performed in vector form.
52496 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
52497 return OptSize && !VT.isVector();
52498}
52499
52500void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
52501 if (!Subtarget.is64Bit())
52502 return;
52503
52504 // Update IsSplitCSR in X86MachineFunctionInfo.
52505 X86MachineFunctionInfo *AFI =
52506 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
52507 AFI->setIsSplitCSR(true);
52508}
52509
52510void X86TargetLowering::insertCopiesSplitCSR(
52511 MachineBasicBlock *Entry,
52512 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
52513 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
52514 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
52515 if (!IStart)
52516 return;
52517
52518 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
52519 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
52520 MachineBasicBlock::iterator MBBI = Entry->begin();
52521 for (const MCPhysReg *I = IStart; *I; ++I) {
52522 const TargetRegisterClass *RC = nullptr;
52523 if (X86::GR64RegClass.contains(*I))
52524 RC = &X86::GR64RegClass;
52525 else
52526 llvm_unreachable("Unexpected register class in CSRsViaCopy!")__builtin_unreachable();
52527
52528 Register NewVR = MRI->createVirtualRegister(RC);
52529 // Create copy from CSR to a virtual register.
52530 // FIXME: this currently does not emit CFI pseudo-instructions, it works
52531 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
52532 // nounwind. If we want to generalize this later, we may need to emit
52533 // CFI pseudo-instructions.
52534 assert(((void)0)
52535 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&((void)0)
52536 "Function should be nounwind in insertCopiesSplitCSR!")((void)0);
52537 Entry->addLiveIn(*I);
52538 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
52539 .addReg(*I);
52540
52541 // Insert the copy-back instructions right before the terminator.
52542 for (auto *Exit : Exits)
52543 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
52544 TII->get(TargetOpcode::COPY), *I)
52545 .addReg(NewVR);
52546 }
52547}
52548
52549bool X86TargetLowering::supportSwiftError() const {
52550 return Subtarget.is64Bit();
52551}
52552
52553/// Returns true if stack probing through a function call is requested.
52554bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
52555 return !getStackProbeSymbolName(MF).empty();
52556}
52557
52558/// Returns true if stack probing through inline assembly is requested.
52559bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
52560
52561 // No inline stack probe for Windows, they have their own mechanism.
52562 if (Subtarget.isOSWindows() ||
52563 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52564 return false;
52565
52566 // If the function specifically requests inline stack probes, emit them.
52567 if (MF.getFunction().hasFnAttribute("probe-stack"))
52568 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
52569 "inline-asm";
52570
52571 return false;
52572}
52573
52574/// Returns the name of the symbol used to emit stack probes or the empty
52575/// string if not applicable.
52576StringRef
52577X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
52578 // Inline Stack probes disable stack probe call
52579 if (hasInlineStackProbe(MF))
52580 return "";
52581
52582 // If the function specifically requests stack probes, emit them.
52583 if (MF.getFunction().hasFnAttribute("probe-stack"))
52584 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
52585
52586 // Generally, if we aren't on Windows, the platform ABI does not include
52587 // support for stack probes, so don't emit them.
52588 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
52589 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52590 return "";
52591
52592 // We need a stack probe to conform to the Windows ABI. Choose the right
52593 // symbol.
52594 if (Subtarget.is64Bit())
52595 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
52596 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
52597}
52598
52599unsigned
52600X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
52601 // The default stack probe size is 4096 if the function has no stackprobesize
52602 // attribute.
52603 unsigned StackProbeSize = 4096;
52604 const Function &Fn = MF.getFunction();
52605 if (Fn.hasFnAttribute("stack-probe-size"))
52606 Fn.getFnAttribute("stack-probe-size")
52607 .getValueAsString()
52608 .getAsInteger(0, StackProbeSize);
52609 return StackProbeSize;
52610}
52611
52612Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
52613 if (ML->isInnermost() &&
52614 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
52615 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
52616 return TargetLowering::getPrefLoopAlignment();
52617}

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/CodeGen/SelectionDAGNodes.h

1//===- llvm/CodeGen/SelectionDAGNodes.h - SelectionDAG Nodes ----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the SDNode class and derived classes, which are used to
10// represent the nodes and operations present in a SelectionDAG. These nodes
11// and operations are machine code level operations, with some similarities to
12// the GCC RTL representation.
13//
14// Clients should include the SelectionDAG.h file instead of this file directly.
15//
16//===----------------------------------------------------------------------===//
17
18#ifndef LLVM_CODEGEN_SELECTIONDAGNODES_H
19#define LLVM_CODEGEN_SELECTIONDAGNODES_H
20
21#include "llvm/ADT/APFloat.h"
22#include "llvm/ADT/ArrayRef.h"
23#include "llvm/ADT/BitVector.h"
24#include "llvm/ADT/FoldingSet.h"
25#include "llvm/ADT/GraphTraits.h"
26#include "llvm/ADT/SmallPtrSet.h"
27#include "llvm/ADT/SmallVector.h"
28#include "llvm/ADT/ilist_node.h"
29#include "llvm/ADT/iterator.h"
30#include "llvm/ADT/iterator_range.h"
31#include "llvm/CodeGen/ISDOpcodes.h"
32#include "llvm/CodeGen/MachineMemOperand.h"
33#include "llvm/CodeGen/Register.h"
34#include "llvm/CodeGen/ValueTypes.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DebugLoc.h"
37#include "llvm/IR/Instruction.h"
38#include "llvm/IR/Instructions.h"
39#include "llvm/IR/Metadata.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/Support/AlignOf.h"
42#include "llvm/Support/AtomicOrdering.h"
43#include "llvm/Support/Casting.h"
44#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/MachineValueType.h"
46#include "llvm/Support/TypeSize.h"
47#include <algorithm>
48#include <cassert>
49#include <climits>
50#include <cstddef>
51#include <cstdint>
52#include <cstring>
53#include <iterator>
54#include <string>
55#include <tuple>
56
57namespace llvm {
58
59class APInt;
60class Constant;
61template <typename T> struct DenseMapInfo;
62class GlobalValue;
63class MachineBasicBlock;
64class MachineConstantPoolValue;
65class MCSymbol;
66class raw_ostream;
67class SDNode;
68class SelectionDAG;
69class Type;
70class Value;
71
72void checkForCycles(const SDNode *N, const SelectionDAG *DAG = nullptr,
73 bool force = false);
74
75/// This represents a list of ValueType's that has been intern'd by
76/// a SelectionDAG. Instances of this simple value class are returned by
77/// SelectionDAG::getVTList(...).
78///
79struct SDVTList {
80 const EVT *VTs;
81 unsigned int NumVTs;
82};
83
84namespace ISD {
85
86 /// Node predicates
87
88/// If N is a BUILD_VECTOR or SPLAT_VECTOR node whose elements are all the
89/// same constant or undefined, return true and return the constant value in
90/// \p SplatValue.
91bool isConstantSplatVector(const SDNode *N, APInt &SplatValue);
92
93/// Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where
94/// all of the elements are ~0 or undef. If \p BuildVectorOnly is set to
95/// true, it only checks BUILD_VECTOR.
96bool isConstantSplatVectorAllOnes(const SDNode *N,
97 bool BuildVectorOnly = false);
98
99/// Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where
100/// all of the elements are 0 or undef. If \p BuildVectorOnly is set to true, it
101/// only checks BUILD_VECTOR.
102bool isConstantSplatVectorAllZeros(const SDNode *N,
103 bool BuildVectorOnly = false);
104
105/// Return true if the specified node is a BUILD_VECTOR where all of the
106/// elements are ~0 or undef.
107bool isBuildVectorAllOnes(const SDNode *N);
108
109/// Return true if the specified node is a BUILD_VECTOR where all of the
110/// elements are 0 or undef.
111bool isBuildVectorAllZeros(const SDNode *N);
112
113/// Return true if the specified node is a BUILD_VECTOR node of all
114/// ConstantSDNode or undef.
115bool isBuildVectorOfConstantSDNodes(const SDNode *N);
116
117/// Return true if the specified node is a BUILD_VECTOR node of all
118/// ConstantFPSDNode or undef.
119bool isBuildVectorOfConstantFPSDNodes(const SDNode *N);
120
121/// Return true if the node has at least one operand and all operands of the
122/// specified node are ISD::UNDEF.
123bool allOperandsUndef(const SDNode *N);
124
125} // end namespace ISD
126
127//===----------------------------------------------------------------------===//
128/// Unlike LLVM values, Selection DAG nodes may return multiple
129/// values as the result of a computation. Many nodes return multiple values,
130/// from loads (which define a token and a return value) to ADDC (which returns
131/// a result and a carry value), to calls (which may return an arbitrary number
132/// of values).
133///
134/// As such, each use of a SelectionDAG computation must indicate the node that
135/// computes it as well as which return value to use from that node. This pair
136/// of information is represented with the SDValue value type.
137///
138class SDValue {
139 friend struct DenseMapInfo<SDValue>;
140
141 SDNode *Node = nullptr; // The node defining the value we are using.
4
Null pointer value stored to 'PreservedSrc.Node'
142 unsigned ResNo = 0; // Which return value of the node we are using.
143
144public:
145 SDValue() = default;
5
Returning without writing to 'this->Node'
146 SDValue(SDNode *node, unsigned resno);
147
148 /// get the index which selects a specific result in the SDNode
149 unsigned getResNo() const { return ResNo; }
150
151 /// get the SDNode which holds the desired result
152 SDNode *getNode() const { return Node; }
153
154 /// set the SDNode
155 void setNode(SDNode *N) { Node = N; }
156
157 inline SDNode *operator->() const { return Node; }
158
159 bool operator==(const SDValue &O) const {
160 return Node == O.Node && ResNo == O.ResNo;
161 }
162 bool operator!=(const SDValue &O) const {
163 return !operator==(O);
164 }
165 bool operator<(const SDValue &O) const {
166 return std::tie(Node, ResNo) < std::tie(O.Node, O.ResNo);
167 }
168 explicit operator bool() const {
169 return Node != nullptr;
170 }
171
172 SDValue getValue(unsigned R) const {
173 return SDValue(Node, R);
174 }
175
176 /// Return true if this node is an operand of N.
177 bool isOperandOf(const SDNode *N) const;
178
179 /// Return the ValueType of the referenced return value.
180 inline EVT getValueType() const;
181
182 /// Return the simple ValueType of the referenced return value.
183 MVT getSimpleValueType() const {
184 return getValueType().getSimpleVT();
185 }
186
187 /// Returns the size of the value in bits.
188 ///
189 /// If the value type is a scalable vector type, the scalable property will
190 /// be set and the runtime size will be a positive integer multiple of the
191 /// base size.
192 TypeSize getValueSizeInBits() const {
193 return getValueType().getSizeInBits();
194 }
195
196 uint64_t getScalarValueSizeInBits() const {
197 return getValueType().getScalarType().getFixedSizeInBits();
198 }
199
200 // Forwarding methods - These forward to the corresponding methods in SDNode.
201 inline unsigned getOpcode() const;
202 inline unsigned getNumOperands() const;
203 inline const SDValue &getOperand(unsigned i) const;
204 inline uint64_t getConstantOperandVal(unsigned i) const;
205 inline const APInt &getConstantOperandAPInt(unsigned i) const;
206 inline bool isTargetMemoryOpcode() const;
207 inline bool isTargetOpcode() const;
208 inline bool isMachineOpcode() const;
209 inline bool isUndef() const;
210 inline unsigned getMachineOpcode() const;
211 inline const DebugLoc &getDebugLoc() const;
212 inline void dump() const;
213 inline void dump(const SelectionDAG *G) const;
214 inline void dumpr() const;
215 inline void dumpr(const SelectionDAG *G) const;
216
217 /// Return true if this operand (which must be a chain) reaches the
218 /// specified operand without crossing any side-effecting instructions.
219 /// In practice, this looks through token factors and non-volatile loads.
220 /// In order to remain efficient, this only
221 /// looks a couple of nodes in, it does not do an exhaustive search.
222 bool reachesChainWithoutSideEffects(SDValue Dest,
223 unsigned Depth = 2) const;
224
225 /// Return true if there are no nodes using value ResNo of Node.
226 inline bool use_empty() const;
227
228 /// Return true if there is exactly one node using value ResNo of Node.
229 inline bool hasOneUse() const;
230};
231
232template<> struct DenseMapInfo<SDValue> {
233 static inline SDValue getEmptyKey() {
234 SDValue V;
235 V.ResNo = -1U;
236 return V;
237 }
238
239 static inline SDValue getTombstoneKey() {
240 SDValue V;
241 V.ResNo = -2U;
242 return V;
243 }
244
245 static unsigned getHashValue(const SDValue &Val) {
246 return ((unsigned)((uintptr_t)Val.getNode() >> 4) ^
247 (unsigned)((uintptr_t)Val.getNode() >> 9)) + Val.getResNo();
248 }
249
250 static bool isEqual(const SDValue &LHS, const SDValue &RHS) {
251 return LHS == RHS;
252 }
253};
254
255/// Allow casting operators to work directly on
256/// SDValues as if they were SDNode*'s.
257template<> struct simplify_type<SDValue> {
258 using SimpleType = SDNode *;
259
260 static SimpleType getSimplifiedValue(SDValue &Val) {
261 return Val.getNode();
262 }
263};
264template<> struct simplify_type<const SDValue> {
265 using SimpleType = /*const*/ SDNode *;
266
267 static SimpleType getSimplifiedValue(const SDValue &Val) {
268 return Val.getNode();
269 }
270};
271
272/// Represents a use of a SDNode. This class holds an SDValue,
273/// which records the SDNode being used and the result number, a
274/// pointer to the SDNode using the value, and Next and Prev pointers,
275/// which link together all the uses of an SDNode.
276///
277class SDUse {
278 /// Val - The value being used.
279 SDValue Val;
280 /// User - The user of this value.
281 SDNode *User = nullptr;
282 /// Prev, Next - Pointers to the uses list of the SDNode referred by
283 /// this operand.
284 SDUse **Prev = nullptr;
285 SDUse *Next = nullptr;
286
287public:
288 SDUse() = default;
289 SDUse(const SDUse &U) = delete;
290 SDUse &operator=(const SDUse &) = delete;
291
292 /// Normally SDUse will just implicitly convert to an SDValue that it holds.
293 operator const SDValue&() const { return Val; }
294
295 /// If implicit conversion to SDValue doesn't work, the get() method returns
296 /// the SDValue.
297 const SDValue &get() const { return Val; }
298
299 /// This returns the SDNode that contains this Use.
300 SDNode *getUser() { return User; }
301
302 /// Get the next SDUse in the use list.
303 SDUse *getNext() const { return Next; }
304
305 /// Convenience function for get().getNode().
306 SDNode *getNode() const { return Val.getNode(); }
307 /// Convenience function for get().getResNo().
308 unsigned getResNo() const { return Val.getResNo(); }
309 /// Convenience function for get().getValueType().
310 EVT getValueType() const { return Val.getValueType(); }
311
312 /// Convenience function for get().operator==
313 bool operator==(const SDValue &V) const {
314 return Val == V;
315 }
316
317 /// Convenience function for get().operator!=
318 bool operator!=(const SDValue &V) const {
319 return Val != V;
320 }
321
322 /// Convenience function for get().operator<
323 bool operator<(const SDValue &V) const {
324 return Val < V;
325 }
326
327private:
328 friend class SelectionDAG;
329 friend class SDNode;
330 // TODO: unfriend HandleSDNode once we fix its operand handling.
331 friend class HandleSDNode;
332
333 void setUser(SDNode *p) { User = p; }
334
335 /// Remove this use from its existing use list, assign it the
336 /// given value, and add it to the new value's node's use list.
337 inline void set(const SDValue &V);
338 /// Like set, but only supports initializing a newly-allocated
339 /// SDUse with a non-null value.
340 inline void setInitial(const SDValue &V);
341 /// Like set, but only sets the Node portion of the value,
342 /// leaving the ResNo portion unmodified.
343 inline void setNode(SDNode *N);
344
345 void addToList(SDUse **List) {
346 Next = *List;
347 if (Next) Next->Prev = &Next;
348 Prev = List;
349 *List = this;
350 }
351
352 void removeFromList() {
353 *Prev = Next;
354 if (Next) Next->Prev = Prev;
355 }
356};
357
358/// simplify_type specializations - Allow casting operators to work directly on
359/// SDValues as if they were SDNode*'s.
360template<> struct simplify_type<SDUse> {
361 using SimpleType = SDNode *;
362
363 static SimpleType getSimplifiedValue(SDUse &Val) {
364 return Val.getNode();
365 }
366};
367
368/// These are IR-level optimization flags that may be propagated to SDNodes.
369/// TODO: This data structure should be shared by the IR optimizer and the
370/// the backend.
371struct SDNodeFlags {
372private:
373 bool NoUnsignedWrap : 1;
374 bool NoSignedWrap : 1;
375 bool Exact : 1;
376 bool NoNaNs : 1;
377 bool NoInfs : 1;
378 bool NoSignedZeros : 1;
379 bool AllowReciprocal : 1;
380 bool AllowContract : 1;
381 bool ApproximateFuncs : 1;
382 bool AllowReassociation : 1;
383
384 // We assume instructions do not raise floating-point exceptions by default,
385 // and only those marked explicitly may do so. We could choose to represent
386 // this via a positive "FPExcept" flags like on the MI level, but having a
387 // negative "NoFPExcept" flag here (that defaults to true) makes the flag
388 // intersection logic more straightforward.
389 bool NoFPExcept : 1;
390
391public:
392 /// Default constructor turns off all optimization flags.
393 SDNodeFlags()
394 : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false), NoNaNs(false),
395 NoInfs(false), NoSignedZeros(false), AllowReciprocal(false),
396 AllowContract(false), ApproximateFuncs(false),
397 AllowReassociation(false), NoFPExcept(false) {}
398
399 /// Propagate the fast-math-flags from an IR FPMathOperator.
400 void copyFMF(const FPMathOperator &FPMO) {
401 setNoNaNs(FPMO.hasNoNaNs());
402 setNoInfs(FPMO.hasNoInfs());
403 setNoSignedZeros(FPMO.hasNoSignedZeros());
404 setAllowReciprocal(FPMO.hasAllowReciprocal());
405 setAllowContract(FPMO.hasAllowContract());
406 setApproximateFuncs(FPMO.hasApproxFunc());
407 setAllowReassociation(FPMO.hasAllowReassoc());
408 }
409
410 // These are mutators for each flag.
411 void setNoUnsignedWrap(bool b) { NoUnsignedWrap = b; }
412 void setNoSignedWrap(bool b) { NoSignedWrap = b; }
413 void setExact(bool b) { Exact = b; }
414 void setNoNaNs(bool b) { NoNaNs = b; }
415 void setNoInfs(bool b) { NoInfs = b; }
416 void setNoSignedZeros(bool b) { NoSignedZeros = b; }
417 void setAllowReciprocal(bool b) { AllowReciprocal = b; }
418 void setAllowContract(bool b) { AllowContract = b; }
419 void setApproximateFuncs(bool b) { ApproximateFuncs = b; }
420 void setAllowReassociation(bool b) { AllowReassociation = b; }
421 void setNoFPExcept(bool b) { NoFPExcept = b; }
422
423 // These are accessors for each flag.
424 bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
425 bool hasNoSignedWrap() const { return NoSignedWrap; }
426 bool hasExact() const { return Exact; }
427 bool hasNoNaNs() const { return NoNaNs; }
428 bool hasNoInfs() const { return NoInfs; }
429 bool hasNoSignedZeros() const { return NoSignedZeros; }
430 bool hasAllowReciprocal() const { return AllowReciprocal; }
431 bool hasAllowContract() const { return AllowContract; }
432 bool hasApproximateFuncs() const { return ApproximateFuncs; }
433 bool hasAllowReassociation() const { return AllowReassociation; }
434 bool hasNoFPExcept() const { return NoFPExcept; }
435
436 /// Clear any flags in this flag set that aren't also set in Flags. All
437 /// flags will be cleared if Flags are undefined.
438 void intersectWith(const SDNodeFlags Flags) {
439 NoUnsignedWrap &= Flags.NoUnsignedWrap;
440 NoSignedWrap &= Flags.NoSignedWrap;
441 Exact &= Flags.Exact;
442 NoNaNs &= Flags.NoNaNs;
443 NoInfs &= Flags.NoInfs;
444 NoSignedZeros &= Flags.NoSignedZeros;
445 AllowReciprocal &= Flags.AllowReciprocal;
446 AllowContract &= Flags.AllowContract;
447 ApproximateFuncs &= Flags.ApproximateFuncs;
448 AllowReassociation &= Flags.AllowReassociation;
449 NoFPExcept &= Flags.NoFPExcept;
450 }
451};
452
453/// Represents one node in the SelectionDAG.
454///
455class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
456private:
457 /// The operation that this node performs.
458 int16_t NodeType;
459
460protected:
461 // We define a set of mini-helper classes to help us interpret the bits in our
462 // SubclassData. These are designed to fit within a uint16_t so they pack
463 // with NodeType.
464
465#if defined(_AIX) && (!defined(__GNUC__4) || defined(__clang__1))
466// Except for GCC; by default, AIX compilers store bit-fields in 4-byte words
467// and give the `pack` pragma push semantics.
468#define BEGIN_TWO_BYTE_PACK() _Pragma("pack(2)")pack(2)
469#define END_TWO_BYTE_PACK() _Pragma("pack(pop)")pack(pop)
470#else
471#define BEGIN_TWO_BYTE_PACK()
472#define END_TWO_BYTE_PACK()
473#endif
474
475BEGIN_TWO_BYTE_PACK()
476 class SDNodeBitfields {
477 friend class SDNode;
478 friend class MemIntrinsicSDNode;
479 friend class MemSDNode;
480 friend class SelectionDAG;
481
482 uint16_t HasDebugValue : 1;
483 uint16_t IsMemIntrinsic : 1;
484 uint16_t IsDivergent : 1;
485 };
486 enum { NumSDNodeBits = 3 };
487
488 class ConstantSDNodeBitfields {
489 friend class ConstantSDNode;
490
491 uint16_t : NumSDNodeBits;
492
493 uint16_t IsOpaque : 1;
494 };
495
496 class MemSDNodeBitfields {
497 friend class MemSDNode;
498 friend class MemIntrinsicSDNode;
499 friend class AtomicSDNode;
500
501 uint16_t : NumSDNodeBits;
502
503 uint16_t IsVolatile : 1;
504 uint16_t IsNonTemporal : 1;
505 uint16_t IsDereferenceable : 1;
506 uint16_t IsInvariant : 1;
507 };
508 enum { NumMemSDNodeBits = NumSDNodeBits + 4 };
509
510 class LSBaseSDNodeBitfields {
511 friend class LSBaseSDNode;
512 friend class MaskedLoadStoreSDNode;
513 friend class MaskedGatherScatterSDNode;
514
515 uint16_t : NumMemSDNodeBits;
516
517 // This storage is shared between disparate class hierarchies to hold an
518 // enumeration specific to the class hierarchy in use.
519 // LSBaseSDNode => enum ISD::MemIndexedMode
520 // MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode
521 // MaskedGatherScatterSDNode => enum ISD::MemIndexType
522 uint16_t AddressingMode : 3;
523 };
524 enum { NumLSBaseSDNodeBits = NumMemSDNodeBits + 3 };
525
526 class LoadSDNodeBitfields {
527 friend class LoadSDNode;
528 friend class MaskedLoadSDNode;
529 friend class MaskedGatherSDNode;
530
531 uint16_t : NumLSBaseSDNodeBits;
532
533 uint16_t ExtTy : 2; // enum ISD::LoadExtType
534 uint16_t IsExpanding : 1;
535 };
536
537 class StoreSDNodeBitfields {
538 friend class StoreSDNode;
539 friend class MaskedStoreSDNode;
540 friend class MaskedScatterSDNode;
541
542 uint16_t : NumLSBaseSDNodeBits;
543
544 uint16_t IsTruncating : 1;
545 uint16_t IsCompressing : 1;
546 };
547
548 union {
549 char RawSDNodeBits[sizeof(uint16_t)];
550 SDNodeBitfields SDNodeBits;
551 ConstantSDNodeBitfields ConstantSDNodeBits;
552 MemSDNodeBitfields MemSDNodeBits;
553 LSBaseSDNodeBitfields LSBaseSDNodeBits;
554 LoadSDNodeBitfields LoadSDNodeBits;
555 StoreSDNodeBitfields StoreSDNodeBits;
556 };
557END_TWO_BYTE_PACK()
558#undef BEGIN_TWO_BYTE_PACK
559#undef END_TWO_BYTE_PACK
560
561 // RawSDNodeBits must cover the entirety of the union. This means that all of
562 // the union's members must have size <= RawSDNodeBits. We write the RHS as
563 // "2" instead of sizeof(RawSDNodeBits) because MSVC can't handle the latter.
564 static_assert(sizeof(SDNodeBitfields) <= 2, "field too wide");
565 static_assert(sizeof(ConstantSDNodeBitfields) <= 2, "field too wide");
566 static_assert(sizeof(MemSDNodeBitfields) <= 2, "field too wide");
567 static_assert(sizeof(LSBaseSDNodeBitfields) <= 2, "field too wide");
568 static_assert(sizeof(LoadSDNodeBitfields) <= 2, "field too wide");
569 static_assert(sizeof(StoreSDNodeBitfields) <= 2, "field too wide");
570
571private:
572 friend class SelectionDAG;
573 // TODO: unfriend HandleSDNode once we fix its operand handling.
574 friend class HandleSDNode;
575
576 /// Unique id per SDNode in the DAG.
577 int NodeId = -1;
578
579 /// The values that are used by this operation.
580 SDUse *OperandList = nullptr;
581
582 /// The types of the values this node defines. SDNode's may
583 /// define multiple values simultaneously.
584 const EVT *ValueList;
585
586 /// List of uses for this SDNode.
587 SDUse *UseList = nullptr;
588
589 /// The number of entries in the Operand/Value list.
590 unsigned short NumOperands = 0;
591 unsigned short NumValues;
592
593 // The ordering of the SDNodes. It roughly corresponds to the ordering of the
594 // original LLVM instructions.
595 // This is used for turning off scheduling, because we'll forgo
596 // the normal scheduling algorithms and output the instructions according to
597 // this ordering.
598 unsigned IROrder;
599
600 /// Source line information.
601 DebugLoc debugLoc;
602
603 /// Return a pointer to the specified value type.
604 static const EVT *getValueTypeList(EVT VT);
605
606 SDNodeFlags Flags;
607
608public:
609 /// Unique and persistent id per SDNode in the DAG.
610 /// Used for debug printing.
611 uint16_t PersistentId;
612
613 //===--------------------------------------------------------------------===//
614 // Accessors
615 //
616
617 /// Return the SelectionDAG opcode value for this node. For
618 /// pre-isel nodes (those for which isMachineOpcode returns false), these
619 /// are the opcode values in the ISD and <target>ISD namespaces. For
620 /// post-isel opcodes, see getMachineOpcode.
621 unsigned getOpcode() const { return (unsigned short)NodeType; }
622
623 /// Test if this node has a target-specific opcode (in the
624 /// \<target\>ISD namespace).
625 bool isTargetOpcode() const { return NodeType >= ISD::BUILTIN_OP_END; }
626
627 /// Test if this node has a target-specific opcode that may raise
628 /// FP exceptions (in the \<target\>ISD namespace and greater than
629 /// FIRST_TARGET_STRICTFP_OPCODE). Note that all target memory
630 /// opcode are currently automatically considered to possibly raise
631 /// FP exceptions as well.
632 bool isTargetStrictFPOpcode() const {
633 return NodeType >= ISD::FIRST_TARGET_STRICTFP_OPCODE;
634 }
635
636 /// Test if this node has a target-specific
637 /// memory-referencing opcode (in the \<target\>ISD namespace and
638 /// greater than FIRST_TARGET_MEMORY_OPCODE).
639 bool isTargetMemoryOpcode() const {
640 return NodeType >= ISD::FIRST_TARGET_MEMORY_OPCODE;
641 }
642
643 /// Return true if the type of the node type undefined.
644 bool isUndef() const { return NodeType == ISD::UNDEF; }
645
646 /// Test if this node is a memory intrinsic (with valid pointer information).
647 /// INTRINSIC_W_CHAIN and INTRINSIC_VOID nodes are sometimes created for
648 /// non-memory intrinsics (with chains) that are not really instances of
649 /// MemSDNode. For such nodes, we need some extra state to determine the
650 /// proper classof relationship.
651 bool isMemIntrinsic() const {
652 return (NodeType == ISD::INTRINSIC_W_CHAIN ||
653 NodeType == ISD::INTRINSIC_VOID) &&
654 SDNodeBits.IsMemIntrinsic;
655 }
656
657 /// Test if this node is a strict floating point pseudo-op.
658 bool isStrictFPOpcode() {
659 switch (NodeType) {
660 default:
661 return false;
662 case ISD::STRICT_FP16_TO_FP:
663 case ISD::STRICT_FP_TO_FP16:
664#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN) \
665 case ISD::STRICT_##DAGN:
666#include "llvm/IR/ConstrainedOps.def"
667 return true;
668 }
669 }
670
671 /// Test if this node has a post-isel opcode, directly
672 /// corresponding to a MachineInstr opcode.
673 bool isMachineOpcode() const { return NodeType < 0; }
674
675 /// This may only be called if isMachineOpcode returns
676 /// true. It returns the MachineInstr opcode value that the node's opcode
677 /// corresponds to.
678 unsigned getMachineOpcode() const {
679 assert(isMachineOpcode() && "Not a MachineInstr opcode!")((void)0);
680 return ~NodeType;
681 }
682
683 bool getHasDebugValue() const { return SDNodeBits.HasDebugValue; }
684 void setHasDebugValue(bool b) { SDNodeBits.HasDebugValue = b; }
685
686 bool isDivergent() const { return SDNodeBits.IsDivergent; }
687
688 /// Return true if there are no uses of this node.
689 bool use_empty() const { return UseList == nullptr; }
690
691 /// Return true if there is exactly one use of this node.
692 bool hasOneUse() const { return hasSingleElement(uses()); }
693
694 /// Return the number of uses of this node. This method takes
695 /// time proportional to the number of uses.
696 size_t use_size() const { return std::distance(use_begin(), use_end()); }
697
698 /// Return the unique node id.
699 int getNodeId() const { return NodeId; }
700
701 /// Set unique node id.
702 void setNodeId(int Id) { NodeId = Id; }
703
704 /// Return the node ordering.
705 unsigned getIROrder() const { return IROrder; }
706
707 /// Set the node ordering.
708 void setIROrder(unsigned Order) { IROrder = Order; }
709
710 /// Return the source location info.
711 const DebugLoc &getDebugLoc() const { return debugLoc; }
712
713 /// Set source location info. Try to avoid this, putting
714 /// it in the constructor is preferable.
715 void setDebugLoc(DebugLoc dl) { debugLoc = std::move(dl); }
716
717 /// This class provides iterator support for SDUse
718 /// operands that use a specific SDNode.
719 class use_iterator {
720 friend class SDNode;
721
722 SDUse *Op = nullptr;
723
724 explicit use_iterator(SDUse *op) : Op(op) {}
725
726 public:
727 using iterator_category = std::forward_iterator_tag;
728 using value_type = SDUse;
729 using difference_type = std::ptrdiff_t;
730 using pointer = value_type *;
731 using reference = value_type &;
732
733 use_iterator() = default;
734 use_iterator(const use_iterator &I) : Op(I.Op) {}
735
736 bool operator==(const use_iterator &x) const {
737 return Op == x.Op;
738 }
739 bool operator!=(const use_iterator &x) const {
740 return !operator==(x);
741 }
742
743 /// Return true if this iterator is at the end of uses list.
744 bool atEnd() const { return Op == nullptr; }
745
746 // Iterator traversal: forward iteration only.
747 use_iterator &operator++() { // Preincrement
748 assert(Op && "Cannot increment end iterator!")((void)0);
749 Op = Op->getNext();
750 return *this;
751 }
752
753 use_iterator operator++(int) { // Postincrement
754 use_iterator tmp = *this; ++*this; return tmp;
755 }
756
757 /// Retrieve a pointer to the current user node.
758 SDNode *operator*() const {
759 assert(Op && "Cannot dereference end iterator!")((void)0);
760 return Op->getUser();
761 }
762
763 SDNode *operator->() const { return operator*(); }
764
765 SDUse &getUse() const { return *Op; }
766
767 /// Retrieve the operand # of this use in its user.
768 unsigned getOperandNo() const {
769 assert(Op && "Cannot dereference end iterator!")((void)0);
770 return (unsigned)(Op - Op->getUser()->OperandList);
771 }
772 };
773
774 /// Provide iteration support to walk over all uses of an SDNode.
775 use_iterator use_begin() const {
776 return use_iterator(UseList);
777 }
778
779 static use_iterator use_end() { return use_iterator(nullptr); }
780
781 inline iterator_range<use_iterator> uses() {
782 return make_range(use_begin(), use_end());
783 }
784 inline iterator_range<use_iterator> uses() const {
785 return make_range(use_begin(), use_end());
786 }
787
788 /// Return true if there are exactly NUSES uses of the indicated value.
789 /// This method ignores uses of other values defined by this operation.
790 bool hasNUsesOfValue(unsigned NUses, unsigned Value) const;
791
792 /// Return true if there are any use of the indicated value.
793 /// This method ignores uses of other values defined by this operation.
794 bool hasAnyUseOfValue(unsigned Value) const;
795
796 /// Return true if this node is the only use of N.
797 bool isOnlyUserOf(const SDNode *N) const;
798
799 /// Return true if this node is an operand of N.
800 bool isOperandOf(const SDNode *N) const;
801
802 /// Return true if this node is a predecessor of N.
803 /// NOTE: Implemented on top of hasPredecessor and every bit as
804 /// expensive. Use carefully.
805 bool isPredecessorOf(const SDNode *N) const {
806 return N->hasPredecessor(this);
807 }
808
809 /// Return true if N is a predecessor of this node.
810 /// N is either an operand of this node, or can be reached by recursively
811 /// traversing up the operands.
812 /// NOTE: This is an expensive method. Use it carefully.
813 bool hasPredecessor(const SDNode *N) const;
814
815 /// Returns true if N is a predecessor of any node in Worklist. This
816 /// helper keeps Visited and Worklist sets externally to allow unions
817 /// searches to be performed in parallel, caching of results across
818 /// queries and incremental addition to Worklist. Stops early if N is
819 /// found but will resume. Remember to clear Visited and Worklists
820 /// if DAG changes. MaxSteps gives a maximum number of nodes to visit before
821 /// giving up. The TopologicalPrune flag signals that positive NodeIds are
822 /// topologically ordered (Operands have strictly smaller node id) and search
823 /// can be pruned leveraging this.
824 static bool hasPredecessorHelper(const SDNode *N,
825 SmallPtrSetImpl<const SDNode *> &Visited,
826 SmallVectorImpl<const SDNode *> &Worklist,
827 unsigned int MaxSteps = 0,
828 bool TopologicalPrune = false) {
829 SmallVector<const SDNode *, 8> DeferredNodes;
830 if (Visited.count(N))
831 return true;
832
833 // Node Id's are assigned in three places: As a topological
834 // ordering (> 0), during legalization (results in values set to
835 // 0), new nodes (set to -1). If N has a topolgical id then we
836 // know that all nodes with ids smaller than it cannot be
837 // successors and we need not check them. Filter out all node
838 // that can't be matches. We add them to the worklist before exit
839 // in case of multiple calls. Note that during selection the topological id
840 // may be violated if a node's predecessor is selected before it. We mark
841 // this at selection negating the id of unselected successors and
842 // restricting topological pruning to positive ids.
843
844 int NId = N->getNodeId();
845 // If we Invalidated the Id, reconstruct original NId.
846 if (NId < -1)
847 NId = -(NId + 1);
848
849 bool Found = false;
850 while (!Worklist.empty()) {
851 const SDNode *M = Worklist.pop_back_val();
852 int MId = M->getNodeId();
853 if (TopologicalPrune && M->getOpcode() != ISD::TokenFactor && (NId > 0) &&
854 (MId > 0) && (MId < NId)) {
855 DeferredNodes.push_back(M);
856 continue;
857 }
858 for (const SDValue &OpV : M->op_values()) {
859 SDNode *Op = OpV.getNode();
860 if (Visited.insert(Op).second)
861 Worklist.push_back(Op);
862 if (Op == N)
863 Found = true;
864 }
865 if (Found)
866 break;
867 if (MaxSteps != 0 && Visited.size() >= MaxSteps)
868 break;
869 }
870 // Push deferred nodes back on worklist.
871 Worklist.append(DeferredNodes.begin(), DeferredNodes.end());
872 // If we bailed early, conservatively return found.
873 if (MaxSteps != 0 && Visited.size() >= MaxSteps)
874 return true;
875 return Found;
876 }
877
878 /// Return true if all the users of N are contained in Nodes.
879 /// NOTE: Requires at least one match, but doesn't require them all.
880 static bool areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N);
881
882 /// Return the number of values used by this operation.
883 unsigned getNumOperands() const { return NumOperands; }
884
885 /// Return the maximum number of operands that a SDNode can hold.
886 static constexpr size_t getMaxNumOperands() {
887 return std::numeric_limits<decltype(SDNode::NumOperands)>::max();
888 }
889
890 /// Helper method returns the integer value of a ConstantSDNode operand.
891 inline uint64_t getConstantOperandVal(unsigned Num) const;
892
893 /// Helper method returns the APInt of a ConstantSDNode operand.
894 inline const APInt &getConstantOperandAPInt(unsigned Num) const;
895
896 const SDValue &getOperand(unsigned Num) const {
897 assert(Num < NumOperands && "Invalid child # of SDNode!")((void)0);
898 return OperandList[Num];
899 }
900
901 using op_iterator = SDUse *;
902
903 op_iterator op_begin() const { return OperandList; }
904 op_iterator op_end() const { return OperandList+NumOperands; }
905 ArrayRef<SDUse> ops() const { return makeArrayRef(op_begin(), op_end()); }
906
907 /// Iterator for directly iterating over the operand SDValue's.
908 struct value_op_iterator
909 : iterator_adaptor_base<value_op_iterator, op_iterator,
910 std::random_access_iterator_tag, SDValue,
911 ptrdiff_t, value_op_iterator *,
912 value_op_iterator *> {
913 explicit value_op_iterator(SDUse *U = nullptr)
914 : iterator_adaptor_base(U) {}
915
916 const SDValue &operator*() const { return I->get(); }
917 };
918
919 iterator_range<value_op_iterator> op_values() const {
920 return make_range(value_op_iterator(op_begin()),
921 value_op_iterator(op_end()));
922 }
923
924 SDVTList getVTList() const {
925 SDVTList X = { ValueList, NumValues };
926 return X;
927 }
928
929 /// If this node has a glue operand, return the node
930 /// to which the glue operand points. Otherwise return NULL.
931 SDNode *getGluedNode() const {
932 if (getNumOperands() != 0 &&
933 getOperand(getNumOperands()-1).getValueType() == MVT::Glue)
934 return getOperand(getNumOperands()-1).getNode();
935 return nullptr;
936 }
937
938 /// If this node has a glue value with a user, return
939 /// the user (there is at most one). Otherwise return NULL.
940 SDNode *getGluedUser() const {
941 for (use_iterator UI = use_begin(), UE = use_end(); UI != UE; ++UI)
942 if (UI.getUse().get().getValueType() == MVT::Glue)
943 return *UI;
944 return nullptr;
945 }
946
947 SDNodeFlags getFlags() const { return Flags; }
948 void setFlags(SDNodeFlags NewFlags) { Flags = NewFlags; }
949
950 /// Clear any flags in this node that aren't also set in Flags.
951 /// If Flags is not in a defined state then this has no effect.
952 void intersectFlagsWith(const SDNodeFlags Flags);
953
954 /// Return the number of values defined/returned by this operator.
955 unsigned getNumValues() const { return NumValues; }
956
957 /// Return the type of a specified result.
958 EVT getValueType(unsigned ResNo) const {
959 assert(ResNo < NumValues && "Illegal result number!")((void)0);
960 return ValueList[ResNo];
961 }
962
963 /// Return the type of a specified result as a simple type.
964 MVT getSimpleValueType(unsigned ResNo) const {
965 return getValueType(ResNo).getSimpleVT();
966 }
967
968 /// Returns MVT::getSizeInBits(getValueType(ResNo)).
969 ///
970 /// If the value type is a scalable vector type, the scalable property will
971 /// be set and the runtime size will be a positive integer multiple of the
972 /// base size.
973 TypeSize getValueSizeInBits(unsigned ResNo) const {
974 return getValueType(ResNo).getSizeInBits();
975 }
976
977 using value_iterator = const EVT *;
978
979 value_iterator value_begin() const { return ValueList; }
980 value_iterator value_end() const { return ValueList+NumValues; }
981 iterator_range<value_iterator> values() const {
982 return llvm::make_range(value_begin(), value_end());
983 }
984
985 /// Return the opcode of this operation for printing.
986 std::string getOperationName(const SelectionDAG *G = nullptr) const;
987 static const char* getIndexedModeName(ISD::MemIndexedMode AM);
988 void print_types(raw_ostream &OS, const SelectionDAG *G) const;
989 void print_details(raw_ostream &OS, const SelectionDAG *G) const;
990 void print(raw_ostream &OS, const SelectionDAG *G = nullptr) const;
991 void printr(raw_ostream &OS, const SelectionDAG *G = nullptr) const;
992
993 /// Print a SelectionDAG node and all children down to
994 /// the leaves. The given SelectionDAG allows target-specific nodes
995 /// to be printed in human-readable form. Unlike printr, this will
996 /// print the whole DAG, including children that appear multiple
997 /// times.
998 ///
999 void printrFull(raw_ostream &O, const SelectionDAG *G = nullptr) const;
1000
1001 /// Print a SelectionDAG node and children up to
1002 /// depth "depth." The given SelectionDAG allows target-specific
1003 /// nodes to be printed in human-readable form. Unlike printr, this
1004 /// will print children that appear multiple times wherever they are
1005 /// used.
1006 ///
1007 void printrWithDepth(raw_ostream &O, const SelectionDAG *G = nullptr,
1008 unsigned depth = 100) const;
1009
1010 /// Dump this node, for debugging.
1011 void dump() const;
1012
1013 /// Dump (recursively) this node and its use-def subgraph.
1014 void dumpr() const;
1015
1016 /// Dump this node, for debugging.
1017 /// The given SelectionDAG allows target-specific nodes to be printed
1018 /// in human-readable form.
1019 void dump(const SelectionDAG *G) const;
1020
1021 /// Dump (recursively) this node and its use-def subgraph.
1022 /// The given SelectionDAG allows target-specific nodes to be printed
1023 /// in human-readable form.
1024 void dumpr(const SelectionDAG *G) const;
1025
1026 /// printrFull to dbgs(). The given SelectionDAG allows
1027 /// target-specific nodes to be printed in human-readable form.
1028 /// Unlike dumpr, this will print the whole DAG, including children
1029 /// that appear multiple times.
1030 void dumprFull(const SelectionDAG *G = nullptr) const;
1031
1032 /// printrWithDepth to dbgs(). The given
1033 /// SelectionDAG allows target-specific nodes to be printed in
1034 /// human-readable form. Unlike dumpr, this will print children
1035 /// that appear multiple times wherever they are used.
1036 ///
1037 void dumprWithDepth(const SelectionDAG *G = nullptr,
1038 unsigned depth = 100) const;
1039
1040 /// Gather unique data for the node.
1041 void Profile(FoldingSetNodeID &ID) const;
1042
1043 /// This method should only be used by the SDUse class.
1044 void addUse(SDUse &U) { U.addToList(&UseList); }
1045
1046protected:
1047 static SDVTList getSDVTList(EVT VT) {
1048 SDVTList Ret = { getValueTypeList(VT), 1 };
1049 return Ret;
1050 }
1051
1052 /// Create an SDNode.
1053 ///
1054 /// SDNodes are created without any operands, and never own the operand
1055 /// storage. To add operands, see SelectionDAG::createOperands.
1056 SDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs)
1057 : NodeType(Opc), ValueList(VTs.VTs), NumValues(VTs.NumVTs),
1058 IROrder(Order), debugLoc(std::move(dl)) {
1059 memset(&RawSDNodeBits, 0, sizeof(RawSDNodeBits));
1060 assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor")((void)0);
1061 assert(NumValues == VTs.NumVTs &&((void)0)
1062 "NumValues wasn't wide enough for its operands!")((void)0);
1063 }
1064
1065 /// Release the operands and set this node to have zero operands.
1066 void DropOperands();
1067};
1068
1069/// Wrapper class for IR location info (IR ordering and DebugLoc) to be passed
1070/// into SDNode creation functions.
1071/// When an SDNode is created from the DAGBuilder, the DebugLoc is extracted
1072/// from the original Instruction, and IROrder is the ordinal position of
1073/// the instruction.
1074/// When an SDNode is created after the DAG is being built, both DebugLoc and
1075/// the IROrder are propagated from the original SDNode.
1076/// So SDLoc class provides two constructors besides the default one, one to
1077/// be used by the DAGBuilder, the other to be used by others.
1078class SDLoc {
1079private:
1080 DebugLoc DL;
1081 int IROrder = 0;
1082
1083public:
1084 SDLoc() = default;
1085 SDLoc(const SDNode *N) : DL(N->getDebugLoc()), IROrder(N->getIROrder()) {}
1086 SDLoc(const SDValue V) : SDLoc(V.getNode()) {}
1087 SDLoc(const Instruction *I, int Order) : IROrder(Order) {
1088 assert(Order >= 0 && "bad IROrder")((void)0);
1089 if (I)
1090 DL = I->getDebugLoc();
1091 }
1092
1093 unsigned getIROrder() const { return IROrder; }
1094 const DebugLoc &getDebugLoc() const { return DL; }
1095};
1096
1097// Define inline functions from the SDValue class.
1098
1099inline SDValue::SDValue(SDNode *node, unsigned resno)
1100 : Node(node), ResNo(resno) {
1101 // Explicitly check for !ResNo to avoid use-after-free, because there are
1102 // callers that use SDValue(N, 0) with a deleted N to indicate successful
1103 // combines.
1104 assert((!Node || !ResNo || ResNo < Node->getNumValues()) &&((void)0)
1105 "Invalid result number for the given node!")((void)0);
1106 assert(ResNo < -2U && "Cannot use result numbers reserved for DenseMaps.")((void)0);
1107}
1108
1109inline unsigned SDValue::getOpcode() const {
1110 return Node->getOpcode();
1111}
1112
1113inline EVT SDValue::getValueType() const {
1114 return Node->getValueType(ResNo);
1115}
1116
1117inline unsigned SDValue::getNumOperands() const {
1118 return Node->getNumOperands();
1119}
1120
1121inline const SDValue &SDValue::getOperand(unsigned i) const {
1122 return Node->getOperand(i);
1123}
1124
1125inline uint64_t SDValue::getConstantOperandVal(unsigned i) const {
1126 return Node->getConstantOperandVal(i);
1127}
1128
1129inline const APInt &SDValue::getConstantOperandAPInt(unsigned i) const {
1130 return Node->getConstantOperandAPInt(i);
1131}
1132
1133inline bool SDValue::isTargetOpcode() const {
1134 return Node->isTargetOpcode();
1135}
1136
1137inline bool SDValue::isTargetMemoryOpcode() const {
1138 return Node->isTargetMemoryOpcode();
1139}
1140
1141inline bool SDValue::isMachineOpcode() const {
1142 return Node->isMachineOpcode();
1143}
1144
1145inline unsigned SDValue::getMachineOpcode() const {
1146 return Node->getMachineOpcode();
1147}
1148
1149inline bool SDValue::isUndef() const {
1150 return Node->isUndef();
14
Called C++ object pointer is null
1151}
1152
1153inline bool SDValue::use_empty() const {
1154 return !Node->hasAnyUseOfValue(ResNo);
1155}
1156
1157inline bool SDValue::hasOneUse() const {
1158 return Node->hasNUsesOfValue(1, ResNo);
1159}
1160
1161inline const DebugLoc &SDValue::getDebugLoc() const {
1162 return Node->getDebugLoc();
1163}
1164
1165inline void SDValue::dump() const {
1166 return Node->dump();
1167}
1168
1169inline void SDValue::dump(const SelectionDAG *G) const {
1170 return Node->dump(G);
1171}
1172
1173inline void SDValue::dumpr() const {
1174 return Node->dumpr();
1175}
1176
1177inline void SDValue::dumpr(const SelectionDAG *G) const {
1178 return Node->dumpr(G);
1179}
1180
1181// Define inline functions from the SDUse class.
1182
1183inline void SDUse::set(const SDValue &V) {
1184 if (Val.getNode()) removeFromList();
1185 Val = V;
1186 if (V.getNode()) V.getNode()->addUse(*this);
1187}
1188
1189inline void SDUse::setInitial(const SDValue &V) {
1190 Val = V;
1191 V.getNode()->addUse(*this);
1192}
1193
1194inline void SDUse::setNode(SDNode *N) {
1195 if (Val.getNode()) removeFromList();
1196 Val.setNode(N);
1197 if (N) N->addUse(*this);
1198}
1199
1200/// This class is used to form a handle around another node that
1201/// is persistent and is updated across invocations of replaceAllUsesWith on its
1202/// operand. This node should be directly created by end-users and not added to
1203/// the AllNodes list.
1204class HandleSDNode : public SDNode {
1205 SDUse Op;
1206
1207public:
1208 explicit HandleSDNode(SDValue X)
1209 : SDNode(ISD::HANDLENODE, 0, DebugLoc(), getSDVTList(MVT::Other)) {
1210 // HandleSDNodes are never inserted into the DAG, so they won't be
1211 // auto-numbered. Use ID 65535 as a sentinel.
1212 PersistentId = 0xffff;
1213
1214 // Manually set up the operand list. This node type is special in that it's
1215 // always stack allocated and SelectionDAG does not manage its operands.
1216 // TODO: This should either (a) not be in the SDNode hierarchy, or (b) not
1217 // be so special.
1218 Op.setUser(this);
1219 Op.setInitial(X);
1220 NumOperands = 1;
1221 OperandList = &Op;
1222 }
1223 ~HandleSDNode();
1224
1225 const SDValue &getValue() const { return Op; }
1226};
1227
1228class AddrSpaceCastSDNode : public SDNode {
1229private:
1230 unsigned SrcAddrSpace;
1231 unsigned DestAddrSpace;
1232
1233public:
1234 AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl, EVT VT,
1235 unsigned SrcAS, unsigned DestAS);
1236
1237 unsigned getSrcAddressSpace() const { return SrcAddrSpace; }
1238 unsigned getDestAddressSpace() const { return DestAddrSpace; }
1239
1240 static bool classof(const SDNode *N) {
1241 return N->getOpcode() == ISD::ADDRSPACECAST;
1242 }
1243};
1244
1245/// This is an abstract virtual class for memory operations.
1246class MemSDNode : public SDNode {
1247private:
1248 // VT of in-memory value.
1249 EVT MemoryVT;
1250
1251protected:
1252 /// Memory reference information.
1253 MachineMemOperand *MMO;
1254
1255public:
1256 MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl, SDVTList VTs,
1257 EVT memvt, MachineMemOperand *MMO);
1258
1259 bool readMem() const { return MMO->isLoad(); }
1260 bool writeMem() const { return MMO->isStore(); }
1261
1262 /// Returns alignment and volatility of the memory access
1263 Align getOriginalAlign() const { return MMO->getBaseAlign(); }
1264 Align getAlign() const { return MMO->getAlign(); }
1265 // FIXME: Remove once transition to getAlign is over.
1266 unsigned getAlignment() const { return MMO->getAlign().value(); }
1267
1268 /// Return the SubclassData value, without HasDebugValue. This contains an
1269 /// encoding of the volatile flag, as well as bits used by subclasses. This
1270 /// function should only be used to compute a FoldingSetNodeID value.
1271 /// The HasDebugValue bit is masked out because CSE map needs to match
1272 /// nodes with debug info with nodes without debug info. Same is about
1273 /// isDivergent bit.
1274 unsigned getRawSubclassData() const {
1275 uint16_t Data;
1276 union {
1277 char RawSDNodeBits[sizeof(uint16_t)];
1278 SDNodeBitfields SDNodeBits;
1279 };
1280 memcpy(&RawSDNodeBits, &this->RawSDNodeBits, sizeof(this->RawSDNodeBits));
1281 SDNodeBits.HasDebugValue = 0;
1282 SDNodeBits.IsDivergent = false;
1283 memcpy(&Data, &RawSDNodeBits, sizeof(RawSDNodeBits));
1284 return Data;
1285 }
1286
1287 bool isVolatile() const { return MemSDNodeBits.IsVolatile; }
1288 bool isNonTemporal() const { return MemSDNodeBits.IsNonTemporal; }
1289 bool isDereferenceable() const { return MemSDNodeBits.IsDereferenceable; }
1290 bool isInvariant() const { return MemSDNodeBits.IsInvariant; }
1291
1292 // Returns the offset from the location of the access.
1293 int64_t getSrcValueOffset() const { return MMO->getOffset(); }
1294
1295 /// Returns the AA info that describes the dereference.
1296 AAMDNodes getAAInfo() const { return MMO->getAAInfo(); }
1297
1298 /// Returns the Ranges that describes the dereference.
1299 const MDNode *getRanges() const { return MMO->getRanges(); }
1300
1301 /// Returns the synchronization scope ID for this memory operation.
1302 SyncScope::ID getSyncScopeID() const { return MMO->getSyncScopeID(); }
1303
1304 /// Return the atomic ordering requirements for this memory operation. For
1305 /// cmpxchg atomic operations, return the atomic ordering requirements when
1306 /// store occurs.
1307 AtomicOrdering getSuccessOrdering() const {
1308 return MMO->getSuccessOrdering();
1309 }
1310
1311 /// Return a single atomic ordering that is at least as strong as both the
1312 /// success and failure orderings for an atomic operation. (For operations
1313 /// other than cmpxchg, this is equivalent to getSuccessOrdering().)
1314 AtomicOrdering getMergedOrdering() const { return MMO->getMergedOrdering(); }
1315
1316 /// Return true if the memory operation ordering is Unordered or higher.
1317 bool isAtomic() const { return MMO->isAtomic(); }
1318
1319 /// Returns true if the memory operation doesn't imply any ordering
1320 /// constraints on surrounding memory operations beyond the normal memory
1321 /// aliasing rules.
1322 bool isUnordered() const { return MMO->isUnordered(); }
1323
1324 /// Returns true if the memory operation is neither atomic or volatile.
1325 bool isSimple() const { return !isAtomic() && !isVolatile(); }
1326
1327 /// Return the type of the in-memory value.
1328 EVT getMemoryVT() const { return MemoryVT; }
1329
1330 /// Return a MachineMemOperand object describing the memory
1331 /// reference performed by operation.
1332 MachineMemOperand *getMemOperand() const { return MMO; }
1333
1334 const MachinePointerInfo &getPointerInfo() const {
1335 return MMO->getPointerInfo();
1336 }
1337
1338 /// Return the address space for the associated pointer
1339 unsigned getAddressSpace() const {
1340 return getPointerInfo().getAddrSpace();
1341 }
1342
1343 /// Update this MemSDNode's MachineMemOperand information
1344 /// to reflect the alignment of NewMMO, if it has a greater alignment.
1345 /// This must only be used when the new alignment applies to all users of
1346 /// this MachineMemOperand.
1347 void refineAlignment(const MachineMemOperand *NewMMO) {
1348 MMO->refineAlignment(NewMMO);
1349 }
1350
1351 const SDValue &getChain() const { return getOperand(0); }
1352
1353 const SDValue &getBasePtr() const {
1354 switch (getOpcode()) {
1355 case ISD::STORE:
1356 case ISD::MSTORE:
1357 return getOperand(2);
1358 case ISD::MGATHER:
1359 case ISD::MSCATTER:
1360 return getOperand(3);
1361 default:
1362 return getOperand(1);
1363 }
1364 }
1365
1366 // Methods to support isa and dyn_cast
1367 static bool classof(const SDNode *N) {
1368 // For some targets, we lower some target intrinsics to a MemIntrinsicNode
1369 // with either an intrinsic or a target opcode.
1370 switch (N->getOpcode()) {
1371 case ISD::LOAD:
1372 case ISD::STORE:
1373 case ISD::PREFETCH:
1374 case ISD::ATOMIC_CMP_SWAP:
1375 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
1376 case ISD::ATOMIC_SWAP:
1377 case ISD::ATOMIC_LOAD_ADD:
1378 case ISD::ATOMIC_LOAD_SUB:
1379 case ISD::ATOMIC_LOAD_AND:
1380 case ISD::ATOMIC_LOAD_CLR:
1381 case ISD::ATOMIC_LOAD_OR:
1382 case ISD::ATOMIC_LOAD_XOR:
1383 case ISD::ATOMIC_LOAD_NAND:
1384 case ISD::ATOMIC_LOAD_MIN:
1385 case ISD::ATOMIC_LOAD_MAX:
1386 case ISD::ATOMIC_LOAD_UMIN:
1387 case ISD::ATOMIC_LOAD_UMAX:
1388 case ISD::ATOMIC_LOAD_FADD:
1389 case ISD::ATOMIC_LOAD_FSUB:
1390 case ISD::ATOMIC_LOAD:
1391 case ISD::ATOMIC_STORE:
1392 case ISD::MLOAD:
1393 case ISD::MSTORE:
1394 case ISD::MGATHER:
1395 case ISD::MSCATTER:
1396 return true;
1397 default:
1398 return N->isMemIntrinsic() || N->isTargetMemoryOpcode();
1399 }
1400 }
1401};
1402
1403/// This is an SDNode representing atomic operations.
1404class AtomicSDNode : public MemSDNode {
1405public:
1406 AtomicSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl, SDVTList VTL,
1407 EVT MemVT, MachineMemOperand *MMO)
1408 : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) {
1409 assert(((Opc != ISD::ATOMIC_LOAD && Opc != ISD::ATOMIC_STORE) ||((void)0)
1410 MMO->isAtomic()) && "then why are we using an AtomicSDNode?")((void)0);
1411 }
1412
1413 const SDValue &getBasePtr() const { return getOperand(1); }
1414 const SDValue &getVal() const { return getOperand(2); }
1415
1416 /// Returns true if this SDNode represents cmpxchg atomic operation, false
1417 /// otherwise.
1418 bool isCompareAndSwap() const {
1419 unsigned Op = getOpcode();
1420 return Op == ISD::ATOMIC_CMP_SWAP ||
1421 Op == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS;
1422 }
1423
1424 /// For cmpxchg atomic operations, return the atomic ordering requirements
1425 /// when store does not occur.
1426 AtomicOrdering getFailureOrdering() const {
1427 assert(isCompareAndSwap() && "Must be cmpxchg operation")((void)0);
1428 return MMO->getFailureOrdering();
1429 }
1430
1431 // Methods to support isa and dyn_cast
1432 static bool classof(const SDNode *N) {
1433 return N->getOpcode() == ISD::ATOMIC_CMP_SWAP ||
1434 N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS ||
1435 N->getOpcode() == ISD::ATOMIC_SWAP ||
1436 N->getOpcode() == ISD::ATOMIC_LOAD_ADD ||
1437 N->getOpcode() == ISD::ATOMIC_LOAD_SUB ||
1438 N->getOpcode() == ISD::ATOMIC_LOAD_AND ||
1439 N->getOpcode() == ISD::ATOMIC_LOAD_CLR ||
1440 N->getOpcode() == ISD::ATOMIC_LOAD_OR ||
1441 N->getOpcode() == ISD::ATOMIC_LOAD_XOR ||
1442 N->getOpcode() == ISD::ATOMIC_LOAD_NAND ||
1443 N->getOpcode() == ISD::ATOMIC_LOAD_MIN ||
1444 N->getOpcode() == ISD::ATOMIC_LOAD_MAX ||
1445 N->getOpcode() == ISD::ATOMIC_LOAD_UMIN ||
1446 N->getOpcode() == ISD::ATOMIC_LOAD_UMAX ||
1447 N->getOpcode() == ISD::ATOMIC_LOAD_FADD ||
1448 N->getOpcode() == ISD::ATOMIC_LOAD_FSUB ||
1449 N->getOpcode() == ISD::ATOMIC_LOAD ||
1450 N->getOpcode() == ISD::ATOMIC_STORE;
1451 }
1452};
1453
1454/// This SDNode is used for target intrinsics that touch
1455/// memory and need an associated MachineMemOperand. Its opcode may be
1456/// INTRINSIC_VOID, INTRINSIC_W_CHAIN, PREFETCH, or a target-specific opcode
1457/// with a value not less than FIRST_TARGET_MEMORY_OPCODE.
1458class MemIntrinsicSDNode : public MemSDNode {
1459public:
1460 MemIntrinsicSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
1461 SDVTList VTs, EVT MemoryVT, MachineMemOperand *MMO)
1462 : MemSDNode(Opc, Order, dl, VTs, MemoryVT, MMO) {
1463 SDNodeBits.IsMemIntrinsic = true;
1464 }
1465
1466 // Methods to support isa and dyn_cast
1467 static bool classof(const SDNode *N) {
1468 // We lower some target intrinsics to their target opcode
1469 // early a node with a target opcode can be of this class
1470 return N->isMemIntrinsic() ||
1471 N->getOpcode() == ISD::PREFETCH ||
1472 N->isTargetMemoryOpcode();
1473 }
1474};
1475
1476/// This SDNode is used to implement the code generator
1477/// support for the llvm IR shufflevector instruction. It combines elements
1478/// from two input vectors into a new input vector, with the selection and
1479/// ordering of elements determined by an array of integers, referred to as
1480/// the shuffle mask. For input vectors of width N, mask indices of 0..N-1
1481/// refer to elements from the LHS input, and indices from N to 2N-1 the RHS.
1482/// An index of -1 is treated as undef, such that the code generator may put
1483/// any value in the corresponding element of the result.
1484class ShuffleVectorSDNode : public SDNode {
1485 // The memory for Mask is owned by the SelectionDAG's OperandAllocator, and
1486 // is freed when the SelectionDAG object is destroyed.
1487 const int *Mask;
1488
1489protected:
1490 friend class SelectionDAG;
1491
1492 ShuffleVectorSDNode(EVT VT, unsigned Order, const DebugLoc &dl, const int *M)
1493 : SDNode(ISD::VECTOR_SHUFFLE, Order, dl, getSDVTList(VT)), Mask(M) {}
1494
1495public:
1496 ArrayRef<int> getMask() const {
1497 EVT VT = getValueType(0);
1498 return makeArrayRef(Mask, VT.getVectorNumElements());
1499 }
1500
1501 int getMaskElt(unsigned Idx) const {
1502 assert(Idx < getValueType(0).getVectorNumElements() && "Idx out of range!")((void)0);
1503 return Mask[Idx];
1504 }
1505
1506 bool isSplat() const { return isSplatMask(Mask, getValueType(0)); }
1507
1508 int getSplatIndex() const {
1509 assert(isSplat() && "Cannot get splat index for non-splat!")((void)0);
1510 EVT VT = getValueType(0);
1511 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
1512 if (Mask[i] >= 0)
1513 return Mask[i];
1514
1515 // We can choose any index value here and be correct because all elements
1516 // are undefined. Return 0 for better potential for callers to simplify.
1517 return 0;
1518 }
1519
1520 static bool isSplatMask(const int *Mask, EVT VT);
1521
1522 /// Change values in a shuffle permute mask assuming
1523 /// the two vector operands have swapped position.
1524 static void commuteMask(MutableArrayRef<int> Mask) {
1525 unsigned NumElems = Mask.size();
1526 for (unsigned i = 0; i != NumElems; ++i) {
1527 int idx = Mask[i];
1528 if (idx < 0)
1529 continue;
1530 else if (idx < (int)NumElems)
1531 Mask[i] = idx + NumElems;
1532 else
1533 Mask[i] = idx - NumElems;
1534 }
1535 }
1536
1537 static bool classof(const SDNode *N) {
1538 return N->getOpcode() == ISD::VECTOR_SHUFFLE;
1539 }
1540};
1541
1542class ConstantSDNode : public SDNode {
1543 friend class SelectionDAG;
1544
1545 const ConstantInt *Value;
1546
1547 ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val, EVT VT)
1548 : SDNode(isTarget ? ISD::TargetConstant : ISD::Constant, 0, DebugLoc(),
1549 getSDVTList(VT)),
1550 Value(val) {
1551 ConstantSDNodeBits.IsOpaque = isOpaque;
1552 }
1553
1554public:
1555 const ConstantInt *getConstantIntValue() const { return Value; }
1556 const APInt &getAPIntValue() const { return Value->getValue(); }
1557 uint64_t getZExtValue() const { return Value->getZExtValue(); }
1558 int64_t getSExtValue() const { return Value->getSExtValue(); }
1559 uint64_t getLimitedValue(uint64_t Limit = UINT64_MAX0xffffffffffffffffULL) {
1560 return Value->getLimitedValue(Limit);
1561 }
1562 MaybeAlign getMaybeAlignValue() const { return Value->getMaybeAlignValue(); }
1563 Align getAlignValue() const { return Value->getAlignValue(); }
1564
1565 bool isOne() const { return Value->isOne(); }
1566 bool isNullValue() const { return Value->isZero(); }
1567 bool isAllOnesValue() const { return Value->isMinusOne(); }
1568 bool isMaxSignedValue() const { return Value->isMaxValue(true); }
1569 bool isMinSignedValue() const { return Value->isMinValue(true); }
1570
1571 bool isOpaque() const { return ConstantSDNodeBits.IsOpaque; }
1572
1573 static bool classof(const SDNode *N) {
1574 return N->getOpcode() == ISD::Constant ||
1575 N->getOpcode() == ISD::TargetConstant;
1576 }
1577};
1578
1579uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
1580 return cast<ConstantSDNode>(getOperand(Num))->getZExtValue();
1581}
1582
1583const APInt &SDNode::getConstantOperandAPInt(unsigned Num) const {
1584 return cast<ConstantSDNode>(getOperand(Num))->getAPIntValue();
1585}
1586
1587class ConstantFPSDNode : public SDNode {
1588 friend class SelectionDAG;
1589
1590 const ConstantFP *Value;
1591
1592 ConstantFPSDNode(bool isTarget, const ConstantFP *val, EVT VT)
1593 : SDNode(isTarget ? ISD::TargetConstantFP : ISD::ConstantFP, 0,
1594 DebugLoc(), getSDVTList(VT)),
1595 Value(val) {}
1596
1597public:
1598 const APFloat& getValueAPF() const { return Value->getValueAPF(); }
1599 const ConstantFP *getConstantFPValue() const { return Value; }
1600
1601 /// Return true if the value is positive or negative zero.
1602 bool isZero() const { return Value->isZero(); }
1603
1604 /// Return true if the value is a NaN.
1605 bool isNaN() const { return Value->isNaN(); }
1606
1607 /// Return true if the value is an infinity
1608 bool isInfinity() const { return Value->isInfinity(); }
1609
1610 /// Return true if the value is negative.
1611 bool isNegative() const { return Value->isNegative(); }
1612
1613 /// We don't rely on operator== working on double values, as
1614 /// it returns true for things that are clearly not equal, like -0.0 and 0.0.
1615 /// As such, this method can be used to do an exact bit-for-bit comparison of
1616 /// two floating point values.
1617
1618 /// We leave the version with the double argument here because it's just so
1619 /// convenient to write "2.0" and the like. Without this function we'd
1620 /// have to duplicate its logic everywhere it's called.
1621 bool isExactlyValue(double V) const {
1622 return Value->getValueAPF().isExactlyValue(V);
1623 }
1624 bool isExactlyValue(const APFloat& V) const;
1625
1626 static bool isValueValidForType(EVT VT, const APFloat& Val);
1627
1628 static bool classof(const SDNode *N) {
1629 return N->getOpcode() == ISD::ConstantFP ||
1630 N->getOpcode() == ISD::TargetConstantFP;
1631 }
1632};
1633
1634/// Returns true if \p V is a constant integer zero.
1635bool isNullConstant(SDValue V);
1636
1637/// Returns true if \p V is an FP constant with a value of positive zero.
1638bool isNullFPConstant(SDValue V);
1639
1640/// Returns true if \p V is an integer constant with all bits set.
1641bool isAllOnesConstant(SDValue V);
1642
1643/// Returns true if \p V is a constant integer one.
1644bool isOneConstant(SDValue V);
1645
1646/// Return the non-bitcasted source operand of \p V if it exists.
1647/// If \p V is not a bitcasted value, it is returned as-is.
1648SDValue peekThroughBitcasts(SDValue V);
1649
1650/// Return the non-bitcasted and one-use source operand of \p V if it exists.
1651/// If \p V is not a bitcasted one-use value, it is returned as-is.
1652SDValue peekThroughOneUseBitcasts(SDValue V);
1653
1654/// Return the non-extracted vector source operand of \p V if it exists.
1655/// If \p V is not an extracted subvector, it is returned as-is.
1656SDValue peekThroughExtractSubvectors(SDValue V);
1657
1658/// Returns true if \p V is a bitwise not operation. Assumes that an all ones
1659/// constant is canonicalized to be operand 1.
1660bool isBitwiseNot(SDValue V, bool AllowUndefs = false);
1661
1662/// Returns the SDNode if it is a constant splat BuildVector or constant int.
1663ConstantSDNode *isConstOrConstSplat(SDValue N, bool AllowUndefs = false,
1664 bool AllowTruncation = false);
1665
1666/// Returns the SDNode if it is a demanded constant splat BuildVector or
1667/// constant int.
1668ConstantSDNode *isConstOrConstSplat(SDValue N, const APInt &DemandedElts,
1669 bool AllowUndefs = false,
1670 bool AllowTruncation = false);
1671
1672/// Returns the SDNode if it is a constant splat BuildVector or constant float.
1673ConstantFPSDNode *isConstOrConstSplatFP(SDValue N, bool AllowUndefs = false);
1674
1675/// Returns the SDNode if it is a demanded constant splat BuildVector or
1676/// constant float.
1677ConstantFPSDNode *isConstOrConstSplatFP(SDValue N, const APInt &DemandedElts,
1678 bool AllowUndefs = false);
1679
1680/// Return true if the value is a constant 0 integer or a splatted vector of
1681/// a constant 0 integer (with no undefs by default).
1682/// Build vector implicit truncation is not an issue for null values.
1683bool isNullOrNullSplat(SDValue V, bool AllowUndefs = false);
1684
1685/// Return true if the value is a constant 1 integer or a splatted vector of a
1686/// constant 1 integer (with no undefs).
1687/// Does not permit build vector implicit truncation.
1688bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false);
1689
1690/// Return true if the value is a constant -1 integer or a splatted vector of a
1691/// constant -1 integer (with no undefs).
1692/// Does not permit build vector implicit truncation.
1693bool isAllOnesOrAllOnesSplat(SDValue V, bool AllowUndefs = false);
1694
1695/// Return true if \p V is either a integer or FP constant.
1696inline bool isIntOrFPConstant(SDValue V) {
1697 return isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V);
1698}
1699
1700class GlobalAddressSDNode : public SDNode {
1701 friend class SelectionDAG;
1702
1703 const GlobalValue *TheGlobal;
1704 int64_t Offset;
1705 unsigned TargetFlags;
1706
1707 GlobalAddressSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL,
1708 const GlobalValue *GA, EVT VT, int64_t o,
1709 unsigned TF);
1710
1711public:
1712 const GlobalValue *getGlobal() const { return TheGlobal; }
1713 int64_t getOffset() const { return Offset; }
1714 unsigned getTargetFlags() const { return TargetFlags; }
1715 // Return the address space this GlobalAddress belongs to.
1716 unsigned getAddressSpace() const;
1717
1718 static bool classof(const SDNode *N) {
1719 return N->getOpcode() == ISD::GlobalAddress ||
1720 N->getOpcode() == ISD::TargetGlobalAddress ||
1721 N->getOpcode() == ISD::GlobalTLSAddress ||
1722 N->getOpcode() == ISD::TargetGlobalTLSAddress;
1723 }
1724};
1725
1726class FrameIndexSDNode : public SDNode {
1727 friend class SelectionDAG;
1728
1729 int FI;
1730
1731 FrameIndexSDNode(int fi, EVT VT, bool isTarg)
1732 : SDNode(isTarg ? ISD::TargetFrameIndex : ISD::FrameIndex,
1733 0, DebugLoc(), getSDVTList(VT)), FI(fi) {
1734 }
1735
1736public:
1737 int getIndex() const { return FI; }
1738
1739 static bool classof(const SDNode *N) {
1740 return N->getOpcode() == ISD::FrameIndex ||
1741 N->getOpcode() == ISD::TargetFrameIndex;
1742 }
1743};
1744
1745/// This SDNode is used for LIFETIME_START/LIFETIME_END values, which indicate
1746/// the offet and size that are started/ended in the underlying FrameIndex.
1747class LifetimeSDNode : public SDNode {
1748 friend class SelectionDAG;
1749 int64_t Size;
1750 int64_t Offset; // -1 if offset is unknown.
1751
1752 LifetimeSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
1753 SDVTList VTs, int64_t Size, int64_t Offset)
1754 : SDNode(Opcode, Order, dl, VTs), Size(Size), Offset(Offset) {}
1755public:
1756 int64_t getFrameIndex() const {
1757 return cast<FrameIndexSDNode>(getOperand(1))->getIndex();
1758 }
1759
1760 bool hasOffset() const { return Offset >= 0; }
1761 int64_t getOffset() const {
1762 assert(hasOffset() && "offset is unknown")((void)0);
1763 return Offset;
1764 }
1765 int64_t getSize() const {
1766 assert(hasOffset() && "offset is unknown")((void)0);
1767 return Size;
1768 }
1769
1770 // Methods to support isa and dyn_cast
1771 static bool classof(const SDNode *N) {
1772 return N->getOpcode() == ISD::LIFETIME_START ||
1773 N->getOpcode() == ISD::LIFETIME_END;
1774 }
1775};
1776
1777/// This SDNode is used for PSEUDO_PROBE values, which are the function guid and
1778/// the index of the basic block being probed. A pseudo probe serves as a place
1779/// holder and will be removed at the end of compilation. It does not have any
1780/// operand because we do not want the instruction selection to deal with any.
1781class PseudoProbeSDNode : public SDNode {
1782 friend class SelectionDAG;
1783 uint64_t Guid;
1784 uint64_t Index;
1785 uint32_t Attributes;
1786
1787 PseudoProbeSDNode(unsigned Opcode, unsigned Order, const DebugLoc &Dl,
1788 SDVTList VTs, uint64_t Guid, uint64_t Index, uint32_t Attr)
1789 : SDNode(Opcode, Order, Dl, VTs), Guid(Guid), Index(Index),
1790 Attributes(Attr) {}
1791
1792public:
1793 uint64_t getGuid() const { return Guid; }
1794 uint64_t getIndex() const { return Index; }
1795 uint32_t getAttributes() const { return Attributes; }
1796
1797 // Methods to support isa and dyn_cast
1798 static bool classof(const SDNode *N) {
1799 return N->getOpcode() == ISD::PSEUDO_PROBE;
1800 }
1801};
1802
1803class JumpTableSDNode : public SDNode {
1804 friend class SelectionDAG;
1805
1806 int JTI;
1807 unsigned TargetFlags;
1808
1809 JumpTableSDNode(int jti, EVT VT, bool isTarg, unsigned TF)
1810 : SDNode(isTarg ? ISD::TargetJumpTable : ISD::JumpTable,
1811 0, DebugLoc(), getSDVTList(VT)), JTI(jti), TargetFlags(TF) {
1812 }
1813
1814public:
1815 int getIndex() const { return JTI; }
1816 unsigned getTargetFlags() const { return TargetFlags; }
1817
1818 static bool classof(const SDNode *N) {
1819 return N->getOpcode() == ISD::JumpTable ||
1820 N->getOpcode() == ISD::TargetJumpTable;
1821 }
1822};
1823
1824class ConstantPoolSDNode : public SDNode {
1825 friend class SelectionDAG;
1826
1827 union {
1828 const Constant *ConstVal;
1829 MachineConstantPoolValue *MachineCPVal;
1830 } Val;
1831 int Offset; // It's a MachineConstantPoolValue if top bit is set.
1832 Align Alignment; // Minimum alignment requirement of CP.
1833 unsigned TargetFlags;
1834
1835 ConstantPoolSDNode(bool isTarget, const Constant *c, EVT VT, int o,
1836 Align Alignment, unsigned TF)
1837 : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
1838 DebugLoc(), getSDVTList(VT)),
1839 Offset(o), Alignment(Alignment), TargetFlags(TF) {
1840 assert(Offset >= 0 && "Offset is too large")((void)0);
1841 Val.ConstVal = c;
1842 }
1843
1844 ConstantPoolSDNode(bool isTarget, MachineConstantPoolValue *v, EVT VT, int o,
1845 Align Alignment, unsigned TF)
1846 : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
1847 DebugLoc(), getSDVTList(VT)),
1848 Offset(o), Alignment(Alignment), TargetFlags(TF) {
1849 assert(Offset >= 0 && "Offset is too large")((void)0);
1850 Val.MachineCPVal = v;
1851 Offset |= 1 << (sizeof(unsigned)*CHAR_BIT8-1);
1852 }
1853
1854public:
1855 bool isMachineConstantPoolEntry() const {
1856 return Offset < 0;
1857 }
1858
1859 const Constant *getConstVal() const {
1860 assert(!isMachineConstantPoolEntry() && "Wrong constantpool type")((void)0);
1861 return Val.ConstVal;
1862 }
1863
1864 MachineConstantPoolValue *getMachineCPVal() const {
1865 assert(isMachineConstantPoolEntry() && "Wrong constantpool type")((void)0);
1866 return Val.MachineCPVal;
1867 }
1868
1869 int getOffset() const {
1870 return Offset & ~(1 << (sizeof(unsigned)*CHAR_BIT8-1));
1871 }
1872
1873 // Return the alignment of this constant pool object, which is either 0 (for
1874 // default alignment) or the desired value.
1875 Align getAlign() const { return Alignment; }
1876 unsigned getTargetFlags() const { return TargetFlags; }
1877
1878 Type *getType() const;
1879
1880 static bool classof(const SDNode *N) {
1881 return N->getOpcode() == ISD::ConstantPool ||
1882 N->getOpcode() == ISD::TargetConstantPool;
1883 }
1884};
1885
1886/// Completely target-dependent object reference.
1887class TargetIndexSDNode : public SDNode {
1888 friend class SelectionDAG;
1889
1890 unsigned TargetFlags;
1891 int Index;
1892 int64_t Offset;
1893
1894public:
1895 TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned TF)
1896 : SDNode(ISD::TargetIndex, 0, DebugLoc(), getSDVTList(VT)),
1897 TargetFlags(TF), Index(Idx), Offset(Ofs) {}
1898
1899 unsigned getTargetFlags() const { return TargetFlags; }
1900 int getIndex() const { return Index; }
1901 int64_t getOffset() const { return Offset; }
1902
1903 static bool classof(const SDNode *N) {
1904 return N->getOpcode() == ISD::TargetIndex;
1905 }
1906};
1907
1908class BasicBlockSDNode : public SDNode {
1909 friend class SelectionDAG;
1910
1911 MachineBasicBlock *MBB;
1912
1913 /// Debug info is meaningful and potentially useful here, but we create
1914 /// blocks out of order when they're jumped to, which makes it a bit
1915 /// harder. Let's see if we need it first.
1916 explicit BasicBlockSDNode(MachineBasicBlock *mbb)
1917 : SDNode(ISD::BasicBlock, 0, DebugLoc(), getSDVTList(MVT::Other)), MBB(mbb)
1918 {}
1919
1920public:
1921 MachineBasicBlock *getBasicBlock() const { return MBB; }
1922
1923 static bool classof(const SDNode *N) {
1924 return N->getOpcode() == ISD::BasicBlock;
1925 }
1926};
1927
1928/// A "pseudo-class" with methods for operating on BUILD_VECTORs.
1929class BuildVectorSDNode : public SDNode {
1930public:
1931 // These are constructed as SDNodes and then cast to BuildVectorSDNodes.
1932 explicit BuildVectorSDNode() = delete;
1933
1934 /// Check if this is a constant splat, and if so, find the
1935 /// smallest element size that splats the vector. If MinSplatBits is
1936 /// nonzero, the element size must be at least that large. Note that the
1937 /// splat element may be the entire vector (i.e., a one element vector).
1938 /// Returns the splat element value in SplatValue. Any undefined bits in
1939 /// that value are zero, and the corresponding bits in the SplatUndef mask
1940 /// are set. The SplatBitSize value is set to the splat element size in
1941 /// bits. HasAnyUndefs is set to true if any bits in the vector are
1942 /// undefined. isBigEndian describes the endianness of the target.
1943 bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
1944 unsigned &SplatBitSize, bool &HasAnyUndefs,
1945 unsigned MinSplatBits = 0,
1946 bool isBigEndian = false) const;
1947
1948 /// Returns the demanded splatted value or a null value if this is not a
1949 /// splat.
1950 ///
1951 /// The DemandedElts mask indicates the elements that must be in the splat.
1952 /// If passed a non-null UndefElements bitvector, it will resize it to match
1953 /// the vector width and set the bits where elements are undef.
1954 SDValue getSplatValue(const APInt &DemandedElts,
1955 BitVector *UndefElements = nullptr) const;
1956
1957 /// Returns the splatted value or a null value if this is not a splat.
1958 ///
1959 /// If passed a non-null UndefElements bitvector, it will resize it to match
1960 /// the vector width and set the bits where elements are undef.
1961 SDValue getSplatValue(BitVector *UndefElements = nullptr) const;
1962
1963 /// Find the shortest repeating sequence of values in the build vector.
1964 ///
1965 /// e.g. { u, X, u, X, u, u, X, u } -> { X }
1966 /// { X, Y, u, Y, u, u, X, u } -> { X, Y }
1967 ///
1968 /// Currently this must be a power-of-2 build vector.
1969 /// The DemandedElts mask indicates the elements that must be present,
1970 /// undemanded elements in Sequence may be null (SDValue()). If passed a
1971 /// non-null UndefElements bitvector, it will resize it to match the original
1972 /// vector width and set the bits where elements are undef. If result is
1973 /// false, Sequence will be empty.
1974 bool getRepeatedSequence(const APInt &DemandedElts,
1975 SmallVectorImpl<SDValue> &Sequence,
1976 BitVector *UndefElements = nullptr) const;
1977
1978 /// Find the shortest repeating sequence of values in the build vector.
1979 ///
1980 /// e.g. { u, X, u, X, u, u, X, u } -> { X }
1981 /// { X, Y, u, Y, u, u, X, u } -> { X, Y }
1982 ///
1983 /// Currently this must be a power-of-2 build vector.
1984 /// If passed a non-null UndefElements bitvector, it will resize it to match
1985 /// the original vector width and set the bits where elements are undef.
1986 /// If result is false, Sequence will be empty.
1987 bool getRepeatedSequence(SmallVectorImpl<SDValue> &Sequence,
1988 BitVector *UndefElements = nullptr) const;
1989
1990 /// Returns the demanded splatted constant or null if this is not a constant
1991 /// splat.
1992 ///
1993 /// The DemandedElts mask indicates the elements that must be in the splat.
1994 /// If passed a non-null UndefElements bitvector, it will resize it to match
1995 /// the vector width and set the bits where elements are undef.
1996 ConstantSDNode *
1997 getConstantSplatNode(const APInt &DemandedElts,
1998 BitVector *UndefElements = nullptr) const;
1999
2000 /// Returns the splatted constant or null if this is not a constant
2001 /// splat.
2002 ///
2003 /// If passed a non-null UndefElements bitvector, it will resize it to match
2004 /// the vector width and set the bits where elements are undef.
2005 ConstantSDNode *
2006 getConstantSplatNode(BitVector *UndefElements = nullptr) const;
2007
2008 /// Returns the demanded splatted constant FP or null if this is not a
2009 /// constant FP splat.
2010 ///
2011 /// The DemandedElts mask indicates the elements that must be in the splat.
2012 /// If passed a non-null UndefElements bitvector, it will resize it to match
2013 /// the vector width and set the bits where elements are undef.
2014 ConstantFPSDNode *
2015 getConstantFPSplatNode(const APInt &DemandedElts,
2016 BitVector *UndefElements = nullptr) const;
2017
2018 /// Returns the splatted constant FP or null if this is not a constant
2019 /// FP splat.
2020 ///
2021 /// If passed a non-null UndefElements bitvector, it will resize it to match
2022 /// the vector width and set the bits where elements are undef.
2023 ConstantFPSDNode *
2024 getConstantFPSplatNode(BitVector *UndefElements = nullptr) const;
2025
2026 /// If this is a constant FP splat and the splatted constant FP is an
2027 /// exact power or 2, return the log base 2 integer value. Otherwise,
2028 /// return -1.
2029 ///
2030 /// The BitWidth specifies the necessary bit precision.
2031 int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
2032 uint32_t BitWidth) const;
2033
2034 bool isConstant() const;
2035
2036 static bool classof(const SDNode *N) {
2037 return N->getOpcode() == ISD::BUILD_VECTOR;
2038 }
2039};
2040
2041/// An SDNode that holds an arbitrary LLVM IR Value. This is
2042/// used when the SelectionDAG needs to make a simple reference to something
2043/// in the LLVM IR representation.
2044///
2045class SrcValueSDNode : public SDNode {
2046 friend class SelectionDAG;
2047
2048 const Value *V;
2049
2050 /// Create a SrcValue for a general value.
2051 explicit SrcValueSDNode(const Value *v)
2052 : SDNode(ISD::SRCVALUE, 0, DebugLoc(), getSDVTList(MVT::Other)), V(v) {}
2053
2054public:
2055 /// Return the contained Value.
2056 const Value *getValue() const { return V; }
2057
2058 static bool classof(const SDNode *N) {
2059 return N->getOpcode() == ISD::SRCVALUE;
2060 }
2061};
2062
2063class MDNodeSDNode : public SDNode {
2064 friend class SelectionDAG;
2065
2066 const MDNode *MD;
2067
2068 explicit MDNodeSDNode(const MDNode *md)
2069 : SDNode(ISD::MDNODE_SDNODE, 0, DebugLoc(), getSDVTList(MVT::Other)), MD(md)
2070 {}
2071
2072public:
2073 const MDNode *getMD() const { return MD; }
2074
2075 static bool classof(const SDNode *N) {
2076 return N->getOpcode() == ISD::MDNODE_SDNODE;
2077 }
2078};
2079
2080class RegisterSDNode : public SDNode {
2081 friend class SelectionDAG;
2082
2083 Register Reg;
2084
2085 RegisterSDNode(Register reg, EVT VT)
2086 : SDNode(ISD::Register, 0, DebugLoc(), getSDVTList(VT)), Reg(reg) {}
2087
2088public:
2089 Register getReg() const { return Reg; }
2090
2091 static bool classof(const SDNode *N) {
2092 return N->getOpcode() == ISD::Register;
2093 }
2094};
2095
2096class RegisterMaskSDNode : public SDNode {
2097 friend class SelectionDAG;
2098
2099 // The memory for RegMask is not owned by the node.
2100 const uint32_t *RegMask;
2101
2102 RegisterMaskSDNode(const uint32_t *mask)
2103 : SDNode(ISD::RegisterMask, 0, DebugLoc(), getSDVTList(MVT::Untyped)),
2104 RegMask(mask) {}
2105
2106public:
2107 const uint32_t *getRegMask() const { return RegMask; }
2108
2109 static bool classof(const SDNode *N) {
2110 return N->getOpcode() == ISD::RegisterMask;
2111 }
2112};
2113
2114class BlockAddressSDNode : public SDNode {
2115 friend class SelectionDAG;
2116
2117 const BlockAddress *BA;
2118 int64_t Offset;
2119 unsigned TargetFlags;
2120
2121 BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba,
2122 int64_t o, unsigned Flags)
2123 : SDNode(NodeTy, 0, DebugLoc(), getSDVTList(VT)),
2124 BA(ba), Offset(o), TargetFlags(Flags) {}
2125
2126public:
2127 const BlockAddress *getBlockAddress() const { return BA; }
2128 int64_t getOffset() const { return Offset; }
2129 unsigned getTargetFlags() const { return TargetFlags; }
2130
2131 static bool classof(const SDNode *N) {
2132 return N->getOpcode() == ISD::BlockAddress ||
2133 N->getOpcode() == ISD::TargetBlockAddress;
2134 }
2135};
2136
2137class LabelSDNode : public SDNode {
2138 friend class SelectionDAG;
2139
2140 MCSymbol *Label;
2141
2142 LabelSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl, MCSymbol *L)
2143 : SDNode(Opcode, Order, dl, getSDVTList(MVT::Other)), Label(L) {
2144 assert(LabelSDNode::classof(this) && "not a label opcode")((void)0);
2145 }
2146
2147public:
2148 MCSymbol *getLabel() const { return Label; }
2149
2150 static bool classof(const SDNode *N) {
2151 return N->getOpcode() == ISD::EH_LABEL ||
2152 N->getOpcode() == ISD::ANNOTATION_LABEL;
2153 }
2154};
2155
2156class ExternalSymbolSDNode : public SDNode {
2157 friend class SelectionDAG;
2158
2159 const char *Symbol;
2160 unsigned TargetFlags;
2161
2162 ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned TF, EVT VT)
2163 : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol, 0,
2164 DebugLoc(), getSDVTList(VT)),
2165 Symbol(Sym), TargetFlags(TF) {}
2166
2167public:
2168 const char *getSymbol() const { return Symbol; }
2169 unsigned getTargetFlags() const { return TargetFlags; }
2170
2171 static bool classof(const SDNode *N) {
2172 return N->getOpcode() == ISD::ExternalSymbol ||
2173 N->getOpcode() == ISD::TargetExternalSymbol;
2174 }
2175};
2176
2177class MCSymbolSDNode : public SDNode {
2178 friend class SelectionDAG;
2179
2180 MCSymbol *Symbol;
2181
2182 MCSymbolSDNode(MCSymbol *Symbol, EVT VT)
2183 : SDNode(ISD::MCSymbol, 0, DebugLoc(), getSDVTList(VT)), Symbol(Symbol) {}
2184
2185public:
2186 MCSymbol *getMCSymbol() const { return Symbol; }
2187
2188 static bool classof(const SDNode *N) {
2189 return N->getOpcode() == ISD::MCSymbol;
2190 }
2191};
2192
2193class CondCodeSDNode : public SDNode {
2194 friend class SelectionDAG;
2195
2196 ISD::CondCode Condition;
2197
2198 explicit CondCodeSDNode(ISD::CondCode Cond)
2199 : SDNode(ISD::CONDCODE, 0, DebugLoc(), getSDVTList(MVT::Other)),
2200 Condition(Cond) {}
2201
2202public:
2203 ISD::CondCode get() const { return Condition; }
2204
2205 static bool classof(const SDNode *N) {
2206 return N->getOpcode() == ISD::CONDCODE;
2207 }
2208};
2209
2210/// This class is used to represent EVT's, which are used
2211/// to parameterize some operations.
2212class VTSDNode : public SDNode {
2213 friend class SelectionDAG;
2214
2215 EVT ValueType;
2216
2217 explicit VTSDNode(EVT VT)
2218 : SDNode(ISD::VALUETYPE, 0, DebugLoc(), getSDVTList(MVT::Other)),
2219 ValueType(VT) {}
2220
2221public:
2222 EVT getVT() const { return ValueType; }
2223
2224 static bool classof(const SDNode *N) {
2225 return N->getOpcode() == ISD::VALUETYPE;
2226 }
2227};
2228
2229/// Base class for LoadSDNode and StoreSDNode
2230class LSBaseSDNode : public MemSDNode {
2231public:
2232 LSBaseSDNode(ISD::NodeType NodeTy, unsigned Order, const DebugLoc &dl,
2233 SDVTList VTs, ISD::MemIndexedMode AM, EVT MemVT,
2234 MachineMemOperand *MMO)
2235 : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
2236 LSBaseSDNodeBits.AddressingMode = AM;
2237 assert(getAddressingMode() == AM && "Value truncated")((void)0);
2238 }
2239
2240 const SDValue &getOffset() const {
2241 return getOperand(getOpcode() == ISD::LOAD ? 2 : 3);
2242 }
2243
2244 /// Return the addressing mode for this load or store:
2245 /// unindexed, pre-inc, pre-dec, post-inc, or post-dec.
2246 ISD::MemIndexedMode getAddressingMode() const {
2247 return static_cast<ISD::MemIndexedMode>(LSBaseSDNodeBits.AddressingMode);
2248 }
2249
2250 /// Return true if this is a pre/post inc/dec load/store.
2251 bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; }
2252
2253 /// Return true if this is NOT a pre/post inc/dec load/store.
2254 bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
2255
2256 static bool classof(const SDNode *N) {
2257 return N->getOpcode() == ISD::LOAD ||
2258 N->getOpcode() == ISD::STORE;
2259 }
2260};
2261
2262/// This class is used to represent ISD::LOAD nodes.
2263class LoadSDNode : public LSBaseSDNode {
2264 friend class SelectionDAG;
2265
2266 LoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2267 ISD::MemIndexedMode AM, ISD::LoadExtType ETy, EVT MemVT,
2268 MachineMemOperand *MMO)
2269 : LSBaseSDNode(ISD::LOAD, Order, dl, VTs, AM, MemVT, MMO) {
2270 LoadSDNodeBits.ExtTy = ETy;
2271 assert(readMem() && "Load MachineMemOperand is not a load!")((void)0);
2272 assert(!writeMem() && "Load MachineMemOperand is a store!")((void)0);
2273 }
2274
2275public:
2276 /// Return whether this is a plain node,
2277 /// or one of the varieties of value-extending loads.
2278 ISD::LoadExtType getExtensionType() const {
2279 return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
2280 }
2281
2282 const SDValue &getBasePtr() const { return getOperand(1); }
2283 const SDValue &getOffset() const { return getOperand(2); }
2284
2285 static bool classof(const SDNode *N) {
2286 return N->getOpcode() == ISD::LOAD;
2287 }
2288};
2289
2290/// This class is used to represent ISD::STORE nodes.
2291class StoreSDNode : public LSBaseSDNode {
2292 friend class SelectionDAG;
2293
2294 StoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2295 ISD::MemIndexedMode AM, bool isTrunc, EVT MemVT,
2296 MachineMemOperand *MMO)
2297 : LSBaseSDNode(ISD::STORE, Order, dl, VTs, AM, MemVT, MMO) {
2298 StoreSDNodeBits.IsTruncating = isTrunc;
2299 assert(!readMem() && "Store MachineMemOperand is a load!")((void)0);
2300 assert(writeMem() && "Store MachineMemOperand is not a store!")((void)0);
2301 }
2302
2303public:
2304 /// Return true if the op does a truncation before store.
2305 /// For integers this is the same as doing a TRUNCATE and storing the result.
2306 /// For floats, it is the same as doing an FP_ROUND and storing the result.
2307 bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
2308 void setTruncatingStore(bool Truncating) {
2309 StoreSDNodeBits.IsTruncating = Truncating;
2310 }
2311
2312 const SDValue &getValue() const { return getOperand(1); }
2313 const SDValue &getBasePtr() const { return getOperand(2); }
2314 const SDValue &getOffset() const { return getOperand(3); }
2315
2316 static bool classof(const SDNode *N) {
2317 return N->getOpcode() == ISD::STORE;
2318 }
2319};
2320
2321/// This base class is used to represent MLOAD and MSTORE nodes
2322class MaskedLoadStoreSDNode : public MemSDNode {
2323public:
2324 friend class SelectionDAG;
2325
2326 MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order,
2327 const DebugLoc &dl, SDVTList VTs,
2328 ISD::MemIndexedMode AM, EVT MemVT,
2329 MachineMemOperand *MMO)
2330 : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
2331 LSBaseSDNodeBits.AddressingMode = AM;
2332 assert(getAddressingMode() == AM && "Value truncated")((void)0);
2333 }
2334
2335 // MaskedLoadSDNode (Chain, ptr, offset, mask, passthru)
2336 // MaskedStoreSDNode (Chain, data, ptr, offset, mask)
2337 // Mask is a vector of i1 elements
2338 const SDValue &getOffset() const {
2339 return getOperand(getOpcode() == ISD::MLOAD ? 2 : 3);
2340 }
2341 const SDValue &getMask() const {
2342 return getOperand(getOpcode() == ISD::MLOAD ? 3 : 4);
2343 }
2344
2345 /// Return the addressing mode for this load or store:
2346 /// unindexed, pre-inc, pre-dec, post-inc, or post-dec.
2347 ISD::MemIndexedMode getAddressingMode() const {
2348 return static_cast<ISD::MemIndexedMode>(LSBaseSDNodeBits.AddressingMode);
2349 }
2350
2351 /// Return true if this is a pre/post inc/dec load/store.
2352 bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; }
2353
2354 /// Return true if this is NOT a pre/post inc/dec load/store.
2355 bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
2356
2357 static bool classof(const SDNode *N) {
2358 return N->getOpcode() == ISD::MLOAD ||
2359 N->getOpcode() == ISD::MSTORE;
2360 }
2361};
2362
2363/// This class is used to represent an MLOAD node
2364class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
2365public:
2366 friend class SelectionDAG;
2367
2368 MaskedLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2369 ISD::MemIndexedMode AM, ISD::LoadExtType ETy,
2370 bool IsExpanding, EVT MemVT, MachineMemOperand *MMO)
2371 : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, VTs, AM, MemVT, MMO) {
2372 LoadSDNodeBits.ExtTy = ETy;
2373 LoadSDNodeBits.IsExpanding = IsExpanding;
2374 }
2375
2376 ISD::LoadExtType getExtensionType() const {
2377 return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
2378 }
2379
2380 const SDValue &getBasePtr() const { return getOperand(1); }
2381 const SDValue &getOffset() const { return getOperand(2); }
2382 const SDValue &getMask() const { return getOperand(3); }
2383 const SDValue &getPassThru() const { return getOperand(4); }
2384
2385 static bool classof(const SDNode *N) {
2386 return N->getOpcode() == ISD::MLOAD;
2387 }
2388
2389 bool isExpandingLoad() const { return LoadSDNodeBits.IsExpanding; }
2390};
2391
2392/// This class is used to represent an MSTORE node
2393class MaskedStoreSDNode : public MaskedLoadStoreSDNode {
2394public:
2395 friend class SelectionDAG;
2396
2397 MaskedStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2398 ISD::MemIndexedMode AM, bool isTrunc, bool isCompressing,
2399 EVT MemVT, MachineMemOperand *MMO)
2400 : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, AM, MemVT, MMO) {
2401 StoreSDNodeBits.IsTruncating = isTrunc;
2402 StoreSDNodeBits.IsCompressing = isCompressing;
2403 }
2404
2405 /// Return true if the op does a truncation before store.
2406 /// For integers this is the same as doing a TRUNCATE and storing the result.
2407 /// For floats, it is the same as doing an FP_ROUND and storing the result.
2408 bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
2409
2410 /// Returns true if the op does a compression to the vector before storing.
2411 /// The node contiguously stores the active elements (integers or floats)
2412 /// in src (those with their respective bit set in writemask k) to unaligned
2413 /// memory at base_addr.
2414 bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
2415
2416 const SDValue &getValue() const { return getOperand(1); }
2417 const SDValue &getBasePtr() const { return getOperand(2); }
2418 const SDValue &getOffset() const { return getOperand(3); }
2419 const SDValue &getMask() const { return getOperand(4); }
2420
2421 static bool classof(const SDNode *N) {
2422 return N->getOpcode() == ISD::MSTORE;
2423 }
2424};
2425
2426/// This is a base class used to represent
2427/// MGATHER and MSCATTER nodes
2428///
2429class MaskedGatherScatterSDNode : public MemSDNode {
2430public:
2431 friend class SelectionDAG;
2432
2433 MaskedGatherScatterSDNode(ISD::NodeType NodeTy, unsigned Order,
2434 const DebugLoc &dl, SDVTList VTs, EVT MemVT,
2435 MachineMemOperand *MMO, ISD::MemIndexType IndexType)
2436 : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
2437 LSBaseSDNodeBits.AddressingMode = IndexType;
2438 assert(getIndexType() == IndexType && "Value truncated")((void)0);
2439 }
2440
2441 /// How is Index applied to BasePtr when computing addresses.
2442 ISD::MemIndexType getIndexType() const {
2443 return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
2444 }
2445 void setIndexType(ISD::MemIndexType IndexType) {
2446 LSBaseSDNodeBits.AddressingMode = IndexType;
2447 }
2448 bool isIndexScaled() const {
2449 return (getIndexType() == ISD::SIGNED_SCALED) ||
2450 (getIndexType() == ISD::UNSIGNED_SCALED);
2451 }
2452 bool isIndexSigned() const {
2453 return (getIndexType() == ISD::SIGNED_SCALED) ||
2454 (getIndexType() == ISD::SIGNED_UNSCALED);
2455 }
2456
2457 // In the both nodes address is Op1, mask is Op2:
2458 // MaskedGatherSDNode (Chain, passthru, mask, base, index, scale)
2459 // MaskedScatterSDNode (Chain, value, mask, base, index, scale)
2460 // Mask is a vector of i1 elements
2461 const SDValue &getBasePtr() const { return getOperand(3); }
2462 const SDValue &getIndex() const { return getOperand(4); }
2463 const SDValue &getMask() const { return getOperand(2); }
2464 const SDValue &getScale() const { return getOperand(5); }
2465
2466 static bool classof(const SDNode *N) {
2467 return N->getOpcode() == ISD::MGATHER ||
2468 N->getOpcode() == ISD::MSCATTER;
2469 }
2470};
2471
2472/// This class is used to represent an MGATHER node
2473///
2474class MaskedGatherSDNode : public MaskedGatherScatterSDNode {
2475public:
2476 friend class SelectionDAG;
2477
2478 MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2479 EVT MemVT, MachineMemOperand *MMO,
2480 ISD::MemIndexType IndexType, ISD::LoadExtType ETy)
2481 : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, VTs, MemVT, MMO,
2482 IndexType) {
2483 LoadSDNodeBits.ExtTy = ETy;
2484 }
2485
2486 const SDValue &getPassThru() const { return getOperand(1); }
2487
2488 ISD::LoadExtType getExtensionType() const {
2489 return ISD::LoadExtType(LoadSDNodeBits.ExtTy);
2490 }
2491
2492 static bool classof(const SDNode *N) {
2493 return N->getOpcode() == ISD::MGATHER;
2494 }
2495};
2496
2497/// This class is used to represent an MSCATTER node
2498///
2499class MaskedScatterSDNode : public MaskedGatherScatterSDNode {
2500public:
2501 friend class SelectionDAG;
2502
2503 MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
2504 EVT MemVT, MachineMemOperand *MMO,
2505 ISD::MemIndexType IndexType, bool IsTrunc)
2506 : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, VTs, MemVT, MMO,
2507 IndexType) {
2508 StoreSDNodeBits.IsTruncating = IsTrunc;
2509 }
2510
2511 /// Return true if the op does a truncation before store.
2512 /// For integers this is the same as doing a TRUNCATE and storing the result.
2513 /// For floats, it is the same as doing an FP_ROUND and storing the result.
2514 bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
2515
2516 const SDValue &getValue() const { return getOperand(1); }
2517
2518 static bool classof(const SDNode *N) {
2519 return N->getOpcode() == ISD::MSCATTER;
2520 }
2521};
2522
2523/// An SDNode that represents everything that will be needed
2524/// to construct a MachineInstr. These nodes are created during the
2525/// instruction selection proper phase.
2526///
2527/// Note that the only supported way to set the `memoperands` is by calling the
2528/// `SelectionDAG::setNodeMemRefs` function as the memory management happens
2529/// inside the DAG rather than in the node.
2530class MachineSDNode : public SDNode {
2531private:
2532 friend class SelectionDAG;
2533
2534 MachineSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL, SDVTList VTs)
2535 : SDNode(Opc, Order, DL, VTs) {}
2536
2537 // We use a pointer union between a single `MachineMemOperand` pointer and
2538 // a pointer to an array of `MachineMemOperand` pointers. This is null when
2539 // the number of these is zero, the single pointer variant used when the
2540 // number is one, and the array is used for larger numbers.
2541 //
2542 // The array is allocated via the `SelectionDAG`'s allocator and so will
2543 // always live until the DAG is cleaned up and doesn't require ownership here.
2544 //
2545 // We can't use something simpler like `TinyPtrVector` here because `SDNode`
2546 // subclasses aren't managed in a conforming C++ manner. See the comments on
2547 // `SelectionDAG::MorphNodeTo` which details what all goes on, but the
2548 // constraint here is that these don't manage memory with their constructor or
2549 // destructor and can be initialized to a good state even if they start off
2550 // uninitialized.
2551 PointerUnion<MachineMemOperand *, MachineMemOperand **> MemRefs = {};
2552
2553 // Note that this could be folded into the above `MemRefs` member if doing so
2554 // is advantageous at some point. We don't need to store this in most cases.
2555 // However, at the moment this doesn't appear to make the allocation any
2556 // smaller and makes the code somewhat simpler to read.
2557 int NumMemRefs = 0;
2558
2559public:
2560 using mmo_iterator = ArrayRef<MachineMemOperand *>::const_iterator;
2561
2562 ArrayRef<MachineMemOperand *> memoperands() const {
2563 // Special case the common cases.
2564 if (NumMemRefs == 0)
2565 return {};
2566 if (NumMemRefs == 1)
2567 return makeArrayRef(MemRefs.getAddrOfPtr1(), 1);
2568
2569 // Otherwise we have an actual array.
2570 return makeArrayRef(MemRefs.get<MachineMemOperand **>(), NumMemRefs);
2571 }
2572 mmo_iterator memoperands_begin() const { return memoperands().begin(); }
2573 mmo_iterator memoperands_end() const { return memoperands().end(); }
2574 bool memoperands_empty() const { return memoperands().empty(); }
2575
2576 /// Clear out the memory reference descriptor list.
2577 void clearMemRefs() {
2578 MemRefs = nullptr;
2579 NumMemRefs = 0;
2580 }
2581
2582 static bool classof(const SDNode *N) {
2583 return N->isMachineOpcode();
2584 }
2585};
2586
2587/// An SDNode that records if a register contains a value that is guaranteed to
2588/// be aligned accordingly.
2589class AssertAlignSDNode : public SDNode {
2590 Align Alignment;
2591
2592public:
2593 AssertAlignSDNode(unsigned Order, const DebugLoc &DL, EVT VT, Align A)
2594 : SDNode(ISD::AssertAlign, Order, DL, getSDVTList(VT)), Alignment(A) {}
2595
2596 Align getAlign() const { return Alignment; }
2597
2598 static bool classof(const SDNode *N) {
2599 return N->getOpcode() == ISD::AssertAlign;
2600 }
2601};
2602
2603class SDNodeIterator {
2604 const SDNode *Node;
2605 unsigned Operand;
2606
2607 SDNodeIterator(const SDNode *N, unsigned Op) : Node(N), Operand(Op) {}
2608
2609public:
2610 using iterator_category = std::forward_iterator_tag;
2611 using value_type = SDNode;
2612 using difference_type = std::ptrdiff_t;
2613 using pointer = value_type *;
2614 using reference = value_type &;
2615
2616 bool operator==(const SDNodeIterator& x) const {
2617 return Operand == x.Operand;
2618 }
2619 bool operator!=(const SDNodeIterator& x) const { return !operator==(x); }
2620
2621 pointer operator*() const {
2622 return Node->getOperand(Operand).getNode();
2623 }
2624 pointer operator->() const { return operator*(); }
2625
2626 SDNodeIterator& operator++() { // Preincrement
2627 ++Operand;
2628 return *this;
2629 }
2630 SDNodeIterator operator++(int) { // Postincrement
2631 SDNodeIterator tmp = *this; ++*this; return tmp;
2632 }
2633 size_t operator-(SDNodeIterator Other) const {
2634 assert(Node == Other.Node &&((void)0)
2635 "Cannot compare iterators of two different nodes!")((void)0);
2636 return Operand - Other.Operand;
2637 }
2638
2639 static SDNodeIterator begin(const SDNode *N) { return SDNodeIterator(N, 0); }
2640 static SDNodeIterator end (const SDNode *N) {
2641 return SDNodeIterator(N, N->getNumOperands());
2642 }
2643
2644 unsigned getOperand() const { return Operand; }
2645 const SDNode *getNode() const { return Node; }
2646};
2647
2648template <> struct GraphTraits<SDNode*> {
2649 using NodeRef = SDNode *;
2650 using ChildIteratorType = SDNodeIterator;
2651
2652 static NodeRef getEntryNode(SDNode *N) { return N; }
2653
2654 static ChildIteratorType child_begin(NodeRef N) {
2655 return SDNodeIterator::begin(N);
2656 }
2657
2658 static ChildIteratorType child_end(NodeRef N) {
2659 return SDNodeIterator::end(N);
2660 }
2661};
2662
2663/// A representation of the largest SDNode, for use in sizeof().
2664///
2665/// This needs to be a union because the largest node differs on 32 bit systems
2666/// with 4 and 8 byte pointer alignment, respectively.
2667using LargestSDNode = AlignedCharArrayUnion<AtomicSDNode, TargetIndexSDNode,
2668 BlockAddressSDNode,
2669 GlobalAddressSDNode,
2670 PseudoProbeSDNode>;
2671
2672/// The SDNode class with the greatest alignment requirement.
2673using MostAlignedSDNode = GlobalAddressSDNode;
2674
2675namespace ISD {
2676
2677 /// Returns true if the specified node is a non-extending and unindexed load.
2678 inline bool isNormalLoad(const SDNode *N) {
2679 const LoadSDNode *Ld = dyn_cast<LoadSDNode>(N);
2680 return Ld && Ld->getExtensionType() == ISD::NON_EXTLOAD &&
2681 Ld->getAddressingMode() == ISD::UNINDEXED;
2682 }
2683
2684 /// Returns true if the specified node is a non-extending load.
2685 inline bool isNON_EXTLoad(const SDNode *N) {
2686 return isa<LoadSDNode>(N) &&
2687 cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
2688 }
2689
2690 /// Returns true if the specified node is a EXTLOAD.
2691 inline bool isEXTLoad(const SDNode *N) {
2692 return isa<LoadSDNode>(N) &&
2693 cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD;
2694 }
2695
2696 /// Returns true if the specified node is a SEXTLOAD.
2697 inline bool isSEXTLoad(const SDNode *N) {
2698 return isa<LoadSDNode>(N) &&
2699 cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
2700 }
2701
2702 /// Returns true if the specified node is a ZEXTLOAD.
2703 inline bool isZEXTLoad(const SDNode *N) {
2704 return isa<LoadSDNode>(N) &&
2705 cast<LoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
2706 }
2707
2708 /// Returns true if the specified node is an unindexed load.
2709 inline bool isUNINDEXEDLoad(const SDNode *N) {
2710 return isa<LoadSDNode>(N) &&
2711 cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
2712 }
2713
2714 /// Returns true if the specified node is a non-truncating
2715 /// and unindexed store.
2716 inline bool isNormalStore(const SDNode *N) {
2717 const StoreSDNode *St = dyn_cast<StoreSDNode>(N);
2718 return St && !St->isTruncatingStore() &&
2719 St->getAddressingMode() == ISD::UNINDEXED;
2720 }
2721
2722 /// Returns true if the specified node is an unindexed store.
2723 inline bool isUNINDEXEDStore(const SDNode *N) {
2724 return isa<StoreSDNode>(N) &&
2725 cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
2726 }
2727
2728 /// Attempt to match a unary predicate against a scalar/splat constant or
2729 /// every element of a constant BUILD_VECTOR.
2730 /// If AllowUndef is true, then UNDEF elements will pass nullptr to Match.
2731 bool matchUnaryPredicate(SDValue Op,
2732 std::function<bool(ConstantSDNode *)> Match,
2733 bool AllowUndefs = false);
2734
2735 /// Attempt to match a binary predicate against a pair of scalar/splat
2736 /// constants or every element of a pair of constant BUILD_VECTORs.
2737 /// If AllowUndef is true, then UNDEF elements will pass nullptr to Match.
2738 /// If AllowTypeMismatch is true then RetType + ArgTypes don't need to match.
2739 bool matchBinaryPredicate(
2740 SDValue LHS, SDValue RHS,
2741 std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match,
2742 bool AllowUndefs = false, bool AllowTypeMismatch = false);
2743
2744 /// Returns true if the specified value is the overflow result from one
2745 /// of the overflow intrinsic nodes.
2746 inline bool isOverflowIntrOpRes(SDValue Op) {
2747 unsigned Opc = Op.getOpcode();
2748 return (Op.getResNo() == 1 &&
2749 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2750 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
2751 }
2752
2753} // end namespace ISD
2754
2755} // end namespace llvm
2756
2757#endif // LLVM_CODEGEN_SELECTIONDAGNODES_H