Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
Warning:line 1122, column 10
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -D PIC -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -D_RET_PROTECTOR -ret-protector -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86/X86ISelLowering.cpp

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/Instructions.h"
52#include "llvm/IR/Intrinsics.h"
53#include "llvm/IR/IRBuilder.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
58#include "llvm/Support/CommandLine.h"
59#include "llvm/Support/Debug.h"
60#include "llvm/Support/ErrorHandling.h"
61#include "llvm/Support/KnownBits.h"
62#include "llvm/Support/MathExtras.h"
63#include "llvm/Target/TargetOptions.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE"x86-isel" "x86-isel"
71
72STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
73
74static cl::opt<int> ExperimentalPrefLoopAlignment(
75 "x86-experimental-pref-loop-alignment", cl::init(4),
76 cl::desc(
77 "Sets the preferable loop alignment for experiments (as log2 bytes)"
78 "(the last x86-experimental-pref-loop-alignment bits"
79 " of the loop header PC will be 0)."),
80 cl::Hidden);
81
82static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
83 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
84 cl::desc(
85 "Sets the preferable loop alignment for experiments (as log2 bytes) "
86 "for innermost loops only. If specified, this option overrides "
87 "alignment set by x86-experimental-pref-loop-alignment."),
88 cl::Hidden);
89
90static cl::opt<bool> MulConstantOptimization(
91 "mul-constant-optimization", cl::init(true),
92 cl::desc("Replace 'mul x, Const' with more effective instructions like "
93 "SHIFT, LEA, etc."),
94 cl::Hidden);
95
96static cl::opt<bool> ExperimentalUnorderedISEL(
97 "x86-experimental-unordered-atomic-isel", cl::init(false),
98 cl::desc("Use LoadSDNode and StoreSDNode instead of "
99 "AtomicSDNode for unordered atomic loads and "
100 "stores respectively."),
101 cl::Hidden);
102
103/// Call this when the user attempts to do something unsupported, like
104/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
105/// report_fatal_error, so calling code should attempt to recover without
106/// crashing.
107static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
108 const char *Msg) {
109 MachineFunction &MF = DAG.getMachineFunction();
110 DAG.getContext()->diagnose(
111 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
112}
113
114X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
115 const X86Subtarget &STI)
116 : TargetLowering(TM), Subtarget(STI) {
117 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
118 X86ScalarSSEf64 = Subtarget.hasSSE2();
119 X86ScalarSSEf32 = Subtarget.hasSSE1();
120 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
121
122 // Set up the TargetLowering object.
123
124 // X86 is weird. It always uses i8 for shift amounts and setcc results.
125 setBooleanContents(ZeroOrOneBooleanContent);
126 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
127 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
128
129 // For 64-bit, since we have so many registers, use the ILP scheduler.
130 // For 32-bit, use the register pressure specific scheduling.
131 // For Atom, always use ILP scheduling.
132 if (Subtarget.isAtom())
133 setSchedulingPreference(Sched::ILP);
134 else if (Subtarget.is64Bit())
135 setSchedulingPreference(Sched::ILP);
136 else
137 setSchedulingPreference(Sched::RegPressure);
138 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
139 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
140
141 // Bypass expensive divides and use cheaper ones.
142 if (TM.getOptLevel() >= CodeGenOpt::Default) {
143 if (Subtarget.hasSlowDivide32())
144 addBypassSlowDiv(32, 8);
145 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
146 addBypassSlowDiv(64, 32);
147 }
148
149 // Setup Windows compiler runtime calls.
150 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
151 static const struct {
152 const RTLIB::Libcall Op;
153 const char * const Name;
154 const CallingConv::ID CC;
155 } LibraryCalls[] = {
156 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
157 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
158 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
159 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
160 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
161 };
162
163 for (const auto &LC : LibraryCalls) {
164 setLibcallName(LC.Op, LC.Name);
165 setLibcallCallingConv(LC.Op, LC.CC);
166 }
167 }
168
169 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
170 // MSVCRT doesn't have powi; fall back to pow
171 setLibcallName(RTLIB::POWI_F32, nullptr);
172 setLibcallName(RTLIB::POWI_F64, nullptr);
173 }
174
175 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
176 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
177 // FIXME: Should we be limiting the atomic size on other configs? Default is
178 // 1024.
179 if (!Subtarget.hasCmpxchg8b())
180 setMaxAtomicSizeInBitsSupported(32);
181
182 // Set up the register classes.
183 addRegisterClass(MVT::i8, &X86::GR8RegClass);
184 addRegisterClass(MVT::i16, &X86::GR16RegClass);
185 addRegisterClass(MVT::i32, &X86::GR32RegClass);
186 if (Subtarget.is64Bit())
187 addRegisterClass(MVT::i64, &X86::GR64RegClass);
188
189 for (MVT VT : MVT::integer_valuetypes())
190 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
191
192 // We don't accept any truncstore of integer registers.
193 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
194 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
197 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
198 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
199
200 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
201
202 // SETOEQ and SETUNE require checking two conditions.
203 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 setCondCodeAction(ISD::SETOEQ, VT, Expand);
205 setCondCodeAction(ISD::SETUNE, VT, Expand);
206 }
207
208 // Integer absolute.
209 if (Subtarget.hasCMov()) {
210 setOperationAction(ISD::ABS , MVT::i16 , Custom);
211 setOperationAction(ISD::ABS , MVT::i32 , Custom);
212 if (Subtarget.is64Bit())
213 setOperationAction(ISD::ABS , MVT::i64 , Custom);
214 }
215
216 // Funnel shifts.
217 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
218 // For slow shld targets we only lower for code size.
219 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
220
221 setOperationAction(ShiftOp , MVT::i8 , Custom);
222 setOperationAction(ShiftOp , MVT::i16 , Custom);
223 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
224 if (Subtarget.is64Bit())
225 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
226 }
227
228 if (!Subtarget.useSoftFloat()) {
229 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
230 // operation.
231 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
232 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
233 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
234 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
235 // We have an algorithm for SSE2, and we turn this into a 64-bit
236 // FILD or VCVTUSI2SS/SD for other targets.
237 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
238 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
239 // We have an algorithm for SSE2->double, and we turn this into a
240 // 64-bit FILD followed by conditional FADD for other targets.
241 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
242 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
243
244 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
245 // this operation.
246 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
247 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
248 // SSE has no i16 to fp conversion, only i32. We promote in the handler
249 // to allow f80 to use i16 and f64 to use i16 with sse1 only
250 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
251 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
252 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
253 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
254 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
255 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
256 // are Legal, f80 is custom lowered.
257 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
258 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
259
260 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
261 // this operation.
262 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
263 // FIXME: This doesn't generate invalid exception when it should. PR44019.
264 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
265 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
266 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
267 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
268 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
271 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
272 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
273
274 // Handle FP_TO_UINT by promoting the destination to a larger signed
275 // conversion.
276 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
278 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
279 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
280 // FIXME: This doesn't generate invalid exception when it should. PR44019.
281 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
282 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
283 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
284 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
285 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
286
287 setOperationAction(ISD::LRINT, MVT::f32, Custom);
288 setOperationAction(ISD::LRINT, MVT::f64, Custom);
289 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
290 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
291
292 if (!Subtarget.is64Bit()) {
293 setOperationAction(ISD::LRINT, MVT::i64, Custom);
294 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
295 }
296 }
297
298 if (Subtarget.hasSSE2()) {
299 // Custom lowering for saturating float to int conversions.
300 // We handle promotion to larger result types manually.
301 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
302 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
303 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
304 }
305 if (Subtarget.is64Bit()) {
306 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
307 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
308 }
309 }
310
311 // Handle address space casts between mixed sized pointers.
312 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
313 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
314
315 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
316 if (!X86ScalarSSEf64) {
317 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
318 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
319 if (Subtarget.is64Bit()) {
320 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
321 // Without SSE, i64->f64 goes through memory.
322 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
323 }
324 } else if (!Subtarget.is64Bit())
325 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
326
327 // Scalar integer divide and remainder are lowered to use operations that
328 // produce two results, to match the available instructions. This exposes
329 // the two-result form to trivial CSE, which is able to combine x/y and x%y
330 // into a single instruction.
331 //
332 // Scalar integer multiply-high is also lowered to use two-result
333 // operations, to match the available instructions. However, plain multiply
334 // (low) operations are left as Legal, as there are single-result
335 // instructions for this in x86. Using the two-result multiply instructions
336 // when both high and low results are needed must be arranged by dagcombine.
337 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
338 setOperationAction(ISD::MULHS, VT, Expand);
339 setOperationAction(ISD::MULHU, VT, Expand);
340 setOperationAction(ISD::SDIV, VT, Expand);
341 setOperationAction(ISD::UDIV, VT, Expand);
342 setOperationAction(ISD::SREM, VT, Expand);
343 setOperationAction(ISD::UREM, VT, Expand);
344 }
345
346 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
347 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
348 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
349 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
350 setOperationAction(ISD::BR_CC, VT, Expand);
351 setOperationAction(ISD::SELECT_CC, VT, Expand);
352 }
353 if (Subtarget.is64Bit())
354 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
355 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
356 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
357 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
358
359 setOperationAction(ISD::FREM , MVT::f32 , Expand);
360 setOperationAction(ISD::FREM , MVT::f64 , Expand);
361 setOperationAction(ISD::FREM , MVT::f80 , Expand);
362 setOperationAction(ISD::FREM , MVT::f128 , Expand);
363
364 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
365 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
366 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
367 }
368
369 // Promote the i8 variants and force them on up to i32 which has a shorter
370 // encoding.
371 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
372 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
373
374 if (Subtarget.hasBMI()) {
375 // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
376 // is enabled.
377 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
378 } else {
379 setOperationAction(ISD::CTTZ, MVT::i16, Custom);
380 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
381 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
382 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
383 if (Subtarget.is64Bit()) {
384 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
385 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
386 }
387 }
388
389 if (Subtarget.hasLZCNT()) {
390 // When promoting the i8 variants, force them to i32 for a shorter
391 // encoding.
392 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
393 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
394 } else {
395 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
396 if (VT == MVT::i64 && !Subtarget.is64Bit())
397 continue;
398 setOperationAction(ISD::CTLZ , VT, Custom);
399 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
400 }
401 }
402
403 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
404 ISD::STRICT_FP_TO_FP16}) {
405 // Special handling for half-precision floating point conversions.
406 // If we don't have F16C support, then lower half float conversions
407 // into library calls.
408 setOperationAction(
409 Op, MVT::f32,
410 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
411 // There's never any support for operations beyond MVT::f32.
412 setOperationAction(Op, MVT::f64, Expand);
413 setOperationAction(Op, MVT::f80, Expand);
414 setOperationAction(Op, MVT::f128, Expand);
415 }
416
417 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
418 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
419 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
420 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
421 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
422 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
423 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
424 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
425
426 setOperationAction(ISD::PARITY, MVT::i8, Custom);
427 if (Subtarget.hasPOPCNT()) {
428 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
429 } else {
430 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
431 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
432 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
433 if (Subtarget.is64Bit())
434 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
435 else
436 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
437
438 setOperationAction(ISD::PARITY, MVT::i16, Custom);
439 setOperationAction(ISD::PARITY, MVT::i32, Custom);
440 if (Subtarget.is64Bit())
441 setOperationAction(ISD::PARITY, MVT::i64, Custom);
442 }
443
444 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
445
446 if (!Subtarget.hasMOVBE())
447 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
448
449 // X86 wants to expand cmov itself.
450 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
451 setOperationAction(ISD::SELECT, VT, Custom);
452 setOperationAction(ISD::SETCC, VT, Custom);
453 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
454 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
455 }
456 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::SELECT, VT, Custom);
460 setOperationAction(ISD::SETCC, VT, Custom);
461 }
462
463 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
464 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
465 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
466
467 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
468 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
469 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
470 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
471 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
472 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
473 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
474 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
475
476 // Darwin ABI issue.
477 for (auto VT : { MVT::i32, MVT::i64 }) {
478 if (VT == MVT::i64 && !Subtarget.is64Bit())
479 continue;
480 setOperationAction(ISD::ConstantPool , VT, Custom);
481 setOperationAction(ISD::JumpTable , VT, Custom);
482 setOperationAction(ISD::GlobalAddress , VT, Custom);
483 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
484 setOperationAction(ISD::ExternalSymbol , VT, Custom);
485 setOperationAction(ISD::BlockAddress , VT, Custom);
486 }
487
488 // 64-bit shl, sra, srl (iff 32-bit x86)
489 for (auto VT : { MVT::i32, MVT::i64 }) {
490 if (VT == MVT::i64 && !Subtarget.is64Bit())
491 continue;
492 setOperationAction(ISD::SHL_PARTS, VT, Custom);
493 setOperationAction(ISD::SRA_PARTS, VT, Custom);
494 setOperationAction(ISD::SRL_PARTS, VT, Custom);
495 }
496
497 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
498 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
499
500 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
501
502 // Expand certain atomics
503 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
504 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
505 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
506 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
507 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
508 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
509 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
510 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
511 }
512
513 if (!Subtarget.is64Bit())
514 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
515
516 if (Subtarget.hasCmpxchg16b()) {
517 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
518 }
519
520 // FIXME - use subtarget debug flags
521 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
522 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
523 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
524 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
525 }
526
527 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
528 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
529
530 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
531 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
532
533 setOperationAction(ISD::TRAP, MVT::Other, Legal);
534 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
535 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
536
537 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
538 setOperationAction(ISD::VASTART , MVT::Other, Custom);
539 setOperationAction(ISD::VAEND , MVT::Other, Expand);
540 bool Is64Bit = Subtarget.is64Bit();
541 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
542 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
543
544 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
545 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
546
547 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
548
549 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
550 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
551 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
552
553 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
554 // f32 and f64 use SSE.
555 // Set up the FP register classes.
556 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
557 : &X86::FR32RegClass);
558 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
559 : &X86::FR64RegClass);
560
561 // Disable f32->f64 extload as we can only generate this in one instruction
562 // under optsize. So its easier to pattern match (fpext (load)) for that
563 // case instead of needing to emit 2 instructions for extload in the
564 // non-optsize case.
565 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
566
567 for (auto VT : { MVT::f32, MVT::f64 }) {
568 // Use ANDPD to simulate FABS.
569 setOperationAction(ISD::FABS, VT, Custom);
570
571 // Use XORP to simulate FNEG.
572 setOperationAction(ISD::FNEG, VT, Custom);
573
574 // Use ANDPD and ORPD to simulate FCOPYSIGN.
575 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
576
577 // These might be better off as horizontal vector ops.
578 setOperationAction(ISD::FADD, VT, Custom);
579 setOperationAction(ISD::FSUB, VT, Custom);
580
581 // We don't support sin/cos/fmod
582 setOperationAction(ISD::FSIN , VT, Expand);
583 setOperationAction(ISD::FCOS , VT, Expand);
584 setOperationAction(ISD::FSINCOS, VT, Expand);
585 }
586
587 // Lower this to MOVMSK plus an AND.
588 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
589 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
590
591 } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
592 (UseX87 || Is64Bit)) {
593 // Use SSE for f32, x87 for f64.
594 // Set up the FP register classes.
595 addRegisterClass(MVT::f32, &X86::FR32RegClass);
596 if (UseX87)
597 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
598
599 // Use ANDPS to simulate FABS.
600 setOperationAction(ISD::FABS , MVT::f32, Custom);
601
602 // Use XORP to simulate FNEG.
603 setOperationAction(ISD::FNEG , MVT::f32, Custom);
604
605 if (UseX87)
606 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
607
608 // Use ANDPS and ORPS to simulate FCOPYSIGN.
609 if (UseX87)
610 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
611 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
612
613 // We don't support sin/cos/fmod
614 setOperationAction(ISD::FSIN , MVT::f32, Expand);
615 setOperationAction(ISD::FCOS , MVT::f32, Expand);
616 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
617
618 if (UseX87) {
619 // Always expand sin/cos functions even though x87 has an instruction.
620 setOperationAction(ISD::FSIN, MVT::f64, Expand);
621 setOperationAction(ISD::FCOS, MVT::f64, Expand);
622 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
623 }
624 } else if (UseX87) {
625 // f32 and f64 in x87.
626 // Set up the FP register classes.
627 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
628 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
629
630 for (auto VT : { MVT::f32, MVT::f64 }) {
631 setOperationAction(ISD::UNDEF, VT, Expand);
632 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
633
634 // Always expand sin/cos functions even though x87 has an instruction.
635 setOperationAction(ISD::FSIN , VT, Expand);
636 setOperationAction(ISD::FCOS , VT, Expand);
637 setOperationAction(ISD::FSINCOS, VT, Expand);
638 }
639 }
640
641 // Expand FP32 immediates into loads from the stack, save special cases.
642 if (isTypeLegal(MVT::f32)) {
643 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
644 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
645 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
646 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
647 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
648 } else // SSE immediates.
649 addLegalFPImmediate(APFloat(+0.0f)); // xorps
650 }
651 // Expand FP64 immediates into loads from the stack, save special cases.
652 if (isTypeLegal(MVT::f64)) {
653 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
654 addLegalFPImmediate(APFloat(+0.0)); // FLD0
655 addLegalFPImmediate(APFloat(+1.0)); // FLD1
656 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
657 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
658 } else // SSE immediates.
659 addLegalFPImmediate(APFloat(+0.0)); // xorpd
660 }
661 // Handle constrained floating-point operations of scalar.
662 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
663 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
664 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
665 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
666 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
667 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
668 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
669 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
670 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
671 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
672 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
673 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
674 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
675
676 // We don't support FMA.
677 setOperationAction(ISD::FMA, MVT::f64, Expand);
678 setOperationAction(ISD::FMA, MVT::f32, Expand);
679
680 // f80 always uses X87.
681 if (UseX87) {
682 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
683 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
684 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
685 {
686 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
687 addLegalFPImmediate(TmpFlt); // FLD0
688 TmpFlt.changeSign();
689 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
690
691 bool ignored;
692 APFloat TmpFlt2(+1.0);
693 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
694 &ignored);
695 addLegalFPImmediate(TmpFlt2); // FLD1
696 TmpFlt2.changeSign();
697 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
698 }
699
700 // Always expand sin/cos functions even though x87 has an instruction.
701 setOperationAction(ISD::FSIN , MVT::f80, Expand);
702 setOperationAction(ISD::FCOS , MVT::f80, Expand);
703 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
704
705 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
706 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
707 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
708 setOperationAction(ISD::FRINT, MVT::f80, Expand);
709 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
710 setOperationAction(ISD::FMA, MVT::f80, Expand);
711 setOperationAction(ISD::LROUND, MVT::f80, Expand);
712 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
713 setOperationAction(ISD::LRINT, MVT::f80, Custom);
714 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
715
716 // Handle constrained floating-point operations of scalar.
717 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
718 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
719 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
720 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
721 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
722 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
723 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
724 // as Custom.
725 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
726 }
727
728 // f128 uses xmm registers, but most operations require libcalls.
729 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
730 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
731 : &X86::VR128RegClass);
732
733 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
734
735 setOperationAction(ISD::FADD, MVT::f128, LibCall);
736 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
737 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
738 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
739 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
740 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
741 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
742 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
743 setOperationAction(ISD::FMA, MVT::f128, LibCall);
744 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
745
746 setOperationAction(ISD::FABS, MVT::f128, Custom);
747 setOperationAction(ISD::FNEG, MVT::f128, Custom);
748 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
749
750 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
751 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
752 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
753 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
754 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
755 // No STRICT_FSINCOS
756 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
757 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
758
759 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
760 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
761 // We need to custom handle any FP_ROUND with an f128 input, but
762 // LegalizeDAG uses the result type to know when to run a custom handler.
763 // So we have to list all legal floating point result types here.
764 if (isTypeLegal(MVT::f32)) {
765 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
766 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
767 }
768 if (isTypeLegal(MVT::f64)) {
769 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
770 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
771 }
772 if (isTypeLegal(MVT::f80)) {
773 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
774 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
775 }
776
777 setOperationAction(ISD::SETCC, MVT::f128, Custom);
778
779 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
780 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
781 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
782 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
783 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
784 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
785 }
786
787 // Always use a library call for pow.
788 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
789 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
790 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
791 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
792
793 setOperationAction(ISD::FLOG, MVT::f80, Expand);
794 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
795 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
796 setOperationAction(ISD::FEXP, MVT::f80, Expand);
797 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
798 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
799 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
800
801 // Some FP actions are always expanded for vector types.
802 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
803 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
804 setOperationAction(ISD::FSIN, VT, Expand);
805 setOperationAction(ISD::FSINCOS, VT, Expand);
806 setOperationAction(ISD::FCOS, VT, Expand);
807 setOperationAction(ISD::FREM, VT, Expand);
808 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
809 setOperationAction(ISD::FPOW, VT, Expand);
810 setOperationAction(ISD::FLOG, VT, Expand);
811 setOperationAction(ISD::FLOG2, VT, Expand);
812 setOperationAction(ISD::FLOG10, VT, Expand);
813 setOperationAction(ISD::FEXP, VT, Expand);
814 setOperationAction(ISD::FEXP2, VT, Expand);
815 }
816
817 // First set operation action for all vector types to either promote
818 // (for widening) or expand (for scalarization). Then we will selectively
819 // turn on ones that can be effectively codegen'd.
820 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
821 setOperationAction(ISD::SDIV, VT, Expand);
822 setOperationAction(ISD::UDIV, VT, Expand);
823 setOperationAction(ISD::SREM, VT, Expand);
824 setOperationAction(ISD::UREM, VT, Expand);
825 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
826 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
827 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
828 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
829 setOperationAction(ISD::FMA, VT, Expand);
830 setOperationAction(ISD::FFLOOR, VT, Expand);
831 setOperationAction(ISD::FCEIL, VT, Expand);
832 setOperationAction(ISD::FTRUNC, VT, Expand);
833 setOperationAction(ISD::FRINT, VT, Expand);
834 setOperationAction(ISD::FNEARBYINT, VT, Expand);
835 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
836 setOperationAction(ISD::MULHS, VT, Expand);
837 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
838 setOperationAction(ISD::MULHU, VT, Expand);
839 setOperationAction(ISD::SDIVREM, VT, Expand);
840 setOperationAction(ISD::UDIVREM, VT, Expand);
841 setOperationAction(ISD::CTPOP, VT, Expand);
842 setOperationAction(ISD::CTTZ, VT, Expand);
843 setOperationAction(ISD::CTLZ, VT, Expand);
844 setOperationAction(ISD::ROTL, VT, Expand);
845 setOperationAction(ISD::ROTR, VT, Expand);
846 setOperationAction(ISD::BSWAP, VT, Expand);
847 setOperationAction(ISD::SETCC, VT, Expand);
848 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
849 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
850 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
851 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
852 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
853 setOperationAction(ISD::TRUNCATE, VT, Expand);
854 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
855 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
856 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
857 setOperationAction(ISD::SELECT_CC, VT, Expand);
858 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
859 setTruncStoreAction(InnerVT, VT, Expand);
860
861 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
862 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
863
864 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
865 // types, we have to deal with them whether we ask for Expansion or not.
866 // Setting Expand causes its own optimisation problems though, so leave
867 // them legal.
868 if (VT.getVectorElementType() == MVT::i1)
869 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
870
871 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
872 // split/scalarized right now.
873 if (VT.getVectorElementType() == MVT::f16)
874 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
875 }
876 }
877
878 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
879 // with -msoft-float, disable use of MMX as well.
880 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
881 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
882 // No operations on x86mmx supported, everything uses intrinsics.
883 }
884
885 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
890 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
891 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
892 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
893 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
894 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
895 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
896 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
897
898 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
899 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
900
901 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
902 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
903 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
904 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
905 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
906 }
907
908 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
909 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
910 : &X86::VR128RegClass);
911
912 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
913 // registers cannot be used even for integer operations.
914 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
915 : &X86::VR128RegClass);
916 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
917 : &X86::VR128RegClass);
918 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
919 : &X86::VR128RegClass);
920 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
921 : &X86::VR128RegClass);
922
923 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
924 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
925 setOperationAction(ISD::SDIV, VT, Custom);
926 setOperationAction(ISD::SREM, VT, Custom);
927 setOperationAction(ISD::UDIV, VT, Custom);
928 setOperationAction(ISD::UREM, VT, Custom);
929 }
930
931 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
932 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
933 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
934
935 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
936 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
937 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
938 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
939 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
940 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
941 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
942 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
943 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
944 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
945
946 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
947 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
948
949 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
950 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
951 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
952
953 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
954 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
955 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
956 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
957 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
958 }
959
960 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
961 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
962 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
963 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
964 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
965 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
966 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
967 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
968 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
969 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
970
971 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
972 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
973 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
974
975 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
976 setOperationAction(ISD::SETCC, VT, Custom);
977 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
978 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
979 setOperationAction(ISD::CTPOP, VT, Custom);
980 setOperationAction(ISD::ABS, VT, Custom);
981
982 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
983 // setcc all the way to isel and prefer SETGT in some isel patterns.
984 setCondCodeAction(ISD::SETLT, VT, Custom);
985 setCondCodeAction(ISD::SETLE, VT, Custom);
986 }
987
988 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
989 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
990 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
991 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
992 setOperationAction(ISD::VSELECT, VT, Custom);
993 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
994 }
995
996 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
997 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
998 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
999 setOperationAction(ISD::VSELECT, VT, Custom);
1000
1001 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1002 continue;
1003
1004 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1005 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1006 }
1007
1008 // Custom lower v2i64 and v2f64 selects.
1009 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1010 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1011 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1012 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1013 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1014
1015 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1016 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1017 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1018 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1019 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
1020 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1021
1022 // Custom legalize these to avoid over promotion or custom promotion.
1023 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1024 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1025 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1026 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1027 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1028 }
1029
1030 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1031 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1032 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1033 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1034
1035 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1036 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1037
1038 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1039 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1040
1041 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1042 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1043 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1044 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1045 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1046
1047 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1048 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1049 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1050 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1051
1052 // We want to legalize this to an f64 load rather than an i64 load on
1053 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1054 // store.
1055 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1056 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1057 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1058 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1059 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1060 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1061
1062 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1063 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1064 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1065 if (!Subtarget.hasAVX512())
1066 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1067
1068 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1069 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1070 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1071
1072 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1073
1074 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1075 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1076 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1077 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1078 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1079 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1080
1081 // In the customized shift lowering, the legal v4i32/v2i64 cases
1082 // in AVX2 will be recognized.
1083 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1084 setOperationAction(ISD::SRL, VT, Custom);
1085 setOperationAction(ISD::SHL, VT, Custom);
1086 setOperationAction(ISD::SRA, VT, Custom);
1087 }
1088
1089 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1090 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1091
1092 // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1093 // shifts) is better.
1094 if (!Subtarget.useAVX512Regs() &&
1095 !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1096 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1097
1098 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1099 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1100 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1101 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1102 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1103 }
1104
1105 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1106 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1107 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1108 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1109 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1110 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1111 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1112 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1113 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1114
1115 // These might be better off as horizontal vector ops.
1116 setOperationAction(ISD::ADD, MVT::i16, Custom);
1117 setOperationAction(ISD::ADD, MVT::i32, Custom);
1118 setOperationAction(ISD::SUB, MVT::i16, Custom);
1119 setOperationAction(ISD::SUB, MVT::i32, Custom);
1120 }
1121
1122 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1123 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1124 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1125 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1126 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1127 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1128 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1129 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1130 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1131 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1132 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1133 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1134 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1135 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1136
1137 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1138 }
1139
1140 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1141 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1142 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1143 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1144 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1145 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1146 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1147 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1148
1149 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1150
1151 // FIXME: Do we need to handle scalar-to-vector here?
1152 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1153
1154 // We directly match byte blends in the backend as they match the VSELECT
1155 // condition form.
1156 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1157
1158 // SSE41 brings specific instructions for doing vector sign extend even in
1159 // cases where we don't have SRA.
1160 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1161 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1162 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1163 }
1164
1165 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1166 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1167 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1168 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1169 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1170 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1171 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1172 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1173 }
1174
1175 // i8 vectors are custom because the source register and source
1176 // source memory operand types are not the same width.
1177 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1178
1179 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1180 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1181 // do the pre and post work in the vector domain.
1182 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1183 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1184 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1185 // so that DAG combine doesn't try to turn it into uint_to_fp.
1186 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1187 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1188 }
1189 }
1190
1191 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1192 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1193 }
1194
1195 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1196 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1197 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1198 setOperationAction(ISD::ROTL, VT, Custom);
1199
1200 // XOP can efficiently perform BITREVERSE with VPPERM.
1201 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1202 setOperationAction(ISD::BITREVERSE, VT, Custom);
1203
1204 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1205 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1206 setOperationAction(ISD::BITREVERSE, VT, Custom);
1207 }
1208
1209 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1210 bool HasInt256 = Subtarget.hasInt256();
1211
1212 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1213 : &X86::VR256RegClass);
1214 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1215 : &X86::VR256RegClass);
1216 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1217 : &X86::VR256RegClass);
1218 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1219 : &X86::VR256RegClass);
1220 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1221 : &X86::VR256RegClass);
1222 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1223 : &X86::VR256RegClass);
1224
1225 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1226 setOperationAction(ISD::FFLOOR, VT, Legal);
1227 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1228 setOperationAction(ISD::FCEIL, VT, Legal);
1229 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1230 setOperationAction(ISD::FTRUNC, VT, Legal);
1231 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1232 setOperationAction(ISD::FRINT, VT, Legal);
1233 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1234 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1235 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1236 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1237 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1238
1239 setOperationAction(ISD::FROUND, VT, Custom);
1240
1241 setOperationAction(ISD::FNEG, VT, Custom);
1242 setOperationAction(ISD::FABS, VT, Custom);
1243 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1244 }
1245
1246 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1247 // even though v8i16 is a legal type.
1248 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1249 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1250 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1251 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1252 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1253 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1254 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1255
1256 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1257 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1258
1259 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1260 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1261 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1262 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1263 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1264 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1265 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1266 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1267 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1268 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1269 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1270 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1271
1272 if (!Subtarget.hasAVX512())
1273 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1274
1275 // In the customized shift lowering, the legal v8i32/v4i64 cases
1276 // in AVX2 will be recognized.
1277 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1278 setOperationAction(ISD::SRL, VT, Custom);
1279 setOperationAction(ISD::SHL, VT, Custom);
1280 setOperationAction(ISD::SRA, VT, Custom);
1281 }
1282
1283 // These types need custom splitting if their input is a 128-bit vector.
1284 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1285 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1286 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1287 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1288
1289 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1290 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1291
1292 // With BWI, expanding (and promoting the shifts) is the better.
1293 if (!Subtarget.useBWIRegs())
1294 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1295
1296 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1297 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1298 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1299 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1300 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1301 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1302
1303 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1304 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1305 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1307 }
1308
1309 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1310 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1311 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1312 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1313
1314 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1315 setOperationAction(ISD::SETCC, VT, Custom);
1316 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1317 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1318 setOperationAction(ISD::CTPOP, VT, Custom);
1319 setOperationAction(ISD::CTLZ, VT, Custom);
1320
1321 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1322 // setcc all the way to isel and prefer SETGT in some isel patterns.
1323 setCondCodeAction(ISD::SETLT, VT, Custom);
1324 setCondCodeAction(ISD::SETLE, VT, Custom);
1325 }
1326
1327 if (Subtarget.hasAnyFMA()) {
1328 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1329 MVT::v2f64, MVT::v4f64 }) {
1330 setOperationAction(ISD::FMA, VT, Legal);
1331 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1332 }
1333 }
1334
1335 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1336 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1337 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1338 }
1339
1340 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1341 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1342 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1343 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1344
1345 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1346 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1347 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1348 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1349 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1350 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1351
1352 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1353 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1354
1355 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1356 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1357 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1358 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1359 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1360
1361 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1362 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1363 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1364 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1365 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1366 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1367 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1368 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1369 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1370 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1371 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1372 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1373
1374 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1375 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1376 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1377 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1378 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1379 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1380 }
1381
1382 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1383 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1384 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1385 }
1386
1387 if (HasInt256) {
1388 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1389 // when we have a 256bit-wide blend with immediate.
1390 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1391 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1392
1393 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1394 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1395 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1396 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1397 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1398 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1399 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1401 }
1402 }
1403
1404 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1405 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1406 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1407 setOperationAction(ISD::MSTORE, VT, Legal);
1408 }
1409
1410 // Extract subvector is special because the value type
1411 // (result) is 128-bit but the source is 256-bit wide.
1412 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1413 MVT::v4f32, MVT::v2f64 }) {
1414 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1415 }
1416
1417 // Custom lower several nodes for 256-bit types.
1418 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1419 MVT::v8f32, MVT::v4f64 }) {
1420 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1421 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1422 setOperationAction(ISD::VSELECT, VT, Custom);
1423 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1424 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1425 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1426 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1427 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1428 setOperationAction(ISD::STORE, VT, Custom);
1429 }
1430
1431 if (HasInt256) {
1432 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1433
1434 // Custom legalize 2x32 to get a little better code.
1435 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1436 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1437
1438 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1439 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1440 setOperationAction(ISD::MGATHER, VT, Custom);
1441 }
1442 }
1443
1444 // This block controls legalization of the mask vector sizes that are
1445 // available with AVX512. 512-bit vectors are in a separate block controlled
1446 // by useAVX512Regs.
1447 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1448 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1449 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1450 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1451 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1452 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1453
1454 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1455 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1456 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1457
1458 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1459 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1460 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1461 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1462 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1463 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1464 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1465 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1466 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1467 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1468 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1469 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1470
1471 // There is no byte sized k-register load or store without AVX512DQ.
1472 if (!Subtarget.hasDQI()) {
1473 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1474 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1475 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1476 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1477
1478 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1479 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1480 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1481 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1482 }
1483
1484 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1485 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1486 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1487 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1488 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1489 }
1490
1491 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1492 setOperationAction(ISD::VSELECT, VT, Expand);
1493
1494 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1495 setOperationAction(ISD::SETCC, VT, Custom);
1496 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1497 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1498 setOperationAction(ISD::SELECT, VT, Custom);
1499 setOperationAction(ISD::TRUNCATE, VT, Custom);
1500
1501 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1502 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1503 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1504 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1505 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1506 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1507 }
1508
1509 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1510 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1511 }
1512
1513 // This block controls legalization for 512-bit operations with 32/64 bit
1514 // elements. 512-bits can be disabled based on prefer-vector-width and
1515 // required-vector-width function attributes.
1516 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1517 bool HasBWI = Subtarget.hasBWI();
1518
1519 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1520 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1521 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1522 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1523 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1524 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1525
1526 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1527 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1528 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1529 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1530 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1531 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1532 if (HasBWI)
1533 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1534 }
1535
1536 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1537 setOperationAction(ISD::FNEG, VT, Custom);
1538 setOperationAction(ISD::FABS, VT, Custom);
1539 setOperationAction(ISD::FMA, VT, Legal);
1540 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1541 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1542 }
1543
1544 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1545 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1546 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1547 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1548 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1549 }
1550 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1551 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1552 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1553 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1554 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1555 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1556 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1557 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1558
1559 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1560 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1561 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1562 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1563 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1564 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1565 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1566 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1567 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1568 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1569 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1570 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1571
1572 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1573 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1574 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1575 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1576 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1577 if (HasBWI)
1578 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1579
1580 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1581 // to 512-bit rather than use the AVX2 instructions so that we can use
1582 // k-masks.
1583 if (!Subtarget.hasVLX()) {
1584 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1585 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1586 setOperationAction(ISD::MLOAD, VT, Custom);
1587 setOperationAction(ISD::MSTORE, VT, Custom);
1588 }
1589 }
1590
1591 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1592 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1593 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1594 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1595 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1596 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1597 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1598 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1599 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1600 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1601 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1602 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1603 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1604
1605 if (HasBWI) {
1606 // Extends from v64i1 masks to 512-bit vectors.
1607 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1608 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1609 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1610 }
1611
1612 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1613 setOperationAction(ISD::FFLOOR, VT, Legal);
1614 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1615 setOperationAction(ISD::FCEIL, VT, Legal);
1616 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1617 setOperationAction(ISD::FTRUNC, VT, Legal);
1618 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1619 setOperationAction(ISD::FRINT, VT, Legal);
1620 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1621 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1622 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1623 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1624 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1625
1626 setOperationAction(ISD::FROUND, VT, Custom);
1627 }
1628
1629 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1630 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1631 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1632 }
1633
1634 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1635 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1636 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1637 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1638
1639 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1640 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1641 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1642 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1643
1644 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1645 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1646 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1647 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1648 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1649 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1650
1651 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1652 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1653
1654 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1655
1656 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1657 setOperationAction(ISD::SRL, VT, Custom);
1658 setOperationAction(ISD::SHL, VT, Custom);
1659 setOperationAction(ISD::SRA, VT, Custom);
1660 setOperationAction(ISD::SETCC, VT, Custom);
1661
1662 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1663 // setcc all the way to isel and prefer SETGT in some isel patterns.
1664 setCondCodeAction(ISD::SETLT, VT, Custom);
1665 setCondCodeAction(ISD::SETLE, VT, Custom);
1666 }
1667 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1668 setOperationAction(ISD::SMAX, VT, Legal);
1669 setOperationAction(ISD::UMAX, VT, Legal);
1670 setOperationAction(ISD::SMIN, VT, Legal);
1671 setOperationAction(ISD::UMIN, VT, Legal);
1672 setOperationAction(ISD::ABS, VT, Legal);
1673 setOperationAction(ISD::CTPOP, VT, Custom);
1674 setOperationAction(ISD::ROTL, VT, Custom);
1675 setOperationAction(ISD::ROTR, VT, Custom);
1676 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1677 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1678 }
1679
1680 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1681 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1682 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1683 setOperationAction(ISD::CTLZ, VT, Custom);
1684 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1685 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1686 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1687 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1688 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1689 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1690 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1691 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1692 }
1693
1694 if (Subtarget.hasDQI()) {
1695 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1696 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1697 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1698 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1699 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1700 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1701 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1702 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1703
1704 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1705 }
1706
1707 if (Subtarget.hasCDI()) {
1708 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1709 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1710 setOperationAction(ISD::CTLZ, VT, Legal);
1711 }
1712 } // Subtarget.hasCDI()
1713
1714 if (Subtarget.hasVPOPCNTDQ()) {
1715 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1716 setOperationAction(ISD::CTPOP, VT, Legal);
1717 }
1718
1719 // Extract subvector is special because the value type
1720 // (result) is 256-bit but the source is 512-bit wide.
1721 // 128-bit was made Legal under AVX1.
1722 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1723 MVT::v8f32, MVT::v4f64 })
1724 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1725
1726 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1727 MVT::v16f32, MVT::v8f64 }) {
1728 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1729 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1730 setOperationAction(ISD::SELECT, VT, Custom);
1731 setOperationAction(ISD::VSELECT, VT, Custom);
1732 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1733 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1734 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1735 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1736 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1737 }
1738
1739 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1740 setOperationAction(ISD::MLOAD, VT, Legal);
1741 setOperationAction(ISD::MSTORE, VT, Legal);
1742 setOperationAction(ISD::MGATHER, VT, Custom);
1743 setOperationAction(ISD::MSCATTER, VT, Custom);
1744 }
1745 if (HasBWI) {
1746 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1747 setOperationAction(ISD::MLOAD, VT, Legal);
1748 setOperationAction(ISD::MSTORE, VT, Legal);
1749 }
1750 } else {
1751 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1752 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1753 }
1754
1755 if (Subtarget.hasVBMI2()) {
1756 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1757 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1758 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1759 setOperationAction(ISD::FSHL, VT, Custom);
1760 setOperationAction(ISD::FSHR, VT, Custom);
1761 }
1762
1763 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1764 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1765 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1766 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1767 }
1768 }// useAVX512Regs
1769
1770 // This block controls legalization for operations that don't have
1771 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1772 // narrower widths.
1773 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1774 // These operations are handled on non-VLX by artificially widening in
1775 // isel patterns.
1776
1777 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1778 Subtarget.hasVLX() ? Legal : Custom);
1779 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1780 Subtarget.hasVLX() ? Legal : Custom);
1781 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1782 Subtarget.hasVLX() ? Legal : Custom);
1783 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1784 Subtarget.hasVLX() ? Legal : Custom);
1785 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1786 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1787 Subtarget.hasVLX() ? Legal : Custom);
1788 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1789 Subtarget.hasVLX() ? Legal : Custom);
1790 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1791 Subtarget.hasVLX() ? Legal : Custom);
1792 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1793 Subtarget.hasVLX() ? Legal : Custom);
1794
1795 if (Subtarget.hasDQI()) {
1796 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1797 // v2f32 UINT_TO_FP is already custom under SSE2.
1798 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&((void)0)
1799 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&((void)0)
1800 "Unexpected operation action!")((void)0);
1801 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1802 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1803 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1804 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1805 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1806 }
1807
1808 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1809 setOperationAction(ISD::SMAX, VT, Legal);
1810 setOperationAction(ISD::UMAX, VT, Legal);
1811 setOperationAction(ISD::SMIN, VT, Legal);
1812 setOperationAction(ISD::UMIN, VT, Legal);
1813 setOperationAction(ISD::ABS, VT, Legal);
1814 }
1815
1816 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1817 setOperationAction(ISD::ROTL, VT, Custom);
1818 setOperationAction(ISD::ROTR, VT, Custom);
1819 }
1820
1821 // Custom legalize 2x32 to get a little better code.
1822 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1823 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1824
1825 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1826 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1827 setOperationAction(ISD::MSCATTER, VT, Custom);
1828
1829 if (Subtarget.hasDQI()) {
1830 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1831 setOperationAction(ISD::SINT_TO_FP, VT,
1832 Subtarget.hasVLX() ? Legal : Custom);
1833 setOperationAction(ISD::UINT_TO_FP, VT,
1834 Subtarget.hasVLX() ? Legal : Custom);
1835 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1836 Subtarget.hasVLX() ? Legal : Custom);
1837 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1838 Subtarget.hasVLX() ? Legal : Custom);
1839 setOperationAction(ISD::FP_TO_SINT, VT,
1840 Subtarget.hasVLX() ? Legal : Custom);
1841 setOperationAction(ISD::FP_TO_UINT, VT,
1842 Subtarget.hasVLX() ? Legal : Custom);
1843 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1844 Subtarget.hasVLX() ? Legal : Custom);
1845 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1846 Subtarget.hasVLX() ? Legal : Custom);
1847 setOperationAction(ISD::MUL, VT, Legal);
1848 }
1849 }
1850
1851 if (Subtarget.hasCDI()) {
1852 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1853 setOperationAction(ISD::CTLZ, VT, Legal);
1854 }
1855 } // Subtarget.hasCDI()
1856
1857 if (Subtarget.hasVPOPCNTDQ()) {
1858 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1859 setOperationAction(ISD::CTPOP, VT, Legal);
1860 }
1861 }
1862
1863 // This block control legalization of v32i1/v64i1 which are available with
1864 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1865 // useBWIRegs.
1866 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1867 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1868 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1869
1870 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1871 setOperationAction(ISD::VSELECT, VT, Expand);
1872 setOperationAction(ISD::TRUNCATE, VT, Custom);
1873 setOperationAction(ISD::SETCC, VT, Custom);
1874 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1875 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1876 setOperationAction(ISD::SELECT, VT, Custom);
1877 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1878 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1879 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1880 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1881 }
1882
1883 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1884 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1885
1886 // Extends from v32i1 masks to 256-bit vectors.
1887 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1888 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1889 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1890
1891 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1892 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1893 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1894 }
1895
1896 // These operations are handled on non-VLX by artificially widening in
1897 // isel patterns.
1898 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1899
1900 if (Subtarget.hasBITALG()) {
1901 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1902 setOperationAction(ISD::CTPOP, VT, Legal);
1903 }
1904 }
1905
1906 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1907 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1908 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1909 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1910 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1911 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1912
1913 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1914 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1915 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1916 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1917 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1918
1919 if (Subtarget.hasBWI()) {
1920 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1921 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1922 }
1923
1924 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1925 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1926 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1927 }
1928
1929 if (Subtarget.hasAMXTILE()) {
1930 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
1931 }
1932
1933 // We want to custom lower some of our intrinsics.
1934 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1935 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1936 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1937 if (!Subtarget.is64Bit()) {
1938 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1939 }
1940
1941 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1942 // handle type legalization for these operations here.
1943 //
1944 // FIXME: We really should do custom legalization for addition and
1945 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1946 // than generic legalization for 64-bit multiplication-with-overflow, though.
1947 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1948 if (VT == MVT::i64 && !Subtarget.is64Bit())
1949 continue;
1950 // Add/Sub/Mul with overflow operations are custom lowered.
1951 setOperationAction(ISD::SADDO, VT, Custom);
1952 setOperationAction(ISD::UADDO, VT, Custom);
1953 setOperationAction(ISD::SSUBO, VT, Custom);
1954 setOperationAction(ISD::USUBO, VT, Custom);
1955 setOperationAction(ISD::SMULO, VT, Custom);
1956 setOperationAction(ISD::UMULO, VT, Custom);
1957
1958 // Support carry in as value rather than glue.
1959 setOperationAction(ISD::ADDCARRY, VT, Custom);
1960 setOperationAction(ISD::SUBCARRY, VT, Custom);
1961 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1962 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
1963 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
1964 }
1965
1966 if (!Subtarget.is64Bit()) {
1967 // These libcalls are not available in 32-bit.
1968 setLibcallName(RTLIB::SHL_I128, nullptr);
1969 setLibcallName(RTLIB::SRL_I128, nullptr);
1970 setLibcallName(RTLIB::SRA_I128, nullptr);
1971 setLibcallName(RTLIB::MUL_I128, nullptr);
1972 }
1973
1974 // Combine sin / cos into _sincos_stret if it is available.
1975 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1976 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1977 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1978 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1979 }
1980
1981 if (Subtarget.isTargetWin64()) {
1982 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1983 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1984 setOperationAction(ISD::SREM, MVT::i128, Custom);
1985 setOperationAction(ISD::UREM, MVT::i128, Custom);
1986 }
1987
1988 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1989 // is. We should promote the value to 64-bits to solve this.
1990 // This is what the CRT headers do - `fmodf` is an inline header
1991 // function casting to f64 and calling `fmod`.
1992 if (Subtarget.is32Bit() &&
1993 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1994 for (ISD::NodeType Op :
1995 {ISD::FCEIL, ISD::STRICT_FCEIL,
1996 ISD::FCOS, ISD::STRICT_FCOS,
1997 ISD::FEXP, ISD::STRICT_FEXP,
1998 ISD::FFLOOR, ISD::STRICT_FFLOOR,
1999 ISD::FREM, ISD::STRICT_FREM,
2000 ISD::FLOG, ISD::STRICT_FLOG,
2001 ISD::FLOG10, ISD::STRICT_FLOG10,
2002 ISD::FPOW, ISD::STRICT_FPOW,
2003 ISD::FSIN, ISD::STRICT_FSIN})
2004 if (isOperationExpand(Op, MVT::f32))
2005 setOperationAction(Op, MVT::f32, Promote);
2006
2007 // We have target-specific dag combine patterns for the following nodes:
2008 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2009 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2010 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2012 setTargetDAGCombine(ISD::CONCAT_VECTORS);
2013 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2014 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2015 setTargetDAGCombine(ISD::BITCAST);
2016 setTargetDAGCombine(ISD::VSELECT);
2017 setTargetDAGCombine(ISD::SELECT);
2018 setTargetDAGCombine(ISD::SHL);
2019 setTargetDAGCombine(ISD::SRA);
2020 setTargetDAGCombine(ISD::SRL);
2021 setTargetDAGCombine(ISD::OR);
2022 setTargetDAGCombine(ISD::AND);
2023 setTargetDAGCombine(ISD::ADD);
2024 setTargetDAGCombine(ISD::FADD);
2025 setTargetDAGCombine(ISD::FSUB);
2026 setTargetDAGCombine(ISD::FNEG);
2027 setTargetDAGCombine(ISD::FMA);
2028 setTargetDAGCombine(ISD::STRICT_FMA);
2029 setTargetDAGCombine(ISD::FMINNUM);
2030 setTargetDAGCombine(ISD::FMAXNUM);
2031 setTargetDAGCombine(ISD::SUB);
2032 setTargetDAGCombine(ISD::LOAD);
2033 setTargetDAGCombine(ISD::MLOAD);
2034 setTargetDAGCombine(ISD::STORE);
2035 setTargetDAGCombine(ISD::MSTORE);
2036 setTargetDAGCombine(ISD::TRUNCATE);
2037 setTargetDAGCombine(ISD::ZERO_EXTEND);
2038 setTargetDAGCombine(ISD::ANY_EXTEND);
2039 setTargetDAGCombine(ISD::SIGN_EXTEND);
2040 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2041 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2042 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2043 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2044 setTargetDAGCombine(ISD::SINT_TO_FP);
2045 setTargetDAGCombine(ISD::UINT_TO_FP);
2046 setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2047 setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2048 setTargetDAGCombine(ISD::SETCC);
2049 setTargetDAGCombine(ISD::MUL);
2050 setTargetDAGCombine(ISD::XOR);
2051 setTargetDAGCombine(ISD::MSCATTER);
2052 setTargetDAGCombine(ISD::MGATHER);
2053 setTargetDAGCombine(ISD::FP16_TO_FP);
2054 setTargetDAGCombine(ISD::FP_EXTEND);
2055 setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2056 setTargetDAGCombine(ISD::FP_ROUND);
2057
2058 computeRegisterProperties(Subtarget.getRegisterInfo());
2059
2060 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2061 MaxStoresPerMemsetOptSize = 8;
2062 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2063 MaxStoresPerMemcpyOptSize = 4;
2064 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2065 MaxStoresPerMemmoveOptSize = 4;
2066
2067 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2068 // that needs to benchmarked and balanced with the potential use of vector
2069 // load/store types (PR33329, PR33914).
2070 MaxLoadsPerMemcmp = 2;
2071 MaxLoadsPerMemcmpOptSize = 2;
2072
2073 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2074 setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2075
2076 // An out-of-order CPU can speculatively execute past a predictable branch,
2077 // but a conditional move could be stalled by an expensive earlier operation.
2078 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2079 EnableExtLdPromotion = true;
2080 setPrefFunctionAlignment(Align(16));
2081
2082 verifyIntrinsicTables();
2083
2084 // Default to having -disable-strictnode-mutation on
2085 IsStrictFPEnabled = true;
2086}
2087
2088// This has so far only been implemented for 64-bit MachO.
2089bool X86TargetLowering::useLoadStackGuardNode() const {
2090 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2091}
2092
2093bool X86TargetLowering::useStackGuardXorFP() const {
2094 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2095 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2096}
2097
2098SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2099 const SDLoc &DL) const {
2100 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2101 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2102 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2103 return SDValue(Node, 0);
2104}
2105
2106TargetLoweringBase::LegalizeTypeAction
2107X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2108 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2109 !Subtarget.hasBWI())
2110 return TypeSplitVector;
2111
2112 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2113 VT.getVectorElementType() != MVT::i1)
2114 return TypeWidenVector;
2115
2116 return TargetLoweringBase::getPreferredVectorAction(VT);
2117}
2118
2119static std::pair<MVT, unsigned>
2120handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2121 const X86Subtarget &Subtarget) {
2122 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2123 // convention is one that uses k registers.
2124 if (NumElts == 2)
2125 return {MVT::v2i64, 1};
2126 if (NumElts == 4)
2127 return {MVT::v4i32, 1};
2128 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2129 CC != CallingConv::Intel_OCL_BI)
2130 return {MVT::v8i16, 1};
2131 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2132 CC != CallingConv::Intel_OCL_BI)
2133 return {MVT::v16i8, 1};
2134 // v32i1 passes in ymm unless we have BWI and the calling convention is
2135 // regcall.
2136 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2137 return {MVT::v32i8, 1};
2138 // Split v64i1 vectors if we don't have v64i8 available.
2139 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2140 if (Subtarget.useAVX512Regs())
2141 return {MVT::v64i8, 1};
2142 return {MVT::v32i8, 2};
2143 }
2144
2145 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2146 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2147 NumElts > 64)
2148 return {MVT::i8, NumElts};
2149
2150 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2151}
2152
2153MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2154 CallingConv::ID CC,
2155 EVT VT) const {
2156 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2157 Subtarget.hasAVX512()) {
2158 unsigned NumElts = VT.getVectorNumElements();
2159
2160 MVT RegisterVT;
2161 unsigned NumRegisters;
2162 std::tie(RegisterVT, NumRegisters) =
2163 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2164 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2165 return RegisterVT;
2166 }
2167
2168 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2169}
2170
2171unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2172 CallingConv::ID CC,
2173 EVT VT) const {
2174 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2175 Subtarget.hasAVX512()) {
2176 unsigned NumElts = VT.getVectorNumElements();
2177
2178 MVT RegisterVT;
2179 unsigned NumRegisters;
2180 std::tie(RegisterVT, NumRegisters) =
2181 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2182 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2183 return NumRegisters;
2184 }
2185
2186 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2187}
2188
2189unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2190 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2191 unsigned &NumIntermediates, MVT &RegisterVT) const {
2192 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2193 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2194 Subtarget.hasAVX512() &&
2195 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2196 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2197 VT.getVectorNumElements() > 64)) {
2198 RegisterVT = MVT::i8;
2199 IntermediateVT = MVT::i1;
2200 NumIntermediates = VT.getVectorNumElements();
2201 return NumIntermediates;
2202 }
2203
2204 // Split v64i1 vectors if we don't have v64i8 available.
2205 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2206 CC != CallingConv::X86_RegCall) {
2207 RegisterVT = MVT::v32i8;
2208 IntermediateVT = MVT::v32i1;
2209 NumIntermediates = 2;
2210 return 2;
2211 }
2212
2213 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2214 NumIntermediates, RegisterVT);
2215}
2216
2217EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2218 LLVMContext& Context,
2219 EVT VT) const {
2220 if (!VT.isVector())
2221 return MVT::i8;
2222
2223 if (Subtarget.hasAVX512()) {
2224 // Figure out what this type will be legalized to.
2225 EVT LegalVT = VT;
2226 while (getTypeAction(Context, LegalVT) != TypeLegal)
2227 LegalVT = getTypeToTransformTo(Context, LegalVT);
2228
2229 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2230 if (LegalVT.getSimpleVT().is512BitVector())
2231 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2232
2233 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2234 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2235 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2236 // vXi16/vXi8.
2237 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2238 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2239 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2240 }
2241 }
2242
2243 return VT.changeVectorElementTypeToInteger();
2244}
2245
2246/// Helper for getByValTypeAlignment to determine
2247/// the desired ByVal argument alignment.
2248static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2249 if (MaxAlign == 16)
2250 return;
2251 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2252 if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2253 MaxAlign = Align(16);
2254 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2255 Align EltAlign;
2256 getMaxByValAlign(ATy->getElementType(), EltAlign);
2257 if (EltAlign > MaxAlign)
2258 MaxAlign = EltAlign;
2259 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2260 for (auto *EltTy : STy->elements()) {
2261 Align EltAlign;
2262 getMaxByValAlign(EltTy, EltAlign);
2263 if (EltAlign > MaxAlign)
2264 MaxAlign = EltAlign;
2265 if (MaxAlign == 16)
2266 break;
2267 }
2268 }
2269}
2270
2271/// Return the desired alignment for ByVal aggregate
2272/// function arguments in the caller parameter area. For X86, aggregates
2273/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2274/// are at 4-byte boundaries.
2275unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2276 const DataLayout &DL) const {
2277 if (Subtarget.is64Bit()) {
2278 // Max of 8 and alignment of type.
2279 Align TyAlign = DL.getABITypeAlign(Ty);
2280 if (TyAlign > 8)
2281 return TyAlign.value();
2282 return 8;
2283 }
2284
2285 Align Alignment(4);
2286 if (Subtarget.hasSSE1())
2287 getMaxByValAlign(Ty, Alignment);
2288 return Alignment.value();
2289}
2290
2291/// It returns EVT::Other if the type should be determined using generic
2292/// target-independent logic.
2293/// For vector ops we check that the overall size isn't larger than our
2294/// preferred vector width.
2295EVT X86TargetLowering::getOptimalMemOpType(
2296 const MemOp &Op, const AttributeList &FuncAttributes) const {
2297 if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2298 if (Op.size() >= 16 &&
2299 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2300 // FIXME: Check if unaligned 64-byte accesses are slow.
2301 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2302 (Subtarget.getPreferVectorWidth() >= 512)) {
2303 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2304 }
2305 // FIXME: Check if unaligned 32-byte accesses are slow.
2306 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2307 (Subtarget.getPreferVectorWidth() >= 256)) {
2308 // Although this isn't a well-supported type for AVX1, we'll let
2309 // legalization and shuffle lowering produce the optimal codegen. If we
2310 // choose an optimal type with a vector element larger than a byte,
2311 // getMemsetStores() may create an intermediate splat (using an integer
2312 // multiply) before we splat as a vector.
2313 return MVT::v32i8;
2314 }
2315 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2316 return MVT::v16i8;
2317 // TODO: Can SSE1 handle a byte vector?
2318 // If we have SSE1 registers we should be able to use them.
2319 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2320 (Subtarget.getPreferVectorWidth() >= 128))
2321 return MVT::v4f32;
2322 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2323 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2324 // Do not use f64 to lower memcpy if source is string constant. It's
2325 // better to use i32 to avoid the loads.
2326 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2327 // The gymnastics of splatting a byte value into an XMM register and then
2328 // only using 8-byte stores (because this is a CPU with slow unaligned
2329 // 16-byte accesses) makes that a loser.
2330 return MVT::f64;
2331 }
2332 }
2333 // This is a compromise. If we reach here, unaligned accesses may be slow on
2334 // this target. However, creating smaller, aligned accesses could be even
2335 // slower and would certainly be a lot more code.
2336 if (Subtarget.is64Bit() && Op.size() >= 8)
2337 return MVT::i64;
2338 return MVT::i32;
2339}
2340
2341bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2342 if (VT == MVT::f32)
2343 return X86ScalarSSEf32;
2344 if (VT == MVT::f64)
2345 return X86ScalarSSEf64;
2346 return true;
2347}
2348
2349bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2350 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2351 bool *Fast) const {
2352 if (Fast) {
2353 switch (VT.getSizeInBits()) {
2354 default:
2355 // 8-byte and under are always assumed to be fast.
2356 *Fast = true;
2357 break;
2358 case 128:
2359 *Fast = !Subtarget.isUnalignedMem16Slow();
2360 break;
2361 case 256:
2362 *Fast = !Subtarget.isUnalignedMem32Slow();
2363 break;
2364 // TODO: What about AVX-512 (512-bit) accesses?
2365 }
2366 }
2367 // NonTemporal vector memory ops must be aligned.
2368 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2369 // NT loads can only be vector aligned, so if its less aligned than the
2370 // minimum vector size (which we can split the vector down to), we might as
2371 // well use a regular unaligned vector load.
2372 // We don't have any NT loads pre-SSE41.
2373 if (!!(Flags & MachineMemOperand::MOLoad))
2374 return (Alignment < 16 || !Subtarget.hasSSE41());
2375 return false;
2376 }
2377 // Misaligned accesses of any size are always allowed.
2378 return true;
2379}
2380
2381/// Return the entry encoding for a jump table in the
2382/// current function. The returned value is a member of the
2383/// MachineJumpTableInfo::JTEntryKind enum.
2384unsigned X86TargetLowering::getJumpTableEncoding() const {
2385 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2386 // symbol.
2387 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2388 return MachineJumpTableInfo::EK_Custom32;
2389
2390 // Otherwise, use the normal jump table encoding heuristics.
2391 return TargetLowering::getJumpTableEncoding();
2392}
2393
2394bool X86TargetLowering::useSoftFloat() const {
2395 return Subtarget.useSoftFloat();
2396}
2397
2398void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2399 ArgListTy &Args) const {
2400
2401 // Only relabel X86-32 for C / Stdcall CCs.
2402 if (Subtarget.is64Bit())
2403 return;
2404 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2405 return;
2406 unsigned ParamRegs = 0;
2407 if (auto *M = MF->getFunction().getParent())
2408 ParamRegs = M->getNumberRegisterParameters();
2409
2410 // Mark the first N int arguments as having reg
2411 for (auto &Arg : Args) {
2412 Type *T = Arg.Ty;
2413 if (T->isIntOrPtrTy())
2414 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2415 unsigned numRegs = 1;
2416 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2417 numRegs = 2;
2418 if (ParamRegs < numRegs)
2419 return;
2420 ParamRegs -= numRegs;
2421 Arg.IsInReg = true;
2422 }
2423 }
2424}
2425
2426const MCExpr *
2427X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2428 const MachineBasicBlock *MBB,
2429 unsigned uid,MCContext &Ctx) const{
2430 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((void)0);
2431 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2432 // entries.
2433 return MCSymbolRefExpr::create(MBB->getSymbol(),
2434 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2435}
2436
2437/// Returns relocation base for the given PIC jumptable.
2438SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2439 SelectionDAG &DAG) const {
2440 if (!Subtarget.is64Bit())
2441 // This doesn't have SDLoc associated with it, but is not really the
2442 // same as a Register.
2443 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2444 getPointerTy(DAG.getDataLayout()));
2445 return Table;
2446}
2447
2448/// This returns the relocation base for the given PIC jumptable,
2449/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2450const MCExpr *X86TargetLowering::
2451getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2452 MCContext &Ctx) const {
2453 // X86-64 uses RIP relative addressing based on the jump table label.
2454 if (Subtarget.isPICStyleRIPRel())
2455 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2456
2457 // Otherwise, the reference is relative to the PIC base.
2458 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2459}
2460
2461std::pair<const TargetRegisterClass *, uint8_t>
2462X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2463 MVT VT) const {
2464 const TargetRegisterClass *RRC = nullptr;
2465 uint8_t Cost = 1;
2466 switch (VT.SimpleTy) {
2467 default:
2468 return TargetLowering::findRepresentativeClass(TRI, VT);
2469 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2470 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2471 break;
2472 case MVT::x86mmx:
2473 RRC = &X86::VR64RegClass;
2474 break;
2475 case MVT::f32: case MVT::f64:
2476 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2477 case MVT::v4f32: case MVT::v2f64:
2478 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2479 case MVT::v8f32: case MVT::v4f64:
2480 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2481 case MVT::v16f32: case MVT::v8f64:
2482 RRC = &X86::VR128XRegClass;
2483 break;
2484 }
2485 return std::make_pair(RRC, Cost);
2486}
2487
2488unsigned X86TargetLowering::getAddressSpace() const {
2489 if (Subtarget.is64Bit())
2490 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2491 return 256;
2492}
2493
2494static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2495 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2496 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2497}
2498
2499static Constant* SegmentOffset(IRBuilderBase &IRB,
2500 int Offset, unsigned AddressSpace) {
2501 return ConstantExpr::getIntToPtr(
2502 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2503 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2504}
2505
2506Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2507 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2508 // tcbhead_t; use it instead of the usual global variable (see
2509 // sysdeps/{i386,x86_64}/nptl/tls.h)
2510 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2511 if (Subtarget.isTargetFuchsia()) {
2512 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2513 return SegmentOffset(IRB, 0x10, getAddressSpace());
2514 } else {
2515 unsigned AddressSpace = getAddressSpace();
2516 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2517 // Specially, some users may customize the base reg and offset.
2518 int Offset = M->getStackProtectorGuardOffset();
2519 // If we don't set -stack-protector-guard-offset value:
2520 // %fs:0x28, unless we're using a Kernel code model, in which case
2521 // it's %gs:0x28. gs:0x14 on i386.
2522 if (Offset == INT_MAX2147483647)
2523 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2524
2525 StringRef GuardReg = M->getStackProtectorGuardReg();
2526 if (GuardReg == "fs")
2527 AddressSpace = X86AS::FS;
2528 else if (GuardReg == "gs")
2529 AddressSpace = X86AS::GS;
2530 return SegmentOffset(IRB, Offset, AddressSpace);
2531 }
2532 }
2533 return TargetLowering::getIRStackGuard(IRB);
2534}
2535
2536void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2537 // MSVC CRT provides functionalities for stack protection.
2538 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2539 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2540 // MSVC CRT has a global variable holding security cookie.
2541 M.getOrInsertGlobal("__security_cookie",
2542 Type::getInt8PtrTy(M.getContext()));
2543
2544 // MSVC CRT has a function to validate security cookie.
2545 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2546 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2547 Type::getInt8PtrTy(M.getContext()));
2548 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2549 F->setCallingConv(CallingConv::X86_FastCall);
2550 F->addAttribute(1, Attribute::AttrKind::InReg);
2551 }
2552 return;
2553 }
2554
2555 StringRef GuardMode = M.getStackProtectorGuard();
2556
2557 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2558 if ((GuardMode == "tls" || GuardMode.empty()) &&
2559 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2560 return;
2561 TargetLowering::insertSSPDeclarations(M);
2562}
2563
2564Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2565 // MSVC CRT has a global variable holding security cookie.
2566 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2567 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2568 return M.getGlobalVariable("__security_cookie");
2569 }
2570 return TargetLowering::getSDagStackGuard(M);
2571}
2572
2573Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2574 // MSVC CRT has a function to validate security cookie.
2575 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2576 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2577 return M.getFunction("__security_check_cookie");
2578 }
2579 return TargetLowering::getSSPStackGuardCheck(M);
2580}
2581
2582Value *
2583X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2584 if (Subtarget.getTargetTriple().isOSContiki())
2585 return getDefaultSafeStackPointerLocation(IRB, false);
2586
2587 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2588 // definition of TLS_SLOT_SAFESTACK in
2589 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2590 if (Subtarget.isTargetAndroid()) {
2591 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2592 // %gs:0x24 on i386
2593 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2594 return SegmentOffset(IRB, Offset, getAddressSpace());
2595 }
2596
2597 // Fuchsia is similar.
2598 if (Subtarget.isTargetFuchsia()) {
2599 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2600 return SegmentOffset(IRB, 0x18, getAddressSpace());
2601 }
2602
2603 return TargetLowering::getSafeStackPointerLocation(IRB);
2604}
2605
2606//===----------------------------------------------------------------------===//
2607// Return Value Calling Convention Implementation
2608//===----------------------------------------------------------------------===//
2609
2610bool X86TargetLowering::CanLowerReturn(
2611 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2612 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2613 SmallVector<CCValAssign, 16> RVLocs;
2614 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2615 return CCInfo.CheckReturn(Outs, RetCC_X86);
2616}
2617
2618const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2619 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2620 return ScratchRegs;
2621}
2622
2623/// Lowers masks values (v*i1) to the local register values
2624/// \returns DAG node after lowering to register type
2625static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2626 const SDLoc &Dl, SelectionDAG &DAG) {
2627 EVT ValVT = ValArg.getValueType();
2628
2629 if (ValVT == MVT::v1i1)
2630 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2631 DAG.getIntPtrConstant(0, Dl));
2632
2633 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2634 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2635 // Two stage lowering might be required
2636 // bitcast: v8i1 -> i8 / v16i1 -> i16
2637 // anyextend: i8 -> i32 / i16 -> i32
2638 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2639 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2640 if (ValLoc == MVT::i32)
2641 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2642 return ValToCopy;
2643 }
2644
2645 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2646 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2647 // One stage lowering is required
2648 // bitcast: v32i1 -> i32 / v64i1 -> i64
2649 return DAG.getBitcast(ValLoc, ValArg);
2650 }
2651
2652 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2653}
2654
2655/// Breaks v64i1 value into two registers and adds the new node to the DAG
2656static void Passv64i1ArgInRegs(
2657 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2658 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2659 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2660 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((void)0);
2661 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((void)0);
2662 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((void)0);
2663 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((void)0)
2664 "The value should reside in two registers")((void)0);
2665
2666 // Before splitting the value we cast it to i64
2667 Arg = DAG.getBitcast(MVT::i64, Arg);
2668
2669 // Splitting the value into two i32 types
2670 SDValue Lo, Hi;
2671 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2672 DAG.getConstant(0, Dl, MVT::i32));
2673 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2674 DAG.getConstant(1, Dl, MVT::i32));
2675
2676 // Attach the two i32 types into corresponding registers
2677 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2678 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2679}
2680
2681SDValue
2682X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2683 bool isVarArg,
2684 const SmallVectorImpl<ISD::OutputArg> &Outs,
2685 const SmallVectorImpl<SDValue> &OutVals,
2686 const SDLoc &dl, SelectionDAG &DAG) const {
2687 MachineFunction &MF = DAG.getMachineFunction();
2688 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2689
2690 // In some cases we need to disable registers from the default CSR list.
2691 // For example, when they are used for argument passing.
2692 bool ShouldDisableCalleeSavedRegister =
2693 CallConv == CallingConv::X86_RegCall ||
2694 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2695
2696 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2697 report_fatal_error("X86 interrupts may not return any value");
2698
2699 SmallVector<CCValAssign, 16> RVLocs;
2700 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2701 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2702
2703 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2704 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2705 ++I, ++OutsIndex) {
2706 CCValAssign &VA = RVLocs[I];
2707 assert(VA.isRegLoc() && "Can only return in registers!")((void)0);
2708
2709 // Add the register to the CalleeSaveDisableRegs list.
2710 if (ShouldDisableCalleeSavedRegister)
2711 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2712
2713 SDValue ValToCopy = OutVals[OutsIndex];
2714 EVT ValVT = ValToCopy.getValueType();
2715
2716 // Promote values to the appropriate types.
2717 if (VA.getLocInfo() == CCValAssign::SExt)
2718 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2719 else if (VA.getLocInfo() == CCValAssign::ZExt)
2720 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2721 else if (VA.getLocInfo() == CCValAssign::AExt) {
2722 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2723 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2724 else
2725 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2726 }
2727 else if (VA.getLocInfo() == CCValAssign::BCvt)
2728 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2729
2730 assert(VA.getLocInfo() != CCValAssign::FPExt &&((void)0)
2731 "Unexpected FP-extend for return value.")((void)0);
2732
2733 // Report an error if we have attempted to return a value via an XMM
2734 // register and SSE was disabled.
2735 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2736 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2737 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2738 } else if (!Subtarget.hasSSE2() &&
2739 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2740 ValVT == MVT::f64) {
2741 // When returning a double via an XMM register, report an error if SSE2 is
2742 // not enabled.
2743 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2744 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2745 }
2746
2747 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2748 // the RET instruction and handled by the FP Stackifier.
2749 if (VA.getLocReg() == X86::FP0 ||
2750 VA.getLocReg() == X86::FP1) {
2751 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2752 // change the value to the FP stack register class.
2753 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2754 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2755 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2756 // Don't emit a copytoreg.
2757 continue;
2758 }
2759
2760 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2761 // which is returned in RAX / RDX.
2762 if (Subtarget.is64Bit()) {
2763 if (ValVT == MVT::x86mmx) {
2764 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2765 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2766 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2767 ValToCopy);
2768 // If we don't have SSE2 available, convert to v4f32 so the generated
2769 // register is legal.
2770 if (!Subtarget.hasSSE2())
2771 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2772 }
2773 }
2774 }
2775
2776 if (VA.needsCustom()) {
2777 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
2778 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
2779
2780 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2781 Subtarget);
2782
2783 // Add the second register to the CalleeSaveDisableRegs list.
2784 if (ShouldDisableCalleeSavedRegister)
2785 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2786 } else {
2787 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2788 }
2789 }
2790
2791 SDValue Flag;
2792 SmallVector<SDValue, 6> RetOps;
2793 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2794 // Operand #1 = Bytes To Pop
2795 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2796 MVT::i32));
2797
2798 // Copy the result values into the output registers.
2799 for (auto &RetVal : RetVals) {
2800 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2801 RetOps.push_back(RetVal.second);
2802 continue; // Don't emit a copytoreg.
2803 }
2804
2805 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2806 Flag = Chain.getValue(1);
2807 RetOps.push_back(
2808 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2809 }
2810
2811 // Swift calling convention does not require we copy the sret argument
2812 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2813
2814 // All x86 ABIs require that for returning structs by value we copy
2815 // the sret argument into %rax/%eax (depending on ABI) for the return.
2816 // We saved the argument into a virtual register in the entry block,
2817 // so now we copy the value out and into %rax/%eax.
2818 //
2819 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2820 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2821 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2822 // either case FuncInfo->setSRetReturnReg() will have been called.
2823 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2824 // When we have both sret and another return value, we should use the
2825 // original Chain stored in RetOps[0], instead of the current Chain updated
2826 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2827
2828 // For the case of sret and another return value, we have
2829 // Chain_0 at the function entry
2830 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2831 // If we use Chain_1 in getCopyFromReg, we will have
2832 // Val = getCopyFromReg(Chain_1)
2833 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2834
2835 // getCopyToReg(Chain_0) will be glued together with
2836 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2837 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2838 // Data dependency from Unit B to Unit A due to usage of Val in
2839 // getCopyToReg(Chain_1, Val)
2840 // Chain dependency from Unit A to Unit B
2841
2842 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2843 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2844 getPointerTy(MF.getDataLayout()));
2845
2846 Register RetValReg
2847 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2848 X86::RAX : X86::EAX;
2849 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2850 Flag = Chain.getValue(1);
2851
2852 // RAX/EAX now acts like a return value.
2853 RetOps.push_back(
2854 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2855
2856 // Add the returned register to the CalleeSaveDisableRegs list.
2857 if (ShouldDisableCalleeSavedRegister)
2858 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2859 }
2860
2861 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2862 const MCPhysReg *I =
2863 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2864 if (I) {
2865 for (; *I; ++I) {
2866 if (X86::GR64RegClass.contains(*I))
2867 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2868 else
2869 llvm_unreachable("Unexpected register class in CSRsViaCopy!")__builtin_unreachable();
2870 }
2871 }
2872
2873 RetOps[0] = Chain; // Update chain.
2874
2875 // Add the flag if we have it.
2876 if (Flag.getNode())
2877 RetOps.push_back(Flag);
2878
2879 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2880 if (CallConv == CallingConv::X86_INTR)
2881 opcode = X86ISD::IRET;
2882 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2883}
2884
2885bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2886 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2887 return false;
2888
2889 SDValue TCChain = Chain;
2890 SDNode *Copy = *N->use_begin();
2891 if (Copy->getOpcode() == ISD::CopyToReg) {
2892 // If the copy has a glue operand, we conservatively assume it isn't safe to
2893 // perform a tail call.
2894 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2895 return false;
2896 TCChain = Copy->getOperand(0);
2897 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2898 return false;
2899
2900 bool HasRet = false;
2901 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2902 UI != UE; ++UI) {
2903 if (UI->getOpcode() != X86ISD::RET_FLAG)
2904 return false;
2905 // If we are returning more than one value, we can definitely
2906 // not make a tail call see PR19530
2907 if (UI->getNumOperands() > 4)
2908 return false;
2909 if (UI->getNumOperands() == 4 &&
2910 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2911 return false;
2912 HasRet = true;
2913 }
2914
2915 if (!HasRet)
2916 return false;
2917
2918 Chain = TCChain;
2919 return true;
2920}
2921
2922EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2923 ISD::NodeType ExtendKind) const {
2924 MVT ReturnMVT = MVT::i32;
2925
2926 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2927 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2928 // The ABI does not require i1, i8 or i16 to be extended.
2929 //
2930 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2931 // always extending i8/i16 return values, so keep doing that for now.
2932 // (PR26665).
2933 ReturnMVT = MVT::i8;
2934 }
2935
2936 EVT MinVT = getRegisterType(Context, ReturnMVT);
2937 return VT.bitsLT(MinVT) ? MinVT : VT;
2938}
2939
2940/// Reads two 32 bit registers and creates a 64 bit mask value.
2941/// \param VA The current 32 bit value that need to be assigned.
2942/// \param NextVA The next 32 bit value that need to be assigned.
2943/// \param Root The parent DAG node.
2944/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2945/// glue purposes. In the case the DAG is already using
2946/// physical register instead of virtual, we should glue
2947/// our new SDValue to InFlag SDvalue.
2948/// \return a new SDvalue of size 64bit.
2949static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2950 SDValue &Root, SelectionDAG &DAG,
2951 const SDLoc &Dl, const X86Subtarget &Subtarget,
2952 SDValue *InFlag = nullptr) {
2953 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")((void)0);
2954 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((void)0);
2955 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
2956 "Expecting first location of 64 bit width type")((void)0);
2957 assert(NextVA.getValVT() == VA.getValVT() &&((void)0)
2958 "The locations should have the same type")((void)0);
2959 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((void)0)
2960 "The values should reside in two registers")((void)0);
2961
2962 SDValue Lo, Hi;
2963 SDValue ArgValueLo, ArgValueHi;
2964
2965 MachineFunction &MF = DAG.getMachineFunction();
2966 const TargetRegisterClass *RC = &X86::GR32RegClass;
2967
2968 // Read a 32 bit value from the registers.
2969 if (nullptr == InFlag) {
2970 // When no physical register is present,
2971 // create an intermediate virtual register.
2972 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2973 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2974 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2975 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2976 } else {
2977 // When a physical register is available read the value from it and glue
2978 // the reads together.
2979 ArgValueLo =
2980 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2981 *InFlag = ArgValueLo.getValue(2);
2982 ArgValueHi =
2983 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2984 *InFlag = ArgValueHi.getValue(2);
2985 }
2986
2987 // Convert the i32 type into v32i1 type.
2988 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2989
2990 // Convert the i32 type into v32i1 type.
2991 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2992
2993 // Concatenate the two values together.
2994 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2995}
2996
2997/// The function will lower a register of various sizes (8/16/32/64)
2998/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2999/// \returns a DAG node contains the operand after lowering to mask type.
3000static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3001 const EVT &ValLoc, const SDLoc &Dl,
3002 SelectionDAG &DAG) {
3003 SDValue ValReturned = ValArg;
3004
3005 if (ValVT == MVT::v1i1)
3006 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3007
3008 if (ValVT == MVT::v64i1) {
3009 // In 32 bit machine, this case is handled by getv64i1Argument
3010 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((void)0);
3011 // In 64 bit machine, There is no need to truncate the value only bitcast
3012 } else {
3013 MVT maskLen;
3014 switch (ValVT.getSimpleVT().SimpleTy) {
3015 case MVT::v8i1:
3016 maskLen = MVT::i8;
3017 break;
3018 case MVT::v16i1:
3019 maskLen = MVT::i16;
3020 break;
3021 case MVT::v32i1:
3022 maskLen = MVT::i32;
3023 break;
3024 default:
3025 llvm_unreachable("Expecting a vector of i1 types")__builtin_unreachable();
3026 }
3027
3028 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3029 }
3030 return DAG.getBitcast(ValVT, ValReturned);
3031}
3032
3033/// Lower the result values of a call into the
3034/// appropriate copies out of appropriate physical registers.
3035///
3036SDValue X86TargetLowering::LowerCallResult(
3037 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3038 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3039 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3040 uint32_t *RegMask) const {
3041
3042 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3043 // Assign locations to each value returned by this call.
3044 SmallVector<CCValAssign, 16> RVLocs;
3045 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3046 *DAG.getContext());
3047 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3048
3049 // Copy all of the result registers out of their specified physreg.
3050 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3051 ++I, ++InsIndex) {
3052 CCValAssign &VA = RVLocs[I];
3053 EVT CopyVT = VA.getLocVT();
3054
3055 // In some calling conventions we need to remove the used registers
3056 // from the register mask.
3057 if (RegMask) {
3058 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3059 SubRegs.isValid(); ++SubRegs)
3060 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3061 }
3062
3063 // Report an error if there was an attempt to return FP values via XMM
3064 // registers.
3065 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3066 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3067 if (VA.getLocReg() == X86::XMM1)
3068 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3069 else
3070 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3071 } else if (!Subtarget.hasSSE2() &&
3072 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3073 CopyVT == MVT::f64) {
3074 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3075 if (VA.getLocReg() == X86::XMM1)
3076 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3077 else
3078 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3079 }
3080
3081 // If we prefer to use the value in xmm registers, copy it out as f80 and
3082 // use a truncate to move it from fp stack reg to xmm reg.
3083 bool RoundAfterCopy = false;
3084 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3085 isScalarFPTypeInSSEReg(VA.getValVT())) {
3086 if (!Subtarget.hasX87())
3087 report_fatal_error("X87 register return with X87 disabled");
3088 CopyVT = MVT::f80;
3089 RoundAfterCopy = (CopyVT != VA.getLocVT());
3090 }
3091
3092 SDValue Val;
3093 if (VA.needsCustom()) {
3094 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
3095 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
3096 Val =
3097 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3098 } else {
3099 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3100 .getValue(1);
3101 Val = Chain.getValue(0);
3102 InFlag = Chain.getValue(2);
3103 }
3104
3105 if (RoundAfterCopy)
3106 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3107 // This truncation won't change the value.
3108 DAG.getIntPtrConstant(1, dl));
3109
3110 if (VA.isExtInLoc()) {
3111 if (VA.getValVT().isVector() &&
3112 VA.getValVT().getScalarType() == MVT::i1 &&
3113 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3114 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3115 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3116 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3117 } else
3118 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3119 }
3120
3121 if (VA.getLocInfo() == CCValAssign::BCvt)
3122 Val = DAG.getBitcast(VA.getValVT(), Val);
3123
3124 InVals.push_back(Val);
3125 }
3126
3127 return Chain;
3128}
3129
3130//===----------------------------------------------------------------------===//
3131// C & StdCall & Fast Calling Convention implementation
3132//===----------------------------------------------------------------------===//
3133// StdCall calling convention seems to be standard for many Windows' API
3134// routines and around. It differs from C calling convention just a little:
3135// callee should clean up the stack, not caller. Symbols should be also
3136// decorated in some fancy way :) It doesn't support any vector arguments.
3137// For info on fast calling convention see Fast Calling Convention (tail call)
3138// implementation LowerX86_32FastCCCallTo.
3139
3140/// CallIsStructReturn - Determines whether a call uses struct return
3141/// semantics.
3142enum StructReturnType {
3143 NotStructReturn,
3144 RegStructReturn,
3145 StackStructReturn
3146};
3147static StructReturnType
3148callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3149 if (Outs.empty())
3150 return NotStructReturn;
3151
3152 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3153 if (!Flags.isSRet())
3154 return NotStructReturn;
3155 if (Flags.isInReg() || IsMCU)
3156 return RegStructReturn;
3157 return StackStructReturn;
3158}
3159
3160/// Determines whether a function uses struct return semantics.
3161static StructReturnType
3162argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3163 if (Ins.empty())
3164 return NotStructReturn;
3165
3166 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3167 if (!Flags.isSRet())
3168 return NotStructReturn;
3169 if (Flags.isInReg() || IsMCU)
3170 return RegStructReturn;
3171 return StackStructReturn;
3172}
3173
3174/// Make a copy of an aggregate at address specified by "Src" to address
3175/// "Dst" with size and alignment information specified by the specific
3176/// parameter attribute. The copy will be passed as a byval function parameter.
3177static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3178 SDValue Chain, ISD::ArgFlagsTy Flags,
3179 SelectionDAG &DAG, const SDLoc &dl) {
3180 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3181
3182 return DAG.getMemcpy(
3183 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3184 /*isVolatile*/ false, /*AlwaysInline=*/true,
3185 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3186}
3187
3188/// Return true if the calling convention is one that we can guarantee TCO for.
3189static bool canGuaranteeTCO(CallingConv::ID CC) {
3190 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3191 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3192 CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3193 CC == CallingConv::SwiftTail);
3194}
3195
3196/// Return true if we might ever do TCO for calls with this calling convention.
3197static bool mayTailCallThisCC(CallingConv::ID CC) {
3198 switch (CC) {
3199 // C calling conventions:
3200 case CallingConv::C:
3201 case CallingConv::Win64:
3202 case CallingConv::X86_64_SysV:
3203 // Callee pop conventions:
3204 case CallingConv::X86_ThisCall:
3205 case CallingConv::X86_StdCall:
3206 case CallingConv::X86_VectorCall:
3207 case CallingConv::X86_FastCall:
3208 // Swift:
3209 case CallingConv::Swift:
3210 return true;
3211 default:
3212 return canGuaranteeTCO(CC);
3213 }
3214}
3215
3216/// Return true if the function is being made into a tailcall target by
3217/// changing its ABI.
3218static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3219 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3220 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3221}
3222
3223bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3224 if (!CI->isTailCall())
3225 return false;
3226
3227 CallingConv::ID CalleeCC = CI->getCallingConv();
3228 if (!mayTailCallThisCC(CalleeCC))
3229 return false;
3230
3231 return true;
3232}
3233
3234SDValue
3235X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3236 const SmallVectorImpl<ISD::InputArg> &Ins,
3237 const SDLoc &dl, SelectionDAG &DAG,
3238 const CCValAssign &VA,
3239 MachineFrameInfo &MFI, unsigned i) const {
3240 // Create the nodes corresponding to a load from this parameter slot.
3241 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3242 bool AlwaysUseMutable = shouldGuaranteeTCO(
3243 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3244 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3245 EVT ValVT;
3246 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3247
3248 // If value is passed by pointer we have address passed instead of the value
3249 // itself. No need to extend if the mask value and location share the same
3250 // absolute size.
3251 bool ExtendedInMem =
3252 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3253 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3254
3255 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3256 ValVT = VA.getLocVT();
3257 else
3258 ValVT = VA.getValVT();
3259
3260 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3261 // changed with more analysis.
3262 // In case of tail call optimization mark all arguments mutable. Since they
3263 // could be overwritten by lowering of arguments in case of a tail call.
3264 if (Flags.isByVal()) {
3265 unsigned Bytes = Flags.getByValSize();
3266 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3267
3268 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3269 // can be improved with deeper analysis.
3270 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3271 /*isAliased=*/true);
3272 return DAG.getFrameIndex(FI, PtrVT);
3273 }
3274
3275 EVT ArgVT = Ins[i].ArgVT;
3276
3277 // If this is a vector that has been split into multiple parts, and the
3278 // scalar size of the parts don't match the vector element size, then we can't
3279 // elide the copy. The parts will have padding between them instead of being
3280 // packed like a vector.
3281 bool ScalarizedAndExtendedVector =
3282 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3283 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3284
3285 // This is an argument in memory. We might be able to perform copy elision.
3286 // If the argument is passed directly in memory without any extension, then we
3287 // can perform copy elision. Large vector types, for example, may be passed
3288 // indirectly by pointer.
3289 if (Flags.isCopyElisionCandidate() &&
3290 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3291 !ScalarizedAndExtendedVector) {
3292 SDValue PartAddr;
3293 if (Ins[i].PartOffset == 0) {
3294 // If this is a one-part value or the first part of a multi-part value,
3295 // create a stack object for the entire argument value type and return a
3296 // load from our portion of it. This assumes that if the first part of an
3297 // argument is in memory, the rest will also be in memory.
3298 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3299 /*IsImmutable=*/false);
3300 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3301 return DAG.getLoad(
3302 ValVT, dl, Chain, PartAddr,
3303 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3304 } else {
3305 // This is not the first piece of an argument in memory. See if there is
3306 // already a fixed stack object including this offset. If so, assume it
3307 // was created by the PartOffset == 0 branch above and create a load from
3308 // the appropriate offset into it.
3309 int64_t PartBegin = VA.getLocMemOffset();
3310 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3311 int FI = MFI.getObjectIndexBegin();
3312 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3313 int64_t ObjBegin = MFI.getObjectOffset(FI);
3314 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3315 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3316 break;
3317 }
3318 if (MFI.isFixedObjectIndex(FI)) {
3319 SDValue Addr =
3320 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3321 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3322 return DAG.getLoad(
3323 ValVT, dl, Chain, Addr,
3324 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3325 Ins[i].PartOffset));
3326 }
3327 }
3328 }
3329
3330 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3331 VA.getLocMemOffset(), isImmutable);
3332
3333 // Set SExt or ZExt flag.
3334 if (VA.getLocInfo() == CCValAssign::ZExt) {
3335 MFI.setObjectZExt(FI, true);
3336 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3337 MFI.setObjectSExt(FI, true);
3338 }
3339
3340 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3341 SDValue Val = DAG.getLoad(
3342 ValVT, dl, Chain, FIN,
3343 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3344 return ExtendedInMem
3345 ? (VA.getValVT().isVector()
3346 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3347 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3348 : Val;
3349}
3350
3351// FIXME: Get this from tablegen.
3352static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3353 const X86Subtarget &Subtarget) {
3354 assert(Subtarget.is64Bit())((void)0);
3355
3356 if (Subtarget.isCallingConvWin64(CallConv)) {
3357 static const MCPhysReg GPR64ArgRegsWin64[] = {
3358 X86::RCX, X86::RDX, X86::R8, X86::R9
3359 };
3360 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3361 }
3362
3363 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3364 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3365 };
3366 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3367}
3368
3369// FIXME: Get this from tablegen.
3370static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3371 CallingConv::ID CallConv,
3372 const X86Subtarget &Subtarget) {
3373 assert(Subtarget.is64Bit())((void)0);
3374 if (Subtarget.isCallingConvWin64(CallConv)) {
3375 // The XMM registers which might contain var arg parameters are shadowed
3376 // in their paired GPR. So we only need to save the GPR to their home
3377 // slots.
3378 // TODO: __vectorcall will change this.
3379 return None;
3380 }
3381
3382 bool isSoftFloat = Subtarget.useSoftFloat();
3383 if (isSoftFloat || !Subtarget.hasSSE1())
3384 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3385 // registers.
3386 return None;
3387
3388 static const MCPhysReg XMMArgRegs64Bit[] = {
3389 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3390 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3391 };
3392 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3393}
3394
3395#ifndef NDEBUG1
3396static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3397 return llvm::is_sorted(
3398 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3399 return A.getValNo() < B.getValNo();
3400 });
3401}
3402#endif
3403
3404namespace {
3405/// This is a helper class for lowering variable arguments parameters.
3406class VarArgsLoweringHelper {
3407public:
3408 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3409 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3410 CallingConv::ID CallConv, CCState &CCInfo)
3411 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3412 TheMachineFunction(DAG.getMachineFunction()),
3413 TheFunction(TheMachineFunction.getFunction()),
3414 FrameInfo(TheMachineFunction.getFrameInfo()),
3415 FrameLowering(*Subtarget.getFrameLowering()),
3416 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3417 CCInfo(CCInfo) {}
3418
3419 // Lower variable arguments parameters.
3420 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3421
3422private:
3423 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3424
3425 void forwardMustTailParameters(SDValue &Chain);
3426
3427 bool is64Bit() const { return Subtarget.is64Bit(); }
3428 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3429
3430 X86MachineFunctionInfo *FuncInfo;
3431 const SDLoc &DL;
3432 SelectionDAG &DAG;
3433 const X86Subtarget &Subtarget;
3434 MachineFunction &TheMachineFunction;
3435 const Function &TheFunction;
3436 MachineFrameInfo &FrameInfo;
3437 const TargetFrameLowering &FrameLowering;
3438 const TargetLowering &TargLowering;
3439 CallingConv::ID CallConv;
3440 CCState &CCInfo;
3441};
3442} // namespace
3443
3444void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3445 SDValue &Chain, unsigned StackSize) {
3446 // If the function takes variable number of arguments, make a frame index for
3447 // the start of the first vararg value... for expansion of llvm.va_start. We
3448 // can skip this if there are no va_start calls.
3449 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3450 CallConv != CallingConv::X86_ThisCall)) {
3451 FuncInfo->setVarArgsFrameIndex(
3452 FrameInfo.CreateFixedObject(1, StackSize, true));
3453 }
3454
3455 // 64-bit calling conventions support varargs and register parameters, so we
3456 // have to do extra work to spill them in the prologue.
3457 if (is64Bit()) {
3458 // Find the first unallocated argument registers.
3459 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3460 ArrayRef<MCPhysReg> ArgXMMs =
3461 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3462 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3463 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3464
3465 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((void)0)
3466 "SSE register cannot be used when SSE is disabled!")((void)0);
3467
3468 if (isWin64()) {
3469 // Get to the caller-allocated home save location. Add 8 to account
3470 // for the return address.
3471 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3472 FuncInfo->setRegSaveFrameIndex(
3473 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3474 // Fixup to set vararg frame on shadow area (4 x i64).
3475 if (NumIntRegs < 4)
3476 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3477 } else {
3478 // For X86-64, if there are vararg parameters that are passed via
3479 // registers, then we must store them to their spots on the stack so
3480 // they may be loaded by dereferencing the result of va_next.
3481 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3482 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3483 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3484 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3485 }
3486
3487 SmallVector<SDValue, 6>
3488 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3489 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3490 // keeping live input value
3491 SDValue ALVal; // if applicable keeps SDValue for %al register
3492
3493 // Gather all the live in physical registers.
3494 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3495 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3496 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3497 }
3498 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3499 if (!AvailableXmms.empty()) {
3500 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3501 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3502 for (MCPhysReg Reg : AvailableXmms) {
3503 // FastRegisterAllocator spills virtual registers at basic
3504 // block boundary. That leads to usages of xmm registers
3505 // outside of check for %al. Pass physical registers to
3506 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3507 TheMachineFunction.getRegInfo().addLiveIn(Reg);
3508 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3509 }
3510 }
3511
3512 // Store the integer parameter registers.
3513 SmallVector<SDValue, 8> MemOps;
3514 SDValue RSFIN =
3515 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3516 TargLowering.getPointerTy(DAG.getDataLayout()));
3517 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3518 for (SDValue Val : LiveGPRs) {
3519 SDValue FIN = DAG.getNode(ISD::ADD, DL,
3520 TargLowering.getPointerTy(DAG.getDataLayout()),
3521 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3522 SDValue Store =
3523 DAG.getStore(Val.getValue(1), DL, Val, FIN,
3524 MachinePointerInfo::getFixedStack(
3525 DAG.getMachineFunction(),
3526 FuncInfo->getRegSaveFrameIndex(), Offset));
3527 MemOps.push_back(Store);
3528 Offset += 8;
3529 }
3530
3531 // Now store the XMM (fp + vector) parameter registers.
3532 if (!LiveXMMRegs.empty()) {
3533 SmallVector<SDValue, 12> SaveXMMOps;
3534 SaveXMMOps.push_back(Chain);
3535 SaveXMMOps.push_back(ALVal);
3536 SaveXMMOps.push_back(
3537 DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3538 SaveXMMOps.push_back(
3539 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3540 llvm::append_range(SaveXMMOps, LiveXMMRegs);
3541 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3542 MVT::Other, SaveXMMOps));
3543 }
3544
3545 if (!MemOps.empty())
3546 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3547 }
3548}
3549
3550void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3551 // Find the largest legal vector type.
3552 MVT VecVT = MVT::Other;
3553 // FIXME: Only some x86_32 calling conventions support AVX512.
3554 if (Subtarget.useAVX512Regs() &&
3555 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3556 CallConv == CallingConv::Intel_OCL_BI)))
3557 VecVT = MVT::v16f32;
3558 else if (Subtarget.hasAVX())
3559 VecVT = MVT::v8f32;
3560 else if (Subtarget.hasSSE2())
3561 VecVT = MVT::v4f32;
3562
3563 // We forward some GPRs and some vector types.
3564 SmallVector<MVT, 2> RegParmTypes;
3565 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3566 RegParmTypes.push_back(IntVT);
3567 if (VecVT != MVT::Other)
3568 RegParmTypes.push_back(VecVT);
3569
3570 // Compute the set of forwarded registers. The rest are scratch.
3571 SmallVectorImpl<ForwardedRegister> &Forwards =
3572 FuncInfo->getForwardedMustTailRegParms();
3573 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3574
3575 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3576 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3577 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3578 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3579 }
3580
3581 // Copy all forwards from physical to virtual registers.
3582 for (ForwardedRegister &FR : Forwards) {
3583 // FIXME: Can we use a less constrained schedule?
3584 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3585 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3586 TargLowering.getRegClassFor(FR.VT));
3587 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3588 }
3589}
3590
3591void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3592 unsigned StackSize) {
3593 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3594 // If necessary, it would be set into the correct value later.
3595 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3596 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3597
3598 if (FrameInfo.hasVAStart())
3599 createVarArgAreaAndStoreRegisters(Chain, StackSize);
3600
3601 if (FrameInfo.hasMustTailInVarArgFunc())
3602 forwardMustTailParameters(Chain);
3603}
3604
3605SDValue X86TargetLowering::LowerFormalArguments(
3606 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3607 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3608 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3609 MachineFunction &MF = DAG.getMachineFunction();
3610 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3611
3612 const Function &F = MF.getFunction();
3613 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3614 F.getName() == "main")
3615 FuncInfo->setForceFramePointer(true);
3616
3617 MachineFrameInfo &MFI = MF.getFrameInfo();
3618 bool Is64Bit = Subtarget.is64Bit();
3619 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3620
3621 assert(((void)0)
3622 !(IsVarArg && canGuaranteeTCO(CallConv)) &&((void)0)
3623 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((void)0);
3624
3625 // Assign locations to all of the incoming arguments.
3626 SmallVector<CCValAssign, 16> ArgLocs;
3627 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3628
3629 // Allocate shadow area for Win64.
3630 if (IsWin64)
3631 CCInfo.AllocateStack(32, Align(8));
3632
3633 CCInfo.AnalyzeArguments(Ins, CC_X86);
3634
3635 // In vectorcall calling convention a second pass is required for the HVA
3636 // types.
3637 if (CallingConv::X86_VectorCall == CallConv) {
3638 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3639 }
3640
3641 // The next loop assumes that the locations are in the same order of the
3642 // input arguments.
3643 assert(isSortedByValueNo(ArgLocs) &&((void)0)
3644 "Argument Location list must be sorted before lowering")((void)0);
3645
3646 SDValue ArgValue;
3647 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3648 ++I, ++InsIndex) {
3649 assert(InsIndex < Ins.size() && "Invalid Ins index")((void)0);
3650 CCValAssign &VA = ArgLocs[I];
3651
3652 if (VA.isRegLoc()) {
3653 EVT RegVT = VA.getLocVT();
3654 if (VA.needsCustom()) {
3655 assert(((void)0)
3656 VA.getValVT() == MVT::v64i1 &&((void)0)
3657 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
3658
3659 // v64i1 values, in regcall calling convention, that are
3660 // compiled to 32 bit arch, are split up into two registers.
3661 ArgValue =
3662 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3663 } else {
3664 const TargetRegisterClass *RC;
3665 if (RegVT == MVT::i8)
3666 RC = &X86::GR8RegClass;
3667 else if (RegVT == MVT::i16)
3668 RC = &X86::GR16RegClass;
3669 else if (RegVT == MVT::i32)
3670 RC = &X86::GR32RegClass;
3671 else if (Is64Bit && RegVT == MVT::i64)
3672 RC = &X86::GR64RegClass;
3673 else if (RegVT == MVT::f32)
3674 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3675 else if (RegVT == MVT::f64)
3676 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3677 else if (RegVT == MVT::f80)
3678 RC = &X86::RFP80RegClass;
3679 else if (RegVT == MVT::f128)
3680 RC = &X86::VR128RegClass;
3681 else if (RegVT.is512BitVector())
3682 RC = &X86::VR512RegClass;
3683 else if (RegVT.is256BitVector())
3684 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3685 else if (RegVT.is128BitVector())
3686 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3687 else if (RegVT == MVT::x86mmx)
3688 RC = &X86::VR64RegClass;
3689 else if (RegVT == MVT::v1i1)
3690 RC = &X86::VK1RegClass;
3691 else if (RegVT == MVT::v8i1)
3692 RC = &X86::VK8RegClass;
3693 else if (RegVT == MVT::v16i1)
3694 RC = &X86::VK16RegClass;
3695 else if (RegVT == MVT::v32i1)
3696 RC = &X86::VK32RegClass;
3697 else if (RegVT == MVT::v64i1)
3698 RC = &X86::VK64RegClass;
3699 else
3700 llvm_unreachable("Unknown argument type!")__builtin_unreachable();
3701
3702 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3703 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3704 }
3705
3706 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3707 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3708 // right size.
3709 if (VA.getLocInfo() == CCValAssign::SExt)
3710 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3711 DAG.getValueType(VA.getValVT()));
3712 else if (VA.getLocInfo() == CCValAssign::ZExt)
3713 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3714 DAG.getValueType(VA.getValVT()));
3715 else if (VA.getLocInfo() == CCValAssign::BCvt)
3716 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3717
3718 if (VA.isExtInLoc()) {
3719 // Handle MMX values passed in XMM regs.
3720 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3721 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3722 else if (VA.getValVT().isVector() &&
3723 VA.getValVT().getScalarType() == MVT::i1 &&
3724 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3725 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3726 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3727 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3728 } else
3729 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3730 }
3731 } else {
3732 assert(VA.isMemLoc())((void)0);
3733 ArgValue =
3734 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3735 }
3736
3737 // If value is passed via pointer - do a load.
3738 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3739 ArgValue =
3740 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3741
3742 InVals.push_back(ArgValue);
3743 }
3744
3745 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3746 if (Ins[I].Flags.isSwiftAsync()) {
3747 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3748 if (Subtarget.is64Bit())
3749 X86FI->setHasSwiftAsyncContext(true);
3750 else {
3751 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3752 X86FI->setSwiftAsyncContextFrameIdx(FI);
3753 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3754 DAG.getFrameIndex(FI, MVT::i32),
3755 MachinePointerInfo::getFixedStack(MF, FI));
3756 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3757 }
3758 }
3759
3760 // Swift calling convention does not require we copy the sret argument
3761 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3762 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3763 continue;
3764
3765 // All x86 ABIs require that for returning structs by value we copy the
3766 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3767 // the argument into a virtual register so that we can access it from the
3768 // return points.
3769 if (Ins[I].Flags.isSRet()) {
3770 Register Reg = FuncInfo->getSRetReturnReg();
3771 if (!Reg) {
3772 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3773 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3774 FuncInfo->setSRetReturnReg(Reg);
3775 }
3776 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3777 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3778 break;
3779 }
3780 }
3781
3782 unsigned StackSize = CCInfo.getNextStackOffset();
3783 // Align stack specially for tail calls.
3784 if (shouldGuaranteeTCO(CallConv,
3785 MF.getTarget().Options.GuaranteedTailCallOpt))
3786 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3787
3788 if (IsVarArg)
3789 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3790 .lowerVarArgsParameters(Chain, StackSize);
3791
3792 // Some CCs need callee pop.
3793 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3794 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3795 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3796 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3797 // X86 interrupts must pop the error code (and the alignment padding) if
3798 // present.
3799 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3800 } else {
3801 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3802 // If this is an sret function, the return should pop the hidden pointer.
3803 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3804 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3805 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3806 FuncInfo->setBytesToPopOnReturn(4);
3807 }
3808
3809 if (!Is64Bit) {
3810 // RegSaveFrameIndex is X86-64 only.
3811 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3812 }
3813
3814 FuncInfo->setArgumentStackSize(StackSize);
3815
3816 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3817 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3818 if (Personality == EHPersonality::CoreCLR) {
3819 assert(Is64Bit)((void)0);
3820 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3821 // that we'd prefer this slot be allocated towards the bottom of the frame
3822 // (i.e. near the stack pointer after allocating the frame). Every
3823 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3824 // offset from the bottom of this and each funclet's frame must be the
3825 // same, so the size of funclets' (mostly empty) frames is dictated by
3826 // how far this slot is from the bottom (since they allocate just enough
3827 // space to accommodate holding this slot at the correct offset).
3828 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
3829 EHInfo->PSPSymFrameIdx = PSPSymFI;
3830 }
3831 }
3832
3833 if (CallConv == CallingConv::X86_RegCall ||
3834 F.hasFnAttribute("no_caller_saved_registers")) {
3835 MachineRegisterInfo &MRI = MF.getRegInfo();
3836 for (std::pair<Register, Register> Pair : MRI.liveins())
3837 MRI.disableCalleeSavedRegister(Pair.first);
3838 }
3839
3840 return Chain;
3841}
3842
3843SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3844 SDValue Arg, const SDLoc &dl,
3845 SelectionDAG &DAG,
3846 const CCValAssign &VA,
3847 ISD::ArgFlagsTy Flags,
3848 bool isByVal) const {
3849 unsigned LocMemOffset = VA.getLocMemOffset();
3850 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3851 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3852 StackPtr, PtrOff);
3853 if (isByVal)
3854 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3855
3856 return DAG.getStore(
3857 Chain, dl, Arg, PtrOff,
3858 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3859}
3860
3861/// Emit a load of return address if tail call
3862/// optimization is performed and it is required.
3863SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3864 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3865 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3866 // Adjust the Return address stack slot.
3867 EVT VT = getPointerTy(DAG.getDataLayout());
3868 OutRetAddr = getReturnAddressFrameIndex(DAG);
3869
3870 // Load the "old" Return address.
3871 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3872 return SDValue(OutRetAddr.getNode(), 1);
3873}
3874
3875/// Emit a store of the return address if tail call
3876/// optimization is performed and it is required (FPDiff!=0).
3877static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3878 SDValue Chain, SDValue RetAddrFrIdx,
3879 EVT PtrVT, unsigned SlotSize,
3880 int FPDiff, const SDLoc &dl) {
3881 // Store the return address to the appropriate stack slot.
3882 if (!FPDiff) return Chain;
3883 // Calculate the new stack slot for the return address.
3884 int NewReturnAddrFI =
3885 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3886 false);
3887 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3888 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3889 MachinePointerInfo::getFixedStack(
3890 DAG.getMachineFunction(), NewReturnAddrFI));
3891 return Chain;
3892}
3893
3894/// Returns a vector_shuffle mask for an movs{s|d}, movd
3895/// operation of specified width.
3896static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3897 SDValue V2) {
3898 unsigned NumElems = VT.getVectorNumElements();
3899 SmallVector<int, 8> Mask;
3900 Mask.push_back(NumElems);
3901 for (unsigned i = 1; i != NumElems; ++i)
3902 Mask.push_back(i);
3903 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3904}
3905
3906SDValue
3907X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3908 SmallVectorImpl<SDValue> &InVals) const {
3909 SelectionDAG &DAG = CLI.DAG;
3910 SDLoc &dl = CLI.DL;
3911 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3912 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3913 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3914 SDValue Chain = CLI.Chain;
3915 SDValue Callee = CLI.Callee;
3916 CallingConv::ID CallConv = CLI.CallConv;
3917 bool &isTailCall = CLI.IsTailCall;
3918 bool isVarArg = CLI.IsVarArg;
3919 const auto *CB = CLI.CB;
3920
3921 MachineFunction &MF = DAG.getMachineFunction();
3922 bool Is64Bit = Subtarget.is64Bit();
3923 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3924 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3925 bool IsSibcall = false;
3926 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3927 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
3928 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3929 bool HasNCSR = (CB && isa<CallInst>(CB) &&
3930 CB->hasFnAttr("no_caller_saved_registers"));
3931 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
3932 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
3933 const Module *M = MF.getMMI().getModule();
3934 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3935
3936 MachineFunction::CallSiteInfo CSInfo;
3937 if (CallConv == CallingConv::X86_INTR)
3938 report_fatal_error("X86 interrupts may not be called directly");
3939
3940 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
3941 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
3942 // If we are using a GOT, disable tail calls to external symbols with
3943 // default visibility. Tail calling such a symbol requires using a GOT
3944 // relocation, which forces early binding of the symbol. This breaks code
3945 // that require lazy function symbol resolution. Using musttail or
3946 // GuaranteedTailCallOpt will override this.
3947 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3948 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3949 G->getGlobal()->hasDefaultVisibility()))
3950 isTailCall = false;
3951 }
3952
3953
3954 if (isTailCall && !IsMustTail) {
3955 // Check if it's really possible to do a tail call.
3956 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3957 isVarArg, SR != NotStructReturn,
3958 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3959 Outs, OutVals, Ins, DAG);
3960
3961 // Sibcalls are automatically detected tailcalls which do not require
3962 // ABI changes.
3963 if (!IsGuaranteeTCO && isTailCall)
3964 IsSibcall = true;
3965
3966 if (isTailCall)
3967 ++NumTailCalls;
3968 }
3969
3970 if (IsMustTail && !isTailCall)
3971 report_fatal_error("failed to perform tail call elimination on a call "
3972 "site marked musttail");
3973
3974 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((void)0)
3975 "Var args not supported with calling convention fastcc, ghc or hipe")((void)0);
3976
3977 // Analyze operands of the call, assigning locations to each operand.
3978 SmallVector<CCValAssign, 16> ArgLocs;
3979 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3980
3981 // Allocate shadow area for Win64.
3982 if (IsWin64)
3983 CCInfo.AllocateStack(32, Align(8));
3984
3985 CCInfo.AnalyzeArguments(Outs, CC_X86);
3986
3987 // In vectorcall calling convention a second pass is required for the HVA
3988 // types.
3989 if (CallingConv::X86_VectorCall == CallConv) {
3990 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3991 }
3992
3993 // Get a count of how many bytes are to be pushed on the stack.
3994 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3995 if (IsSibcall)
3996 // This is a sibcall. The memory operands are available in caller's
3997 // own caller's stack.
3998 NumBytes = 0;
3999 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4000 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4001
4002 int FPDiff = 0;
4003 if (isTailCall &&
4004 shouldGuaranteeTCO(CallConv,
4005 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4006 // Lower arguments at fp - stackoffset + fpdiff.
4007 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4008
4009 FPDiff = NumBytesCallerPushed - NumBytes;
4010
4011 // Set the delta of movement of the returnaddr stackslot.
4012 // But only set if delta is greater than previous delta.
4013 if (FPDiff < X86Info->getTCReturnAddrDelta())
4014 X86Info->setTCReturnAddrDelta(FPDiff);
4015 }
4016
4017 unsigned NumBytesToPush = NumBytes;
4018 unsigned NumBytesToPop = NumBytes;
4019
4020 // If we have an inalloca argument, all stack space has already been allocated
4021 // for us and be right at the top of the stack. We don't support multiple
4022 // arguments passed in memory when using inalloca.
4023 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4024 NumBytesToPush = 0;
4025 if (!ArgLocs.back().isMemLoc())
4026 report_fatal_error("cannot use inalloca attribute on a register "
4027 "parameter");
4028 if (ArgLocs.back().getLocMemOffset() != 0)
4029 report_fatal_error("any parameter with the inalloca attribute must be "
4030 "the only memory argument");
4031 } else if (CLI.IsPreallocated) {
4032 assert(ArgLocs.back().isMemLoc() &&((void)0)
4033 "cannot use preallocated attribute on a register "((void)0)
4034 "parameter")((void)0);
4035 SmallVector<size_t, 4> PreallocatedOffsets;
4036 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4037 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4038 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4039 }
4040 }
4041 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4042 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4043 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4044 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4045 NumBytesToPush = 0;
4046 }
4047
4048 if (!IsSibcall && !IsMustTail)
4049 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4050 NumBytes - NumBytesToPush, dl);
4051
4052 SDValue RetAddrFrIdx;
4053 // Load return address for tail calls.
4054 if (isTailCall && FPDiff)
4055 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4056 Is64Bit, FPDiff, dl);
4057
4058 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4059 SmallVector<SDValue, 8> MemOpChains;
4060 SDValue StackPtr;
4061
4062 // The next loop assumes that the locations are in the same order of the
4063 // input arguments.
4064 assert(isSortedByValueNo(ArgLocs) &&((void)0)
4065 "Argument Location list must be sorted before lowering")((void)0);
4066
4067 // Walk the register/memloc assignments, inserting copies/loads. In the case
4068 // of tail call optimization arguments are handle later.
4069 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4070 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4071 ++I, ++OutIndex) {
4072 assert(OutIndex < Outs.size() && "Invalid Out index")((void)0);
4073 // Skip inalloca/preallocated arguments, they have already been written.
4074 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4075 if (Flags.isInAlloca() || Flags.isPreallocated())
4076 continue;
4077
4078 CCValAssign &VA = ArgLocs[I];
4079 EVT RegVT = VA.getLocVT();
4080 SDValue Arg = OutVals[OutIndex];
4081 bool isByVal = Flags.isByVal();
4082
4083 // Promote the value if needed.
4084 switch (VA.getLocInfo()) {
4085 default: llvm_unreachable("Unknown loc info!")__builtin_unreachable();
4086 case CCValAssign::Full: break;
4087 case CCValAssign::SExt:
4088 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4089 break;
4090 case CCValAssign::ZExt:
4091 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4092 break;
4093 case CCValAssign::AExt:
4094 if (Arg.getValueType().isVector() &&
4095 Arg.getValueType().getVectorElementType() == MVT::i1)
4096 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4097 else if (RegVT.is128BitVector()) {
4098 // Special case: passing MMX values in XMM registers.
4099 Arg = DAG.getBitcast(MVT::i64, Arg);
4100 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4101 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4102 } else
4103 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4104 break;
4105 case CCValAssign::BCvt:
4106 Arg = DAG.getBitcast(RegVT, Arg);
4107 break;
4108 case CCValAssign::Indirect: {
4109 if (isByVal) {
4110 // Memcpy the argument to a temporary stack slot to prevent
4111 // the caller from seeing any modifications the callee may make
4112 // as guaranteed by the `byval` attribute.
4113 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4114 Flags.getByValSize(),
4115 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4116 SDValue StackSlot =
4117 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4118 Chain =
4119 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4120 // From now on treat this as a regular pointer
4121 Arg = StackSlot;
4122 isByVal = false;
4123 } else {
4124 // Store the argument.
4125 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4126 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4127 Chain = DAG.getStore(
4128 Chain, dl, Arg, SpillSlot,
4129 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4130 Arg = SpillSlot;
4131 }
4132 break;
4133 }
4134 }
4135
4136 if (VA.needsCustom()) {
4137 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
4138 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
4139 // Split v64i1 value into two registers
4140 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4141 } else if (VA.isRegLoc()) {
4142 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4143 const TargetOptions &Options = DAG.getTarget().Options;
4144 if (Options.EmitCallSiteInfo)
4145 CSInfo.emplace_back(VA.getLocReg(), I);
4146 if (isVarArg && IsWin64) {
4147 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4148 // shadow reg if callee is a varargs function.
4149 Register ShadowReg;
4150 switch (VA.getLocReg()) {
4151 case X86::XMM0: ShadowReg = X86::RCX; break;
4152 case X86::XMM1: ShadowReg = X86::RDX; break;
4153 case X86::XMM2: ShadowReg = X86::R8; break;
4154 case X86::XMM3: ShadowReg = X86::R9; break;
4155 }
4156 if (ShadowReg)
4157 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4158 }
4159 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4160 assert(VA.isMemLoc())((void)0);
4161 if (!StackPtr.getNode())
4162 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4163 getPointerTy(DAG.getDataLayout()));
4164 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4165 dl, DAG, VA, Flags, isByVal));
4166 }
4167 }
4168
4169 if (!MemOpChains.empty())
4170 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4171
4172 if (Subtarget.isPICStyleGOT()) {
4173 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4174 // GOT pointer (except regcall).
4175 if (!isTailCall) {
4176 // Indirect call with RegCall calling convertion may use up all the
4177 // general registers, so it is not suitable to bind EBX reister for
4178 // GOT address, just let register allocator handle it.
4179 if (CallConv != CallingConv::X86_RegCall)
4180 RegsToPass.push_back(std::make_pair(
4181 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4182 getPointerTy(DAG.getDataLayout()))));
4183 } else {
4184 // If we are tail calling and generating PIC/GOT style code load the
4185 // address of the callee into ECX. The value in ecx is used as target of
4186 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4187 // for tail calls on PIC/GOT architectures. Normally we would just put the
4188 // address of GOT into ebx and then call target@PLT. But for tail calls
4189 // ebx would be restored (since ebx is callee saved) before jumping to the
4190 // target@PLT.
4191
4192 // Note: The actual moving to ECX is done further down.
4193 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4194 if (G && !G->getGlobal()->hasLocalLinkage() &&
4195 G->getGlobal()->hasDefaultVisibility())
4196 Callee = LowerGlobalAddress(Callee, DAG);
4197 else if (isa<ExternalSymbolSDNode>(Callee))
4198 Callee = LowerExternalSymbol(Callee, DAG);
4199 }
4200 }
4201
4202 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4203 // From AMD64 ABI document:
4204 // For calls that may call functions that use varargs or stdargs
4205 // (prototype-less calls or calls to functions containing ellipsis (...) in
4206 // the declaration) %al is used as hidden argument to specify the number
4207 // of SSE registers used. The contents of %al do not need to match exactly
4208 // the number of registers, but must be an ubound on the number of SSE
4209 // registers used and is in the range 0 - 8 inclusive.
4210
4211 // Count the number of XMM registers allocated.
4212 static const MCPhysReg XMMArgRegs[] = {
4213 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4214 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4215 };
4216 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4217 assert((Subtarget.hasSSE1() || !NumXMMRegs)((void)0)
4218 && "SSE registers cannot be used when SSE is disabled")((void)0);
4219 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4220 DAG.getConstant(NumXMMRegs, dl,
4221 MVT::i8)));
4222 }
4223
4224 if (isVarArg && IsMustTail) {
4225 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4226 for (const auto &F : Forwards) {
4227 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4228 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4229 }
4230 }
4231
4232 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4233 // don't need this because the eligibility check rejects calls that require
4234 // shuffling arguments passed in memory.
4235 if (!IsSibcall && isTailCall) {
4236 // Force all the incoming stack arguments to be loaded from the stack
4237 // before any new outgoing arguments are stored to the stack, because the
4238 // outgoing stack slots may alias the incoming argument stack slots, and
4239 // the alias isn't otherwise explicit. This is slightly more conservative
4240 // than necessary, because it means that each store effectively depends
4241 // on every argument instead of just those arguments it would clobber.
4242 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4243
4244 SmallVector<SDValue, 8> MemOpChains2;
4245 SDValue FIN;
4246 int FI = 0;
4247 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4248 ++I, ++OutsIndex) {
4249 CCValAssign &VA = ArgLocs[I];
4250
4251 if (VA.isRegLoc()) {
4252 if (VA.needsCustom()) {
4253 assert((CallConv == CallingConv::X86_RegCall) &&((void)0)
4254 "Expecting custom case only in regcall calling convention")((void)0);
4255 // This means that we are in special case where one argument was
4256 // passed through two register locations - Skip the next location
4257 ++I;
4258 }
4259
4260 continue;
4261 }
4262
4263 assert(VA.isMemLoc())((void)0);
4264 SDValue Arg = OutVals[OutsIndex];
4265 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4266 // Skip inalloca/preallocated arguments. They don't require any work.
4267 if (Flags.isInAlloca() || Flags.isPreallocated())
4268 continue;
4269 // Create frame index.
4270 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4271 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4272 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4273 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4274
4275 if (Flags.isByVal()) {
4276 // Copy relative to framepointer.
4277 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4278 if (!StackPtr.getNode())
4279 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4280 getPointerTy(DAG.getDataLayout()));
4281 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4282 StackPtr, Source);
4283
4284 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4285 ArgChain,
4286 Flags, DAG, dl));
4287 } else {
4288 // Store relative to framepointer.
4289 MemOpChains2.push_back(DAG.getStore(
4290 ArgChain, dl, Arg, FIN,
4291 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4292 }
4293 }
4294
4295 if (!MemOpChains2.empty())
4296 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4297
4298 // Store the return address to the appropriate stack slot.
4299 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4300 getPointerTy(DAG.getDataLayout()),
4301 RegInfo->getSlotSize(), FPDiff, dl);
4302 }
4303
4304 // Build a sequence of copy-to-reg nodes chained together with token chain
4305 // and flag operands which copy the outgoing args into registers.
4306 SDValue InFlag;
4307 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4308 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4309 RegsToPass[i].second, InFlag);
4310 InFlag = Chain.getValue(1);
4311 }
4312
4313 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4314 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((void)0);
4315 // In the 64-bit large code model, we have to make all calls
4316 // through a register, since the call instruction's 32-bit
4317 // pc-relative offset may not be large enough to hold the whole
4318 // address.
4319 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4320 Callee->getOpcode() == ISD::ExternalSymbol) {
4321 // Lower direct calls to global addresses and external symbols. Setting
4322 // ForCall to true here has the effect of removing WrapperRIP when possible
4323 // to allow direct calls to be selected without first materializing the
4324 // address into a register.
4325 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4326 } else if (Subtarget.isTarget64BitILP32() &&
4327 Callee->getValueType(0) == MVT::i32) {
4328 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4329 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4330 }
4331
4332 // Returns a chain & a flag for retval copy to use.
4333 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4334 SmallVector<SDValue, 8> Ops;
4335
4336 if (!IsSibcall && isTailCall && !IsMustTail) {
4337 Chain = DAG.getCALLSEQ_END(Chain,
4338 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4339 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4340 InFlag = Chain.getValue(1);
4341 }
4342
4343 Ops.push_back(Chain);
4344 Ops.push_back(Callee);
4345
4346 if (isTailCall)
4347 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4348
4349 // Add argument registers to the end of the list so that they are known live
4350 // into the call.
4351 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4352 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4353 RegsToPass[i].second.getValueType()));
4354
4355 // Add a register mask operand representing the call-preserved registers.
4356 const uint32_t *Mask = [&]() {
4357 auto AdaptedCC = CallConv;
4358 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4359 // use X86_INTR calling convention because it has the same CSR mask
4360 // (same preserved registers).
4361 if (HasNCSR)
4362 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4363 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4364 // to use the CSR_NoRegs_RegMask.
4365 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4366 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4367 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4368 }();
4369 assert(Mask && "Missing call preserved mask for calling convention")((void)0);
4370
4371 // If this is an invoke in a 32-bit function using a funclet-based
4372 // personality, assume the function clobbers all registers. If an exception
4373 // is thrown, the runtime will not restore CSRs.
4374 // FIXME: Model this more precisely so that we can register allocate across
4375 // the normal edge and spill and fill across the exceptional edge.
4376 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4377 const Function &CallerFn = MF.getFunction();
4378 EHPersonality Pers =
4379 CallerFn.hasPersonalityFn()
4380 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4381 : EHPersonality::Unknown;
4382 if (isFuncletEHPersonality(Pers))
4383 Mask = RegInfo->getNoPreservedMask();
4384 }
4385
4386 // Define a new register mask from the existing mask.
4387 uint32_t *RegMask = nullptr;
4388
4389 // In some calling conventions we need to remove the used physical registers
4390 // from the reg mask.
4391 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4392 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4393
4394 // Allocate a new Reg Mask and copy Mask.
4395 RegMask = MF.allocateRegMask();
4396 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4397 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4398
4399 // Make sure all sub registers of the argument registers are reset
4400 // in the RegMask.
4401 for (auto const &RegPair : RegsToPass)
4402 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4403 SubRegs.isValid(); ++SubRegs)
4404 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4405
4406 // Create the RegMask Operand according to our updated mask.
4407 Ops.push_back(DAG.getRegisterMask(RegMask));
4408 } else {
4409 // Create the RegMask Operand according to the static mask.
4410 Ops.push_back(DAG.getRegisterMask(Mask));
4411 }
4412
4413 if (InFlag.getNode())
4414 Ops.push_back(InFlag);
4415
4416 if (isTailCall) {
4417 // We used to do:
4418 //// If this is the first return lowered for this function, add the regs
4419 //// to the liveout set for the function.
4420 // This isn't right, although it's probably harmless on x86; liveouts
4421 // should be computed from returns not tail calls. Consider a void
4422 // function making a tail call to a function returning int.
4423 MF.getFrameInfo().setHasTailCall();
4424 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4425 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4426 return Ret;
4427 }
4428
4429 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4430 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4431 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4432 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4433 // expanded to the call, directly followed by a special marker sequence and
4434 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4435 assert(!isTailCall &&((void)0)
4436 "tail calls cannot be marked with clang.arc.attachedcall")((void)0);
4437 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")((void)0);
4438
4439 // Add target constant to select ObjC runtime call just before the call
4440 // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4441 // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4442 // epxanding the pseudo.
4443 unsigned RuntimeCallType =
4444 objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4445 Ops.insert(Ops.begin() + 1,
4446 DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4447 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4448 } else {
4449 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4450 }
4451
4452 InFlag = Chain.getValue(1);
4453 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4454 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4455
4456 // Save heapallocsite metadata.
4457 if (CLI.CB)
4458 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4459 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4460
4461 // Create the CALLSEQ_END node.
4462 unsigned NumBytesForCalleeToPop;
4463 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4464 DAG.getTarget().Options.GuaranteedTailCallOpt))
4465 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4466 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4467 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4468 SR == StackStructReturn)
4469 // If this is a call to a struct-return function, the callee
4470 // pops the hidden struct pointer, so we have to push it back.
4471 // This is common for Darwin/X86, Linux & Mingw32 targets.
4472 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4473 NumBytesForCalleeToPop = 4;
4474 else
4475 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4476
4477 // Returns a flag for retval copy to use.
4478 if (!IsSibcall) {
4479 Chain = DAG.getCALLSEQ_END(Chain,
4480 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4481 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4482 true),
4483 InFlag, dl);
4484 InFlag = Chain.getValue(1);
4485 }
4486
4487 // Handle result values, copying them out of physregs into vregs that we
4488 // return.
4489 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4490 InVals, RegMask);
4491}
4492
4493//===----------------------------------------------------------------------===//
4494// Fast Calling Convention (tail call) implementation
4495//===----------------------------------------------------------------------===//
4496
4497// Like std call, callee cleans arguments, convention except that ECX is
4498// reserved for storing the tail called function address. Only 2 registers are
4499// free for argument passing (inreg). Tail call optimization is performed
4500// provided:
4501// * tailcallopt is enabled
4502// * caller/callee are fastcc
4503// On X86_64 architecture with GOT-style position independent code only local
4504// (within module) calls are supported at the moment.
4505// To keep the stack aligned according to platform abi the function
4506// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4507// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4508// If a tail called function callee has more arguments than the caller the
4509// caller needs to make sure that there is room to move the RETADDR to. This is
4510// achieved by reserving an area the size of the argument delta right after the
4511// original RETADDR, but before the saved framepointer or the spilled registers
4512// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4513// stack layout:
4514// arg1
4515// arg2
4516// RETADDR
4517// [ new RETADDR
4518// move area ]
4519// (possible EBP)
4520// ESI
4521// EDI
4522// local1 ..
4523
4524/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4525/// requirement.
4526unsigned
4527X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4528 SelectionDAG &DAG) const {
4529 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4530 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4531 assert(StackSize % SlotSize == 0 &&((void)0)
4532 "StackSize must be a multiple of SlotSize")((void)0);
4533 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4534}
4535
4536/// Return true if the given stack call argument is already available in the
4537/// same position (relatively) of the caller's incoming argument stack.
4538static
4539bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4540 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4541 const X86InstrInfo *TII, const CCValAssign &VA) {
4542 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4543
4544 for (;;) {
4545 // Look through nodes that don't alter the bits of the incoming value.
4546 unsigned Op = Arg.getOpcode();
4547 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4548 Arg = Arg.getOperand(0);
4549 continue;
4550 }
4551 if (Op == ISD::TRUNCATE) {
4552 const SDValue &TruncInput = Arg.getOperand(0);
4553 if (TruncInput.getOpcode() == ISD::AssertZext &&
4554 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4555 Arg.getValueType()) {
4556 Arg = TruncInput.getOperand(0);
4557 continue;
4558 }
4559 }
4560 break;
4561 }
4562
4563 int FI = INT_MAX2147483647;
4564 if (Arg.getOpcode() == ISD::CopyFromReg) {
4565 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4566 if (!VR.isVirtual())
4567 return false;
4568 MachineInstr *Def = MRI->getVRegDef(VR);
4569 if (!Def)
4570 return false;
4571 if (!Flags.isByVal()) {
4572 if (!TII->isLoadFromStackSlot(*Def, FI))
4573 return false;
4574 } else {
4575 unsigned Opcode = Def->getOpcode();
4576 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4577 Opcode == X86::LEA64_32r) &&
4578 Def->getOperand(1).isFI()) {
4579 FI = Def->getOperand(1).getIndex();
4580 Bytes = Flags.getByValSize();
4581 } else
4582 return false;
4583 }
4584 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4585 if (Flags.isByVal())
4586 // ByVal argument is passed in as a pointer but it's now being
4587 // dereferenced. e.g.
4588 // define @foo(%struct.X* %A) {
4589 // tail call @bar(%struct.X* byval %A)
4590 // }
4591 return false;
4592 SDValue Ptr = Ld->getBasePtr();
4593 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4594 if (!FINode)
4595 return false;
4596 FI = FINode->getIndex();
4597 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4598 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4599 FI = FINode->getIndex();
4600 Bytes = Flags.getByValSize();
4601 } else
4602 return false;
4603
4604 assert(FI != INT_MAX)((void)0);
4605 if (!MFI.isFixedObjectIndex(FI))
4606 return false;
4607
4608 if (Offset != MFI.getObjectOffset(FI))
4609 return false;
4610
4611 // If this is not byval, check that the argument stack object is immutable.
4612 // inalloca and argument copy elision can create mutable argument stack
4613 // objects. Byval objects can be mutated, but a byval call intends to pass the
4614 // mutated memory.
4615 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4616 return false;
4617
4618 if (VA.getLocVT().getFixedSizeInBits() >
4619 Arg.getValueSizeInBits().getFixedSize()) {
4620 // If the argument location is wider than the argument type, check that any
4621 // extension flags match.
4622 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4623 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4624 return false;
4625 }
4626 }
4627
4628 return Bytes == MFI.getObjectSize(FI);
4629}
4630
4631/// Check whether the call is eligible for tail call optimization. Targets
4632/// that want to do tail call optimization should implement this function.
4633bool X86TargetLowering::IsEligibleForTailCallOptimization(
4634 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4635 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4636 const SmallVectorImpl<ISD::OutputArg> &Outs,
4637 const SmallVectorImpl<SDValue> &OutVals,
4638 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4639 if (!mayTailCallThisCC(CalleeCC))
4640 return false;
4641
4642 // If -tailcallopt is specified, make fastcc functions tail-callable.
4643 MachineFunction &MF = DAG.getMachineFunction();
4644 const Function &CallerF = MF.getFunction();
4645
4646 // If the function return type is x86_fp80 and the callee return type is not,
4647 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4648 // perform a tailcall optimization here.
4649 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4650 return false;
4651
4652 CallingConv::ID CallerCC = CallerF.getCallingConv();
4653 bool CCMatch = CallerCC == CalleeCC;
4654 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4655 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4656 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4657 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4658
4659 // Win64 functions have extra shadow space for argument homing. Don't do the
4660 // sibcall if the caller and callee have mismatched expectations for this
4661 // space.
4662 if (IsCalleeWin64 != IsCallerWin64)
4663 return false;
4664
4665 if (IsGuaranteeTCO) {
4666 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4667 return true;
4668 return false;
4669 }
4670
4671 // Look for obvious safe cases to perform tail call optimization that do not
4672 // require ABI changes. This is what gcc calls sibcall.
4673
4674 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4675 // emit a special epilogue.
4676 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4677 if (RegInfo->hasStackRealignment(MF))
4678 return false;
4679
4680 // Also avoid sibcall optimization if either caller or callee uses struct
4681 // return semantics.
4682 if (isCalleeStructRet || isCallerStructRet)
4683 return false;
4684
4685 // Do not sibcall optimize vararg calls unless all arguments are passed via
4686 // registers.
4687 LLVMContext &C = *DAG.getContext();
4688 if (isVarArg && !Outs.empty()) {
4689 // Optimizing for varargs on Win64 is unlikely to be safe without
4690 // additional testing.
4691 if (IsCalleeWin64 || IsCallerWin64)
4692 return false;
4693
4694 SmallVector<CCValAssign, 16> ArgLocs;
4695 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4696
4697 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4698 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4699 if (!ArgLocs[i].isRegLoc())
4700 return false;
4701 }
4702
4703 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4704 // stack. Therefore, if it's not used by the call it is not safe to optimize
4705 // this into a sibcall.
4706 bool Unused = false;
4707 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4708 if (!Ins[i].Used) {
4709 Unused = true;
4710 break;
4711 }
4712 }
4713 if (Unused) {
4714 SmallVector<CCValAssign, 16> RVLocs;
4715 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4716 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4717 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4718 CCValAssign &VA = RVLocs[i];
4719 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4720 return false;
4721 }
4722 }
4723
4724 // Check that the call results are passed in the same way.
4725 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4726 RetCC_X86, RetCC_X86))
4727 return false;
4728 // The callee has to preserve all registers the caller needs to preserve.
4729 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4730 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4731 if (!CCMatch) {
4732 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4733 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4734 return false;
4735 }
4736
4737 unsigned StackArgsSize = 0;
4738
4739 // If the callee takes no arguments then go on to check the results of the
4740 // call.
4741 if (!Outs.empty()) {
4742 // Check if stack adjustment is needed. For now, do not do this if any
4743 // argument is passed on the stack.
4744 SmallVector<CCValAssign, 16> ArgLocs;
4745 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4746
4747 // Allocate shadow area for Win64
4748 if (IsCalleeWin64)
4749 CCInfo.AllocateStack(32, Align(8));
4750
4751 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4752 StackArgsSize = CCInfo.getNextStackOffset();
4753
4754 if (CCInfo.getNextStackOffset()) {
4755 // Check if the arguments are already laid out in the right way as
4756 // the caller's fixed stack objects.
4757 MachineFrameInfo &MFI = MF.getFrameInfo();
4758 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4759 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4760 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4761 CCValAssign &VA = ArgLocs[i];
4762 SDValue Arg = OutVals[i];
4763 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4764 if (VA.getLocInfo() == CCValAssign::Indirect)
4765 return false;
4766 if (!VA.isRegLoc()) {
4767 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4768 MFI, MRI, TII, VA))
4769 return false;
4770 }
4771 }
4772 }
4773
4774 bool PositionIndependent = isPositionIndependent();
4775 // If the tailcall address may be in a register, then make sure it's
4776 // possible to register allocate for it. In 32-bit, the call address can
4777 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4778 // callee-saved registers are restored. These happen to be the same
4779 // registers used to pass 'inreg' arguments so watch out for those.
4780 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4781 !isa<ExternalSymbolSDNode>(Callee)) ||
4782 PositionIndependent)) {
4783 unsigned NumInRegs = 0;
4784 // In PIC we need an extra register to formulate the address computation
4785 // for the callee.
4786 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4787
4788 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4789 CCValAssign &VA = ArgLocs[i];
4790 if (!VA.isRegLoc())
4791 continue;
4792 Register Reg = VA.getLocReg();
4793 switch (Reg) {
4794 default: break;
4795 case X86::EAX: case X86::EDX: case X86::ECX:
4796 if (++NumInRegs == MaxInRegs)
4797 return false;
4798 break;
4799 }
4800 }
4801 }
4802
4803 const MachineRegisterInfo &MRI = MF.getRegInfo();
4804 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4805 return false;
4806 }
4807
4808 bool CalleeWillPop =
4809 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4810 MF.getTarget().Options.GuaranteedTailCallOpt);
4811
4812 if (unsigned BytesToPop =
4813 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4814 // If we have bytes to pop, the callee must pop them.
4815 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4816 if (!CalleePopMatches)
4817 return false;
4818 } else if (CalleeWillPop && StackArgsSize > 0) {
4819 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4820 return false;
4821 }
4822
4823 return true;
4824}
4825
4826FastISel *
4827X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4828 const TargetLibraryInfo *libInfo) const {
4829 return X86::createFastISel(funcInfo, libInfo);
4830}
4831
4832//===----------------------------------------------------------------------===//
4833// Other Lowering Hooks
4834//===----------------------------------------------------------------------===//
4835
4836static bool MayFoldLoad(SDValue Op) {
4837 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4838}
4839
4840static bool MayFoldIntoStore(SDValue Op) {
4841 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4842}
4843
4844static bool MayFoldIntoZeroExtend(SDValue Op) {
4845 if (Op.hasOneUse()) {
4846 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4847 return (ISD::ZERO_EXTEND == Opcode);
4848 }
4849 return false;
4850}
4851
4852static bool isTargetShuffle(unsigned Opcode) {
4853 switch(Opcode) {
4854 default: return false;
4855 case X86ISD::BLENDI:
4856 case X86ISD::PSHUFB:
4857 case X86ISD::PSHUFD:
4858 case X86ISD::PSHUFHW:
4859 case X86ISD::PSHUFLW:
4860 case X86ISD::SHUFP:
4861 case X86ISD::INSERTPS:
4862 case X86ISD::EXTRQI:
4863 case X86ISD::INSERTQI:
4864 case X86ISD::VALIGN:
4865 case X86ISD::PALIGNR:
4866 case X86ISD::VSHLDQ:
4867 case X86ISD::VSRLDQ:
4868 case X86ISD::MOVLHPS:
4869 case X86ISD::MOVHLPS:
4870 case X86ISD::MOVSHDUP:
4871 case X86ISD::MOVSLDUP:
4872 case X86ISD::MOVDDUP:
4873 case X86ISD::MOVSS:
4874 case X86ISD::MOVSD:
4875 case X86ISD::UNPCKL:
4876 case X86ISD::UNPCKH:
4877 case X86ISD::VBROADCAST:
4878 case X86ISD::VPERMILPI:
4879 case X86ISD::VPERMILPV:
4880 case X86ISD::VPERM2X128:
4881 case X86ISD::SHUF128:
4882 case X86ISD::VPERMIL2:
4883 case X86ISD::VPERMI:
4884 case X86ISD::VPPERM:
4885 case X86ISD::VPERMV:
4886 case X86ISD::VPERMV3:
4887 case X86ISD::VZEXT_MOVL:
4888 return true;
4889 }
4890}
4891
4892static bool isTargetShuffleVariableMask(unsigned Opcode) {
4893 switch (Opcode) {
4894 default: return false;
4895 // Target Shuffles.
4896 case X86ISD::PSHUFB:
4897 case X86ISD::VPERMILPV:
4898 case X86ISD::VPERMIL2:
4899 case X86ISD::VPPERM:
4900 case X86ISD::VPERMV:
4901 case X86ISD::VPERMV3:
4902 return true;
4903 // 'Faux' Target Shuffles.
4904 case ISD::OR:
4905 case ISD::AND:
4906 case X86ISD::ANDNP:
4907 return true;
4908 }
4909}
4910
4911static bool isTargetShuffleSplat(SDValue Op) {
4912 unsigned Opcode = Op.getOpcode();
4913 if (Opcode == ISD::EXTRACT_SUBVECTOR)
4914 return isTargetShuffleSplat(Op.getOperand(0));
4915 return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
4916}
4917
4918SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4919 MachineFunction &MF = DAG.getMachineFunction();
4920 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4921 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4922 int ReturnAddrIndex = FuncInfo->getRAIndex();
4923
4924 if (ReturnAddrIndex == 0) {
4925 // Set up a frame object for the return address.
4926 unsigned SlotSize = RegInfo->getSlotSize();
4927 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4928 -(int64_t)SlotSize,
4929 false);
4930 FuncInfo->setRAIndex(ReturnAddrIndex);
4931 }
4932
4933 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4934}
4935
4936bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4937 bool hasSymbolicDisplacement) {
4938 // Offset should fit into 32 bit immediate field.
4939 if (!isInt<32>(Offset))
4940 return false;
4941
4942 // If we don't have a symbolic displacement - we don't have any extra
4943 // restrictions.
4944 if (!hasSymbolicDisplacement)
4945 return true;
4946
4947 // FIXME: Some tweaks might be needed for medium code model.
4948 if (M != CodeModel::Small && M != CodeModel::Kernel)
4949 return false;
4950
4951 // For small code model we assume that latest object is 16MB before end of 31
4952 // bits boundary. We may also accept pretty large negative constants knowing
4953 // that all objects are in the positive half of address space.
4954 if (M == CodeModel::Small && Offset < 16*1024*1024)
4955 return true;
4956
4957 // For kernel code model we know that all object resist in the negative half
4958 // of 32bits address space. We may not accept negative offsets, since they may
4959 // be just off and we may accept pretty large positive ones.
4960 if (M == CodeModel::Kernel && Offset >= 0)
4961 return true;
4962
4963 return false;
4964}
4965
4966/// Determines whether the callee is required to pop its own arguments.
4967/// Callee pop is necessary to support tail calls.
4968bool X86::isCalleePop(CallingConv::ID CallingConv,
4969 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4970 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4971 // can guarantee TCO.
4972 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4973 return true;
4974
4975 switch (CallingConv) {
4976 default:
4977 return false;
4978 case CallingConv::X86_StdCall:
4979 case CallingConv::X86_FastCall:
4980 case CallingConv::X86_ThisCall:
4981 case CallingConv::X86_VectorCall:
4982 return !is64Bit;
4983 }
4984}
4985
4986/// Return true if the condition is an signed comparison operation.
4987static bool isX86CCSigned(unsigned X86CC) {
4988 switch (X86CC) {
4989 default:
4990 llvm_unreachable("Invalid integer condition!")__builtin_unreachable();
4991 case X86::COND_E:
4992 case X86::COND_NE:
4993 case X86::COND_B:
4994 case X86::COND_A:
4995 case X86::COND_BE:
4996 case X86::COND_AE:
4997 return false;
4998 case X86::COND_G:
4999 case X86::COND_GE:
5000 case X86::COND_L:
5001 case X86::COND_LE:
5002 return true;
5003 }
5004}
5005
5006static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5007 switch (SetCCOpcode) {
5008 default: llvm_unreachable("Invalid integer condition!")__builtin_unreachable();
5009 case ISD::SETEQ: return X86::COND_E;
5010 case ISD::SETGT: return X86::COND_G;
5011 case ISD::SETGE: return X86::COND_GE;
5012 case ISD::SETLT: return X86::COND_L;
5013 case ISD::SETLE: return X86::COND_LE;
5014 case ISD::SETNE: return X86::COND_NE;
5015 case ISD::SETULT: return X86::COND_B;
5016 case ISD::SETUGT: return X86::COND_A;
5017 case ISD::SETULE: return X86::COND_BE;
5018 case ISD::SETUGE: return X86::COND_AE;
5019 }
5020}
5021
5022/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5023/// condition code, returning the condition code and the LHS/RHS of the
5024/// comparison to make.
5025static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5026 bool isFP, SDValue &LHS, SDValue &RHS,
5027 SelectionDAG &DAG) {
5028 if (!isFP) {
5029 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5030 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5031 // X > -1 -> X == 0, jump !sign.
5032 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5033 return X86::COND_NS;
5034 }
5035 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5036 // X < 0 -> X == 0, jump on sign.
5037 return X86::COND_S;
5038 }
5039 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5040 // X >= 0 -> X == 0, jump on !sign.
5041 return X86::COND_NS;
5042 }
5043 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5044 // X < 1 -> X <= 0
5045 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5046 return X86::COND_LE;
5047 }
5048 }
5049
5050 return TranslateIntegerX86CC(SetCCOpcode);
5051 }
5052
5053 // First determine if it is required or is profitable to flip the operands.
5054
5055 // If LHS is a foldable load, but RHS is not, flip the condition.
5056 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5057 !ISD::isNON_EXTLoad(RHS.getNode())) {
5058 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5059 std::swap(LHS, RHS);
5060 }
5061
5062 switch (SetCCOpcode) {
5063 default: break;
5064 case ISD::SETOLT:
5065 case ISD::SETOLE:
5066 case ISD::SETUGT:
5067 case ISD::SETUGE:
5068 std::swap(LHS, RHS);
5069 break;
5070 }
5071
5072 // On a floating point condition, the flags are set as follows:
5073 // ZF PF CF op
5074 // 0 | 0 | 0 | X > Y
5075 // 0 | 0 | 1 | X < Y
5076 // 1 | 0 | 0 | X == Y
5077 // 1 | 1 | 1 | unordered
5078 switch (SetCCOpcode) {
5079 default: llvm_unreachable("Condcode should be pre-legalized away")__builtin_unreachable();
5080 case ISD::SETUEQ:
5081 case ISD::SETEQ: return X86::COND_E;
5082 case ISD::SETOLT: // flipped
5083 case ISD::SETOGT:
5084 case ISD::SETGT: return X86::COND_A;
5085 case ISD::SETOLE: // flipped
5086 case ISD::SETOGE:
5087 case ISD::SETGE: return X86::COND_AE;
5088 case ISD::SETUGT: // flipped
5089 case ISD::SETULT:
5090 case ISD::SETLT: return X86::COND_B;
5091 case ISD::SETUGE: // flipped
5092 case ISD::SETULE:
5093 case ISD::SETLE: return X86::COND_BE;
5094 case ISD::SETONE:
5095 case ISD::SETNE: return X86::COND_NE;
5096 case ISD::SETUO: return X86::COND_P;
5097 case ISD::SETO: return X86::COND_NP;
5098 case ISD::SETOEQ:
5099 case ISD::SETUNE: return X86::COND_INVALID;
5100 }
5101}
5102
5103/// Is there a floating point cmov for the specific X86 condition code?
5104/// Current x86 isa includes the following FP cmov instructions:
5105/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5106static bool hasFPCMov(unsigned X86CC) {
5107 switch (X86CC) {
5108 default:
5109 return false;
5110 case X86::COND_B:
5111 case X86::COND_BE:
5112 case X86::COND_E:
5113 case X86::COND_P:
5114 case X86::COND_A:
5115 case X86::COND_AE:
5116 case X86::COND_NE:
5117 case X86::COND_NP:
5118 return true;
5119 }
5120}
5121
5122
5123bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5124 const CallInst &I,
5125 MachineFunction &MF,
5126 unsigned Intrinsic) const {
5127 Info.flags = MachineMemOperand::MONone;
5128 Info.offset = 0;
5129
5130 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5131 if (!IntrData) {
5132 switch (Intrinsic) {
5133 case Intrinsic::x86_aesenc128kl:
5134 case Intrinsic::x86_aesdec128kl:
5135 Info.opc = ISD::INTRINSIC_W_CHAIN;
5136 Info.ptrVal = I.getArgOperand(1);
5137 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5138 Info.align = Align(1);
5139 Info.flags |= MachineMemOperand::MOLoad;
5140 return true;
5141 case Intrinsic::x86_aesenc256kl:
5142 case Intrinsic::x86_aesdec256kl:
5143 Info.opc = ISD::INTRINSIC_W_CHAIN;
5144 Info.ptrVal = I.getArgOperand(1);
5145 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5146 Info.align = Align(1);
5147 Info.flags |= MachineMemOperand::MOLoad;
5148 return true;
5149 case Intrinsic::x86_aesencwide128kl:
5150 case Intrinsic::x86_aesdecwide128kl:
5151 Info.opc = ISD::INTRINSIC_W_CHAIN;
5152 Info.ptrVal = I.getArgOperand(0);
5153 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5154 Info.align = Align(1);
5155 Info.flags |= MachineMemOperand::MOLoad;
5156 return true;
5157 case Intrinsic::x86_aesencwide256kl:
5158 case Intrinsic::x86_aesdecwide256kl:
5159 Info.opc = ISD::INTRINSIC_W_CHAIN;
5160 Info.ptrVal = I.getArgOperand(0);
5161 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5162 Info.align = Align(1);
5163 Info.flags |= MachineMemOperand::MOLoad;
5164 return true;
5165 }
5166 return false;
5167 }
5168
5169 switch (IntrData->Type) {
5170 case TRUNCATE_TO_MEM_VI8:
5171 case TRUNCATE_TO_MEM_VI16:
5172 case TRUNCATE_TO_MEM_VI32: {
5173 Info.opc = ISD::INTRINSIC_VOID;
5174 Info.ptrVal = I.getArgOperand(0);
5175 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5176 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5177 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5178 ScalarVT = MVT::i8;
5179 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5180 ScalarVT = MVT::i16;
5181 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5182 ScalarVT = MVT::i32;
5183
5184 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5185 Info.align = Align(1);
5186 Info.flags |= MachineMemOperand::MOStore;
5187 break;
5188 }
5189 case GATHER:
5190 case GATHER_AVX2: {
5191 Info.opc = ISD::INTRINSIC_W_CHAIN;
5192 Info.ptrVal = nullptr;
5193 MVT DataVT = MVT::getVT(I.getType());
5194 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5195 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5196 IndexVT.getVectorNumElements());
5197 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5198 Info.align = Align(1);
5199 Info.flags |= MachineMemOperand::MOLoad;
5200 break;
5201 }
5202 case SCATTER: {
5203 Info.opc = ISD::INTRINSIC_VOID;
5204 Info.ptrVal = nullptr;
5205 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5206 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5207 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5208 IndexVT.getVectorNumElements());
5209 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5210 Info.align = Align(1);
5211 Info.flags |= MachineMemOperand::MOStore;
5212 break;
5213 }
5214 default:
5215 return false;
5216 }
5217
5218 return true;
5219}
5220
5221/// Returns true if the target can instruction select the
5222/// specified FP immediate natively. If false, the legalizer will
5223/// materialize the FP immediate as a load from a constant pool.
5224bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5225 bool ForCodeSize) const {
5226 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5227 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5228 return true;
5229 }
5230 return false;
5231}
5232
5233bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5234 ISD::LoadExtType ExtTy,
5235 EVT NewVT) const {
5236 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")((void)0);
5237
5238 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5239 // relocation target a movq or addq instruction: don't let the load shrink.
5240 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5241 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5242 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5243 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5244
5245 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5246 // those uses are extracted directly into a store, then the extract + store
5247 // can be store-folded. Therefore, it's probably not worth splitting the load.
5248 EVT VT = Load->getValueType(0);
5249 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5250 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5251 // Skip uses of the chain value. Result 0 of the node is the load value.
5252 if (UI.getUse().getResNo() != 0)
5253 continue;
5254
5255 // If this use is not an extract + store, it's probably worth splitting.
5256 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5257 UI->use_begin()->getOpcode() != ISD::STORE)
5258 return true;
5259 }
5260 // All non-chain uses are extract + store.
5261 return false;
5262 }
5263
5264 return true;
5265}
5266
5267/// Returns true if it is beneficial to convert a load of a constant
5268/// to just the constant itself.
5269bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5270 Type *Ty) const {
5271 assert(Ty->isIntegerTy())((void)0);
5272
5273 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5274 if (BitSize == 0 || BitSize > 64)
5275 return false;
5276 return true;
5277}
5278
5279bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5280 // If we are using XMM registers in the ABI and the condition of the select is
5281 // a floating-point compare and we have blendv or conditional move, then it is
5282 // cheaper to select instead of doing a cross-register move and creating a
5283 // load that depends on the compare result.
5284 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5285 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5286}
5287
5288bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5289 // TODO: It might be a win to ease or lift this restriction, but the generic
5290 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5291 if (VT.isVector() && Subtarget.hasAVX512())
5292 return false;
5293
5294 return true;
5295}
5296
5297bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5298 SDValue C) const {
5299 // TODO: We handle scalars using custom code, but generic combining could make
5300 // that unnecessary.
5301 APInt MulC;
5302 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5303 return false;
5304
5305 // Find the type this will be legalized too. Otherwise we might prematurely
5306 // convert this to shl+add/sub and then still have to type legalize those ops.
5307 // Another choice would be to defer the decision for illegal types until
5308 // after type legalization. But constant splat vectors of i64 can't make it
5309 // through type legalization on 32-bit targets so we would need to special
5310 // case vXi64.
5311 while (getTypeAction(Context, VT) != TypeLegal)
5312 VT = getTypeToTransformTo(Context, VT);
5313
5314 // If vector multiply is legal, assume that's faster than shl + add/sub.
5315 // TODO: Multiply is a complex op with higher latency and lower throughput in
5316 // most implementations, so this check could be loosened based on type
5317 // and/or a CPU attribute.
5318 if (isOperationLegal(ISD::MUL, VT))
5319 return false;
5320
5321 // shl+add, shl+sub, shl+add+neg
5322 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5323 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5324}
5325
5326bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5327 unsigned Index) const {
5328 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5329 return false;
5330
5331 // Mask vectors support all subregister combinations and operations that
5332 // extract half of vector.
5333 if (ResVT.getVectorElementType() == MVT::i1)
5334 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5335 (Index == ResVT.getVectorNumElements()));
5336
5337 return (Index % ResVT.getVectorNumElements()) == 0;
5338}
5339
5340bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5341 unsigned Opc = VecOp.getOpcode();
5342
5343 // Assume target opcodes can't be scalarized.
5344 // TODO - do we have any exceptions?
5345 if (Opc >= ISD::BUILTIN_OP_END)
5346 return false;
5347
5348 // If the vector op is not supported, try to convert to scalar.
5349 EVT VecVT = VecOp.getValueType();
5350 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5351 return true;
5352
5353 // If the vector op is supported, but the scalar op is not, the transform may
5354 // not be worthwhile.
5355 EVT ScalarVT = VecVT.getScalarType();
5356 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5357}
5358
5359bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5360 bool) const {
5361 // TODO: Allow vectors?
5362 if (VT.isVector())
5363 return false;
5364 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5365}
5366
5367bool X86TargetLowering::isCheapToSpeculateCttz() const {
5368 // Speculate cttz only if we can directly use TZCNT.
5369 return Subtarget.hasBMI();
5370}
5371
5372bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5373 // Speculate ctlz only if we can directly use LZCNT.
5374 return Subtarget.hasLZCNT();
5375}
5376
5377bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5378 const SelectionDAG &DAG,
5379 const MachineMemOperand &MMO) const {
5380 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5381 BitcastVT.getVectorElementType() == MVT::i1)
5382 return false;
5383
5384 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5385 return false;
5386
5387 // If both types are legal vectors, it's always ok to convert them.
5388 if (LoadVT.isVector() && BitcastVT.isVector() &&
5389 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5390 return true;
5391
5392 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5393}
5394
5395bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5396 const SelectionDAG &DAG) const {
5397 // Do not merge to float value size (128 bytes) if no implicit
5398 // float attribute is set.
5399 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5400 Attribute::NoImplicitFloat);
5401
5402 if (NoFloat) {
5403 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5404 return (MemVT.getSizeInBits() <= MaxIntSize);
5405 }
5406 // Make sure we don't merge greater than our preferred vector
5407 // width.
5408 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5409 return false;
5410
5411 return true;
5412}
5413
5414bool X86TargetLowering::isCtlzFast() const {
5415 return Subtarget.hasFastLZCNT();
5416}
5417
5418bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5419 const Instruction &AndI) const {
5420 return true;
5421}
5422
5423bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5424 EVT VT = Y.getValueType();
5425
5426 if (VT.isVector())
5427 return false;
5428
5429 if (!Subtarget.hasBMI())
5430 return false;
5431
5432 // There are only 32-bit and 64-bit forms for 'andn'.
5433 if (VT != MVT::i32 && VT != MVT::i64)
5434 return false;
5435
5436 return !isa<ConstantSDNode>(Y);
5437}
5438
5439bool X86TargetLowering::hasAndNot(SDValue Y) const {
5440 EVT VT = Y.getValueType();
5441
5442 if (!VT.isVector())
5443 return hasAndNotCompare(Y);
5444
5445 // Vector.
5446
5447 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5448 return false;
5449
5450 if (VT == MVT::v4i32)
5451 return true;
5452
5453 return Subtarget.hasSSE2();
5454}
5455
5456bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5457 return X.getValueType().isScalarInteger(); // 'bt'
5458}
5459
5460bool X86TargetLowering::
5461 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5462 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5463 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5464 SelectionDAG &DAG) const {
5465 // Does baseline recommend not to perform the fold by default?
5466 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5467 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5468 return false;
5469 // For scalars this transform is always beneficial.
5470 if (X.getValueType().isScalarInteger())
5471 return true;
5472 // If all the shift amounts are identical, then transform is beneficial even
5473 // with rudimentary SSE2 shifts.
5474 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5475 return true;
5476 // If we have AVX2 with it's powerful shift operations, then it's also good.
5477 if (Subtarget.hasAVX2())
5478 return true;
5479 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5480 return NewShiftOpcode == ISD::SHL;
5481}
5482
5483bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5484 const SDNode *N, CombineLevel Level) const {
5485 assert(((N->getOpcode() == ISD::SHL &&((void)0)
5486 N->getOperand(0).getOpcode() == ISD::SRL) ||((void)0)
5487 (N->getOpcode() == ISD::SRL &&((void)0)
5488 N->getOperand(0).getOpcode() == ISD::SHL)) &&((void)0)
5489 "Expected shift-shift mask")((void)0);
5490 EVT VT = N->getValueType(0);
5491 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5492 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5493 // Only fold if the shift values are equal - so it folds to AND.
5494 // TODO - we should fold if either is a non-uniform vector but we don't do
5495 // the fold for non-splats yet.
5496 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5497 }
5498 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5499}
5500
5501bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5502 EVT VT = Y.getValueType();
5503
5504 // For vectors, we don't have a preference, but we probably want a mask.
5505 if (VT.isVector())
5506 return false;
5507
5508 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5509 if (VT == MVT::i64 && !Subtarget.is64Bit())
5510 return false;
5511
5512 return true;
5513}
5514
5515bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5516 SDNode *N) const {
5517 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5518 !Subtarget.isOSWindows())
5519 return false;
5520 return true;
5521}
5522
5523bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5524 // Any legal vector type can be splatted more efficiently than
5525 // loading/spilling from memory.
5526 return isTypeLegal(VT);
5527}
5528
5529MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5530 MVT VT = MVT::getIntegerVT(NumBits);
5531 if (isTypeLegal(VT))
5532 return VT;
5533
5534 // PMOVMSKB can handle this.
5535 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5536 return MVT::v16i8;
5537
5538 // VPMOVMSKB can handle this.
5539 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5540 return MVT::v32i8;
5541
5542 // TODO: Allow 64-bit type for 32-bit target.
5543 // TODO: 512-bit types should be allowed, but make sure that those
5544 // cases are handled in combineVectorSizedSetCCEquality().
5545
5546 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5547}
5548
5549/// Val is the undef sentinel value or equal to the specified value.
5550static bool isUndefOrEqual(int Val, int CmpVal) {
5551 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5552}
5553
5554/// Return true if every element in Mask is the undef sentinel value or equal to
5555/// the specified value..
5556static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5557 return llvm::all_of(Mask, [CmpVal](int M) {
5558 return (M == SM_SentinelUndef) || (M == CmpVal);
5559 });
5560}
5561
5562/// Val is either the undef or zero sentinel value.
5563static bool isUndefOrZero(int Val) {
5564 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5565}
5566
5567/// Return true if every element in Mask, beginning from position Pos and ending
5568/// in Pos+Size is the undef sentinel value.
5569static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5570 return llvm::all_of(Mask.slice(Pos, Size),
5571 [](int M) { return M == SM_SentinelUndef; });
5572}
5573
5574/// Return true if the mask creates a vector whose lower half is undefined.
5575static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5576 unsigned NumElts = Mask.size();
5577 return isUndefInRange(Mask, 0, NumElts / 2);
5578}
5579
5580/// Return true if the mask creates a vector whose upper half is undefined.
5581static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5582 unsigned NumElts = Mask.size();
5583 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5584}
5585
5586/// Return true if Val falls within the specified range (L, H].
5587static bool isInRange(int Val, int Low, int Hi) {
5588 return (Val >= Low && Val < Hi);
5589}
5590
5591/// Return true if the value of any element in Mask falls within the specified
5592/// range (L, H].
5593static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5594 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5595}
5596
5597/// Return true if the value of any element in Mask is the zero sentinel value.
5598static bool isAnyZero(ArrayRef<int> Mask) {
5599 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5600}
5601
5602/// Return true if the value of any element in Mask is the zero or undef
5603/// sentinel values.
5604static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5605 return llvm::any_of(Mask, [](int M) {
5606 return M == SM_SentinelZero || M == SM_SentinelUndef;
5607 });
5608}
5609
5610/// Return true if Val is undef or if its value falls within the
5611/// specified range (L, H].
5612static bool isUndefOrInRange(int Val, int Low, int Hi) {
5613 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5614}
5615
5616/// Return true if every element in Mask is undef or if its value
5617/// falls within the specified range (L, H].
5618static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5619 return llvm::all_of(
5620 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5621}
5622
5623/// Return true if Val is undef, zero or if its value falls within the
5624/// specified range (L, H].
5625static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5626 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5627}
5628
5629/// Return true if every element in Mask is undef, zero or if its value
5630/// falls within the specified range (L, H].
5631static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5632 return llvm::all_of(
5633 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5634}
5635
5636/// Return true if every element in Mask, beginning
5637/// from position Pos and ending in Pos + Size, falls within the specified
5638/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5639static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5640 unsigned Size, int Low, int Step = 1) {
5641 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5642 if (!isUndefOrEqual(Mask[i], Low))
5643 return false;
5644 return true;
5645}
5646
5647/// Return true if every element in Mask, beginning
5648/// from position Pos and ending in Pos+Size, falls within the specified
5649/// sequential range (Low, Low+Size], or is undef or is zero.
5650static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5651 unsigned Size, int Low,
5652 int Step = 1) {
5653 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5654 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5655 return false;
5656 return true;
5657}
5658
5659/// Return true if every element in Mask, beginning
5660/// from position Pos and ending in Pos+Size is undef or is zero.
5661static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5662 unsigned Size) {
5663 return llvm::all_of(Mask.slice(Pos, Size),
5664 [](int M) { return isUndefOrZero(M); });
5665}
5666
5667/// Helper function to test whether a shuffle mask could be
5668/// simplified by widening the elements being shuffled.
5669///
5670/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5671/// leaves it in an unspecified state.
5672///
5673/// NOTE: This must handle normal vector shuffle masks and *target* vector
5674/// shuffle masks. The latter have the special property of a '-2' representing
5675/// a zero-ed lane of a vector.
5676static bool canWidenShuffleElements(ArrayRef<int> Mask,
5677 SmallVectorImpl<int> &WidenedMask) {
5678 WidenedMask.assign(Mask.size() / 2, 0);
5679 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5680 int M0 = Mask[i];
5681 int M1 = Mask[i + 1];
5682
5683 // If both elements are undef, its trivial.
5684 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5685 WidenedMask[i / 2] = SM_SentinelUndef;
5686 continue;
5687 }
5688
5689 // Check for an undef mask and a mask value properly aligned to fit with
5690 // a pair of values. If we find such a case, use the non-undef mask's value.
5691 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5692 WidenedMask[i / 2] = M1 / 2;
5693 continue;
5694 }
5695 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5696 WidenedMask[i / 2] = M0 / 2;
5697 continue;
5698 }
5699
5700 // When zeroing, we need to spread the zeroing across both lanes to widen.
5701 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5702 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5703 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5704 WidenedMask[i / 2] = SM_SentinelZero;
5705 continue;
5706 }
5707 return false;
5708 }
5709
5710 // Finally check if the two mask values are adjacent and aligned with
5711 // a pair.
5712 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5713 WidenedMask[i / 2] = M0 / 2;
5714 continue;
5715 }
5716
5717 // Otherwise we can't safely widen the elements used in this shuffle.
5718 return false;
5719 }
5720 assert(WidenedMask.size() == Mask.size() / 2 &&((void)0)
5721 "Incorrect size of mask after widening the elements!")((void)0);
5722
5723 return true;
5724}
5725
5726static bool canWidenShuffleElements(ArrayRef<int> Mask,
5727 const APInt &Zeroable,
5728 bool V2IsZero,
5729 SmallVectorImpl<int> &WidenedMask) {
5730 // Create an alternative mask with info about zeroable elements.
5731 // Here we do not set undef elements as zeroable.
5732 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5733 if (V2IsZero) {
5734 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")((void)0);
5735 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5736 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5737 ZeroableMask[i] = SM_SentinelZero;
5738 }
5739 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5740}
5741
5742static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5743 SmallVector<int, 32> WidenedMask;
5744 return canWidenShuffleElements(Mask, WidenedMask);
5745}
5746
5747// Attempt to narrow/widen shuffle mask until it matches the target number of
5748// elements.
5749static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5750 SmallVectorImpl<int> &ScaledMask) {
5751 unsigned NumSrcElts = Mask.size();
5752 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&((void)0)
5753 "Illegal shuffle scale factor")((void)0);
5754
5755 // Narrowing is guaranteed to work.
5756 if (NumDstElts >= NumSrcElts) {
5757 int Scale = NumDstElts / NumSrcElts;
5758 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5759 return true;
5760 }
5761
5762 // We have to repeat the widening until we reach the target size, but we can
5763 // split out the first widening as it sets up ScaledMask for us.
5764 if (canWidenShuffleElements(Mask, ScaledMask)) {
5765 while (ScaledMask.size() > NumDstElts) {
5766 SmallVector<int, 16> WidenedMask;
5767 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5768 return false;
5769 ScaledMask = std::move(WidenedMask);
5770 }
5771 return true;
5772 }
5773
5774 return false;
5775}
5776
5777/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5778bool X86::isZeroNode(SDValue Elt) {
5779 return isNullConstant(Elt) || isNullFPConstant(Elt);
5780}
5781
5782// Build a vector of constants.
5783// Use an UNDEF node if MaskElt == -1.
5784// Split 64-bit constants in the 32-bit mode.
5785static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5786 const SDLoc &dl, bool IsMask = false) {
5787
5788 SmallVector<SDValue, 32> Ops;
5789 bool Split = false;
5790
5791 MVT ConstVecVT = VT;
5792 unsigned NumElts = VT.getVectorNumElements();
5793 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5794 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5795 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5796 Split = true;
5797 }
5798
5799 MVT EltVT = ConstVecVT.getVectorElementType();
5800 for (unsigned i = 0; i < NumElts; ++i) {
5801 bool IsUndef = Values[i] < 0 && IsMask;
5802 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5803 DAG.getConstant(Values[i], dl, EltVT);
5804 Ops.push_back(OpNode);
5805 if (Split)
5806 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5807 DAG.getConstant(0, dl, EltVT));
5808 }
5809 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5810 if (Split)
5811 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5812 return ConstsNode;
5813}
5814
5815static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5816 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5817 assert(Bits.size() == Undefs.getBitWidth() &&((void)0)
5818 "Unequal constant and undef arrays")((void)0);
5819 SmallVector<SDValue, 32> Ops;
5820 bool Split = false;
5821
5822 MVT ConstVecVT = VT;
5823 unsigned NumElts = VT.getVectorNumElements();
5824 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5825 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5826 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5827 Split = true;
5828 }
5829
5830 MVT EltVT = ConstVecVT.getVectorElementType();
5831 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5832 if (Undefs[i]) {
5833 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5834 continue;
5835 }
5836 const APInt &V = Bits[i];
5837 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((void)0);
5838 if (Split) {
5839 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5840 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5841 } else if (EltVT == MVT::f32) {
5842 APFloat FV(APFloat::IEEEsingle(), V);
5843 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5844 } else if (EltVT == MVT::f64) {
5845 APFloat FV(APFloat::IEEEdouble(), V);
5846 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5847 } else {
5848 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5849 }
5850 }
5851
5852 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5853 return DAG.getBitcast(VT, ConstsNode);
5854}
5855
5856/// Returns a vector of specified type with all zero elements.
5857static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5858 SelectionDAG &DAG, const SDLoc &dl) {
5859 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||((void)0)
5860 VT.getVectorElementType() == MVT::i1) &&((void)0)
5861 "Unexpected vector type")((void)0);
5862
5863 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5864 // type. This ensures they get CSE'd. But if the integer type is not
5865 // available, use a floating-point +0.0 instead.
5866 SDValue Vec;
5867 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5868 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5869 } else if (VT.isFloatingPoint()) {
5870 Vec = DAG.getConstantFP(+0.0, dl, VT);
5871 } else if (VT.getVectorElementType() == MVT::i1) {
5872 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&((void)0)
5873 "Unexpected vector type")((void)0);
5874 Vec = DAG.getConstant(0, dl, VT);
5875 } else {
5876 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5877 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5878 }
5879 return DAG.getBitcast(VT, Vec);
5880}
5881
5882static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5883 const SDLoc &dl, unsigned vectorWidth) {
5884 EVT VT = Vec.getValueType();
5885 EVT ElVT = VT.getVectorElementType();
5886 unsigned Factor = VT.getSizeInBits() / vectorWidth;
5887 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5888 VT.getVectorNumElements() / Factor);
5889
5890 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5891 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5892 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
5893
5894 // This is the index of the first element of the vectorWidth-bit chunk
5895 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5896 IdxVal &= ~(ElemsPerChunk - 1);
5897
5898 // If the input is a buildvector just emit a smaller one.
5899 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5900 return DAG.getBuildVector(ResultVT, dl,
5901 Vec->ops().slice(IdxVal, ElemsPerChunk));
5902
5903 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5904 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5905}
5906
5907/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5908/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5909/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5910/// instructions or a simple subregister reference. Idx is an index in the
5911/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5912/// lowering EXTRACT_VECTOR_ELT operations easier.
5913static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5914 SelectionDAG &DAG, const SDLoc &dl) {
5915 assert((Vec.getValueType().is256BitVector() ||((void)0)
5916 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")((void)0);
5917 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5918}
5919
5920/// Generate a DAG to grab 256-bits from a 512-bit vector.
5921static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5922 SelectionDAG &DAG, const SDLoc &dl) {
5923 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((void)0);
5924 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5925}
5926
5927static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5928 SelectionDAG &DAG, const SDLoc &dl,
5929 unsigned vectorWidth) {
5930 assert((vectorWidth == 128 || vectorWidth == 256) &&((void)0)
5931 "Unsupported vector width")((void)0);
5932 // Inserting UNDEF is Result
5933 if (Vec.isUndef())
5934 return Result;
5935 EVT VT = Vec.getValueType();
5936 EVT ElVT = VT.getVectorElementType();
5937 EVT ResultVT = Result.getValueType();
5938
5939 // Insert the relevant vectorWidth bits.
5940 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5941 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
5942
5943 // This is the index of the first element of the vectorWidth-bit chunk
5944 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5945 IdxVal &= ~(ElemsPerChunk - 1);
5946
5947 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5948 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5949}
5950
5951/// Generate a DAG to put 128-bits into a vector > 128 bits. This
5952/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5953/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5954/// simple superregister reference. Idx is an index in the 128 bits
5955/// we want. It need not be aligned to a 128-bit boundary. That makes
5956/// lowering INSERT_VECTOR_ELT operations easier.
5957static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5958 SelectionDAG &DAG, const SDLoc &dl) {
5959 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((void)0);
5960 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5961}
5962
5963/// Widen a vector to a larger size with the same scalar type, with the new
5964/// elements either zero or undef.
5965static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5966 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5967 const SDLoc &dl) {
5968 assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&((void)0)
5969 Vec.getValueType().getScalarType() == VT.getScalarType() &&((void)0)
5970 "Unsupported vector widening type")((void)0);
5971 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5972 : DAG.getUNDEF(VT);
5973 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5974 DAG.getIntPtrConstant(0, dl));
5975}
5976
5977/// Widen a vector to a larger size with the same scalar type, with the new
5978/// elements either zero or undef.
5979static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5980 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5981 const SDLoc &dl, unsigned WideSizeInBits) {
5982 assert(Vec.getValueSizeInBits() < WideSizeInBits &&((void)0)
5983 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&((void)0)
5984 "Unsupported vector widening type")((void)0);
5985 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5986 MVT SVT = Vec.getSimpleValueType().getScalarType();
5987 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5988 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5989}
5990
5991// Helper function to collect subvector ops that are concatenated together,
5992// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5993// The subvectors in Ops are guaranteed to be the same type.
5994static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5995 assert(Ops.empty() && "Expected an empty ops vector")((void)0);
5996
5997 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
5998 Ops.append(N->op_begin(), N->op_end());
5999 return true;
6000 }
6001
6002 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6003 SDValue Src = N->getOperand(0);
6004 SDValue Sub = N->getOperand(1);
6005 const APInt &Idx = N->getConstantOperandAPInt(2);
6006 EVT VT = Src.getValueType();
6007 EVT SubVT = Sub.getValueType();
6008
6009 // TODO - Handle more general insert_subvector chains.
6010 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6011 Idx == (VT.getVectorNumElements() / 2)) {
6012 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6013 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6014 Src.getOperand(1).getValueType() == SubVT &&
6015 isNullConstant(Src.getOperand(2))) {
6016 Ops.push_back(Src.getOperand(1));
6017 Ops.push_back(Sub);
6018 return true;
6019 }
6020 // insert_subvector(x, extract_subvector(x, lo), hi)
6021 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6022 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6023 Ops.append(2, Sub);
6024 return true;
6025 }
6026 }
6027 }
6028
6029 return false;
6030}
6031
6032static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6033 const SDLoc &dl) {
6034 EVT VT = Op.getValueType();
6035 unsigned NumElems = VT.getVectorNumElements();
6036 unsigned SizeInBits = VT.getSizeInBits();
6037 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&((void)0)
6038 "Can't split odd sized vector")((void)0);
6039
6040 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6041 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6042 return std::make_pair(Lo, Hi);
6043}
6044
6045// Split an unary integer op into 2 half sized ops.
6046static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6047 EVT VT = Op.getValueType();
6048
6049 // Make sure we only try to split 256/512-bit types to avoid creating
6050 // narrow vectors.
6051 assert((Op.getOperand(0).getValueType().is256BitVector() ||((void)0)
6052 Op.getOperand(0).getValueType().is512BitVector()) &&((void)0)
6053 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")((void)0);
6054 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==((void)0)
6055 VT.getVectorNumElements() &&((void)0)
6056 "Unexpected VTs!")((void)0);
6057
6058 SDLoc dl(Op);
6059
6060 // Extract the Lo/Hi vectors
6061 SDValue Lo, Hi;
6062 std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6063
6064 EVT LoVT, HiVT;
6065 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6066 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6067 DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6068 DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6069}
6070
6071/// Break a binary integer operation into 2 half sized ops and then
6072/// concatenate the result back.
6073static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6074 EVT VT = Op.getValueType();
6075
6076 // Sanity check that all the types match.
6077 assert(Op.getOperand(0).getValueType() == VT &&((void)0)
6078 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")((void)0);
6079 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")((void)0);
6080
6081 SDLoc dl(Op);
6082
6083 // Extract the LHS Lo/Hi vectors
6084 SDValue LHS1, LHS2;
6085 std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6086
6087 // Extract the RHS Lo/Hi vectors
6088 SDValue RHS1, RHS2;
6089 std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6090
6091 EVT LoVT, HiVT;
6092 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6093 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6094 DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6095 DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6096}
6097
6098// Helper for splitting operands of an operation to legal target size and
6099// apply a function on each part.
6100// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6101// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6102// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6103// The argument Builder is a function that will be applied on each split part:
6104// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6105template <typename F>
6106SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6107 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6108 F Builder, bool CheckBWI = true) {
6109 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")((void)0);
6110 unsigned NumSubs = 1;
6111 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6112 (!CheckBWI && Subtarget.useAVX512Regs())) {
6113 if (VT.getSizeInBits() > 512) {
6114 NumSubs = VT.getSizeInBits() / 512;
6115 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")((void)0);
6116 }
6117 } else if (Subtarget.hasAVX2()) {
6118 if (VT.getSizeInBits() > 256) {
6119 NumSubs = VT.getSizeInBits() / 256;
6120 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")((void)0);
6121 }
6122 } else {
6123 if (VT.getSizeInBits() > 128) {
6124 NumSubs = VT.getSizeInBits() / 128;
6125 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")((void)0);
6126 }
6127 }
6128
6129 if (NumSubs == 1)
6130 return Builder(DAG, DL, Ops);
6131
6132 SmallVector<SDValue, 4> Subs;
6133 for (unsigned i = 0; i != NumSubs; ++i) {
6134 SmallVector<SDValue, 2> SubOps;
6135 for (SDValue Op : Ops) {
6136 EVT OpVT = Op.getValueType();
6137 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6138 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6139 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6140 }
6141 Subs.push_back(Builder(DAG, DL, SubOps));
6142 }
6143 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6144}
6145
6146/// Insert i1-subvector to i1-vector.
6147static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6148 const X86Subtarget &Subtarget) {
6149
6150 SDLoc dl(Op);
6151 SDValue Vec = Op.getOperand(0);
6152 SDValue SubVec = Op.getOperand(1);
6153 SDValue Idx = Op.getOperand(2);
6154 unsigned IdxVal = Op.getConstantOperandVal(2);
6155
6156 // Inserting undef is a nop. We can just return the original vector.
6157 if (SubVec.isUndef())
6158 return Vec;
6159
6160 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6161 return Op;
6162
6163 MVT OpVT = Op.getSimpleValueType();
6164 unsigned NumElems = OpVT.getVectorNumElements();
6165 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6166
6167 // Extend to natively supported kshift.
6168 MVT WideOpVT = OpVT;
6169 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6170 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6171
6172 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6173 // if necessary.
6174 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6175 // May need to promote to a legal type.
6176 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6177 DAG.getConstant(0, dl, WideOpVT),
6178 SubVec, Idx);
6179 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6180 }
6181
6182 MVT SubVecVT = SubVec.getSimpleValueType();
6183 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6184 assert(IdxVal + SubVecNumElems <= NumElems &&((void)0)
6185 IdxVal % SubVecVT.getSizeInBits() == 0 &&((void)0)
6186 "Unexpected index value in INSERT_SUBVECTOR")((void)0);
6187
6188 SDValue Undef = DAG.getUNDEF(WideOpVT);
6189
6190 if (IdxVal == 0) {
6191 // Zero lower bits of the Vec
6192 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6193 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6194 ZeroIdx);
6195 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6196 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6197 // Merge them together, SubVec should be zero extended.
6198 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6199 DAG.getConstant(0, dl, WideOpVT),
6200 SubVec, ZeroIdx);
6201 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6202 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6203 }
6204
6205 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6206 Undef, SubVec, ZeroIdx);
6207
6208 if (Vec.isUndef()) {
6209 assert(IdxVal != 0 && "Unexpected index")((void)0);
6210 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6211 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6212 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6213 }
6214
6215 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6216 assert(IdxVal != 0 && "Unexpected index")((void)0);
6217 NumElems = WideOpVT.getVectorNumElements();
6218 unsigned ShiftLeft = NumElems - SubVecNumElems;
6219 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6220 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6221 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6222 if (ShiftRight != 0)
6223 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6224 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6225 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6226 }
6227
6228 // Simple case when we put subvector in the upper part
6229 if (IdxVal + SubVecNumElems == NumElems) {
6230 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6231 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6232 if (SubVecNumElems * 2 == NumElems) {
6233 // Special case, use legal zero extending insert_subvector. This allows
6234 // isel to optimize when bits are known zero.
6235 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6236 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6237 DAG.getConstant(0, dl, WideOpVT),
6238 Vec, ZeroIdx);
6239 } else {
6240 // Otherwise use explicit shifts to zero the bits.
6241 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6242 Undef, Vec, ZeroIdx);
6243 NumElems = WideOpVT.getVectorNumElements();
6244 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6245 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6246 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6247 }
6248 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6249 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6250 }
6251
6252 // Inserting into the middle is more complicated.
6253
6254 NumElems = WideOpVT.getVectorNumElements();
6255
6256 // Widen the vector if needed.
6257 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6258
6259 unsigned ShiftLeft = NumElems - SubVecNumElems;
6260 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6261
6262 // Do an optimization for the the most frequently used types.
6263 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6264 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6265 Mask0.flipAllBits();
6266 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6267 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6268 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6269 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6270 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6271 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6272 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6273 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6274
6275 // Reduce to original width if needed.
6276 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6277 }
6278
6279 // Clear the upper bits of the subvector and move it to its insert position.
6280 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6281 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6282 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6283 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6284
6285 // Isolate the bits below the insertion point.
6286 unsigned LowShift = NumElems - IdxVal;
6287 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6288 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6289 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6290 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6291
6292 // Isolate the bits after the last inserted bit.
6293 unsigned HighShift = IdxVal + SubVecNumElems;
6294 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6295 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6296 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6297 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6298
6299 // Now OR all 3 pieces together.
6300 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6301 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6302
6303 // Reduce to original width if needed.
6304 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6305}
6306
6307static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6308 const SDLoc &dl) {
6309 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")((void)0);
6310 EVT SubVT = V1.getValueType();
6311 EVT SubSVT = SubVT.getScalarType();
6312 unsigned SubNumElts = SubVT.getVectorNumElements();
6313 unsigned SubVectorWidth = SubVT.getSizeInBits();
6314 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6315 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6316 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6317}
6318
6319/// Returns a vector of specified type with all bits set.
6320/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6321/// Then bitcast to their original type, ensuring they get CSE'd.
6322static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6323 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
6324 "Expected a 128/256/512-bit vector type")((void)0);
6325
6326 APInt Ones = APInt::getAllOnesValue(32);
6327 unsigned NumElts = VT.getSizeInBits() / 32;
6328 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6329 return DAG.getBitcast(VT, Vec);
6330}
6331
6332// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6333static unsigned getOpcode_EXTEND(unsigned Opcode) {
6334 switch (Opcode) {
6335 case ISD::ANY_EXTEND:
6336 case ISD::ANY_EXTEND_VECTOR_INREG:
6337 return ISD::ANY_EXTEND;
6338 case ISD::ZERO_EXTEND:
6339 case ISD::ZERO_EXTEND_VECTOR_INREG:
6340 return ISD::ZERO_EXTEND;
6341 case ISD::SIGN_EXTEND:
6342 case ISD::SIGN_EXTEND_VECTOR_INREG:
6343 return ISD::SIGN_EXTEND;
6344 }
6345 llvm_unreachable("Unknown opcode")__builtin_unreachable();
6346}
6347
6348// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6349static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6350 switch (Opcode) {
6351 case ISD::ANY_EXTEND:
6352 case ISD::ANY_EXTEND_VECTOR_INREG:
6353 return ISD::ANY_EXTEND_VECTOR_INREG;
6354 case ISD::ZERO_EXTEND:
6355 case ISD::ZERO_EXTEND_VECTOR_INREG:
6356 return ISD::ZERO_EXTEND_VECTOR_INREG;
6357 case ISD::SIGN_EXTEND:
6358 case ISD::SIGN_EXTEND_VECTOR_INREG:
6359 return ISD::SIGN_EXTEND_VECTOR_INREG;
6360 }
6361 llvm_unreachable("Unknown opcode")__builtin_unreachable();
6362}
6363
6364static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6365 SDValue In, SelectionDAG &DAG) {
6366 EVT InVT = In.getValueType();
6367 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")((void)0);
6368 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||((void)0)
6369 ISD::ZERO_EXTEND == Opcode) &&((void)0)
6370 "Unknown extension opcode")((void)0);
6371
6372 // For 256-bit vectors, we only need the lower (128-bit) input half.
6373 // For 512-bit vectors, we only need the lower input half or quarter.
6374 if (InVT.getSizeInBits() > 128) {
6375 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&((void)0)
6376 "Expected VTs to be the same size!")((void)0);
6377 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6378 In = extractSubVector(In, 0, DAG, DL,
6379 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6380 InVT = In.getValueType();
6381 }
6382
6383 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6384 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6385
6386 return DAG.getNode(Opcode, DL, VT, In);
6387}
6388
6389// Match (xor X, -1) -> X.
6390// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6391// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6392static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6393 V = peekThroughBitcasts(V);
6394 if (V.getOpcode() == ISD::XOR &&
6395 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6396 return V.getOperand(0);
6397 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6398 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6399 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6400 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6401 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6402 Not, V.getOperand(1));
6403 }
6404 }
6405 SmallVector<SDValue, 2> CatOps;
6406 if (collectConcatOps(V.getNode(), CatOps)) {
6407 for (SDValue &CatOp : CatOps) {
6408 SDValue NotCat = IsNOT(CatOp, DAG);
6409 if (!NotCat) return SDValue();
6410 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6411 }
6412 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6413 }
6414 return SDValue();
6415}
6416
6417void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6418 bool Lo, bool Unary) {
6419 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&((void)0)
6420 "Illegal vector type to unpack")((void)0);
6421 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6422 int NumElts = VT.getVectorNumElements();
6423 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6424 for (int i = 0; i < NumElts; ++i) {
6425 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6426 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6427 Pos += (Unary ? 0 : NumElts * (i % 2));
6428 Pos += (Lo ? 0 : NumEltsInLane / 2);
6429 Mask.push_back(Pos);
6430 }
6431}
6432
6433/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6434/// imposed by AVX and specific to the unary pattern. Example:
6435/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6436/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6437void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6438 bool Lo) {
6439 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6440 int NumElts = VT.getVectorNumElements();
6441 for (int i = 0; i < NumElts; ++i) {
6442 int Pos = i / 2;
6443 Pos += (Lo ? 0 : NumElts / 2);
6444 Mask.push_back(Pos);
6445 }
6446}
6447
6448/// Returns a vector_shuffle node for an unpackl operation.
6449static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6450 SDValue V1, SDValue V2) {
6451 SmallVector<int, 8> Mask;
6452 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6453 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6454}
6455
6456/// Returns a vector_shuffle node for an unpackh operation.
6457static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6458 SDValue V1, SDValue V2) {
6459 SmallVector<int, 8> Mask;
6460 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6461 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6462}
6463
6464/// Return a vector_shuffle of the specified vector of zero or undef vector.
6465/// This produces a shuffle where the low element of V2 is swizzled into the
6466/// zero/undef vector, landing at element Idx.
6467/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6468static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6469 bool IsZero,
6470 const X86Subtarget &Subtarget,
6471 SelectionDAG &DAG) {
6472 MVT VT = V2.getSimpleValueType();
6473 SDValue V1 = IsZero
6474 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6475 int NumElems = VT.getVectorNumElements();
6476 SmallVector<int, 16> MaskVec(NumElems);
6477 for (int i = 0; i != NumElems; ++i)
6478 // If this is the insertion idx, put the low elt of V2 here.
6479 MaskVec[i] = (i == Idx) ? NumElems : i;
6480 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6481}
6482
6483static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6484 if (Ptr.getOpcode() == X86ISD::Wrapper ||
6485 Ptr.getOpcode() == X86ISD::WrapperRIP)
6486 Ptr = Ptr.getOperand(0);
6487
6488 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6489 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6490 return nullptr;
6491
6492 return CNode->getConstVal();
6493}
6494
6495static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6496 if (!Load || !ISD::isNormalLoad(Load))
6497 return nullptr;
6498 return getTargetConstantFromBasePtr(Load->getBasePtr());
6499}
6500
6501static const Constant *getTargetConstantFromNode(SDValue Op) {
6502 Op = peekThroughBitcasts(Op);
6503 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6504}
6505
6506const Constant *
6507X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6508 assert(LD && "Unexpected null LoadSDNode")((void)0);
6509 return getTargetConstantFromNode(LD);
6510}
6511
6512// Extract raw constant bits from constant pools.
6513static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6514 APInt &UndefElts,
6515 SmallVectorImpl<APInt> &EltBits,
6516 bool AllowWholeUndefs = true,
6517 bool AllowPartialUndefs = true) {
6518 assert(EltBits.empty() && "Expected an empty EltBits vector")((void)0);
6519
6520 Op = peekThroughBitcasts(Op);
6521
6522 EVT VT = Op.getValueType();
6523 unsigned SizeInBits = VT.getSizeInBits();
6524 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")((void)0);
6525 unsigned NumElts = SizeInBits / EltSizeInBits;
6526
6527 // Bitcast a source array of element bits to the target size.
6528 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6529 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6530 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6531 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&((void)0)
6532 "Constant bit sizes don't match")((void)0);
6533
6534 // Don't split if we don't allow undef bits.
6535 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6536 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6537 return false;
6538
6539 // If we're already the right size, don't bother bitcasting.
6540 if (NumSrcElts == NumElts) {
6541 UndefElts = UndefSrcElts;
6542 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6543 return true;
6544 }
6545
6546 // Extract all the undef/constant element data and pack into single bitsets.
6547 APInt UndefBits(SizeInBits, 0);
6548 APInt MaskBits(SizeInBits, 0);
6549
6550 for (unsigned i = 0; i != NumSrcElts; ++i) {
6551 unsigned BitOffset = i * SrcEltSizeInBits;
6552 if (UndefSrcElts[i])
6553 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6554 MaskBits.insertBits(SrcEltBits[i], BitOffset);
6555 }
6556
6557 // Split the undef/constant single bitset data into the target elements.
6558 UndefElts = APInt(NumElts, 0);
6559 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6560
6561 for (unsigned i = 0; i != NumElts; ++i) {
6562 unsigned BitOffset = i * EltSizeInBits;
6563 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6564
6565 // Only treat an element as UNDEF if all bits are UNDEF.
6566 if (UndefEltBits.isAllOnesValue()) {
6567 if (!AllowWholeUndefs)
6568 return false;
6569 UndefElts.setBit(i);
6570 continue;
6571 }
6572
6573 // If only some bits are UNDEF then treat them as zero (or bail if not
6574 // supported).
6575 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6576 return false;
6577
6578 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6579 }
6580 return true;
6581 };
6582
6583 // Collect constant bits and insert into mask/undef bit masks.
6584 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6585 unsigned UndefBitIndex) {
6586 if (!Cst)
6587 return false;
6588 if (isa<UndefValue>(Cst)) {
6589 Undefs.setBit(UndefBitIndex);
6590 return true;
6591 }
6592 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6593 Mask = CInt->getValue();
6594 return true;
6595 }
6596 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6597 Mask = CFP->getValueAPF().bitcastToAPInt();
6598 return true;
6599 }
6600 return false;
6601 };
6602
6603 // Handle UNDEFs.
6604 if (Op.isUndef()) {
6605 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6606 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6607 return CastBitData(UndefSrcElts, SrcEltBits);
6608 }
6609
6610 // Extract scalar constant bits.
6611 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6612 APInt UndefSrcElts = APInt::getNullValue(1);
6613 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6614 return CastBitData(UndefSrcElts, SrcEltBits);
6615 }
6616 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6617 APInt UndefSrcElts = APInt::getNullValue(1);
6618 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6619 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6620 return CastBitData(UndefSrcElts, SrcEltBits);
6621 }
6622
6623 // Extract constant bits from build vector.
6624 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6625 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6626 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6627
6628 APInt UndefSrcElts(NumSrcElts, 0);
6629 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6630 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6631 const SDValue &Src = Op.getOperand(i);
6632 if (Src.isUndef()) {
6633 UndefSrcElts.setBit(i);
6634 continue;
6635 }
6636 auto *Cst = cast<ConstantSDNode>(Src);
6637 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6638 }
6639 return CastBitData(UndefSrcElts, SrcEltBits);
6640 }
6641 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6642 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6643 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6644
6645 APInt UndefSrcElts(NumSrcElts, 0);
6646 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6647 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6648 const SDValue &Src = Op.getOperand(i);
6649 if (Src.isUndef()) {
6650 UndefSrcElts.setBit(i);
6651 continue;
6652 }
6653 auto *Cst = cast<ConstantFPSDNode>(Src);
6654 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6655 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6656 }
6657 return CastBitData(UndefSrcElts, SrcEltBits);
6658 }
6659
6660 // Extract constant bits from constant pool vector.
6661 if (auto *Cst = getTargetConstantFromNode(Op)) {
6662 Type *CstTy = Cst->getType();
6663 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6664 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6665 return false;
6666
6667 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6668 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6669
6670 APInt UndefSrcElts(NumSrcElts, 0);
6671 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6672 for (unsigned i = 0; i != NumSrcElts; ++i)
6673 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6674 UndefSrcElts, i))
6675 return false;
6676
6677 return CastBitData(UndefSrcElts, SrcEltBits);
6678 }
6679
6680 // Extract constant bits from a broadcasted constant pool scalar.
6681 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6682 EltSizeInBits <= VT.getScalarSizeInBits()) {
6683 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6684 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6685 return false;
6686
6687 SDValue Ptr = MemIntr->getBasePtr();
6688 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6689 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6690 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6691
6692 APInt UndefSrcElts(NumSrcElts, 0);
6693 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6694 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6695 if (UndefSrcElts[0])
6696 UndefSrcElts.setBits(0, NumSrcElts);
6697 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6698 return CastBitData(UndefSrcElts, SrcEltBits);
6699 }
6700 }
6701 }
6702
6703 // Extract constant bits from a subvector broadcast.
6704 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6705 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6706 SDValue Ptr = MemIntr->getBasePtr();
6707 // The source constant may be larger than the subvector broadcast,
6708 // ensure we extract the correct subvector constants.
6709 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6710 Type *CstTy = Cst->getType();
6711 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6712 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
6713 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
6714 (SizeInBits % SubVecSizeInBits) != 0)
6715 return false;
6716 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
6717 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
6718 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
6719 APInt UndefSubElts(NumSubElts, 0);
6720 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6721 APInt(CstEltSizeInBits, 0));
6722 for (unsigned i = 0; i != NumSubElts; ++i) {
6723 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6724 UndefSubElts, i))
6725 return false;
6726 for (unsigned j = 1; j != NumSubVecs; ++j)
6727 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6728 }
6729 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6730 UndefSubElts);
6731 return CastBitData(UndefSubElts, SubEltBits);
6732 }
6733 }
6734
6735 // Extract a rematerialized scalar constant insertion.
6736 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6737 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6738 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6739 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6740 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6741
6742 APInt UndefSrcElts(NumSrcElts, 0);
6743 SmallVector<APInt, 64> SrcEltBits;
6744 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6745 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6746 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6747 return CastBitData(UndefSrcElts, SrcEltBits);
6748 }
6749
6750 // Insert constant bits from a base and sub vector sources.
6751 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6752 // If bitcasts to larger elements we might lose track of undefs - don't
6753 // allow any to be safe.
6754 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6755 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6756
6757 APInt UndefSrcElts, UndefSubElts;
6758 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6759 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6760 UndefSubElts, EltSubBits,
6761 AllowWholeUndefs && AllowUndefs,
6762 AllowPartialUndefs && AllowUndefs) &&
6763 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6764 UndefSrcElts, EltSrcBits,
6765 AllowWholeUndefs && AllowUndefs,
6766 AllowPartialUndefs && AllowUndefs)) {
6767 unsigned BaseIdx = Op.getConstantOperandVal(2);
6768 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6769 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6770 EltSrcBits[BaseIdx + i] = EltSubBits[i];
6771 return CastBitData(UndefSrcElts, EltSrcBits);
6772 }
6773 }
6774
6775 // Extract constant bits from a subvector's source.
6776 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6777 // TODO - support extract_subvector through bitcasts.
6778 if (EltSizeInBits != VT.getScalarSizeInBits())
6779 return false;
6780
6781 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6782 UndefElts, EltBits, AllowWholeUndefs,
6783 AllowPartialUndefs)) {
6784 EVT SrcVT = Op.getOperand(0).getValueType();
6785 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6786 unsigned NumSubElts = VT.getVectorNumElements();
6787 unsigned BaseIdx = Op.getConstantOperandVal(1);
6788 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6789 if ((BaseIdx + NumSubElts) != NumSrcElts)
6790 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6791 if (BaseIdx != 0)
6792 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6793 return true;
6794 }
6795 }
6796
6797 // Extract constant bits from shuffle node sources.
6798 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6799 // TODO - support shuffle through bitcasts.
6800 if (EltSizeInBits != VT.getScalarSizeInBits())
6801 return false;
6802
6803 ArrayRef<int> Mask = SVN->getMask();
6804 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6805 llvm::any_of(Mask, [](int M) { return M < 0; }))
6806 return false;
6807
6808 APInt UndefElts0, UndefElts1;
6809 SmallVector<APInt, 32> EltBits0, EltBits1;
6810 if (isAnyInRange(Mask, 0, NumElts) &&
6811 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6812 UndefElts0, EltBits0, AllowWholeUndefs,
6813 AllowPartialUndefs))
6814 return false;
6815 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6816 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6817 UndefElts1, EltBits1, AllowWholeUndefs,
6818 AllowPartialUndefs))
6819 return false;
6820
6821 UndefElts = APInt::getNullValue(NumElts);
6822 for (int i = 0; i != (int)NumElts; ++i) {
6823 int M = Mask[i];
6824 if (M < 0) {
6825 UndefElts.setBit(i);
6826 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6827 } else if (M < (int)NumElts) {
6828 if (UndefElts0[M])
6829 UndefElts.setBit(i);
6830 EltBits.push_back(EltBits0[M]);
6831 } else {
6832 if (UndefElts1[M - NumElts])
6833 UndefElts.setBit(i);
6834 EltBits.push_back(EltBits1[M - NumElts]);
6835 }
6836 }
6837 return true;
6838 }
6839
6840 return false;
6841}
6842
6843namespace llvm {
6844namespace X86 {
6845bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
6846 APInt UndefElts;
6847 SmallVector<APInt, 16> EltBits;
6848 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6849 UndefElts, EltBits, true,
6850 AllowPartialUndefs)) {
6851 int SplatIndex = -1;
6852 for (int i = 0, e = EltBits.size(); i != e; ++i) {
6853 if (UndefElts[i])
6854 continue;
6855 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6856 SplatIndex = -1;
6857 break;
6858 }
6859 SplatIndex = i;
6860 }
6861 if (0 <= SplatIndex) {
6862 SplatVal = EltBits[SplatIndex];
6863 return true;
6864 }
6865 }
6866
6867 return false;
6868}
6869} // namespace X86
6870} // namespace llvm
6871
6872static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6873 unsigned MaskEltSizeInBits,
6874 SmallVectorImpl<uint64_t> &RawMask,
6875 APInt &UndefElts) {
6876 // Extract the raw target constant bits.
6877 SmallVector<APInt, 64> EltBits;
6878 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6879 EltBits, /* AllowWholeUndefs */ true,
6880 /* AllowPartialUndefs */ false))
6881 return false;
6882
6883 // Insert the extracted elements into the mask.
6884 for (const APInt &Elt : EltBits)
6885 RawMask.push_back(Elt.getZExtValue());
6886
6887 return true;
6888}
6889
6890/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6891/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
6892/// Note: This ignores saturation, so inputs must be checked first.
6893static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6894 bool Unary, unsigned NumStages = 1) {
6895 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6896 unsigned NumElts = VT.getVectorNumElements();
6897 unsigned NumLanes = VT.getSizeInBits() / 128;
6898 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6899 unsigned Offset = Unary ? 0 : NumElts;
6900 unsigned Repetitions = 1u << (NumStages - 1);
6901 unsigned Increment = 1u << NumStages;
6902 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")((void)0);
6903
6904 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6905 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
6906 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6907 Mask.push_back(Elt + (Lane * NumEltsPerLane));
6908 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6909 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6910 }
6911 }
6912}
6913
6914// Split the demanded elts of a PACKSS/PACKUS node between its operands.
6915static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6916 APInt &DemandedLHS, APInt &DemandedRHS) {
6917 int NumLanes = VT.getSizeInBits() / 128;
6918 int NumElts = DemandedElts.getBitWidth();
6919 int NumInnerElts = NumElts / 2;
6920 int NumEltsPerLane = NumElts / NumLanes;
6921 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6922
6923 DemandedLHS = APInt::getNullValue(NumInnerElts);
6924 DemandedRHS = APInt::getNullValue(NumInnerElts);
6925
6926 // Map DemandedElts to the packed operands.
6927 for (int Lane = 0; Lane != NumLanes; ++Lane) {
6928 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6929 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6930 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6931 if (DemandedElts[OuterIdx])
6932 DemandedLHS.setBit(InnerIdx);
6933 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6934 DemandedRHS.setBit(InnerIdx);
6935 }
6936 }
6937}
6938
6939// Split the demanded elts of a HADD/HSUB node between its operands.
6940static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6941 APInt &DemandedLHS, APInt &DemandedRHS) {
6942 int NumLanes = VT.getSizeInBits() / 128;
6943 int NumElts = DemandedElts.getBitWidth();
6944 int NumEltsPerLane = NumElts / NumLanes;
6945 int HalfEltsPerLane = NumEltsPerLane / 2;
6946
6947 DemandedLHS = APInt::getNullValue(NumElts);
6948 DemandedRHS = APInt::getNullValue(NumElts);
6949
6950 // Map DemandedElts to the horizontal operands.
6951 for (int Idx = 0; Idx != NumElts; ++Idx) {
6952 if (!DemandedElts[Idx])
6953 continue;
6954 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6955 int LocalIdx = Idx % NumEltsPerLane;
6956 if (LocalIdx < HalfEltsPerLane) {
6957 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6958 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6959 } else {
6960 LocalIdx -= HalfEltsPerLane;
6961 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6962 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6963 }
6964 }
6965}
6966
6967/// Calculates the shuffle mask corresponding to the target-specific opcode.
6968/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6969/// operands in \p Ops, and returns true.
6970/// Sets \p IsUnary to true if only one source is used. Note that this will set
6971/// IsUnary for shuffles which use a single input multiple times, and in those
6972/// cases it will adjust the mask to only have indices within that single input.
6973/// It is an error to call this with non-empty Mask/Ops vectors.
6974static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6975 SmallVectorImpl<SDValue> &Ops,
6976 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6977 unsigned NumElems = VT.getVectorNumElements();
6978 unsigned MaskEltSize = VT.getScalarSizeInBits();
6979 SmallVector<uint64_t, 32> RawMask;
6980 APInt RawUndefs;
6981 uint64_t ImmN;
6982
6983 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((void)0);
6984 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((void)0);
6985
6986 IsUnary = false;
6987 bool IsFakeUnary = false;
6988 switch (N->getOpcode()) {
6989 case X86ISD::BLENDI:
6990 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
6991 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
6992 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6993 DecodeBLENDMask(NumElems, ImmN, Mask);
6994 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6995 break;
6996 case X86ISD::SHUFP:
6997 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
6998 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
6999 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7000 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7001 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7002 break;
7003 case X86ISD::INSERTPS:
7004 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7005 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7006 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7007 DecodeINSERTPSMask(ImmN, Mask);
7008 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7009 break;
7010 case X86ISD::EXTRQI:
7011 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7012 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7013 isa<ConstantSDNode>(N->getOperand(2))) {
7014 int BitLen = N->getConstantOperandVal(1);
7015 int BitIdx = N->getConstantOperandVal(2);
7016 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7017 IsUnary = true;
7018 }
7019 break;
7020 case X86ISD::INSERTQI:
7021 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7022 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7023 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7024 isa<ConstantSDNode>(N->getOperand(3))) {
7025 int BitLen = N->getConstantOperandVal(2);
7026 int BitIdx = N->getConstantOperandVal(3);
7027 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7028 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7029 }
7030 break;
7031 case X86ISD::UNPCKH:
7032 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7033 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7034 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7035 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7036 break;
7037 case X86ISD::UNPCKL:
7038 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7039 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7040 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7041 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7042 break;
7043 case X86ISD::MOVHLPS:
7044 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7045 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7046 DecodeMOVHLPSMask(NumElems, Mask);
7047 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7048 break;
7049 case X86ISD::MOVLHPS:
7050 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7051 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7052 DecodeMOVLHPSMask(NumElems, Mask);
7053 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7054 break;
7055 case X86ISD::VALIGN:
7056 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&((void)0)
7057 "Only 32-bit and 64-bit elements are supported!")((void)0);
7058 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7059 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7060 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7061 DecodeVALIGNMask(NumElems, ImmN, Mask);
7062 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7063 Ops.push_back(N->getOperand(1));
7064 Ops.push_back(N->getOperand(0));
7065 break;
7066 case X86ISD::PALIGNR:
7067 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7068 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7069 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7070 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7071 DecodePALIGNRMask(NumElems, ImmN, Mask);
7072 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7073 Ops.push_back(N->getOperand(1));
7074 Ops.push_back(N->getOperand(0));
7075 break;
7076 case X86ISD::VSHLDQ:
7077 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7078 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7079 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7080 DecodePSLLDQMask(NumElems, ImmN, Mask);
7081 IsUnary = true;
7082 break;
7083 case X86ISD::VSRLDQ:
7084 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7085 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7086 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7087 DecodePSRLDQMask(NumElems, ImmN, Mask);
7088 IsUnary = true;
7089 break;
7090 case X86ISD::PSHUFD:
7091 case X86ISD::VPERMILPI:
7092 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7093 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7094 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7095 IsUnary = true;
7096 break;
7097 case X86ISD::PSHUFHW:
7098 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7099 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7100 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7101 IsUnary = true;
7102 break;
7103 case X86ISD::PSHUFLW:
7104 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7105 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7106 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7107 IsUnary = true;
7108 break;
7109 case X86ISD::VZEXT_MOVL:
7110 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7111 DecodeZeroMoveLowMask(NumElems, Mask);
7112 IsUnary = true;
7113 break;
7114 case X86ISD::VBROADCAST:
7115 // We only decode broadcasts of same-sized vectors, peeking through to
7116 // extracted subvectors is likely to cause hasOneUse issues with
7117 // SimplifyDemandedBits etc.
7118 if (N->getOperand(0).getValueType() == VT) {
7119 DecodeVectorBroadcast(NumElems, Mask);
7120 IsUnary = true;
7121 break;
7122 }
7123 return false;
7124 case X86ISD::VPERMILPV: {
7125 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7126 IsUnary = true;
7127 SDValue MaskNode = N->getOperand(1);
7128 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7129 RawUndefs)) {
7130 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7131 break;
7132 }
7133 return false;
7134 }
7135 case X86ISD::PSHUFB: {
7136 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7137 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7138 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7139 IsUnary = true;
7140 SDValue MaskNode = N->getOperand(1);
7141 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7142 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7143 break;
7144 }
7145 return false;
7146 }
7147 case X86ISD::VPERMI:
7148 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7149 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7150 DecodeVPERMMask(NumElems, ImmN, Mask);
7151 IsUnary = true;
7152 break;
7153 case X86ISD::MOVSS:
7154 case X86ISD::MOVSD:
7155 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7156 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7157 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7158 break;
7159 case X86ISD::VPERM2X128:
7160 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7161 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7162 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7163 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7164 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7165 break;
7166 case X86ISD::SHUF128:
7167 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7168 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7169 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7170 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7171 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7172 break;
7173 case X86ISD::MOVSLDUP:
7174 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7175 DecodeMOVSLDUPMask(NumElems, Mask);
7176 IsUnary = true;
7177 break;
7178 case X86ISD::MOVSHDUP:
7179 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7180 DecodeMOVSHDUPMask(NumElems, Mask);
7181 IsUnary = true;
7182 break;
7183 case X86ISD::MOVDDUP:
7184 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7185 DecodeMOVDDUPMask(NumElems, Mask);
7186 IsUnary = true;
7187 break;
7188 case X86ISD::VPERMIL2: {
7189 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7190 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7191 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7192 SDValue MaskNode = N->getOperand(2);
7193 SDValue CtrlNode = N->getOperand(3);
7194 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7195 unsigned CtrlImm = CtrlOp->getZExtValue();
7196 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7197 RawUndefs)) {
7198 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7199 Mask);
7200 break;
7201 }
7202 }
7203 return false;
7204 }
7205 case X86ISD::VPPERM: {
7206 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7207 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7208 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7209 SDValue MaskNode = N->getOperand(2);
7210 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7211 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7212 break;
7213 }
7214 return false;
7215 }
7216 case X86ISD::VPERMV: {
7217 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7218 IsUnary = true;
7219 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7220 Ops.push_back(N->getOperand(1));
7221 SDValue MaskNode = N->getOperand(0);
7222 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7223 RawUndefs)) {
7224 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7225 break;
7226 }
7227 return false;
7228 }
7229 case X86ISD::VPERMV3: {
7230 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7231 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")((void)0);
7232 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7233 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7234 Ops.push_back(N->getOperand(0));
7235 Ops.push_back(N->getOperand(2));
7236 SDValue MaskNode = N->getOperand(1);
7237 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7238 RawUndefs)) {
7239 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7240 break;
7241 }
7242 return false;
7243 }
7244 default: llvm_unreachable("unknown target shuffle node")__builtin_unreachable();
7245 }
7246
7247 // Empty mask indicates the decode failed.
7248 if (Mask.empty())
7249 return false;
7250
7251 // Check if we're getting a shuffle mask with zero'd elements.
7252 if (!AllowSentinelZero && isAnyZero(Mask))
7253 return false;
7254
7255 // If we have a fake unary shuffle, the shuffle mask is spread across two
7256 // inputs that are actually the same node. Re-map the mask to always point
7257 // into the first input.
7258 if (IsFakeUnary)
7259 for (int &M : Mask)
7260 if (M >= (int)Mask.size())
7261 M -= Mask.size();
7262
7263 // If we didn't already add operands in the opcode-specific code, default to
7264 // adding 1 or 2 operands starting at 0.
7265 if (Ops.empty()) {
7266 Ops.push_back(N->getOperand(0));
7267 if (!IsUnary || IsFakeUnary)
7268 Ops.push_back(N->getOperand(1));
7269 }
7270
7271 return true;
7272}
7273
7274// Wrapper for getTargetShuffleMask with InUnary;
7275static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7276 SmallVectorImpl<SDValue> &Ops,
7277 SmallVectorImpl<int> &Mask) {
7278 bool IsUnary;
7279 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7280}
7281
7282/// Compute whether each element of a shuffle is zeroable.
7283///
7284/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7285/// Either it is an undef element in the shuffle mask, the element of the input
7286/// referenced is undef, or the element of the input referenced is known to be
7287/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7288/// as many lanes with this technique as possible to simplify the remaining
7289/// shuffle.
7290static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7291 SDValue V1, SDValue V2,
7292 APInt &KnownUndef, APInt &KnownZero) {
7293 int Size = Mask.size();
7294 KnownUndef = KnownZero = APInt::getNullValue(Size);
7295
7296 V1 = peekThroughBitcasts(V1);
7297 V2 = peekThroughBitcasts(V2);
7298
7299 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7300 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7301
7302 int VectorSizeInBits = V1.getValueSizeInBits();
7303 int ScalarSizeInBits = VectorSizeInBits / Size;
7304 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((void)0);
7305
7306 for (int i = 0; i < Size; ++i) {
7307 int M = Mask[i];
7308 // Handle the easy cases.
7309 if (M < 0) {
7310 KnownUndef.setBit(i);
7311 continue;
7312 }
7313 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7314 KnownZero.setBit(i);
7315 continue;
7316 }
7317
7318 // Determine shuffle input and normalize the mask.
7319 SDValue V = M < Size ? V1 : V2;
7320 M %= Size;
7321
7322 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7323 if (V.getOpcode() != ISD::BUILD_VECTOR)
7324 continue;
7325
7326 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7327 // the (larger) source element must be UNDEF/ZERO.
7328 if ((Size % V.getNumOperands()) == 0) {
7329 int Scale = Size / V->getNumOperands();
7330 SDValue Op = V.getOperand(M / Scale);
7331 if (Op.isUndef())
7332 KnownUndef.setBit(i);
7333 if (X86::isZeroNode(Op))
7334 KnownZero.setBit(i);
7335 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7336 APInt Val = Cst->getAPIntValue();
7337 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7338 if (Val == 0)
7339 KnownZero.setBit(i);
7340 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7341 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7342 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7343 if (Val == 0)
7344 KnownZero.setBit(i);
7345 }
7346 continue;
7347 }
7348
7349 // If the BUILD_VECTOR has more elements then all the (smaller) source
7350 // elements must be UNDEF or ZERO.
7351 if ((V.getNumOperands() % Size) == 0) {
7352 int Scale = V->getNumOperands() / Size;
7353 bool AllUndef = true;
7354 bool AllZero = true;
7355 for (int j = 0; j < Scale; ++j) {
7356 SDValue Op = V.getOperand((M * Scale) + j);
7357 AllUndef &= Op.isUndef();
7358 AllZero &= X86::isZeroNode(Op);
7359 }
7360 if (AllUndef)
7361 KnownUndef.setBit(i);
7362 if (AllZero)
7363 KnownZero.setBit(i);
7364 continue;
7365 }
7366 }
7367}
7368
7369/// Decode a target shuffle mask and inputs and see if any values are
7370/// known to be undef or zero from their inputs.
7371/// Returns true if the target shuffle mask was decoded.
7372/// FIXME: Merge this with computeZeroableShuffleElements?
7373static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7374 SmallVectorImpl<SDValue> &Ops,
7375 APInt &KnownUndef, APInt &KnownZero) {
7376 bool IsUnary;
7377 if (!isTargetShuffle(N.getOpcode()))
7378 return false;
7379
7380 MVT VT = N.getSimpleValueType();
7381 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7382 return false;
7383
7384 int Size = Mask.size();
7385 SDValue V1 = Ops[0];
7386 SDValue V2 = IsUnary ? V1 : Ops[1];
7387 KnownUndef = KnownZero = APInt::getNullValue(Size);
7388
7389 V1 = peekThroughBitcasts(V1);
7390 V2 = peekThroughBitcasts(V2);
7391
7392 assert((VT.getSizeInBits() % Size) == 0 &&((void)0)
7393 "Illegal split of shuffle value type")((void)0);
7394 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7395
7396 // Extract known constant input data.
7397 APInt UndefSrcElts[2];
7398 SmallVector<APInt, 32> SrcEltBits[2];
7399 bool IsSrcConstant[2] = {
7400 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7401 SrcEltBits[0], true, false),
7402 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7403 SrcEltBits[1], true, false)};
7404
7405 for (int i = 0; i < Size; ++i) {
7406 int M = Mask[i];
7407
7408 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7409 if (M < 0) {
7410 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")((void)0);
7411 if (SM_SentinelUndef == M)
7412 KnownUndef.setBit(i);
7413 if (SM_SentinelZero == M)
7414 KnownZero.setBit(i);
7415 continue;
7416 }
7417
7418 // Determine shuffle input and normalize the mask.
7419 unsigned SrcIdx = M / Size;
7420 SDValue V = M < Size ? V1 : V2;
7421 M %= Size;
7422
7423 // We are referencing an UNDEF input.
7424 if (V.isUndef()) {
7425 KnownUndef.setBit(i);
7426 continue;
7427 }
7428
7429 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7430 // TODO: We currently only set UNDEF for integer types - floats use the same
7431 // registers as vectors and many of the scalar folded loads rely on the
7432 // SCALAR_TO_VECTOR pattern.
7433 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7434 (Size % V.getValueType().getVectorNumElements()) == 0) {
7435 int Scale = Size / V.getValueType().getVectorNumElements();
7436 int Idx = M / Scale;
7437 if (Idx != 0 && !VT.isFloatingPoint())
7438 KnownUndef.setBit(i);
7439 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7440 KnownZero.setBit(i);
7441 continue;
7442 }
7443
7444 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7445 // base vectors.
7446 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7447 SDValue Vec = V.getOperand(0);
7448 int NumVecElts = Vec.getValueType().getVectorNumElements();
7449 if (Vec.isUndef() && Size == NumVecElts) {
7450 int Idx = V.getConstantOperandVal(2);
7451 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7452 if (M < Idx || (Idx + NumSubElts) <= M)
7453 KnownUndef.setBit(i);
7454 }
7455 continue;
7456 }
7457
7458 // Attempt to extract from the source's constant bits.
7459 if (IsSrcConstant[SrcIdx]) {
7460 if (UndefSrcElts[SrcIdx][M])
7461 KnownUndef.setBit(i);
7462 else if (SrcEltBits[SrcIdx][M] == 0)
7463 KnownZero.setBit(i);
7464 }
7465 }
7466
7467 assert(VT.getVectorNumElements() == (unsigned)Size &&((void)0)
7468 "Different mask size from vector size!")((void)0);
7469 return true;
7470}
7471
7472// Replace target shuffle mask elements with known undef/zero sentinels.
7473static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7474 const APInt &KnownUndef,
7475 const APInt &KnownZero,
7476 bool ResolveKnownZeros= true) {
7477 unsigned NumElts = Mask.size();
7478 assert(KnownUndef.getBitWidth() == NumElts &&((void)0)
7479 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")((void)0);
7480
7481 for (unsigned i = 0; i != NumElts; ++i) {
7482 if (KnownUndef[i])
7483 Mask[i] = SM_SentinelUndef;
7484 else if (ResolveKnownZeros && KnownZero[i])
7485 Mask[i] = SM_SentinelZero;
7486 }
7487}
7488
7489// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7490static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7491 APInt &KnownUndef,
7492 APInt &KnownZero) {
7493 unsigned NumElts = Mask.size();
7494 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7495
7496 for (unsigned i = 0; i != NumElts; ++i) {
7497 int M = Mask[i];
7498 if (SM_SentinelUndef == M)
7499 KnownUndef.setBit(i);
7500 if (SM_SentinelZero == M)
7501 KnownZero.setBit(i);
7502 }
7503}
7504
7505// Forward declaration (for getFauxShuffleMask recursive check).
7506// TODO: Use DemandedElts variant.
7507static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7508 SmallVectorImpl<int> &Mask,
7509 const SelectionDAG &DAG, unsigned Depth,
7510 bool ResolveKnownElts);
7511
7512// Attempt to decode ops that could be represented as a shuffle mask.
7513// The decoded shuffle mask may contain a different number of elements to the
7514// destination value type.
7515static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7516 SmallVectorImpl<int> &Mask,
7517 SmallVectorImpl<SDValue> &Ops,
7518 const SelectionDAG &DAG, unsigned Depth,
7519 bool ResolveKnownElts) {
7520 Mask.clear();
7521 Ops.clear();
7522
7523 MVT VT = N.getSimpleValueType();
7524 unsigned NumElts = VT.getVectorNumElements();
7525 unsigned NumSizeInBits = VT.getSizeInBits();
7526 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7527 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7528 return false;
7529 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")((void)0);
7530 unsigned NumSizeInBytes = NumSizeInBits / 8;
7531 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7532
7533 unsigned Opcode = N.getOpcode();
7534 switch (Opcode) {
7535 case ISD::VECTOR_SHUFFLE: {
7536 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7537 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7538 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7539 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7540 Ops.push_back(N.getOperand(0));
7541 Ops.push_back(N.getOperand(1));
7542 return true;
7543 }
7544 return false;
7545 }
7546 case ISD::AND:
7547 case X86ISD::ANDNP: {
7548 // Attempt to decode as a per-byte mask.
7549 APInt UndefElts;
7550 SmallVector<APInt, 32> EltBits;
7551 SDValue N0 = N.getOperand(0);
7552 SDValue N1 = N.getOperand(1);
7553 bool IsAndN = (X86ISD::ANDNP == Opcode);
7554 uint64_t ZeroMask = IsAndN ? 255 : 0;
7555 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7556 return false;
7557 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7558 if (UndefElts[i]) {
7559 Mask.push_back(SM_SentinelUndef);
7560 continue;
7561 }
7562 const APInt &ByteBits = EltBits[i];
7563 if (ByteBits != 0 && ByteBits != 255)
7564 return false;
7565 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7566 }
7567 Ops.push_back(IsAndN ? N1 : N0);
7568 return true;
7569 }
7570 case ISD::OR: {
7571 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7572 // is a valid shuffle index.
7573 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7574 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7575 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7576 return false;
7577 SmallVector<int, 64> SrcMask0, SrcMask1;
7578 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7579 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7580 true) ||
7581 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7582 true))
7583 return false;
7584
7585 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7586 SmallVector<int, 64> Mask0, Mask1;
7587 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7588 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7589 for (int i = 0; i != (int)MaskSize; ++i) {
7590 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7591 // loops converting between OR and BLEND shuffles due to
7592 // canWidenShuffleElements merging away undef elements, meaning we
7593 // fail to recognise the OR as the undef element isn't known zero.
7594 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7595 Mask.push_back(SM_SentinelZero);
7596 else if (Mask1[i] == SM_SentinelZero)
7597 Mask.push_back(i);
7598 else if (Mask0[i] == SM_SentinelZero)
7599 Mask.push_back(i + MaskSize);
7600 else
7601 return false;
7602 }
7603 Ops.push_back(N0);
7604 Ops.push_back(N1);
7605 return true;
7606 }
7607 case ISD::INSERT_SUBVECTOR: {
7608 SDValue Src = N.getOperand(0);
7609 SDValue Sub = N.getOperand(1);
7610 EVT SubVT = Sub.getValueType();
7611 unsigned NumSubElts = SubVT.getVectorNumElements();
7612 if (!N->isOnlyUserOf(Sub.getNode()))
7613 return false;
7614 uint64_t InsertIdx = N.getConstantOperandVal(2);
7615 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7616 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7617 Sub.getOperand(0).getValueType() == VT) {
7618 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7619 for (int i = 0; i != (int)NumElts; ++i)
7620 Mask.push_back(i);
7621 for (int i = 0; i != (int)NumSubElts; ++i)
7622 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7623 Ops.push_back(Src);
7624 Ops.push_back(Sub.getOperand(0));
7625 return true;
7626 }
7627 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7628 SmallVector<int, 64> SubMask;
7629 SmallVector<SDValue, 2> SubInputs;
7630 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7631 SubMask, DAG, Depth + 1, ResolveKnownElts))
7632 return false;
7633
7634 // Subvector shuffle inputs must not be larger than the subvector.
7635 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7636 return SubVT.getFixedSizeInBits() <
7637 SubInput.getValueSizeInBits().getFixedSize();
7638 }))
7639 return false;
7640
7641 if (SubMask.size() != NumSubElts) {
7642 assert(((SubMask.size() % NumSubElts) == 0 ||((void)0)
7643 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")((void)0);
7644 if ((NumSubElts % SubMask.size()) == 0) {
7645 int Scale = NumSubElts / SubMask.size();
7646 SmallVector<int,64> ScaledSubMask;
7647 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7648 SubMask = ScaledSubMask;
7649 } else {
7650 int Scale = SubMask.size() / NumSubElts;
7651 NumSubElts = SubMask.size();
7652 NumElts *= Scale;
7653 InsertIdx *= Scale;
7654 }
7655 }
7656 Ops.push_back(Src);
7657 Ops.append(SubInputs.begin(), SubInputs.end());
7658 if (ISD::isBuildVectorAllZeros(Src.getNode()))
7659 Mask.append(NumElts, SM_SentinelZero);
7660 else
7661 for (int i = 0; i != (int)NumElts; ++i)
7662 Mask.push_back(i);
7663 for (int i = 0; i != (int)NumSubElts; ++i) {
7664 int M = SubMask[i];
7665 if (0 <= M) {
7666 int InputIdx = M / NumSubElts;
7667 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7668 }
7669 Mask[i + InsertIdx] = M;
7670 }
7671 return true;
7672 }
7673 case X86ISD::PINSRB:
7674 case X86ISD::PINSRW:
7675 case ISD::SCALAR_TO_VECTOR:
7676 case ISD::INSERT_VECTOR_ELT: {
7677 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7678 // vector, for matching src/dst vector types.
7679 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7680
7681 unsigned DstIdx = 0;
7682 if (Opcode != ISD::SCALAR_TO_VECTOR) {
7683 // Check we have an in-range constant insertion index.
7684 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7685 N.getConstantOperandAPInt(2).uge(NumElts))
7686 return false;
7687 DstIdx = N.getConstantOperandVal(2);
7688
7689 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7690 if (X86::isZeroNode(Scl)) {
7691 Ops.push_back(N.getOperand(0));
7692 for (unsigned i = 0; i != NumElts; ++i)
7693 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7694 return true;
7695 }
7696 }
7697
7698 // Peek through trunc/aext/zext.
7699 // TODO: aext shouldn't require SM_SentinelZero padding.
7700 // TODO: handle shift of scalars.
7701 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7702 while (Scl.getOpcode() == ISD::TRUNCATE ||
7703 Scl.getOpcode() == ISD::ANY_EXTEND ||
7704 Scl.getOpcode() == ISD::ZERO_EXTEND) {
7705 Scl = Scl.getOperand(0);
7706 MinBitsPerElt =
7707 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7708 }
7709 if ((MinBitsPerElt % 8) != 0)
7710 return false;
7711
7712 // Attempt to find the source vector the scalar was extracted from.
7713 SDValue SrcExtract;
7714 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7715 Scl.getOpcode() == X86ISD::PEXTRW ||
7716 Scl.getOpcode() == X86ISD::PEXTRB) &&
7717 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7718 SrcExtract = Scl;
7719 }
7720 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7721 return false;
7722
7723 SDValue SrcVec = SrcExtract.getOperand(0);
7724 EVT SrcVT = SrcVec.getValueType();
7725 if (!SrcVT.getScalarType().isByteSized())
7726 return false;
7727 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7728 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7729 unsigned DstByte = DstIdx * NumBytesPerElt;
7730 MinBitsPerElt =
7731 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7732
7733 // Create 'identity' byte level shuffle mask and then add inserted bytes.
7734 if (Opcode == ISD::SCALAR_TO_VECTOR) {
7735 Ops.push_back(SrcVec);
7736 Mask.append(NumSizeInBytes, SM_SentinelUndef);
7737 } else {
7738 Ops.push_back(SrcVec);
7739 Ops.push_back(N.getOperand(0));
7740 for (int i = 0; i != (int)NumSizeInBytes; ++i)
7741 Mask.push_back(NumSizeInBytes + i);
7742 }
7743
7744 unsigned MinBytesPerElts = MinBitsPerElt / 8;
7745 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7746 for (unsigned i = 0; i != MinBytesPerElts; ++i)
7747 Mask[DstByte + i] = SrcByte + i;
7748 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7749 Mask[DstByte + i] = SM_SentinelZero;
7750 return true;
7751 }
7752 case X86ISD::PACKSS:
7753 case X86ISD::PACKUS: {
7754 SDValue N0 = N.getOperand(0);
7755 SDValue N1 = N.getOperand(1);
7756 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&((void)0)
7757 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&((void)0)
7758 "Unexpected input value type")((void)0);
7759
7760 APInt EltsLHS, EltsRHS;
7761 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7762
7763 // If we know input saturation won't happen (or we don't care for particular
7764 // lanes), we can treat this as a truncation shuffle.
7765 bool Offset0 = false, Offset1 = false;
7766 if (Opcode == X86ISD::PACKSS) {
7767 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7768 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7769 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7770 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7771 return false;
7772 // We can't easily fold ASHR into a shuffle, but if it was feeding a
7773 // PACKSS then it was likely being used for sign-extension for a
7774 // truncation, so just peek through and adjust the mask accordingly.
7775 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7776 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7777 Offset0 = true;
7778 N0 = N0.getOperand(0);
7779 }
7780 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7781 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
7782 Offset1 = true;
7783 N1 = N1.getOperand(0);
7784 }
7785 } else {
7786 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7787 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7788 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7789 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7790 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7791 return false;
7792 }
7793
7794 bool IsUnary = (N0 == N1);
7795
7796 Ops.push_back(N0);
7797 if (!IsUnary)
7798 Ops.push_back(N1);
7799
7800 createPackShuffleMask(VT, Mask, IsUnary);
7801
7802 if (Offset0 || Offset1) {
7803 for (int &M : Mask)
7804 if ((Offset0 && isInRange(M, 0, NumElts)) ||
7805 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
7806 ++M;
7807 }
7808 return true;
7809 }
7810 case X86ISD::VTRUNC: {
7811 SDValue Src = N.getOperand(0);
7812 EVT SrcVT = Src.getValueType();
7813 // Truncated source must be a simple vector.
7814 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7815 (SrcVT.getScalarSizeInBits() % 8) != 0)
7816 return false;
7817 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7818 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7819 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7820 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")((void)0);
7821 for (unsigned i = 0; i != NumSrcElts; ++i)
7822 Mask.push_back(i * Scale);
7823 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7824 Ops.push_back(Src);
7825 return true;
7826 }
7827 case X86ISD::VSHLI:
7828 case X86ISD::VSRLI: {
7829 uint64_t ShiftVal = N.getConstantOperandVal(1);
7830 // Out of range bit shifts are guaranteed to be zero.
7831 if (NumBitsPerElt <= ShiftVal) {
7832 Mask.append(NumElts, SM_SentinelZero);
7833 return true;
7834 }
7835
7836 // We can only decode 'whole byte' bit shifts as shuffles.
7837 if ((ShiftVal % 8) != 0)
7838 break;
7839
7840 uint64_t ByteShift = ShiftVal / 8;
7841 Ops.push_back(N.getOperand(0));
7842
7843 // Clear mask to all zeros and insert the shifted byte indices.
7844 Mask.append(NumSizeInBytes, SM_SentinelZero);
7845
7846 if (X86ISD::VSHLI == Opcode) {
7847 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7848 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7849 Mask[i + j] = i + j - ByteShift;
7850 } else {
7851 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7852 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7853 Mask[i + j - ByteShift] = i + j;
7854 }
7855 return true;
7856 }
7857 case X86ISD::VROTLI:
7858 case X86ISD::VROTRI: {
7859 // We can only decode 'whole byte' bit rotates as shuffles.
7860 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7861 if ((RotateVal % 8) != 0)
7862 return false;
7863 Ops.push_back(N.getOperand(0));
7864 int Offset = RotateVal / 8;
7865 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7866 for (int i = 0; i != (int)NumElts; ++i) {
7867 int BaseIdx = i * NumBytesPerElt;
7868 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7869 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7870 }
7871 }
7872 return true;
7873 }
7874 case X86ISD::VBROADCAST: {
7875 SDValue Src = N.getOperand(0);
7876 if (!Src.getSimpleValueType().isVector()) {
7877 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7878 !isNullConstant(Src.getOperand(1)) ||
7879 Src.getOperand(0).getValueType().getScalarType() !=
7880 VT.getScalarType())
7881 return false;
7882 Src = Src.getOperand(0);
7883 }
7884 Ops.push_back(Src);
7885 Mask.append(NumElts, 0);
7886 return true;
7887 }
7888 case ISD::ZERO_EXTEND:
7889 case ISD::ANY_EXTEND:
7890 case ISD::ZERO_EXTEND_VECTOR_INREG:
7891 case ISD::ANY_EXTEND_VECTOR_INREG: {
7892 SDValue Src = N.getOperand(0);
7893 EVT SrcVT = Src.getValueType();
7894
7895 // Extended source must be a simple vector.
7896 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7897 (SrcVT.getScalarSizeInBits() % 8) != 0)
7898 return false;
7899
7900 bool IsAnyExtend =
7901 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7902 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7903 IsAnyExtend, Mask);
7904 Ops.push_back(Src);
7905 return true;
7906 }
7907 }
7908
7909 return false;
7910}
7911
7912/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7913static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7914 SmallVectorImpl<int> &Mask) {
7915 int MaskWidth = Mask.size();
7916 SmallVector<SDValue, 16> UsedInputs;
7917 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7918 int lo = UsedInputs.size() * MaskWidth;
7919 int hi = lo + MaskWidth;
7920
7921 // Strip UNDEF input usage.
7922 if (Inputs[i].isUndef())
7923 for (int &M : Mask)
7924 if ((lo <= M) && (M < hi))
7925 M = SM_SentinelUndef;
7926
7927 // Check for unused inputs.
7928 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7929 for (int &M : Mask)
7930 if (lo <= M)
7931 M -= MaskWidth;
7932 continue;
7933 }
7934
7935 // Check for repeated inputs.
7936 bool IsRepeat = false;
7937 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7938 if (UsedInputs[j] != Inputs[i])
7939 continue;
7940 for (int &M : Mask)
7941 if (lo <= M)
7942 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7943 IsRepeat = true;
7944 break;
7945 }
7946 if (IsRepeat)
7947 continue;
7948
7949 UsedInputs.push_back(Inputs[i]);
7950 }
7951 Inputs = UsedInputs;
7952}
7953
7954/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7955/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7956/// Returns true if the target shuffle mask was decoded.
7957static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7958 SmallVectorImpl<SDValue> &Inputs,
7959 SmallVectorImpl<int> &Mask,
7960 APInt &KnownUndef, APInt &KnownZero,
7961 const SelectionDAG &DAG, unsigned Depth,
7962 bool ResolveKnownElts) {
7963 EVT VT = Op.getValueType();
7964 if (!VT.isSimple() || !VT.isVector())
7965 return false;
7966
7967 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7968 if (ResolveKnownElts)
7969 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7970 return true;
7971 }
7972 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7973 ResolveKnownElts)) {
7974 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7975 return true;
7976 }
7977 return false;
7978}
7979
7980static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7981 SmallVectorImpl<int> &Mask,
7982 const SelectionDAG &DAG, unsigned Depth = 0,
7983 bool ResolveKnownElts = true) {
7984 EVT VT = Op.getValueType();
7985 if (!VT.isSimple() || !VT.isVector())
7986 return false;
7987
7988 APInt KnownUndef, KnownZero;
7989 unsigned NumElts = Op.getValueType().getVectorNumElements();
7990 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7991 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7992 KnownZero, DAG, Depth, ResolveKnownElts);
7993}
7994
7995// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
7996static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
7997 EVT MemVT, MemSDNode *Mem, unsigned Offset,
7998 SelectionDAG &DAG) {
7999 assert((Opcode == X86ISD::VBROADCAST_LOAD ||((void)0)
8000 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&((void)0)
8001 "Unknown broadcast load type")((void)0);
8002
8003 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8004 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8005 return SDValue();
8006
8007 SDValue Ptr =
8008 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8009 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8010 SDValue Ops[] = {Mem->getChain(), Ptr};
8011 SDValue BcstLd = DAG.getMemIntrinsicNode(
8012 Opcode, DL, Tys, Ops, MemVT,
8013 DAG.getMachineFunction().getMachineMemOperand(
8014 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8015 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8016 return BcstLd;
8017}
8018
8019/// Returns the scalar element that will make up the i'th
8020/// element of the result of the vector shuffle.
8021static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8022 SelectionDAG &DAG, unsigned Depth) {
8023 if (Depth >= SelectionDAG::MaxRecursionDepth)
8024 return SDValue(); // Limit search depth.
8025
8026 EVT VT = Op.getValueType();
8027 unsigned Opcode = Op.getOpcode();
8028 unsigned NumElems = VT.getVectorNumElements();
8029
8030 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8031 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8032 int Elt = SV->getMaskElt(Index);
8033
8034 if (Elt < 0)
8035 return DAG.getUNDEF(VT.getVectorElementType());
8036
8037 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8038 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8039 }
8040
8041 // Recurse into target specific vector shuffles to find scalars.
8042 if (isTargetShuffle(Opcode)) {
8043 MVT ShufVT = VT.getSimpleVT();
8044 MVT ShufSVT = ShufVT.getVectorElementType();
8045 int NumElems = (int)ShufVT.getVectorNumElements();
8046 SmallVector<int, 16> ShuffleMask;
8047 SmallVector<SDValue, 16> ShuffleOps;
8048 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8049 ShuffleMask))
8050 return SDValue();
8051
8052 int Elt = ShuffleMask[Index];
8053 if (Elt == SM_SentinelZero)
8054 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8055 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8056 if (Elt == SM_SentinelUndef)
8057 return DAG.getUNDEF(ShufSVT);
8058
8059 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")((void)0);
8060 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8061 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8062 }
8063
8064 // Recurse into insert_subvector base/sub vector to find scalars.
8065 if (Opcode == ISD::INSERT_SUBVECTOR) {
8066 SDValue Vec = Op.getOperand(0);
8067 SDValue Sub = Op.getOperand(1);
8068 uint64_t SubIdx = Op.getConstantOperandVal(2);
8069 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8070
8071 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8072 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8073 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8074 }
8075
8076 // Recurse into concat_vectors sub vector to find scalars.
8077 if (Opcode == ISD::CONCAT_VECTORS) {
8078 EVT SubVT = Op.getOperand(0).getValueType();
8079 unsigned NumSubElts = SubVT.getVectorNumElements();
8080 uint64_t SubIdx = Index / NumSubElts;
8081 uint64_t SubElt = Index % NumSubElts;
8082 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8083 }
8084
8085 // Recurse into extract_subvector src vector to find scalars.
8086 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8087 SDValue Src = Op.getOperand(0);
8088 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8089 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8090 }
8091
8092 // We only peek through bitcasts of the same vector width.
8093 if (Opcode == ISD::BITCAST) {
8094 SDValue Src = Op.getOperand(0);
8095 EVT SrcVT = Src.getValueType();
8096 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8097 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8098 return SDValue();
8099 }
8100
8101 // Actual nodes that may contain scalar elements
8102
8103 // For insert_vector_elt - either return the index matching scalar or recurse
8104 // into the base vector.
8105 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8106 isa<ConstantSDNode>(Op.getOperand(2))) {
8107 if (Op.getConstantOperandAPInt(2) == Index)
8108 return Op.getOperand(1);
8109 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8110 }
8111
8112 if (Opcode == ISD::SCALAR_TO_VECTOR)
8113 return (Index == 0) ? Op.getOperand(0)
8114 : DAG.getUNDEF(VT.getVectorElementType());
8115
8116 if (Opcode == ISD::BUILD_VECTOR)
8117 return Op.getOperand(Index);
8118
8119 return SDValue();
8120}
8121
8122// Use PINSRB/PINSRW/PINSRD to create a build vector.
8123static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8124 unsigned NumNonZero, unsigned NumZero,
8125 SelectionDAG &DAG,
8126 const X86Subtarget &Subtarget) {
8127 MVT VT = Op.getSimpleValueType();
8128 unsigned NumElts = VT.getVectorNumElements();
8129 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||((void)0)
8130 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&((void)0)
8131 "Illegal vector insertion")((void)0);
8132
8133 SDLoc dl(Op);
8134 SDValue V;
8135 bool First = true;
8136
8137 for (unsigned i = 0; i < NumElts; ++i) {
8138 bool IsNonZero = NonZeroMask[i];
8139 if (!IsNonZero)
8140 continue;
8141
8142 // If the build vector contains zeros or our first insertion is not the
8143 // first index then insert into zero vector to break any register
8144 // dependency else use SCALAR_TO_VECTOR.
8145 if (First) {
8146 First = false;
8147 if (NumZero || 0 != i)
8148 V = getZeroVector(VT, Subtarget, DAG, dl);
8149 else {
8150 assert(0 == i && "Expected insertion into zero-index")((void)0);
8151 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8152 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8153 V = DAG.getBitcast(VT, V);
8154 continue;
8155 }
8156 }
8157 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8158 DAG.getIntPtrConstant(i, dl));
8159 }
8160
8161 return V;
8162}
8163
8164/// Custom lower build_vector of v16i8.
8165static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8166 unsigned NumNonZero, unsigned NumZero,
8167 SelectionDAG &DAG,
8168 const X86Subtarget &Subtarget) {
8169 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8170 return SDValue();
8171
8172 // SSE4.1 - use PINSRB to insert each byte directly.
8173 if (Subtarget.hasSSE41())
8174 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8175 Subtarget);
8176
8177 SDLoc dl(Op);
8178 SDValue V;
8179
8180 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8181 for (unsigned i = 0; i < 16; i += 2) {
8182 bool ThisIsNonZero = NonZeroMask[i];
8183 bool NextIsNonZero = NonZeroMask[i + 1];
8184 if (!ThisIsNonZero && !NextIsNonZero)
8185 continue;
8186
8187 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8188 SDValue Elt;
8189 if (ThisIsNonZero) {
8190 if (NumZero || NextIsNonZero)
8191 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8192 else
8193 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8194 }
8195
8196 if (NextIsNonZero) {
8197 SDValue NextElt = Op.getOperand(i + 1);
8198 if (i == 0 && NumZero)
8199 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8200 else
8201 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8202 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8203 DAG.getConstant(8, dl, MVT::i8));
8204 if (ThisIsNonZero)
8205 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8206 else
8207 Elt = NextElt;
8208 }
8209
8210 // If our first insertion is not the first index or zeros are needed, then
8211 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8212 // elements undefined).
8213 if (!V) {
8214 if (i != 0 || NumZero)
8215 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8216 else {
8217 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8218 V = DAG.getBitcast(MVT::v8i16, V);
8219 continue;
8220 }
8221 }
8222 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8223 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8224 DAG.getIntPtrConstant(i / 2, dl));
8225 }
8226
8227 return DAG.getBitcast(MVT::v16i8, V);
8228}
8229
8230/// Custom lower build_vector of v8i16.
8231static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8232 unsigned NumNonZero, unsigned NumZero,
8233 SelectionDAG &DAG,
8234 const X86Subtarget &Subtarget) {
8235 if (NumNonZero > 4 && !Subtarget.hasSSE41())
8236 return SDValue();
8237
8238 // Use PINSRW to insert each byte directly.
8239 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8240 Subtarget);
8241}
8242
8243/// Custom lower build_vector of v4i32 or v4f32.
8244static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8245 const X86Subtarget &Subtarget) {
8246 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8247 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8248 // Because we're creating a less complicated build vector here, we may enable
8249 // further folding of the MOVDDUP via shuffle transforms.
8250 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8251 Op.getOperand(0) == Op.getOperand(2) &&
8252 Op.getOperand(1) == Op.getOperand(3) &&
8253 Op.getOperand(0) != Op.getOperand(1)) {
8254 SDLoc DL(Op);
8255 MVT VT = Op.getSimpleValueType();
8256 MVT EltVT = VT.getVectorElementType();
8257 // Create a new build vector with the first 2 elements followed by undef
8258 // padding, bitcast to v2f64, duplicate, and bitcast back.
8259 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8260 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8261 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8262 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8263 return DAG.getBitcast(VT, Dup);
8264 }
8265
8266 // Find all zeroable elements.
8267 std::bitset<4> Zeroable, Undefs;
8268 for (int i = 0; i < 4; ++i) {
8269 SDValue Elt = Op.getOperand(i);
8270 Undefs[i] = Elt.isUndef();
8271 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8272 }
8273 assert(Zeroable.size() - Zeroable.count() > 1 &&((void)0)
8274 "We expect at least two non-zero elements!")((void)0);
8275
8276 // We only know how to deal with build_vector nodes where elements are either
8277 // zeroable or extract_vector_elt with constant index.
8278 SDValue FirstNonZero;
8279 unsigned FirstNonZeroIdx;
8280 for (unsigned i = 0; i < 4; ++i) {
8281 if (Zeroable[i])
8282 continue;
8283 SDValue Elt = Op.getOperand(i);
8284 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8285 !isa<ConstantSDNode>(Elt.getOperand(1)))
8286 return SDValue();
8287 // Make sure that this node is extracting from a 128-bit vector.
8288 MVT VT = Elt.getOperand(0).getSimpleValueType();
8289 if (!VT.is128BitVector())
8290 return SDValue();
8291 if (!FirstNonZero.getNode()) {
8292 FirstNonZero = Elt;
8293 FirstNonZeroIdx = i;
8294 }
8295 }
8296
8297 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((void)0);
8298 SDValue V1 = FirstNonZero.getOperand(0);
8299 MVT VT = V1.getSimpleValueType();
8300
8301 // See if this build_vector can be lowered as a blend with zero.
8302 SDValue Elt;
8303 unsigned EltMaskIdx, EltIdx;
8304 int Mask[4];
8305 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8306 if (Zeroable[EltIdx]) {
8307 // The zero vector will be on the right hand side.
8308 Mask[EltIdx] = EltIdx+4;
8309 continue;
8310 }
8311
8312 Elt = Op->getOperand(EltIdx);
8313 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8314 EltMaskIdx = Elt.getConstantOperandVal(1);
8315 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8316 break;
8317 Mask[EltIdx] = EltIdx;
8318 }
8319
8320 if (EltIdx == 4) {
8321 // Let the shuffle legalizer deal with blend operations.
8322 SDValue VZeroOrUndef = (Zeroable == Undefs)
8323 ? DAG.getUNDEF(VT)
8324 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8325 if (V1.getSimpleValueType() != VT)
8326 V1 = DAG.getBitcast(VT, V1);
8327 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8328 }
8329
8330 // See if we can lower this build_vector to a INSERTPS.
8331 if (!Subtarget.hasSSE41())
8332 return SDValue();
8333
8334 SDValue V2 = Elt.getOperand(0);
8335 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8336 V1 = SDValue();
8337
8338 bool CanFold = true;
8339 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8340 if (Zeroable[i])
8341 continue;
8342
8343 SDValue Current = Op->getOperand(i);
8344 SDValue SrcVector = Current->getOperand(0);
8345 if (!V1.getNode())
8346 V1 = SrcVector;
8347 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8348 }
8349
8350 if (!CanFold)
8351 return SDValue();
8352
8353 assert(V1.getNode() && "Expected at least two non-zero elements!")((void)0);
8354 if (V1.getSimpleValueType() != MVT::v4f32)
8355 V1 = DAG.getBitcast(MVT::v4f32, V1);
8356 if (V2.getSimpleValueType() != MVT::v4f32)
8357 V2 = DAG.getBitcast(MVT::v4f32, V2);
8358
8359 // Ok, we can emit an INSERTPS instruction.
8360 unsigned ZMask = Zeroable.to_ulong();
8361
8362 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8363 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")((void)0);
8364 SDLoc DL(Op);
8365 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8366 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8367 return DAG.getBitcast(VT, Result);
8368}
8369
8370/// Return a vector logical shift node.
8371static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8372 SelectionDAG &DAG, const TargetLowering &TLI,
8373 const SDLoc &dl) {
8374 assert(VT.is128BitVector() && "Unknown type for VShift")((void)0);
8375 MVT ShVT = MVT::v16i8;
8376 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8377 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8378 assert(NumBits % 8 == 0 && "Only support byte sized shifts")((void)0);
8379 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8380 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8381}
8382
8383static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8384 SelectionDAG &DAG) {
8385
8386 // Check if the scalar load can be widened into a vector load. And if
8387 // the address is "base + cst" see if the cst can be "absorbed" into
8388 // the shuffle mask.
8389 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
27
Assuming 'LD' is non-null
28
Taking true branch
8390 SDValue Ptr = LD->getBasePtr();
29
Value assigned to 'Ptr.Node'
8391 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
30
Taking false branch
8392 return SDValue();
8393 EVT PVT = LD->getValueType(0);
8394 if (PVT != MVT::i32 && PVT != MVT::f32)
31
Taking false branch
8395 return SDValue();
8396
8397 int FI = -1;
8398 int64_t Offset = 0;
8399 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
32
Calling 'dyn_cast<llvm::FrameIndexSDNode, llvm::SDValue>'
45
Returning from 'dyn_cast<llvm::FrameIndexSDNode, llvm::SDValue>'
46
Assuming 'FINode' is null
47
Taking false branch
8400 FI = FINode->getIndex();
8401 Offset = 0;
8402 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
48
Assuming the condition is true
8403 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
49
Calling 'SDValue::getOperand'
8404 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8405 Offset = Ptr.getConstantOperandVal(1);
8406 Ptr = Ptr.getOperand(0);
8407 } else {
8408 return SDValue();
8409 }
8410
8411 // FIXME: 256-bit vector instructions don't require a strict alignment,
8412 // improve this code to support it better.
8413 Align RequiredAlign(VT.getSizeInBits() / 8);
8414 SDValue Chain = LD->getChain();
8415 // Make sure the stack object alignment is at least 16 or 32.
8416 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8417 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8418 if (!InferredAlign || *InferredAlign < RequiredAlign) {
8419 if (MFI.isFixedObjectIndex(FI)) {
8420 // Can't change the alignment. FIXME: It's possible to compute
8421 // the exact stack offset and reference FI + adjust offset instead.
8422 // If someone *really* cares about this. That's the way to implement it.
8423 return SDValue();
8424 } else {
8425 MFI.setObjectAlignment(FI, RequiredAlign);
8426 }
8427 }
8428
8429 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8430 // Ptr + (Offset & ~15).
8431 if (Offset < 0)
8432 return SDValue();
8433 if ((Offset % RequiredAlign.value()) & 3)
8434 return SDValue();
8435 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8436 if (StartOffset) {
8437 SDLoc DL(Ptr);
8438 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8439 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8440 }
8441
8442 int EltNo = (Offset - StartOffset) >> 2;
8443 unsigned NumElems = VT.getVectorNumElements();
8444
8445 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8446 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8447 LD->getPointerInfo().getWithOffset(StartOffset));
8448
8449 SmallVector<int, 8> Mask(NumElems, EltNo);
8450
8451 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8452 }
8453
8454 return SDValue();
8455}
8456
8457// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8458static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8459 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8460 auto *BaseLd = cast<LoadSDNode>(Elt);
8461 if (!BaseLd->isSimple())
8462 return false;
8463 Ld = BaseLd;
8464 ByteOffset = 0;
8465 return true;
8466 }
8467
8468 switch (Elt.getOpcode()) {
8469 case ISD::BITCAST:
8470 case ISD::TRUNCATE:
8471 case ISD::SCALAR_TO_VECTOR:
8472 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8473 case ISD::SRL:
8474 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8475 uint64_t Idx = IdxC->getZExtValue();
8476 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8477 ByteOffset += Idx / 8;
8478 return true;
8479 }
8480 }
8481 break;
8482 case ISD::EXTRACT_VECTOR_ELT:
8483 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8484 SDValue Src = Elt.getOperand(0);
8485 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8486 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8487 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8488 findEltLoadSrc(Src, Ld, ByteOffset)) {
8489 uint64_t Idx = IdxC->getZExtValue();
8490 ByteOffset += Idx * (SrcSizeInBits / 8);
8491 return true;
8492 }
8493 }
8494 break;
8495 }
8496
8497 return false;
8498}
8499
8500/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8501/// elements can be replaced by a single large load which has the same value as
8502/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8503///
8504/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8505static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8506 const SDLoc &DL, SelectionDAG &DAG,
8507 const X86Subtarget &Subtarget,
8508 bool IsAfterLegalize) {
8509 if ((VT.getScalarSizeInBits() % 8) != 0)
8510 return SDValue();
8511
8512 unsigned NumElems = Elts.size();
8513
8514 int LastLoadedElt = -1;
8515 APInt LoadMask = APInt::getNullValue(NumElems);
8516 APInt ZeroMask = APInt::getNullValue(NumElems);
8517 APInt UndefMask = APInt::getNullValue(NumElems);
8518
8519 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8520 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8521
8522 // For each element in the initializer, see if we've found a load, zero or an
8523 // undef.
8524 for (unsigned i = 0; i < NumElems; ++i) {
8525 SDValue Elt = peekThroughBitcasts(Elts[i]);
8526 if (!Elt.getNode())
8527 return SDValue();
8528 if (Elt.isUndef()) {
8529 UndefMask.setBit(i);
8530 continue;
8531 }
8532 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8533 ZeroMask.setBit(i);
8534 continue;
8535 }
8536
8537 // Each loaded element must be the correct fractional portion of the
8538 // requested vector load.
8539 unsigned EltSizeInBits = Elt.getValueSizeInBits();
8540 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8541 return SDValue();
8542
8543 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8544 return SDValue();
8545 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8546 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8547 return SDValue();
8548
8549 LoadMask.setBit(i);
8550 LastLoadedElt = i;
8551 }
8552 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +((void)0)
8553 LoadMask.countPopulation()) == NumElems &&((void)0)
8554 "Incomplete element masks")((void)0);
8555
8556 // Handle Special Cases - all undef or undef/zero.
8557 if (UndefMask.countPopulation() == NumElems)
8558 return DAG.getUNDEF(VT);
8559 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8560 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8561 : DAG.getConstantFP(0.0, DL, VT);
8562
8563 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8564 int FirstLoadedElt = LoadMask.countTrailingZeros();
8565 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8566 EVT EltBaseVT = EltBase.getValueType();
8567 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&((void)0)
8568 "Register/Memory size mismatch")((void)0);
8569 LoadSDNode *LDBase = Loads[FirstLoadedElt];
8570 assert(LDBase && "Did not find base load for merging consecutive loads")((void)0);
8571 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8572 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8573 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8574 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8575 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")((void)0);
8576
8577 // TODO: Support offsetting the base load.
8578 if (ByteOffsets[FirstLoadedElt] != 0)
8579 return SDValue();
8580
8581 // Check to see if the element's load is consecutive to the base load
8582 // or offset from a previous (already checked) load.
8583 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8584 LoadSDNode *Ld = Loads[EltIdx];
8585 int64_t ByteOffset = ByteOffsets[EltIdx];
8586 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8587 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8588 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8589 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8590 }
8591 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8592 EltIdx - FirstLoadedElt);
8593 };
8594
8595 // Consecutive loads can contain UNDEFS but not ZERO elements.
8596 // Consecutive loads with UNDEFs and ZEROs elements require a
8597 // an additional shuffle stage to clear the ZERO elements.
8598 bool IsConsecutiveLoad = true;
8599 bool IsConsecutiveLoadWithZeros = true;
8600 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8601 if (LoadMask[i]) {
8602 if (!CheckConsecutiveLoad(LDBase, i)) {
8603 IsConsecutiveLoad = false;
8604 IsConsecutiveLoadWithZeros = false;
8605 break;
8606 }
8607 } else if (ZeroMask[i]) {
8608 IsConsecutiveLoad = false;
8609 }
8610 }
8611
8612 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8613 auto MMOFlags = LDBase->getMemOperand()->getFlags();
8614 assert(LDBase->isSimple() &&((void)0)
8615 "Cannot merge volatile or atomic loads.")((void)0);
8616 SDValue NewLd =
8617 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8618 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8619 MMOFlags);
8620 for (auto *LD : Loads)
8621 if (LD)
8622 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8623 return NewLd;
8624 };
8625
8626 // Check if the base load is entirely dereferenceable.
8627 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8628 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8629
8630 // LOAD - all consecutive load/undefs (must start/end with a load or be
8631 // entirely dereferenceable). If we have found an entire vector of loads and
8632 // undefs, then return a large load of the entire vector width starting at the
8633 // base pointer. If the vector contains zeros, then attempt to shuffle those
8634 // elements.
8635 if (FirstLoadedElt == 0 &&
8636 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8637 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8638 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8639 return SDValue();
8640
8641 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8642 // will lower to regular temporal loads and use the cache.
8643 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8644 VT.is256BitVector() && !Subtarget.hasInt256())
8645 return SDValue();
8646
8647 if (NumElems == 1)
8648 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8649
8650 if (!ZeroMask)
8651 return CreateLoad(VT, LDBase);
8652
8653 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8654 // vector and a zero vector to clear out the zero elements.
8655 if (!IsAfterLegalize && VT.isVector()) {
8656 unsigned NumMaskElts = VT.getVectorNumElements();
8657 if ((NumMaskElts % NumElems) == 0) {
8658 unsigned Scale = NumMaskElts / NumElems;
8659 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8660 for (unsigned i = 0; i < NumElems; ++i) {
8661 if (UndefMask[i])
8662 continue;
8663 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8664 for (unsigned j = 0; j != Scale; ++j)
8665 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8666 }
8667 SDValue V = CreateLoad(VT, LDBase);
8668 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8669 : DAG.getConstantFP(0.0, DL, VT);
8670 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8671 }
8672 }
8673 }
8674
8675 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8676 if (VT.is256BitVector() || VT.is512BitVector()) {
8677 unsigned HalfNumElems = NumElems / 2;
8678 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8679 EVT HalfVT =
8680 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8681 SDValue HalfLD =
8682 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8683 DAG, Subtarget, IsAfterLegalize);
8684 if (HalfLD)
8685 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8686 HalfLD, DAG.getIntPtrConstant(0, DL));
8687 }
8688 }
8689
8690 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8691 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8692 (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8693 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8694 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8695 : MVT::getIntegerVT(LoadSizeInBits);
8696 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8697 // Allow v4f32 on SSE1 only targets.
8698 // FIXME: Add more isel patterns so we can just use VT directly.
8699 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8700 VecVT = MVT::v4f32;
8701 if (TLI.isTypeLegal(VecVT)) {
8702 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8703 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8704 SDValue ResNode = DAG.getMemIntrinsicNode(
8705 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8706 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8707 for (auto *LD : Loads)
8708 if (LD)
8709 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8710 return DAG.getBitcast(VT, ResNode);
8711 }
8712 }
8713
8714 // BROADCAST - match the smallest possible repetition pattern, load that
8715 // scalar/subvector element and then broadcast to the entire vector.
8716 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8717 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8718 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8719 unsigned RepeatSize = SubElems * BaseSizeInBits;
8720 unsigned ScalarSize = std::min(RepeatSize, 64u);
8721 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8722 continue;
8723
8724 // Don't attempt a 1:N subvector broadcast - it should be caught by
8725 // combineConcatVectorOps, else will cause infinite loops.
8726 if (RepeatSize > ScalarSize && SubElems == 1)
8727 continue;
8728
8729 bool Match = true;
8730 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8731 for (unsigned i = 0; i != NumElems && Match; ++i) {
8732 if (!LoadMask[i])
8733 continue;
8734 SDValue Elt = peekThroughBitcasts(Elts[i]);
8735 if (RepeatedLoads[i % SubElems].isUndef())
8736 RepeatedLoads[i % SubElems] = Elt;
8737 else
8738 Match &= (RepeatedLoads[i % SubElems] == Elt);
8739 }
8740
8741 // We must have loads at both ends of the repetition.
8742 Match &= !RepeatedLoads.front().isUndef();
8743 Match &= !RepeatedLoads.back().isUndef();
8744 if (!Match)
8745 continue;
8746
8747 EVT RepeatVT =
8748 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8749 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8750 : EVT::getFloatingPointVT(ScalarSize);
8751 if (RepeatSize > ScalarSize)
8752 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8753 RepeatSize / ScalarSize);
8754 EVT BroadcastVT =
8755 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8756 VT.getSizeInBits() / ScalarSize);
8757 if (TLI.isTypeLegal(BroadcastVT)) {
8758 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8759 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
8760 SDValue Broadcast = RepeatLoad;
8761 if (RepeatSize > ScalarSize) {
8762 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8763 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8764 } else {
8765 Broadcast =
8766 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8767 }
8768 return DAG.getBitcast(VT, Broadcast);
8769 }
8770 }
8771 }
8772 }
8773
8774 return SDValue();
8775}
8776
8777// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8778// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8779// are consecutive, non-overlapping, and in the right order.
8780static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
8781 SelectionDAG &DAG,
8782 const X86Subtarget &Subtarget,
8783 bool IsAfterLegalize) {
8784 SmallVector<SDValue, 64> Elts;
8785 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8786 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8787 Elts.push_back(Elt);
8788 continue;
8789 }
8790 return SDValue();
8791 }
8792 assert(Elts.size() == VT.getVectorNumElements())((void)0);
8793 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8794 IsAfterLegalize);
8795}
8796
8797static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8798 unsigned SplatBitSize, LLVMContext &C) {
8799 unsigned ScalarSize = VT.getScalarSizeInBits();
8800 unsigned NumElm = SplatBitSize / ScalarSize;
8801
8802 SmallVector<Constant *, 32> ConstantVec;
8803 for (unsigned i = 0; i < NumElm; i++) {
8804 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8805 Constant *Const;
8806 if (VT.isFloatingPoint()) {
8807 if (ScalarSize == 32) {
8808 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8809 } else {
8810 assert(ScalarSize == 64 && "Unsupported floating point scalar size")((void)0);
8811 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8812 }
8813 } else
8814 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8815 ConstantVec.push_back(Const);
8816 }
8817 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8818}
8819
8820static bool isFoldableUseOfShuffle(SDNode *N) {
8821 for (auto *U : N->uses()) {
8822 unsigned Opc = U->getOpcode();
8823 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8824 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8825 return false;
8826 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8827 return false;
8828 if (isTargetShuffle(Opc))
8829 return true;
8830 if (Opc == ISD::BITCAST) // Ignore bitcasts
8831 return isFoldableUseOfShuffle(U);
8832 if (N->hasOneUse())
8833 return true;
8834 }
8835 return false;
8836}
8837
8838/// Attempt to use the vbroadcast instruction to generate a splat value
8839/// from a splat BUILD_VECTOR which uses:
8840/// a. A single scalar load, or a constant.
8841/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8842///
8843/// The VBROADCAST node is returned when a pattern is found,
8844/// or SDValue() otherwise.
8845static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8846 const X86Subtarget &Subtarget,
8847 SelectionDAG &DAG) {
8848 // VBROADCAST requires AVX.
8849 // TODO: Splats could be generated for non-AVX CPUs using SSE
8850 // instructions, but there's less potential gain for only 128-bit vectors.
8851 if (!Subtarget.hasAVX())
8852 return SDValue();
8853
8854 MVT VT = BVOp->getSimpleValueType(0);
8855 unsigned NumElts = VT.getVectorNumElements();
8856 SDLoc dl(BVOp);
8857
8858 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
8859 "Unsupported vector type for broadcast.")((void)0);
8860
8861 // See if the build vector is a repeating sequence of scalars (inc. splat).
8862 SDValue Ld;
8863 BitVector UndefElements;
8864 SmallVector<SDValue, 16> Sequence;
8865 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8866 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")((void)0);
8867 if (Sequence.size() == 1)
8868 Ld = Sequence[0];
8869 }
8870
8871 // Attempt to use VBROADCASTM
8872 // From this pattern:
8873 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8874 // b. t1 = (build_vector t0 t0)
8875 //
8876 // Create (VBROADCASTM v2i1 X)
8877 if (!Sequence.empty() && Subtarget.hasCDI()) {
8878 // If not a splat, are the upper sequence values zeroable?
8879 unsigned SeqLen = Sequence.size();
8880 bool UpperZeroOrUndef =
8881 SeqLen == 1 ||
8882 llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
8883 return !V || V.isUndef() || isNullConstant(V);
8884 });
8885 SDValue Op0 = Sequence[0];
8886 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8887 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8888 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8889 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8890 ? Op0.getOperand(0)
8891 : Op0.getOperand(0).getOperand(0);
8892 MVT MaskVT = BOperand.getSimpleValueType();
8893 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8894 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8895 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8896 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8897 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8898 unsigned Scale = 512 / VT.getSizeInBits();
8899 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8900 }
8901 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8902 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8903 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8904 return DAG.getBitcast(VT, Bcst);
8905 }
8906 }
8907 }
8908
8909 unsigned NumUndefElts = UndefElements.count();
8910 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8911 APInt SplatValue, Undef;
8912 unsigned SplatBitSize;
8913 bool HasUndef;
8914 // Check if this is a repeated constant pattern suitable for broadcasting.
8915 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8916 SplatBitSize > VT.getScalarSizeInBits() &&
8917 SplatBitSize < VT.getSizeInBits()) {
8918 // Avoid replacing with broadcast when it's a use of a shuffle
8919 // instruction to preserve the present custom lowering of shuffles.
8920 if (isFoldableUseOfShuffle(BVOp))
8921 return SDValue();
8922 // replace BUILD_VECTOR with broadcast of the repeated constants.
8923 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8924 LLVMContext *Ctx = DAG.getContext();
8925 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8926 if (Subtarget.hasAVX()) {
8927 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8928 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8929 // Splatted value can fit in one INTEGER constant in constant pool.
8930 // Load the constant and broadcast it.
8931 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8932 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8933 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8934 SDValue CP = DAG.getConstantPool(C, PVT);
8935 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8936
8937 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8938 SDVTList Tys =
8939 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8940 SDValue Ops[] = {DAG.getEntryNode(), CP};
8941 MachinePointerInfo MPI =
8942 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8943 SDValue Brdcst = DAG.getMemIntrinsicNode(
8944 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8945 MachineMemOperand::MOLoad);
8946 return DAG.getBitcast(VT, Brdcst);
8947 }
8948 if (SplatBitSize > 64) {
8949 // Load the vector of constants and broadcast it.
8950 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8951 *Ctx);
8952 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8953 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8954 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8955 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8956 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8957 SDValue Ops[] = {DAG.getEntryNode(), VCP};
8958 MachinePointerInfo MPI =
8959 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8960 return DAG.getMemIntrinsicNode(
8961 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
8962 MachineMemOperand::MOLoad);
8963 }
8964 }
8965 }
8966
8967 // If we are moving a scalar into a vector (Ld must be set and all elements
8968 // but 1 are undef) and that operation is not obviously supported by
8969 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8970 // That's better than general shuffling and may eliminate a load to GPR and
8971 // move from scalar to vector register.
8972 if (!Ld || NumElts - NumUndefElts != 1)
8973 return SDValue();
8974 unsigned ScalarSize = Ld.getValueSizeInBits();
8975 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8976 return SDValue();
8977 }
8978
8979 bool ConstSplatVal =
8980 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8981 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8982
8983 // TODO: Handle broadcasts of non-constant sequences.
8984
8985 // Make sure that all of the users of a non-constant load are from the
8986 // BUILD_VECTOR node.
8987 // FIXME: Is the use count needed for non-constant, non-load case?
8988 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8989 return SDValue();
8990
8991 unsigned ScalarSize = Ld.getValueSizeInBits();
8992 bool IsGE256 = (VT.getSizeInBits() >= 256);
8993
8994 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8995 // instruction to save 8 or more bytes of constant pool data.
8996 // TODO: If multiple splats are generated to load the same constant,
8997 // it may be detrimental to overall size. There needs to be a way to detect
8998 // that condition to know if this is truly a size win.
8999 bool OptForSize = DAG.shouldOptForSize();
9000
9001 // Handle broadcasting a single constant scalar from the constant pool
9002 // into a vector.
9003 // On Sandybridge (no AVX2), it is still better to load a constant vector
9004 // from the constant pool and not to broadcast it from a scalar.
9005 // But override that restriction when optimizing for size.
9006 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9007 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9008 EVT CVT = Ld.getValueType();
9009 assert(!CVT.isVector() && "Must not broadcast a vector type")((void)0);
9010
9011 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9012 // For size optimization, also splat v2f64 and v2i64, and for size opt
9013 // with AVX2, also splat i8 and i16.
9014 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9015 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9016 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9017 const Constant *C = nullptr;
9018 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9019 C = CI->getConstantIntValue();
9020 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9021 C = CF->getConstantFPValue();
9022
9023 assert(C && "Invalid constant type")((void)0);
9024
9025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9026 SDValue CP =
9027 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9028 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9029
9030 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9031 SDValue Ops[] = {DAG.getEntryNode(), CP};
9032 MachinePointerInfo MPI =
9033 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9034 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9035 MPI, Alignment, MachineMemOperand::MOLoad);
9036 }
9037 }
9038
9039 // Handle AVX2 in-register broadcasts.
9040 if (!IsLoad && Subtarget.hasInt256() &&
9041 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9042 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9043
9044 // The scalar source must be a normal load.
9045 if (!IsLoad)
9046 return SDValue();
9047
9048 // Make sure the non-chain result is only used by this build vector.
9049 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9050 return SDValue();
9051
9052 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9053 (Subtarget.hasVLX() && ScalarSize == 64)) {
9054 auto *LN = cast<LoadSDNode>(Ld);
9055 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9056 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9057 SDValue BCast =
9058 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9059 LN->getMemoryVT(), LN->getMemOperand());
9060 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9061 return BCast;
9062 }
9063
9064 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9065 // double since there is no vbroadcastsd xmm
9066 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9067 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9068 auto *LN = cast<LoadSDNode>(Ld);
9069 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9070 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9071 SDValue BCast =
9072 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9073 LN->getMemoryVT(), LN->getMemOperand());
9074 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9075 return BCast;
9076 }
9077
9078 // Unsupported broadcast.
9079 return SDValue();
9080}
9081
9082/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9083/// underlying vector and index.
9084///
9085/// Modifies \p ExtractedFromVec to the real vector and returns the real
9086/// index.
9087static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9088 SDValue ExtIdx) {
9089 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9090 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9091 return Idx;
9092
9093 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9094 // lowered this:
9095 // (extract_vector_elt (v8f32 %1), Constant<6>)
9096 // to:
9097 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9098 // (extract_subvector (v8f32 %0), Constant<4>),
9099 // undef)
9100 // Constant<0>)
9101 // In this case the vector is the extract_subvector expression and the index
9102 // is 2, as specified by the shuffle.
9103 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9104 SDValue ShuffleVec = SVOp->getOperand(0);
9105 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9106 assert(ShuffleVecVT.getVectorElementType() ==((void)0)
9107 ExtractedFromVec.getSimpleValueType().getVectorElementType())((void)0);
9108
9109 int ShuffleIdx = SVOp->getMaskElt(Idx);
9110 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9111 ExtractedFromVec = ShuffleVec;
9112 return ShuffleIdx;
9113 }
9114 return Idx;
9115}
9116
9117static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9118 MVT VT = Op.getSimpleValueType();
9119
9120 // Skip if insert_vec_elt is not supported.
9121 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9122 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9123 return SDValue();
9124
9125 SDLoc DL(Op);
9126 unsigned NumElems = Op.getNumOperands();
9127
9128 SDValue VecIn1;
9129 SDValue VecIn2;
9130 SmallVector<unsigned, 4> InsertIndices;
9131 SmallVector<int, 8> Mask(NumElems, -1);
9132
9133 for (unsigned i = 0; i != NumElems; ++i) {
9134 unsigned Opc = Op.getOperand(i).getOpcode();
9135
9136 if (Opc == ISD::UNDEF)
9137 continue;
9138
9139 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9140 // Quit if more than 1 elements need inserting.
9141 if (InsertIndices.size() > 1)
9142 return SDValue();
9143
9144 InsertIndices.push_back(i);
9145 continue;
9146 }
9147
9148 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9149 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9150
9151 // Quit if non-constant index.
9152 if (!isa<ConstantSDNode>(ExtIdx))
9153 return SDValue();
9154 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9155
9156 // Quit if extracted from vector of different type.
9157 if (ExtractedFromVec.getValueType() != VT)
9158 return SDValue();
9159
9160 if (!VecIn1.getNode())
9161 VecIn1 = ExtractedFromVec;
9162 else if (VecIn1 != ExtractedFromVec) {
9163 if (!VecIn2.getNode())
9164 VecIn2 = ExtractedFromVec;
9165 else if (VecIn2 != ExtractedFromVec)
9166 // Quit if more than 2 vectors to shuffle
9167 return SDValue();
9168 }
9169
9170 if (ExtractedFromVec == VecIn1)
9171 Mask[i] = Idx;
9172 else if (ExtractedFromVec == VecIn2)
9173 Mask[i] = Idx + NumElems;
9174 }
9175
9176 if (!VecIn1.getNode())
9177 return SDValue();
9178
9179 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9180 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9181
9182 for (unsigned Idx : InsertIndices)
9183 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9184 DAG.getIntPtrConstant(Idx, DL));
9185
9186 return NV;
9187}
9188
9189// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9190static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9191 const X86Subtarget &Subtarget) {
9192
9193 MVT VT = Op.getSimpleValueType();
9194 assert((VT.getVectorElementType() == MVT::i1) &&((void)0)
9195 "Unexpected type in LowerBUILD_VECTORvXi1!")((void)0);
9196
9197 SDLoc dl(Op);
9198 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9199 ISD::isBuildVectorAllOnes(Op.getNode()))
9200 return Op;
9201
9202 uint64_t Immediate = 0;
9203 SmallVector<unsigned, 16> NonConstIdx;
9204 bool IsSplat = true;
9205 bool HasConstElts = false;
9206 int SplatIdx = -1;
9207 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9208 SDValue In = Op.getOperand(idx);
9209 if (In.isUndef())
9210 continue;
9211 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9212 Immediate |= (InC->getZExtValue() & 0x1) << idx;
9213 HasConstElts = true;
9214 } else {
9215 NonConstIdx.push_back(idx);
9216 }
9217 if (SplatIdx < 0)
9218 SplatIdx = idx;
9219 else if (In != Op.getOperand(SplatIdx))
9220 IsSplat = false;
9221 }
9222
9223 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9224 if (IsSplat) {
9225 // The build_vector allows the scalar element to be larger than the vector
9226 // element type. We need to mask it to use as a condition unless we know
9227 // the upper bits are zero.
9228 // FIXME: Use computeKnownBits instead of checking specific opcode?
9229 SDValue Cond = Op.getOperand(SplatIdx);
9230 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")((void)0);
9231 if (Cond.getOpcode() != ISD::SETCC)
9232 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9233 DAG.getConstant(1, dl, MVT::i8));
9234
9235 // Perform the select in the scalar domain so we can use cmov.
9236 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9237 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9238 DAG.getAllOnesConstant(dl, MVT::i32),
9239 DAG.getConstant(0, dl, MVT::i32));
9240 Select = DAG.getBitcast(MVT::v32i1, Select);
9241 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9242 } else {
9243 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9244 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9245 DAG.getAllOnesConstant(dl, ImmVT),
9246 DAG.getConstant(0, dl, ImmVT));
9247 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9248 Select = DAG.getBitcast(VecVT, Select);
9249 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9250 DAG.getIntPtrConstant(0, dl));
9251 }
9252 }
9253
9254 // insert elements one by one
9255 SDValue DstVec;
9256 if (HasConstElts) {
9257 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9258 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9259 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9260 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9261 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9262 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9263 } else {
9264 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9265 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9266 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9267 DstVec = DAG.getBitcast(VecVT, Imm);
9268 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9269 DAG.getIntPtrConstant(0, dl));
9270 }
9271 } else
9272 DstVec = DAG.getUNDEF(VT);
9273
9274 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9275 unsigned InsertIdx = NonConstIdx[i];
9276 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9277 Op.getOperand(InsertIdx),
9278 DAG.getIntPtrConstant(InsertIdx, dl));
9279 }
9280 return DstVec;
9281}
9282
9283LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
9284 switch (Opcode) {
9285 case X86ISD::PACKSS:
9286 case X86ISD::PACKUS:
9287 case X86ISD::FHADD:
9288 case X86ISD::FHSUB:
9289 case X86ISD::HADD:
9290 case X86ISD::HSUB:
9291 return true;
9292 }
9293 return false;
9294}
9295
9296/// This is a helper function of LowerToHorizontalOp().
9297/// This function checks that the build_vector \p N in input implements a
9298/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9299/// may not match the layout of an x86 256-bit horizontal instruction.
9300/// In other words, if this returns true, then some extraction/insertion will
9301/// be required to produce a valid horizontal instruction.
9302///
9303/// Parameter \p Opcode defines the kind of horizontal operation to match.
9304/// For example, if \p Opcode is equal to ISD::ADD, then this function
9305/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9306/// is equal to ISD::SUB, then this function checks if this is a horizontal
9307/// arithmetic sub.
9308///
9309/// This function only analyzes elements of \p N whose indices are
9310/// in range [BaseIdx, LastIdx).
9311///
9312/// TODO: This function was originally used to match both real and fake partial
9313/// horizontal operations, but the index-matching logic is incorrect for that.
9314/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9315/// code because it is only used for partial h-op matching now?
9316static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9317 SelectionDAG &DAG,
9318 unsigned BaseIdx, unsigned LastIdx,
9319 SDValue &V0, SDValue &V1) {
9320 EVT VT = N->getValueType(0);
9321 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")((void)0);
9322 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((void)0);
9323 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((void)0)
9324 "Invalid Vector in input!")((void)0);
9325
9326 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9327 bool CanFold = true;
9328 unsigned ExpectedVExtractIdx = BaseIdx;
9329 unsigned NumElts = LastIdx - BaseIdx;
9330 V0 = DAG.getUNDEF(VT);
9331 V1 = DAG.getUNDEF(VT);
9332
9333 // Check if N implements a horizontal binop.
9334 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9335 SDValue Op = N->getOperand(i + BaseIdx);
9336
9337 // Skip UNDEFs.
9338 if (Op->isUndef()) {
9339 // Update the expected vector extract index.
9340 if (i * 2 == NumElts)
9341 ExpectedVExtractIdx = BaseIdx;
9342 ExpectedVExtractIdx += 2;
9343 continue;
9344 }
9345
9346 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9347
9348 if (!CanFold)
9349 break;
9350
9351 SDValue Op0 = Op.getOperand(0);
9352 SDValue Op1 = Op.getOperand(1);
9353
9354 // Try to match the following pattern:
9355 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9356 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9357 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9358 Op0.getOperand(0) == Op1.getOperand(0) &&
9359 isa<ConstantSDNode>(Op0.getOperand(1)) &&
9360 isa<ConstantSDNode>(Op1.getOperand(1)));
9361 if (!CanFold)
9362 break;
9363
9364 unsigned I0 = Op0.getConstantOperandVal(1);
9365 unsigned I1 = Op1.getConstantOperandVal(1);
9366
9367 if (i * 2 < NumElts) {
9368 if (V0.isUndef()) {
9369 V0 = Op0.getOperand(0);
9370 if (V0.getValueType() != VT)
9371 return false;
9372 }
9373 } else {
9374 if (V1.isUndef()) {
9375 V1 = Op0.getOperand(0);
9376 if (V1.getValueType() != VT)
9377 return false;
9378 }
9379 if (i * 2 == NumElts)
9380 ExpectedVExtractIdx = BaseIdx;
9381 }
9382
9383 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9384 if (I0 == ExpectedVExtractIdx)
9385 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9386 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9387 // Try to match the following dag sequence:
9388 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9389 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9390 } else
9391 CanFold = false;
9392
9393 ExpectedVExtractIdx += 2;
9394 }
9395
9396 return CanFold;
9397}
9398
9399/// Emit a sequence of two 128-bit horizontal add/sub followed by
9400/// a concat_vector.
9401///
9402/// This is a helper function of LowerToHorizontalOp().
9403/// This function expects two 256-bit vectors called V0 and V1.
9404/// At first, each vector is split into two separate 128-bit vectors.
9405/// Then, the resulting 128-bit vectors are used to implement two
9406/// horizontal binary operations.
9407///
9408/// The kind of horizontal binary operation is defined by \p X86Opcode.
9409///
9410/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9411/// the two new horizontal binop.
9412/// When Mode is set, the first horizontal binop dag node would take as input
9413/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9414/// horizontal binop dag node would take as input the lower 128-bit of V1
9415/// and the upper 128-bit of V1.
9416/// Example:
9417/// HADD V0_LO, V0_HI
9418/// HADD V1_LO, V1_HI
9419///
9420/// Otherwise, the first horizontal binop dag node takes as input the lower
9421/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9422/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9423/// Example:
9424/// HADD V0_LO, V1_LO
9425/// HADD V0_HI, V1_HI
9426///
9427/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9428/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9429/// the upper 128-bits of the result.
9430static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9431 const SDLoc &DL, SelectionDAG &DAG,
9432 unsigned X86Opcode, bool Mode,
9433 bool isUndefLO, bool isUndefHI) {
9434 MVT VT = V0.getSimpleValueType();
9435 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((void)0)
9436 "Invalid nodes in input!")((void)0);
9437
9438 unsigned NumElts = VT.getVectorNumElements();
9439 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9440 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9441 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9442 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9443 MVT NewVT = V0_LO.getSimpleValueType();
9444
9445 SDValue LO = DAG.getUNDEF(NewVT);
9446 SDValue HI = DAG.getUNDEF(NewVT);
9447
9448 if (Mode) {
9449 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9450 if (!isUndefLO && !V0->isUndef())
9451 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9452 if (!isUndefHI && !V1->isUndef())
9453 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9454 } else {
9455 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9456 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9457 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9458
9459 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9460 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9461 }
9462
9463 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9464}
9465
9466/// Returns true iff \p BV builds a vector with the result equivalent to
9467/// the result of ADDSUB/SUBADD operation.
9468/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9469/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9470/// \p Opnd0 and \p Opnd1.
9471static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9472 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9473 SDValue &Opnd0, SDValue &Opnd1,
9474 unsigned &NumExtracts,
9475 bool &IsSubAdd) {
9476
9477 MVT VT = BV->getSimpleValueType(0);
9478 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9479 return false;
9480
9481 unsigned NumElts = VT.getVectorNumElements();
9482 SDValue InVec0 = DAG.getUNDEF(VT);
9483 SDValue InVec1 = DAG.getUNDEF(VT);
9484
9485 NumExtracts = 0;
9486
9487 // Odd-numbered elements in the input build vector are obtained from
9488 // adding/subtracting two integer/float elements.
9489 // Even-numbered elements in the input build vector are obtained from
9490 // subtracting/adding two integer/float elements.
9491 unsigned Opc[2] = {0, 0};
9492 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9493 SDValue Op = BV->getOperand(i);
9494
9495 // Skip 'undef' values.
9496 unsigned Opcode = Op.getOpcode();
9497 if (Opcode == ISD::UNDEF)
9498 continue;
9499
9500 // Early exit if we found an unexpected opcode.
9501 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9502 return false;
9503
9504 SDValue Op0 = Op.getOperand(0);
9505 SDValue Op1 = Op.getOperand(1);
9506
9507 // Try to match the following pattern:
9508 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9509 // Early exit if we cannot match that sequence.
9510 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9511 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9512 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9513 Op0.getOperand(1) != Op1.getOperand(1))
9514 return false;
9515
9516 unsigned I0 = Op0.getConstantOperandVal(1);
9517 if (I0 != i)
9518 return false;
9519
9520 // We found a valid add/sub node, make sure its the same opcode as previous
9521 // elements for this parity.
9522 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9523 return false;
9524 Opc[i % 2] = Opcode;
9525
9526 // Update InVec0 and InVec1.
9527 if (InVec0.isUndef()) {
9528 InVec0 = Op0.getOperand(0);
9529 if (InVec0.getSimpleValueType() != VT)
9530 return false;
9531 }
9532 if (InVec1.isUndef()) {
9533 InVec1 = Op1.getOperand(0);
9534 if (InVec1.getSimpleValueType() != VT)
9535 return false;
9536 }
9537
9538 // Make sure that operands in input to each add/sub node always
9539 // come from a same pair of vectors.
9540 if (InVec0 != Op0.getOperand(0)) {
9541 if (Opcode == ISD::FSUB)
9542 return false;
9543
9544 // FADD is commutable. Try to commute the operands
9545 // and then test again.
9546 std::swap(Op0, Op1);
9547 if (InVec0 != Op0.getOperand(0))
9548 return false;
9549 }
9550
9551 if (InVec1 != Op1.getOperand(0))
9552 return false;
9553
9554 // Increment the number of extractions done.
9555 ++NumExtracts;
9556 }
9557
9558 // Ensure we have found an opcode for both parities and that they are
9559 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9560 // inputs are undef.
9561 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9562 InVec0.isUndef() || InVec1.isUndef())
9563 return false;
9564
9565 IsSubAdd = Opc[0] == ISD::FADD;
9566
9567 Opnd0 = InVec0;
9568 Opnd1 = InVec1;
9569 return true;
9570}
9571
9572/// Returns true if is possible to fold MUL and an idiom that has already been
9573/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9574/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9575/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9576///
9577/// Prior to calling this function it should be known that there is some
9578/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9579/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9580/// before replacement of such SDNode with ADDSUB operation. Thus the number
9581/// of \p Opnd0 uses is expected to be equal to 2.
9582/// For example, this function may be called for the following IR:
9583/// %AB = fmul fast <2 x double> %A, %B
9584/// %Sub = fsub fast <2 x double> %AB, %C
9585/// %Add = fadd fast <2 x double> %AB, %C
9586/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9587/// <2 x i32> <i32 0, i32 3>
9588/// There is a def for %Addsub here, which potentially can be replaced by
9589/// X86ISD::ADDSUB operation:
9590/// %Addsub = X86ISD::ADDSUB %AB, %C
9591/// and such ADDSUB can further be replaced with FMADDSUB:
9592/// %Addsub = FMADDSUB %A, %B, %C.
9593///
9594/// The main reason why this method is called before the replacement of the
9595/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9596/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9597/// FMADDSUB is.
9598static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9599 SelectionDAG &DAG,
9600 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9601 unsigned ExpectedUses) {
9602 if (Opnd0.getOpcode() != ISD::FMUL ||
9603 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9604 return false;
9605
9606 // FIXME: These checks must match the similar ones in
9607 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9608 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9609 // or MUL + ADDSUB to FMADDSUB.
9610 const TargetOptions &Options = DAG.getTarget().Options;
9611 bool AllowFusion =
9612 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9613 if (!AllowFusion)
9614 return false;
9615
9616 Opnd2 = Opnd1;
9617 Opnd1 = Opnd0.getOperand(1);
9618 Opnd0 = Opnd0.getOperand(0);
9619
9620 return true;
9621}
9622
9623/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9624/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9625/// X86ISD::FMSUBADD node.
9626static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9627 const X86Subtarget &Subtarget,
9628 SelectionDAG &DAG) {
9629 SDValue Opnd0, Opnd1;
9630 unsigned NumExtracts;
9631 bool IsSubAdd;
9632 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9633 IsSubAdd))
9634 return SDValue();
9635
9636 MVT VT = BV->getSimpleValueType(0);
9637 SDLoc DL(BV);
9638
9639 // Try to generate X86ISD::FMADDSUB node here.
9640 SDValue Opnd2;
9641 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9642 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9643 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9644 }
9645
9646 // We only support ADDSUB.
9647 if (IsSubAdd)
9648 return SDValue();
9649
9650 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9651 // the ADDSUB idiom has been successfully recognized. There are no known
9652 // X86 targets with 512-bit ADDSUB instructions!
9653 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9654 // recognition.
9655 if (VT.is512BitVector())
9656 return SDValue();
9657
9658 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9659}
9660
9661static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9662 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9663 // Initialize outputs to known values.
9664 MVT VT = BV->getSimpleValueType(0);
9665 HOpcode = ISD::DELETED_NODE;
9666 V0 = DAG.getUNDEF(VT);
9667 V1 = DAG.getUNDEF(VT);
9668
9669 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9670 // half of the result is calculated independently from the 128-bit halves of
9671 // the inputs, so that makes the index-checking logic below more complicated.
9672 unsigned NumElts = VT.getVectorNumElements();
9673 unsigned GenericOpcode = ISD::DELETED_NODE;
9674 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9675 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9676 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9677 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9678 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9679 // Ignore undef elements.
9680 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9681 if (Op.isUndef())
9682 continue;
9683
9684 // If there's an opcode mismatch, we're done.
9685 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9686 return false;
9687
9688 // Initialize horizontal opcode.
9689 if (HOpcode == ISD::DELETED_NODE) {
9690 GenericOpcode = Op.getOpcode();
9691 switch (GenericOpcode) {
9692 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9693 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9694 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9695 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9696 default: return false;
9697 }
9698 }
9699
9700 SDValue Op0 = Op.getOperand(0);
9701 SDValue Op1 = Op.getOperand(1);
9702 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9703 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9704 Op0.getOperand(0) != Op1.getOperand(0) ||
9705 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9706 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9707 return false;
9708
9709 // The source vector is chosen based on which 64-bit half of the
9710 // destination vector is being calculated.
9711 if (j < NumEltsIn64Bits) {
9712 if (V0.isUndef())
9713 V0 = Op0.getOperand(0);
9714 } else {
9715 if (V1.isUndef())
9716 V1 = Op0.getOperand(0);
9717 }
9718
9719 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9720 if (SourceVec != Op0.getOperand(0))
9721 return false;
9722
9723 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9724 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9725 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9726 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9727 (j % NumEltsIn64Bits) * 2;
9728 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9729 continue;
9730
9731 // If this is not a commutative op, this does not match.
9732 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9733 return false;
9734
9735 // Addition is commutative, so try swapping the extract indexes.
9736 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9737 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9738 continue;
9739
9740 // Extract indexes do not match horizontal requirement.
9741 return false;
9742 }
9743 }
9744 // We matched. Opcode and operands are returned by reference as arguments.
9745 return true;
9746}
9747
9748static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9749 SelectionDAG &DAG, unsigned HOpcode,
9750 SDValue V0, SDValue V1) {
9751 // If either input vector is not the same size as the build vector,
9752 // extract/insert the low bits to the correct size.
9753 // This is free (examples: zmm --> xmm, xmm --> ymm).
9754 MVT VT = BV->getSimpleValueType(0);
9755 unsigned Width = VT.getSizeInBits();
9756 if (V0.getValueSizeInBits() > Width)
9757 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9758 else if (V0.getValueSizeInBits() < Width)
9759 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9760
9761 if (V1.getValueSizeInBits() > Width)
9762 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9763 else if (V1.getValueSizeInBits() < Width)
9764 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9765
9766 unsigned NumElts = VT.getVectorNumElements();
9767 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9768 for (unsigned i = 0; i != NumElts; ++i)
9769 if (BV->getOperand(i).isUndef())
9770 DemandedElts.clearBit(i);
9771
9772 // If we don't need the upper xmm, then perform as a xmm hop.
9773 unsigned HalfNumElts = NumElts / 2;
9774 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9775 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9776 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9777 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9778 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9779 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9780 }
9781
9782 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9783}
9784
9785/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9786static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9787 const X86Subtarget &Subtarget,
9788 SelectionDAG &DAG) {
9789 // We need at least 2 non-undef elements to make this worthwhile by default.
9790 unsigned NumNonUndefs =
9791 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9792 if (NumNonUndefs < 2)
9793 return SDValue();
9794
9795 // There are 4 sets of horizontal math operations distinguished by type:
9796 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9797 // subtarget feature. Try to match those "native" patterns first.
9798 MVT VT = BV->getSimpleValueType(0);
9799 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9800 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9801 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9802 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9803 unsigned HOpcode;
9804 SDValue V0, V1;
9805 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9806 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9807 }
9808
9809 // Try harder to match 256-bit ops by using extract/concat.
9810 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9811 return SDValue();
9812
9813 // Count the number of UNDEF operands in the build_vector in input.
9814 unsigned NumElts = VT.getVectorNumElements();
9815 unsigned Half = NumElts / 2;
9816 unsigned NumUndefsLO = 0;
9817 unsigned NumUndefsHI = 0;
9818 for (unsigned i = 0, e = Half; i != e; ++i)
9819 if (BV->getOperand(i)->isUndef())
9820 NumUndefsLO++;
9821
9822 for (unsigned i = Half, e = NumElts; i != e; ++i)
9823 if (BV->getOperand(i)->isUndef())
9824 NumUndefsHI++;
9825
9826 SDLoc DL(BV);
9827 SDValue InVec0, InVec1;
9828 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9829 SDValue InVec2, InVec3;
9830 unsigned X86Opcode;
9831 bool CanFold = true;
9832
9833 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9834 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9835 InVec3) &&
9836 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9837 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9838 X86Opcode = X86ISD::HADD;
9839 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9840 InVec1) &&
9841 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9842 InVec3) &&
9843 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9844 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9845 X86Opcode = X86ISD::HSUB;
9846 else
9847 CanFold = false;
9848
9849 if (CanFold) {
9850 // Do not try to expand this build_vector into a pair of horizontal
9851 // add/sub if we can emit a pair of scalar add/sub.
9852 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9853 return SDValue();
9854
9855 // Convert this build_vector into a pair of horizontal binops followed by
9856 // a concat vector. We must adjust the outputs from the partial horizontal
9857 // matching calls above to account for undefined vector halves.
9858 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9859 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9860 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")((void)0);
9861 bool isUndefLO = NumUndefsLO == Half;
9862 bool isUndefHI = NumUndefsHI == Half;
9863 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9864 isUndefHI);
9865 }
9866 }
9867
9868 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9869 VT == MVT::v16i16) {
9870 unsigned X86Opcode;
9871 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9872 X86Opcode = X86ISD::HADD;
9873 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9874 InVec1))
9875 X86Opcode = X86ISD::HSUB;
9876 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9877 InVec1))
9878 X86Opcode = X86ISD::FHADD;
9879 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9880 InVec1))
9881 X86Opcode = X86ISD::FHSUB;
9882 else
9883 return SDValue();
9884
9885 // Don't try to expand this build_vector into a pair of horizontal add/sub
9886 // if we can simply emit a pair of scalar add/sub.
9887 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9888 return SDValue();
9889
9890 // Convert this build_vector into two horizontal add/sub followed by
9891 // a concat vector.
9892 bool isUndefLO = NumUndefsLO == Half;
9893 bool isUndefHI = NumUndefsHI == Half;
9894 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9895 isUndefLO, isUndefHI);
9896 }
9897
9898 return SDValue();
9899}
9900
9901static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9902 SelectionDAG &DAG);
9903
9904/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9905/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9906/// just apply the bit to the vectors.
9907/// NOTE: Its not in our interest to start make a general purpose vectorizer
9908/// from this, but enough scalar bit operations are created from the later
9909/// legalization + scalarization stages to need basic support.
9910static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9911 const X86Subtarget &Subtarget,
9912 SelectionDAG &DAG) {
9913 SDLoc DL(Op);
9914 MVT VT = Op->getSimpleValueType(0);
9915 unsigned NumElems = VT.getVectorNumElements();
9916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9917
9918 // Check that all elements have the same opcode.
9919 // TODO: Should we allow UNDEFS and if so how many?
9920 unsigned Opcode = Op->getOperand(0).getOpcode();
9921 for (unsigned i = 1; i < NumElems; ++i)
9922 if (Opcode != Op->getOperand(i).getOpcode())
9923 return SDValue();
9924
9925 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9926 bool IsShift = false;
9927 switch (Opcode) {
9928 default:
9929 return SDValue();
9930 case ISD::SHL:
9931 case ISD::SRL:
9932 case ISD::SRA:
9933 IsShift = true;
9934 break;
9935 case ISD::AND:
9936 case ISD::XOR:
9937 case ISD::OR:
9938 // Don't do this if the buildvector is a splat - we'd replace one
9939 // constant with an entire vector.
9940 if (Op->getSplatValue())
9941 return SDValue();
9942 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9943 return SDValue();
9944 break;
9945 }
9946
9947 SmallVector<SDValue, 4> LHSElts, RHSElts;
9948 for (SDValue Elt : Op->ops()) {
9949 SDValue LHS = Elt.getOperand(0);
9950 SDValue RHS = Elt.getOperand(1);
9951
9952 // We expect the canonicalized RHS operand to be the constant.
9953 if (!isa<ConstantSDNode>(RHS))
9954 return SDValue();
9955
9956 // Extend shift amounts.
9957 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9958 if (!IsShift)
9959 return SDValue();
9960 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9961 }
9962
9963 LHSElts.push_back(LHS);
9964 RHSElts.push_back(RHS);
9965 }
9966
9967 // Limit to shifts by uniform immediates.
9968 // TODO: Only accept vXi8/vXi64 special cases?
9969 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9970 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9971 return SDValue();
9972
9973 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9974 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9975 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9976
9977 if (!IsShift)
9978 return Res;
9979
9980 // Immediately lower the shift to ensure the constant build vector doesn't
9981 // get converted to a constant pool before the shift is lowered.
9982 return LowerShift(Res, Subtarget, DAG);
9983}
9984
9985/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9986/// functionality to do this, so it's all zeros, all ones, or some derivation
9987/// that is cheap to calculate.
9988static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9989 const X86Subtarget &Subtarget) {
9990 SDLoc DL(Op);
9991 MVT VT = Op.getSimpleValueType();
9992
9993 // Vectors containing all zeros can be matched by pxor and xorps.
9994 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9995 return Op;
9996
9997 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9998 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9999 // vpcmpeqd on 256-bit vectors.
10000 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10001 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10002 return Op;
10003
10004 return getOnesVector(VT, DAG, DL);
10005 }
10006
10007 return SDValue();
10008}
10009
10010/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10011/// from a vector of source values and a vector of extraction indices.
10012/// The vectors might be manipulated to match the type of the permute op.
10013static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10014 SDLoc &DL, SelectionDAG &DAG,
10015 const X86Subtarget &Subtarget) {
10016 MVT ShuffleVT = VT;
10017 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10018 unsigned NumElts = VT.getVectorNumElements();
10019 unsigned SizeInBits = VT.getSizeInBits();
10020
10021 // Adjust IndicesVec to match VT size.
10022 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&((void)0)
10023 "Illegal variable permute mask size")((void)0);
10024 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10025 // Narrow/widen the indices vector to the correct size.
10026 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10027 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10028 NumElts * VT.getScalarSizeInBits());
10029 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10030 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10031 SDLoc(IndicesVec), SizeInBits);
10032 // Zero-extend the index elements within the vector.
10033 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10034 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10035 IndicesVT, IndicesVec);
10036 }
10037 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10038
10039 // Handle SrcVec that don't match VT type.
10040 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10041 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10042 // Handle larger SrcVec by treating it as a larger permute.
10043 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10044 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10045 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10046 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10047 Subtarget, DAG, SDLoc(IndicesVec));
10048 SDValue NewSrcVec =
10049 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10050 if (NewSrcVec)
10051 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10052 return SDValue();
10053 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10054 // Widen smaller SrcVec to match VT.
10055 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10056 } else
10057 return SDValue();
10058 }
10059
10060 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10061 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")((void)0);
10062 EVT SrcVT = Idx.getValueType();
10063 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10064 uint64_t IndexScale = 0;
10065 uint64_t IndexOffset = 0;
10066
10067 // If we're scaling a smaller permute op, then we need to repeat the
10068 // indices, scaling and offsetting them as well.
10069 // e.g. v4i32 -> v16i8 (Scale = 4)
10070 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10071 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10072 for (uint64_t i = 0; i != Scale; ++i) {
10073 IndexScale |= Scale << (i * NumDstBits);
10074 IndexOffset |= i << (i * NumDstBits);
10075 }
10076
10077 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10078 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10079 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10080 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10081 return Idx;
10082 };
10083
10084 unsigned Opcode = 0;
10085 switch (VT.SimpleTy) {
10086 default:
10087 break;
10088 case MVT::v16i8:
10089 if (Subtarget.hasSSSE3())
10090 Opcode = X86ISD::PSHUFB;
10091 break;
10092 case MVT::v8i16:
10093 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10094 Opcode = X86ISD::VPERMV;
10095 else if (Subtarget.hasSSSE3()) {
10096 Opcode = X86ISD::PSHUFB;
10097 ShuffleVT = MVT::v16i8;
10098 }
10099 break;
10100 case MVT::v4f32:
10101 case MVT::v4i32:
10102 if (Subtarget.hasAVX()) {
10103 Opcode = X86ISD::VPERMILPV;
10104 ShuffleVT = MVT::v4f32;
10105 } else if (Subtarget.hasSSSE3()) {
10106 Opcode = X86ISD::PSHUFB;
10107 ShuffleVT = MVT::v16i8;
10108 }
10109 break;
10110 case MVT::v2f64:
10111 case MVT::v2i64:
10112 if (Subtarget.hasAVX()) {
10113 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10114 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10115 Opcode = X86ISD::VPERMILPV;
10116 ShuffleVT = MVT::v2f64;
10117 } else if (Subtarget.hasSSE41()) {
10118 // SSE41 can compare v2i64 - select between indices 0 and 1.
10119 return DAG.getSelectCC(
10120 DL, IndicesVec,
10121 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10122 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10123 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10124 ISD::CondCode::SETEQ);
10125 }
10126 break;
10127 case MVT::v32i8:
10128 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10129 Opcode = X86ISD::VPERMV;
10130 else if (Subtarget.hasXOP()) {
10131 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10132 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10133 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10134 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10135 return DAG.getNode(
10136 ISD::CONCAT_VECTORS, DL, VT,
10137 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10138 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10139 } else if (Subtarget.hasAVX()) {
10140 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10141 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10142 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10143 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10144 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10145 ArrayRef<SDValue> Ops) {
10146 // Permute Lo and Hi and then select based on index range.
10147 // This works as SHUFB uses bits[3:0] to permute elements and we don't
10148 // care about the bit[7] as its just an index vector.
10149 SDValue Idx = Ops[2];
10150 EVT VT = Idx.getValueType();
10151 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10152 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10153 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10154 ISD::CondCode::SETGT);
10155 };
10156 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10157 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10158 PSHUFBBuilder);
10159 }
10160 break;
10161 case MVT::v16i16:
10162 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10163 Opcode = X86ISD::VPERMV;
10164 else if (Subtarget.hasAVX()) {
10165 // Scale to v32i8 and perform as v32i8.
10166 IndicesVec = ScaleIndices(IndicesVec, 2);
10167 return DAG.getBitcast(
10168 VT, createVariablePermute(
10169 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10170 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10171 }
10172 break;
10173 case MVT::v8f32:
10174 case MVT::v8i32:
10175 if (Subtarget.hasAVX2())
10176 Opcode = X86ISD::VPERMV;
10177 else if (Subtarget.hasAVX()) {
10178 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10179 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10180 {0, 1, 2, 3, 0, 1, 2, 3});
10181 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10182 {4, 5, 6, 7, 4, 5, 6, 7});
10183 if (Subtarget.hasXOP())
10184 return DAG.getBitcast(
10185 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10186 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10187 // Permute Lo and Hi and then select based on index range.
10188 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10189 SDValue Res = DAG.getSelectCC(
10190 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10191 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10192 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10193 ISD::CondCode::SETGT);
10194 return DAG.getBitcast(VT, Res);
10195 }
10196 break;
10197 case MVT::v4i64:
10198 case MVT::v4f64:
10199 if (Subtarget.hasAVX512()) {
10200 if (!Subtarget.hasVLX()) {
10201 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10202 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10203 SDLoc(SrcVec));
10204 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10205 DAG, SDLoc(IndicesVec));
10206 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10207 DAG, Subtarget);
10208 return extract256BitVector(Res, 0, DAG, DL);
10209 }
10210 Opcode = X86ISD::VPERMV;
10211 } else if (Subtarget.hasAVX()) {
10212 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10213 SDValue LoLo =
10214 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10215 SDValue HiHi =
10216 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10217 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10218 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10219 if (Subtarget.hasXOP())
10220 return DAG.getBitcast(
10221 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10222 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10223 // Permute Lo and Hi and then select based on index range.
10224 // This works as VPERMILPD only uses index bit[1] to permute elements.
10225 SDValue Res = DAG.getSelectCC(
10226 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10227 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10228 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10229 ISD::CondCode::SETGT);
10230 return DAG.getBitcast(VT, Res);
10231 }
10232 break;
10233 case MVT::v64i8:
10234 if (Subtarget.hasVBMI())
10235 Opcode = X86ISD::VPERMV;
10236 break;
10237 case MVT::v32i16:
10238 if (Subtarget.hasBWI())
10239 Opcode = X86ISD::VPERMV;
10240 break;
10241 case MVT::v16f32:
10242 case MVT::v16i32:
10243 case MVT::v8f64:
10244 case MVT::v8i64:
10245 if (Subtarget.hasAVX512())
10246 Opcode = X86ISD::VPERMV;
10247 break;
10248 }
10249 if (!Opcode)
10250 return SDValue();
10251
10252 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&((void)0)
10253 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&((void)0)
10254 "Illegal variable permute shuffle type")((void)0);
10255
10256 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10257 if (Scale > 1)
10258 IndicesVec = ScaleIndices(IndicesVec, Scale);
10259
10260 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10261 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10262
10263 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10264 SDValue Res = Opcode == X86ISD::VPERMV
10265 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10266 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10267 return DAG.getBitcast(VT, Res);
10268}
10269
10270// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10271// reasoned to be a permutation of a vector by indices in a non-constant vector.
10272// (build_vector (extract_elt V, (extract_elt I, 0)),
10273// (extract_elt V, (extract_elt I, 1)),
10274// ...
10275// ->
10276// (vpermv I, V)
10277//
10278// TODO: Handle undefs
10279// TODO: Utilize pshufb and zero mask blending to support more efficient
10280// construction of vectors with constant-0 elements.
10281static SDValue
10282LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10283 const X86Subtarget &Subtarget) {
10284 SDValue SrcVec, IndicesVec;
10285 // Check for a match of the permute source vector and permute index elements.
10286 // This is done by checking that the i-th build_vector operand is of the form:
10287 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10288 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10289 SDValue Op = V.getOperand(Idx);
10290 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10291 return SDValue();
10292
10293 // If this is the first extract encountered in V, set the source vector,
10294 // otherwise verify the extract is from the previously defined source
10295 // vector.
10296 if (!SrcVec)
10297 SrcVec = Op.getOperand(0);
10298 else if (SrcVec != Op.getOperand(0))
10299 return SDValue();
10300 SDValue ExtractedIndex = Op->getOperand(1);
10301 // Peek through extends.
10302 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10303 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10304 ExtractedIndex = ExtractedIndex.getOperand(0);
10305 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10306 return SDValue();
10307
10308 // If this is the first extract from the index vector candidate, set the
10309 // indices vector, otherwise verify the extract is from the previously
10310 // defined indices vector.
10311 if (!IndicesVec)
10312 IndicesVec = ExtractedIndex.getOperand(0);
10313 else if (IndicesVec != ExtractedIndex.getOperand(0))
10314 return SDValue();
10315
10316 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10317 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10318 return SDValue();
10319 }
10320
10321 SDLoc DL(V);
10322 MVT VT = V.getSimpleValueType();
10323 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10324}
10325
10326SDValue
10327X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10328 SDLoc dl(Op);
10329
10330 MVT VT = Op.getSimpleValueType();
10331 MVT EltVT = VT.getVectorElementType();
10332 unsigned NumElems = Op.getNumOperands();
10333
10334 // Generate vectors for predicate vectors.
10335 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
1
Taking false branch
10336 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10337
10338 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
2
Taking false branch
10339 return VectorConstant;
10340
10341 unsigned EVTBits = EltVT.getSizeInBits();
10342 APInt UndefMask = APInt::getNullValue(NumElems);
10343 APInt ZeroMask = APInt::getNullValue(NumElems);
10344 APInt NonZeroMask = APInt::getNullValue(NumElems);
10345 bool IsAllConstants = true;
10346 SmallSet<SDValue, 8> Values;
10347 unsigned NumConstants = NumElems;
10348 for (unsigned i = 0; i < NumElems; ++i) {
3
Assuming 'i' is < 'NumElems'
4
Loop condition is true. Entering loop body
9
Assuming 'i' is >= 'NumElems'
10
Loop condition is false. Execution continues on line 10367
10349 SDValue Elt = Op.getOperand(i);
10350 if (Elt.isUndef()) {
5
Taking false branch
10351 UndefMask.setBit(i);
10352 continue;
10353 }
10354 Values.insert(Elt);
10355 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
6
Assuming 'Elt' is a 'ConstantSDNode'
10356 IsAllConstants = false;
10357 NumConstants--;
10358 }
10359 if (X86::isZeroNode(Elt)) {
7
Assuming the condition is false
8
Taking false branch
10360 ZeroMask.setBit(i);
10361 } else {
10362 NonZeroMask.setBit(i);
10363 }
10364 }
10365
10366 // All undef vector. Return an UNDEF. All zero vectors were handled above.
10367 if (NonZeroMask == 0) {
11
Taking false branch
10368 assert(UndefMask.isAllOnesValue() && "Fully undef mask expected")((void)0);
10369 return DAG.getUNDEF(VT);
10370 }
10371
10372 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
12
The object is a 'BuildVectorSDNode'
10373
10374 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10375 // lowering to a smaller build vector and padding with undef/zero.
10376 if ((VT.is256BitVector() || VT.is512BitVector()) &&
10377 !isFoldableUseOfShuffle(BV)) {
10378 unsigned UpperElems = NumElems / 2;
10379 APInt UndefOrZeroMask = UndefMask | ZeroMask;
10380 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10381 if (NumUpperUndefsOrZeros >= UpperElems) {
10382 if (VT.is512BitVector() &&
10383 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10384 UpperElems = NumElems - (NumElems / 4);
10385 bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10386 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10387 SDValue NewBV =
10388 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10389 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10390 }
10391 }
10392
10393 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
13
Taking false branch
10394 return AddSub;
10395 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
14
Taking false branch
10396 return HorizontalOp;
10397 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
15
Taking false branch
10398 return Broadcast;
10399 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
16
Taking false branch
10400 return BitOp;
10401
10402 unsigned NumZero = ZeroMask.countPopulation();
10403 unsigned NumNonZero = NonZeroMask.countPopulation();
10404
10405 // If we are inserting one variable into a vector of non-zero constants, try
10406 // to avoid loading each constant element as a scalar. Load the constants as a
10407 // vector and then insert the variable scalar element. If insertion is not
10408 // supported, fall back to a shuffle to get the scalar blended with the
10409 // constants. Insertion into a zero vector is handled as a special-case
10410 // somewhere below here.
10411 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
17
Taking false branch
10412 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10413 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10414 // Create an all-constant vector. The variable element in the old
10415 // build vector is replaced by undef in the constant vector. Save the
10416 // variable scalar element and its index for use in the insertelement.
10417 LLVMContext &Context = *DAG.getContext();
10418 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10419 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10420 SDValue VarElt;
10421 SDValue InsIndex;
10422 for (unsigned i = 0; i != NumElems; ++i) {
10423 SDValue Elt = Op.getOperand(i);
10424 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10425 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10426 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10427 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10428 else if (!Elt.isUndef()) {
10429 assert(!VarElt.getNode() && !InsIndex.getNode() &&((void)0)
10430 "Expected one variable element in this vector")((void)0);
10431 VarElt = Elt;
10432 InsIndex = DAG.getVectorIdxConstant(i, dl);
10433 }
10434 }
10435 Constant *CV = ConstantVector::get(ConstVecOps);
10436 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10437
10438 // The constants we just created may not be legal (eg, floating point). We
10439 // must lower the vector right here because we can not guarantee that we'll
10440 // legalize it before loading it. This is also why we could not just create
10441 // a new build vector here. If the build vector contains illegal constants,
10442 // it could get split back up into a series of insert elements.
10443 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10444 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10445 MachineFunction &MF = DAG.getMachineFunction();
10446 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10447 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10448 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10449 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10450 if (InsertC < NumEltsInLow128Bits)
10451 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10452
10453 // There's no good way to insert into the high elements of a >128-bit
10454 // vector, so use shuffles to avoid an extract/insert sequence.
10455 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")((void)0);
10456 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")((void)0);
10457 SmallVector<int, 8> ShuffleMask;
10458 unsigned NumElts = VT.getVectorNumElements();
10459 for (unsigned i = 0; i != NumElts; ++i)
10460 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10461 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10462 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10463 }
10464
10465 // Special case for single non-zero, non-undef, element.
10466 if (NumNonZero == 1) {
18
Assuming 'NumNonZero' is not equal to 1
19
Taking false branch
10467 unsigned Idx = NonZeroMask.countTrailingZeros();
10468 SDValue Item = Op.getOperand(Idx);
10469
10470 // If we have a constant or non-constant insertion into the low element of
10471 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10472 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10473 // depending on what the source datatype is.
10474 if (Idx == 0) {
10475 if (NumZero == 0)
10476 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10477
10478 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10479 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10480 assert((VT.is128BitVector() || VT.is256BitVector() ||((void)0)
10481 VT.is512BitVector()) &&((void)0)
10482 "Expected an SSE value type!")((void)0);
10483 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10484 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10485 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10486 }
10487
10488 // We can't directly insert an i8 or i16 into a vector, so zero extend
10489 // it to i32 first.
10490 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10491 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10492 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10493 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10494 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10495 return DAG.getBitcast(VT, Item);
10496 }
10497 }
10498
10499 // Is it a vector logical left shift?
10500 if (NumElems == 2 && Idx == 1 &&
10501 X86::isZeroNode(Op.getOperand(0)) &&
10502 !X86::isZeroNode(Op.getOperand(1))) {
10503 unsigned NumBits = VT.getSizeInBits();
10504 return getVShift(true, VT,
10505 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10506 VT, Op.getOperand(1)),
10507 NumBits/2, DAG, *this, dl);
10508 }
10509
10510 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10511 return SDValue();
10512
10513 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10514 // is a non-constant being inserted into an element other than the low one,
10515 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10516 // movd/movss) to move this into the low element, then shuffle it into
10517 // place.
10518 if (EVTBits == 32) {
10519 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10520 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10521 }
10522 }
10523
10524 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10525 if (Values.size() == 1) {
20
Assuming the condition is true
21
Taking true branch
10526 if (EVTBits == 32) {
22
Assuming 'EVTBits' is equal to 32
23
Taking true branch
10527 // Instead of a shuffle like this:
10528 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10529 // Check if it's possible to issue this instead.
10530 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10531 unsigned Idx = NonZeroMask.countTrailingZeros();
10532 SDValue Item = Op.getOperand(Idx);
10533 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
24
Assuming the condition is true
25
Taking true branch
10534 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
26
Calling 'LowerAsSplatVectorLoad'
10535 }
10536 return SDValue();
10537 }
10538
10539 // A vector full of immediates; various special cases are already
10540 // handled, so this is best done with a single constant-pool load.
10541 if (IsAllConstants)
10542 return SDValue();
10543
10544 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10545 return V;
10546
10547 // See if we can use a vector load to get all of the elements.
10548 {
10549 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10550 if (SDValue LD =
10551 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10552 return LD;
10553 }
10554
10555 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10556 // build_vector and broadcast it.
10557 // TODO: We could probably generalize this more.
10558 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10559 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10560 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10561 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10562 // Make sure all the even/odd operands match.
10563 for (unsigned i = 2; i != NumElems; ++i)
10564 if (Ops[i % 2] != Op.getOperand(i))
10565 return false;
10566 return true;
10567 };
10568 if (CanSplat(Op, NumElems, Ops)) {
10569 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10570 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10571 // Create a new build vector and cast to v2i64/v2f64.
10572 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10573 DAG.getBuildVector(NarrowVT, dl, Ops));
10574 // Broadcast from v2i64/v2f64 and cast to final VT.
10575 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10576 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10577 NewBV));
10578 }
10579 }
10580
10581 // For AVX-length vectors, build the individual 128-bit pieces and use
10582 // shuffles to put them in place.
10583 if (VT.getSizeInBits() > 128) {
10584 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10585
10586 // Build both the lower and upper subvector.
10587 SDValue Lower =
10588 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10589 SDValue Upper = DAG.getBuildVector(
10590 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10591
10592 // Recreate the wider vector with the lower and upper part.
10593 return concatSubVectors(Lower, Upper, DAG, dl);
10594 }
10595
10596 // Let legalizer expand 2-wide build_vectors.
10597 if (EVTBits == 64) {
10598 if (NumNonZero == 1) {
10599 // One half is zero or undef.
10600 unsigned Idx = NonZeroMask.countTrailingZeros();
10601 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10602 Op.getOperand(Idx));
10603 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10604 }
10605 return SDValue();
10606 }
10607
10608 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10609 if (EVTBits == 8 && NumElems == 16)
10610 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10611 DAG, Subtarget))
10612 return V;
10613
10614 if (EVTBits == 16 && NumElems == 8)
10615 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10616 DAG, Subtarget))
10617 return V;
10618
10619 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10620 if (EVTBits == 32 && NumElems == 4)
10621 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10622 return V;
10623
10624 // If element VT is == 32 bits, turn it into a number of shuffles.
10625 if (NumElems == 4 && NumZero > 0) {
10626 SmallVector<SDValue, 8> Ops(NumElems);
10627 for (unsigned i = 0; i < 4; ++i) {
10628 bool isZero = !NonZeroMask[i];
10629 if (isZero)
10630 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10631 else
10632 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10633 }
10634
10635 for (unsigned i = 0; i < 2; ++i) {
10636 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10637 default: llvm_unreachable("Unexpected NonZero count")__builtin_unreachable();
10638 case 0:
10639 Ops[i] = Ops[i*2]; // Must be a zero vector.
10640 break;
10641 case 1:
10642 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10643 break;
10644 case 2:
10645 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10646 break;
10647 case 3:
10648 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10649 break;
10650 }
10651 }
10652
10653 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10654 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10655 int MaskVec[] = {
10656 Reverse1 ? 1 : 0,
10657 Reverse1 ? 0 : 1,
10658 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10659 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10660 };
10661 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10662 }
10663
10664 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")((void)0);
10665
10666 // Check for a build vector from mostly shuffle plus few inserting.
10667 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10668 return Sh;
10669
10670 // For SSE 4.1, use insertps to put the high elements into the low element.
10671 if (Subtarget.hasSSE41()) {
10672 SDValue Result;
10673 if (!Op.getOperand(0).isUndef())
10674 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10675 else
10676 Result = DAG.getUNDEF(VT);
10677
10678 for (unsigned i = 1; i < NumElems; ++i) {
10679 if (Op.getOperand(i).isUndef()) continue;
10680 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10681 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10682 }
10683 return Result;
10684 }
10685
10686 // Otherwise, expand into a number of unpckl*, start by extending each of
10687 // our (non-undef) elements to the full vector width with the element in the
10688 // bottom slot of the vector (which generates no code for SSE).
10689 SmallVector<SDValue, 8> Ops(NumElems);
10690 for (unsigned i = 0; i < NumElems; ++i) {
10691 if (!Op.getOperand(i).isUndef())
10692 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10693 else
10694 Ops[i] = DAG.getUNDEF(VT);
10695 }
10696
10697 // Next, we iteratively mix elements, e.g. for v4f32:
10698 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10699 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10700 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10701 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10702 // Generate scaled UNPCKL shuffle mask.
10703 SmallVector<int, 16> Mask;
10704 for(unsigned i = 0; i != Scale; ++i)
10705 Mask.push_back(i);
10706 for (unsigned i = 0; i != Scale; ++i)
10707 Mask.push_back(NumElems+i);
10708 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10709
10710 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10711 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10712 }
10713 return Ops[0];
10714}
10715
10716// 256-bit AVX can use the vinsertf128 instruction
10717// to create 256-bit vectors from two other 128-bit ones.
10718// TODO: Detect subvector broadcast here instead of DAG combine?
10719static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10720 const X86Subtarget &Subtarget) {
10721 SDLoc dl(Op);
10722 MVT ResVT = Op.getSimpleValueType();
10723
10724 assert((ResVT.is256BitVector() ||((void)0)
10725 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")((void)0);
10726
10727 unsigned NumOperands = Op.getNumOperands();
10728 unsigned NumZero = 0;
10729 unsigned NumNonZero = 0;
10730 unsigned NonZeros = 0;
10731 for (unsigned i = 0; i != NumOperands; ++i) {
10732 SDValue SubVec = Op.getOperand(i);
10733 if (SubVec.isUndef())
10734 continue;
10735 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10736 ++NumZero;
10737 else {
10738 assert(i < sizeof(NonZeros) * CHAR_BIT)((void)0); // Ensure the shift is in range.
10739 NonZeros |= 1 << i;
10740 ++NumNonZero;
10741 }
10742 }
10743
10744 // If we have more than 2 non-zeros, build each half separately.
10745 if (NumNonZero > 2) {
10746 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10747 ArrayRef<SDUse> Ops = Op->ops();
10748 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10749 Ops.slice(0, NumOperands/2));
10750 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10751 Ops.slice(NumOperands/2));
10752 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10753 }
10754
10755 // Otherwise, build it up through insert_subvectors.
10756 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10757 : DAG.getUNDEF(ResVT);
10758
10759 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10760 unsigned NumSubElems = SubVT.getVectorNumElements();
10761 for (unsigned i = 0; i != NumOperands; ++i) {
10762 if ((NonZeros & (1 << i)) == 0)
10763 continue;
10764
10765 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10766 Op.getOperand(i),
10767 DAG.getIntPtrConstant(i * NumSubElems, dl));
10768 }
10769
10770 return Vec;
10771}
10772
10773// Returns true if the given node is a type promotion (by concatenating i1
10774// zeros) of the result of a node that already zeros all upper bits of
10775// k-register.
10776// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10777static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10778 const X86Subtarget &Subtarget,
10779 SelectionDAG & DAG) {
10780 SDLoc dl(Op);
10781 MVT ResVT = Op.getSimpleValueType();
10782 unsigned NumOperands = Op.getNumOperands();
10783
10784 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&((void)0)
10785 "Unexpected number of operands in CONCAT_VECTORS")((void)0);
10786
10787 uint64_t Zeros = 0;
10788 uint64_t NonZeros = 0;
10789 for (unsigned i = 0; i != NumOperands; ++i) {
10790 SDValue SubVec = Op.getOperand(i);
10791 if (SubVec.isUndef())
10792 continue;
10793 assert(i < sizeof(NonZeros) * CHAR_BIT)((void)0); // Ensure the shift is in range.
10794 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10795 Zeros |= (uint64_t)1 << i;
10796 else
10797 NonZeros |= (uint64_t)1 << i;
10798 }
10799
10800 unsigned NumElems = ResVT.getVectorNumElements();
10801
10802 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10803 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10804 // insert_subvector will give us two kshifts.
10805 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10806 Log2_64(NonZeros) != NumOperands - 1) {
10807 MVT ShiftVT = ResVT;
10808 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10809 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10810 unsigned Idx = Log2_64(NonZeros);
10811 SDValue SubVec = Op.getOperand(Idx);
10812 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10813 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10814 DAG.getUNDEF(ShiftVT), SubVec,
10815 DAG.getIntPtrConstant(0, dl));
10816 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10817 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10818 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10819 DAG.getIntPtrConstant(0, dl));
10820 }
10821
10822 // If there are zero or one non-zeros we can handle this very simply.
10823 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10824 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10825 if (!NonZeros)
10826 return Vec;
10827 unsigned Idx = Log2_64(NonZeros);
10828 SDValue SubVec = Op.getOperand(Idx);
10829 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10830 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10831 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10832 }
10833
10834 if (NumOperands > 2) {
10835 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10836 ArrayRef<SDUse> Ops = Op->ops();
10837 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10838 Ops.slice(0, NumOperands/2));
10839 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10840 Ops.slice(NumOperands/2));
10841 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10842 }
10843
10844 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")((void)0);
10845
10846 if (ResVT.getVectorNumElements() >= 16)
10847 return Op; // The operation is legal with KUNPCK
10848
10849 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10850 DAG.getUNDEF(ResVT), Op.getOperand(0),
10851 DAG.getIntPtrConstant(0, dl));
10852 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10853 DAG.getIntPtrConstant(NumElems/2, dl));
10854}
10855
10856static SDValue LowerCONCAT_VECTORS(SDValue Op,
10857 const X86Subtarget &Subtarget,
10858 SelectionDAG &DAG) {
10859 MVT VT = Op.getSimpleValueType();
10860 if (VT.getVectorElementType() == MVT::i1)
10861 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10862
10863 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||((void)0)
10864 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||((void)0)
10865 Op.getNumOperands() == 4)))((void)0);
10866
10867 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10868 // from two other 128-bit ones.
10869
10870 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10871 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10872}
10873
10874//===----------------------------------------------------------------------===//
10875// Vector shuffle lowering
10876//
10877// This is an experimental code path for lowering vector shuffles on x86. It is
10878// designed to handle arbitrary vector shuffles and blends, gracefully
10879// degrading performance as necessary. It works hard to recognize idiomatic
10880// shuffles and lower them to optimal instruction patterns without leaving
10881// a framework that allows reasonably efficient handling of all vector shuffle
10882// patterns.
10883//===----------------------------------------------------------------------===//
10884
10885/// Tiny helper function to identify a no-op mask.
10886///
10887/// This is a somewhat boring predicate function. It checks whether the mask
10888/// array input, which is assumed to be a single-input shuffle mask of the kind
10889/// used by the X86 shuffle instructions (not a fully general
10890/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10891/// in-place shuffle are 'no-op's.
10892static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10893 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10894 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
10895 if (Mask[i] >= 0 && Mask[i] != i)
10896 return false;
10897 }
10898 return true;
10899}
10900
10901/// Test whether there are elements crossing LaneSizeInBits lanes in this
10902/// shuffle mask.
10903///
10904/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10905/// and we routinely test for these.
10906static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10907 unsigned ScalarSizeInBits,
10908 ArrayRef<int> Mask) {
10909 assert(LaneSizeInBits && ScalarSizeInBits &&((void)0)
10910 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((void)0)
10911 "Illegal shuffle lane size")((void)0);
10912 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10913 int Size = Mask.size();
10914 for (int i = 0; i < Size; ++i)
10915 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10916 return true;
10917 return false;
10918}
10919
10920/// Test whether there are elements crossing 128-bit lanes in this
10921/// shuffle mask.
10922static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10923 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10924}
10925
10926/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10927/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10928/// better support 'repeated mask + lane permute' style shuffles.
10929static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10930 unsigned ScalarSizeInBits,
10931 ArrayRef<int> Mask) {
10932 assert(LaneSizeInBits && ScalarSizeInBits &&((void)0)
10933 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((void)0)
10934 "Illegal shuffle lane size")((void)0);
10935 int NumElts = Mask.size();
10936 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10937 int NumLanes = NumElts / NumEltsPerLane;
10938 if (NumLanes > 1) {
10939 for (int i = 0; i != NumLanes; ++i) {
10940 int SrcLane = -1;
10941 for (int j = 0; j != NumEltsPerLane; ++j) {
10942 int M = Mask[(i * NumEltsPerLane) + j];
10943 if (M < 0)
10944 continue;
10945 int Lane = (M % NumElts) / NumEltsPerLane;
10946 if (SrcLane >= 0 && SrcLane != Lane)
10947 return true;
10948 SrcLane = Lane;
10949 }
10950 }
10951 }
10952 return false;
10953}
10954
10955/// Test whether a shuffle mask is equivalent within each sub-lane.
10956///
10957/// This checks a shuffle mask to see if it is performing the same
10958/// lane-relative shuffle in each sub-lane. This trivially implies
10959/// that it is also not lane-crossing. It may however involve a blend from the
10960/// same lane of a second vector.
10961///
10962/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10963/// non-trivial to compute in the face of undef lanes. The representation is
10964/// suitable for use with existing 128-bit shuffles as entries from the second
10965/// vector have been remapped to [LaneSize, 2*LaneSize).
10966static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10967 ArrayRef<int> Mask,
10968 SmallVectorImpl<int> &RepeatedMask) {
10969 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10970 RepeatedMask.assign(LaneSize, -1);
10971 int Size = Mask.size();
10972 for (int i = 0; i < Size; ++i) {
10973 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)((void)0);
10974 if (Mask[i] < 0)
10975 continue;
10976 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10977 // This entry crosses lanes, so there is no way to model this shuffle.
10978 return false;
10979
10980 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10981 // Adjust second vector indices to start at LaneSize instead of Size.
10982 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10983 : Mask[i] % LaneSize + LaneSize;
10984 if (RepeatedMask[i % LaneSize] < 0)
10985 // This is the first non-undef entry in this slot of a 128-bit lane.
10986 RepeatedMask[i % LaneSize] = LocalM;
10987 else if (RepeatedMask[i % LaneSize] != LocalM)
10988 // Found a mismatch with the repeated mask.
10989 return false;
10990 }
10991 return true;
10992}
10993
10994/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10995static bool
10996is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10997 SmallVectorImpl<int> &RepeatedMask) {
10998 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10999}
11000
11001static bool
11002is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11003 SmallVector<int, 32> RepeatedMask;
11004 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11005}
11006
11007/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11008static bool
11009is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11010 SmallVectorImpl<int> &RepeatedMask) {
11011 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11012}
11013
11014/// Test whether a target shuffle mask is equivalent within each sub-lane.
11015/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11016static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11017 unsigned EltSizeInBits,
11018 ArrayRef<int> Mask,
11019 SmallVectorImpl<int> &RepeatedMask) {
11020 int LaneSize = LaneSizeInBits / EltSizeInBits;
11021 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11022 int Size = Mask.size();
11023 for (int i = 0; i < Size; ++i) {
11024 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))((void)0);
11025 if (Mask[i] == SM_SentinelUndef)
11026 continue;
11027 if (Mask[i] == SM_SentinelZero) {
11028 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11029 return false;
11030 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11031 continue;
11032 }
11033 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11034 // This entry crosses lanes, so there is no way to model this shuffle.
11035 return false;
11036
11037 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11038 // later vector indices to start at multiples of LaneSize instead of Size.
11039 int LaneM = Mask[i] / Size;
11040 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11041 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11042 // This is the first non-undef entry in this slot of a 128-bit lane.
11043 RepeatedMask[i % LaneSize] = LocalM;
11044 else if (RepeatedMask[i % LaneSize] != LocalM)
11045 // Found a mismatch with the repeated mask.
11046 return false;
11047 }
11048 return true;
11049}
11050
11051/// Test whether a target shuffle mask is equivalent within each sub-lane.
11052/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11053static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11054 ArrayRef<int> Mask,
11055 SmallVectorImpl<int> &RepeatedMask) {
11056 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11057 Mask, RepeatedMask);
11058}
11059
11060/// Checks whether the vector elements referenced by two shuffle masks are
11061/// equivalent.
11062static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11063 int Idx, int ExpectedIdx) {
11064 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&((void)0)
11065 ExpectedIdx < MaskSize && "Out of range element index")((void)0);
11066 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11067 return false;
11068
11069 switch (Op.getOpcode()) {
11070 case ISD::BUILD_VECTOR:
11071 // If the values are build vectors, we can look through them to find
11072 // equivalent inputs that make the shuffles equivalent.
11073 // TODO: Handle MaskSize != Op.getNumOperands()?
11074 if (MaskSize == (int)Op.getNumOperands() &&
11075 MaskSize == (int)ExpectedOp.getNumOperands())
11076 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11077 break;
11078 case X86ISD::VBROADCAST:
11079 case X86ISD::VBROADCAST_LOAD:
11080 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11081 return (Op == ExpectedOp &&
11082 (int)Op.getValueType().getVectorNumElements() == MaskSize);
11083 case X86ISD::HADD:
11084 case X86ISD::HSUB:
11085 case X86ISD::FHADD:
11086 case X86ISD::FHSUB:
11087 case X86ISD::PACKSS:
11088 case X86ISD::PACKUS:
11089 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11090 // TODO: Handle MaskSize != NumElts?
11091 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11092 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11093 MVT VT = Op.getSimpleValueType();
11094 int NumElts = VT.getVectorNumElements();
11095 if (MaskSize == NumElts) {
11096 int NumLanes = VT.getSizeInBits() / 128;
11097 int NumEltsPerLane = NumElts / NumLanes;
11098 int NumHalfEltsPerLane = NumEltsPerLane / 2;
11099 bool SameLane =
11100 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11101 bool SameElt =
11102 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11103 return SameLane && SameElt;
11104 }
11105 }
11106 break;
11107 }
11108
11109 return false;
11110}
11111
11112/// Checks whether a shuffle mask is equivalent to an explicit list of
11113/// arguments.
11114///
11115/// This is a fast way to test a shuffle mask against a fixed pattern:
11116///
11117/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11118///
11119/// It returns true if the mask is exactly as wide as the argument list, and
11120/// each element of the mask is either -1 (signifying undef) or the value given
11121/// in the argument.
11122static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11123 SDValue V1 = SDValue(),
11124 SDValue V2 = SDValue()) {
11125 int Size = Mask.size();
11126 if (Size != (int)ExpectedMask.size())
11127 return false;
11128
11129 for (int i = 0; i < Size; ++i) {
11130 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
11131 int MaskIdx = Mask[i];
11132 int ExpectedIdx = ExpectedMask[i];
11133 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11134 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11135 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11136 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11137 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11138 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11139 return false;
11140 }
11141 }
11142 return true;
11143}
11144
11145/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11146///
11147/// The masks must be exactly the same width.
11148///
11149/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11150/// value in ExpectedMask is always accepted. Otherwise the indices must match.
11151///
11152/// SM_SentinelZero is accepted as a valid negative index but must match in
11153/// both.
11154static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11155 ArrayRef<int> ExpectedMask,
11156 SDValue V1 = SDValue(),
11157 SDValue V2 = SDValue()) {
11158 int Size = Mask.size();
11159 if (Size != (int)ExpectedMask.size())
11160 return false;
11161 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&((void)0)
11162 "Illegal target shuffle mask")((void)0);
11163
11164 // Check for out-of-range target shuffle mask indices.
11165 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11166 return false;
11167
11168 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11169 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11170 V1 = SDValue();
11171 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11172 V2 = SDValue();
11173
11174 for (int i = 0; i < Size; ++i) {
11175 int MaskIdx = Mask[i];
11176 int ExpectedIdx = ExpectedMask[i];
11177 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11178 continue;
11179 if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11180 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11181 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11182 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11183 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11184 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11185 continue;
11186 }
11187 // TODO - handle SM_Sentinel equivalences.
11188 return false;
11189 }
11190 return true;
11191}
11192
11193// Attempt to create a shuffle mask from a VSELECT condition mask.
11194static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11195 SDValue Cond) {
11196 EVT CondVT = Cond.getValueType();
11197 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11198 unsigned NumElts = CondVT.getVectorNumElements();
11199
11200 APInt UndefElts;
11201 SmallVector<APInt, 32> EltBits;
11202 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11203 true, false))
11204 return false;
11205
11206 Mask.resize(NumElts, SM_SentinelUndef);
11207
11208 for (int i = 0; i != (int)NumElts; ++i) {
11209 Mask[i] = i;
11210 // Arbitrarily choose from the 2nd operand if the select condition element
11211 // is undef.
11212 // TODO: Can we do better by matching patterns such as even/odd?
11213 if (UndefElts[i] || EltBits[i].isNullValue())
11214 Mask[i] += NumElts;
11215 }
11216
11217 return true;
11218}
11219
11220// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11221// instructions.
11222static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11223 if (VT != MVT::v8i32 && VT != MVT::v8f32)
11224 return false;
11225
11226 SmallVector<int, 8> Unpcklwd;
11227 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11228 /* Unary = */ false);
11229 SmallVector<int, 8> Unpckhwd;
11230 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11231 /* Unary = */ false);
11232 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11233 isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11234 return IsUnpackwdMask;
11235}
11236
11237static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11238 // Create 128-bit vector type based on mask size.
11239 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11240 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11241
11242 // We can't assume a canonical shuffle mask, so try the commuted version too.
11243 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11244 ShuffleVectorSDNode::commuteMask(CommutedMask);
11245
11246 // Match any of unary/binary or low/high.
11247 for (unsigned i = 0; i != 4; ++i) {
11248 SmallVector<int, 16> UnpackMask;
11249 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11250 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11251 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11252 return true;
11253 }
11254 return false;
11255}
11256
11257/// Return true if a shuffle mask chooses elements identically in its top and
11258/// bottom halves. For example, any splat mask has the same top and bottom
11259/// halves. If an element is undefined in only one half of the mask, the halves
11260/// are not considered identical.
11261static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11262 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")((void)0);
11263 unsigned HalfSize = Mask.size() / 2;
11264 for (unsigned i = 0; i != HalfSize; ++i) {
11265 if (Mask[i] != Mask[i + HalfSize])
11266 return false;
11267 }
11268 return true;
11269}
11270
11271/// Get a 4-lane 8-bit shuffle immediate for a mask.
11272///
11273/// This helper function produces an 8-bit shuffle immediate corresponding to
11274/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11275/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11276/// example.
11277///
11278/// NB: We rely heavily on "undef" masks preserving the input lane.
11279static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11280 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((void)0);
11281 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((void)0);
11282 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((void)0);
11283 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((void)0);
11284 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((void)0);
11285
11286 // If the mask only uses one non-undef element, then fully 'splat' it to
11287 // improve later broadcast matching.
11288 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11289 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")((void)0);
11290
11291 int FirstElt = Mask[FirstIndex];
11292 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11293 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11294
11295 unsigned Imm = 0;
11296 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11297 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11298 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11299 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11300 return Imm;
11301}
11302
11303static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11304 SelectionDAG &DAG) {
11305 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11306}
11307
11308// The Shuffle result is as follow:
11309// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11310// Each Zeroable's element correspond to a particular Mask's element.
11311// As described in computeZeroableShuffleElements function.
11312//
11313// The function looks for a sub-mask that the nonzero elements are in
11314// increasing order. If such sub-mask exist. The function returns true.
11315static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11316 ArrayRef<int> Mask, const EVT &VectorType,
11317 bool &IsZeroSideLeft) {
11318 int NextElement = -1;
11319 // Check if the Mask's nonzero elements are in increasing order.
11320 for (int i = 0, e = Mask.size(); i < e; i++) {
11321 // Checks if the mask's zeros elements are built from only zeros.
11322 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
11323 if (Mask[i] < 0)
11324 return false;
11325 if (Zeroable[i])
11326 continue;
11327 // Find the lowest non zero element
11328 if (NextElement < 0) {
11329 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11330 IsZeroSideLeft = NextElement != 0;
11331 }
11332 // Exit if the mask's non zero elements are not in increasing order.
11333 if (NextElement != Mask[i])
11334 return false;
11335 NextElement++;
11336 }
11337 return true;
11338}
11339
11340/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11341static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11342 ArrayRef<int> Mask, SDValue V1,
11343 SDValue V2, const APInt &Zeroable,
11344 const X86Subtarget &Subtarget,
11345 SelectionDAG &DAG) {
11346 int Size = Mask.size();
11347 int LaneSize = 128 / VT.getScalarSizeInBits();
11348 const int NumBytes = VT.getSizeInBits() / 8;
11349 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11350
11351 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||((void)0)
11352 (Subtarget.hasAVX2() && VT.is256BitVector()) ||((void)0)
11353 (Subtarget.hasBWI() && VT.is512BitVector()))((void)0);
11354
11355 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11356 // Sign bit set in i8 mask means zero element.
11357 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11358
11359 SDValue V;
11360 for (int i = 0; i < NumBytes; ++i) {
11361 int M = Mask[i / NumEltBytes];
11362 if (M < 0) {
11363 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11364 continue;
11365 }
11366 if (Zeroable[i / NumEltBytes]) {
11367 PSHUFBMask[i] = ZeroMask;
11368 continue;
11369 }
11370
11371 // We can only use a single input of V1 or V2.
11372 SDValue SrcV = (M >= Size ? V2 : V1);
11373 if (V && V != SrcV)
11374 return SDValue();
11375 V = SrcV;
11376 M %= Size;
11377
11378 // PSHUFB can't cross lanes, ensure this doesn't happen.
11379 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11380 return SDValue();
11381
11382 M = M % LaneSize;
11383 M = M * NumEltBytes + (i % NumEltBytes);
11384 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11385 }
11386 assert(V && "Failed to find a source input")((void)0);
11387
11388 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11389 return DAG.getBitcast(
11390 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11391 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11392}
11393
11394static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11395 const X86Subtarget &Subtarget, SelectionDAG &DAG,
11396 const SDLoc &dl);
11397
11398// X86 has dedicated shuffle that can be lowered to VEXPAND
11399static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11400 const APInt &Zeroable,
11401 ArrayRef<int> Mask, SDValue &V1,
11402 SDValue &V2, SelectionDAG &DAG,
11403 const X86Subtarget &Subtarget) {
11404 bool IsLeftZeroSide = true;
11405 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11406 IsLeftZeroSide))
11407 return SDValue();
11408 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11409 MVT IntegerType =
11410 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11411 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11412 unsigned NumElts = VT.getVectorNumElements();
11413 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&((void)0)
11414 "Unexpected number of vector elements")((void)0);
11415 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11416 Subtarget, DAG, DL);
11417 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11418 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11419 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11420}
11421
11422static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11423 unsigned &UnpackOpcode, bool IsUnary,
11424 ArrayRef<int> TargetMask, const SDLoc &DL,
11425 SelectionDAG &DAG,
11426 const X86Subtarget &Subtarget) {
11427 int NumElts = VT.getVectorNumElements();
11428
11429 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11430 for (int i = 0; i != NumElts; i += 2) {
11431 int M1 = TargetMask[i + 0];
11432 int M2 = TargetMask[i + 1];
11433 Undef1 &= (SM_SentinelUndef == M1);
11434 Undef2 &= (SM_SentinelUndef == M2);
11435 Zero1 &= isUndefOrZero(M1);
11436 Zero2 &= isUndefOrZero(M2);
11437 }
11438 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&((void)0)
11439 "Zeroable shuffle detected")((void)0);
11440
11441 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11442 SmallVector<int, 64> Unpckl, Unpckh;
11443 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11444 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11445 (IsUnary ? V1 : V2))) {
11446 UnpackOpcode = X86ISD::UNPCKL;
11447 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11448 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11449 return true;
11450 }
11451
11452 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11453 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11454 (IsUnary ? V1 : V2))) {
11455 UnpackOpcode = X86ISD::UNPCKH;
11456 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11457 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11458 return true;
11459 }
11460
11461 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11462 if (IsUnary && (Zero1 || Zero2)) {
11463 // Don't bother if we can blend instead.
11464 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11465 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11466 return false;
11467
11468 bool MatchLo = true, MatchHi = true;
11469 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11470 int M = TargetMask[i];
11471
11472 // Ignore if the input is known to be zero or the index is undef.
11473 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11474 (M == SM_SentinelUndef))
11475 continue;
11476
11477 MatchLo &= (M == Unpckl[i]);
11478 MatchHi &= (M == Unpckh[i]);
11479 }
11480
11481 if (MatchLo || MatchHi) {
11482 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11483 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11484 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11485 return true;
11486 }
11487 }
11488
11489 // If a binary shuffle, commute and try again.
11490 if (!IsUnary) {
11491 ShuffleVectorSDNode::commuteMask(Unpckl);
11492 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11493 UnpackOpcode = X86ISD::UNPCKL;
11494 std::swap(V1, V2);
11495 return true;
11496 }
11497
11498 ShuffleVectorSDNode::commuteMask(Unpckh);
11499 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11500 UnpackOpcode = X86ISD::UNPCKH;
11501 std::swap(V1, V2);
11502 return true;
11503 }
11504 }
11505
11506 return false;
11507}
11508
11509// X86 has dedicated unpack instructions that can handle specific blend
11510// operations: UNPCKH and UNPCKL.
11511static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11512 ArrayRef<int> Mask, SDValue V1, SDValue V2,
11513 SelectionDAG &DAG) {
11514 SmallVector<int, 8> Unpckl;
11515 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11516 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11517 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11518
11519 SmallVector<int, 8> Unpckh;
11520 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11521 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11522 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11523
11524 // Commute and try again.
11525 ShuffleVectorSDNode::commuteMask(Unpckl);
11526 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11527 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11528
11529 ShuffleVectorSDNode::commuteMask(Unpckh);
11530 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11531 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11532
11533 return SDValue();
11534}
11535
11536/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11537/// followed by unpack 256-bit.
11538static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11539 ArrayRef<int> Mask, SDValue V1,
11540 SDValue V2, SelectionDAG &DAG) {
11541 SmallVector<int, 32> Unpckl, Unpckh;
11542 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11543 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11544
11545 unsigned UnpackOpcode;
11546 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11547 UnpackOpcode = X86ISD::UNPCKL;
11548 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11549 UnpackOpcode = X86ISD::UNPCKH;
11550 else
11551 return SDValue();
11552
11553 // This is a "natural" unpack operation (rather than the 128-bit sectored
11554 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11555 // input in order to use the x86 instruction.
11556 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11557 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11558 V1 = DAG.getBitcast(VT, V1);
11559 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11560}
11561
11562// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11563// source into the lower elements and zeroing the upper elements.
11564static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11565 ArrayRef<int> Mask, const APInt &Zeroable,
11566 const X86Subtarget &Subtarget) {
11567 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11568 return false;
11569
11570 unsigned NumElts = Mask.size();
11571 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11572 unsigned MaxScale = 64 / EltSizeInBits;
11573
11574 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11575 unsigned SrcEltBits = EltSizeInBits * Scale;
11576 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11577 continue;
11578 unsigned NumSrcElts = NumElts / Scale;
11579 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11580 continue;
11581 unsigned UpperElts = NumElts - NumSrcElts;
11582 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11583 continue;
11584 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11585 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11586 DstVT = MVT::getIntegerVT(EltSizeInBits);
11587 if ((NumSrcElts * EltSizeInBits) >= 128) {
11588 // ISD::TRUNCATE
11589 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11590 } else {
11591 // X86ISD::VTRUNC
11592 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11593 }
11594 return true;
11595 }
11596
11597 return false;
11598}
11599
11600// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11601// element padding to the final DstVT.
11602static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11603 const X86Subtarget &Subtarget,
11604 SelectionDAG &DAG, bool ZeroUppers) {
11605 MVT SrcVT = Src.getSimpleValueType();
11606 MVT DstSVT = DstVT.getScalarType();
11607 unsigned NumDstElts = DstVT.getVectorNumElements();
11608 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11609 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11610
11611 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11612 return SDValue();
11613
11614 // Perform a direct ISD::TRUNCATE if possible.
11615 if (NumSrcElts == NumDstElts)
11616 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11617
11618 if (NumSrcElts > NumDstElts) {
11619 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11620 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11621 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11622 }
11623
11624 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11625 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11626 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11627 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11628 DstVT.getSizeInBits());
11629 }
11630
11631 // Non-VLX targets must truncate from a 512-bit type, so we need to
11632 // widen, truncate and then possibly extract the original subvector.
11633 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11634 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11635 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11636 }
11637
11638 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11639 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11640 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11641 if (DstVT != TruncVT)
11642 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11643 DstVT.getSizeInBits());
11644 return Trunc;
11645}
11646
11647// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11648//
11649// An example is the following:
11650//
11651// t0: ch = EntryToken
11652// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11653// t25: v4i32 = truncate t2
11654// t41: v8i16 = bitcast t25
11655// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11656// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11657// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11658// t18: v2i64 = bitcast t51
11659//
11660// One can just use a single vpmovdw instruction, without avx512vl we need to
11661// use the zmm variant and extract the lower subvector, padding with zeroes.
11662// TODO: Merge with lowerShuffleAsVTRUNC.
11663static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11664 SDValue V2, ArrayRef<int> Mask,
11665 const APInt &Zeroable,
11666 const X86Subtarget &Subtarget,
11667 SelectionDAG &DAG) {
11668 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")((void)0);
11669 if (!Subtarget.hasAVX512())
11670 return SDValue();
11671
11672 unsigned NumElts = VT.getVectorNumElements();
11673 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11674 unsigned MaxScale = 64 / EltSizeInBits;
11675 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11676 unsigned NumSrcElts = NumElts / Scale;
11677 unsigned UpperElts = NumElts - NumSrcElts;
11678 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11679 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11680 continue;
11681
11682 SDValue Src = V1;
11683 if (!Src.hasOneUse())
11684 return SDValue();
11685
11686 Src = peekThroughOneUseBitcasts(Src);
11687 if (Src.getOpcode() != ISD::TRUNCATE ||
11688 Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11689 return SDValue();
11690 Src = Src.getOperand(0);
11691
11692 // VPMOVWB is only available with avx512bw.
11693 MVT SrcVT = Src.getSimpleValueType();
11694 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11695 !Subtarget.hasBWI())
11696 return SDValue();
11697
11698 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11699 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11700 }
11701
11702 return SDValue();
11703}
11704
11705// Attempt to match binary shuffle patterns as a truncate.
11706static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11707 SDValue V2, ArrayRef<int> Mask,
11708 const APInt &Zeroable,
11709 const X86Subtarget &Subtarget,
11710 SelectionDAG &DAG) {
11711 assert((VT.is128BitVector() || VT.is256BitVector()) &&((void)0)
11712 "Unexpected VTRUNC type")((void)0);
11713 if (!Subtarget.hasAVX512())
11714 return SDValue();
11715
11716 unsigned NumElts = VT.getVectorNumElements();
11717 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11718 unsigned MaxScale = 64 / EltSizeInBits;
11719 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11720 // TODO: Support non-BWI VPMOVWB truncations?
11721 unsigned SrcEltBits = EltSizeInBits * Scale;
11722 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11723 continue;
11724
11725 // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11726 // Bail if the V2 elements are undef.
11727 unsigned NumHalfSrcElts = NumElts / Scale;
11728 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11729 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11730 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11731 continue;
11732
11733 // The elements beyond the truncation must be undef/zero.
11734 unsigned UpperElts = NumElts - NumSrcElts;
11735 if (UpperElts > 0 &&
11736 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11737 continue;
11738 bool UndefUppers =
11739 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11740
11741 // As we're using both sources then we need to concat them together
11742 // and truncate from the double-sized src.
11743 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11744 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11745
11746 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11747 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11748 Src = DAG.getBitcast(SrcVT, Src);
11749 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11750 }
11751
11752 return SDValue();
11753}
11754
11755/// Check whether a compaction lowering can be done by dropping even
11756/// elements and compute how many times even elements must be dropped.
11757///
11758/// This handles shuffles which take every Nth element where N is a power of
11759/// two. Example shuffle masks:
11760///
11761/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11762/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11763/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11764/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11765/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11766/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11767///
11768/// Any of these lanes can of course be undef.
11769///
11770/// This routine only supports N <= 3.
11771/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11772/// for larger N.
11773///
11774/// \returns N above, or the number of times even elements must be dropped if
11775/// there is such a number. Otherwise returns zero.
11776static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11777 bool IsSingleInput) {
11778 // The modulus for the shuffle vector entries is based on whether this is
11779 // a single input or not.
11780 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11781 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((void)0)
11782 "We should only be called with masks with a power-of-2 size!")((void)0);
11783
11784 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11785
11786 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11787 // and 2^3 simultaneously. This is because we may have ambiguity with
11788 // partially undef inputs.
11789 bool ViableForN[3] = {true, true, true};
11790
11791 for (int i = 0, e = Mask.size(); i < e; ++i) {
11792 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11793 // want.
11794 if (Mask[i] < 0)
11795 continue;
11796
11797 bool IsAnyViable = false;
11798 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11799 if (ViableForN[j]) {
11800 uint64_t N = j + 1;
11801
11802 // The shuffle mask must be equal to (i * 2^N) % M.
11803 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11804 IsAnyViable = true;
11805 else
11806 ViableForN[j] = false;
11807 }
11808 // Early exit if we exhaust the possible powers of two.
11809 if (!IsAnyViable)
11810 break;
11811 }
11812
11813 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11814 if (ViableForN[j])
11815 return j + 1;
11816
11817 // Return 0 as there is no viable power of two.
11818 return 0;
11819}
11820
11821// X86 has dedicated pack instructions that can handle specific truncation
11822// operations: PACKSS and PACKUS.
11823// Checks for compaction shuffle masks if MaxStages > 1.
11824// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11825static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11826 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11827 const SelectionDAG &DAG,
11828 const X86Subtarget &Subtarget,
11829 unsigned MaxStages = 1) {
11830 unsigned NumElts = VT.getVectorNumElements();
11831 unsigned BitSize = VT.getScalarSizeInBits();
11832 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&((void)0)
11833 "Illegal maximum compaction")((void)0);
11834
11835 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11836 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11837 unsigned NumPackedBits = NumSrcBits - BitSize;
11838 N1 = peekThroughBitcasts(N1);
11839 N2 = peekThroughBitcasts(N2);
11840 unsigned NumBits1 = N1.getScalarValueSizeInBits();
11841 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11842 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11843 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11844 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11845 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11846 return false;
11847 if (Subtarget.hasSSE41() || BitSize == 8) {
11848 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11849 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11850 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11851 V1 = N1;
11852 V2 = N2;
11853 SrcVT = PackVT;
11854 PackOpcode = X86ISD::PACKUS;
11855 return true;
11856 }
11857 }
11858 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11859 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11860 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11861 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11862 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11863 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11864 V1 = N1;
11865 V2 = N2;
11866 SrcVT = PackVT;
11867 PackOpcode = X86ISD::PACKSS;
11868 return true;
11869 }
11870 return false;
11871 };
11872
11873 // Attempt to match against wider and wider compaction patterns.
11874 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11875 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11876 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11877
11878 // Try binary shuffle.
11879 SmallVector<int, 32> BinaryMask;
11880 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11881 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
11882 if (MatchPACK(V1, V2, PackVT))
11883 return true;
11884
11885 // Try unary shuffle.
11886 SmallVector<int, 32> UnaryMask;
11887 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11888 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
11889 if (MatchPACK(V1, V1, PackVT))
11890 return true;
11891 }
11892
11893 return false;
11894}
11895
11896static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11897 SDValue V1, SDValue V2, SelectionDAG &DAG,
11898 const X86Subtarget &Subtarget) {
11899 MVT PackVT;
11900 unsigned PackOpcode;
11901 unsigned SizeBits = VT.getSizeInBits();
11902 unsigned EltBits = VT.getScalarSizeInBits();
11903 unsigned MaxStages = Log2_32(64 / EltBits);
11904 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11905 Subtarget, MaxStages))
11906 return SDValue();
11907
11908 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11909 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11910
11911 // Don't lower multi-stage packs on AVX512, truncation is better.
11912 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11913 return SDValue();
11914
11915 // Pack to the largest type possible:
11916 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11917 unsigned MaxPackBits = 16;
11918 if (CurrentEltBits > 16 &&
11919 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11920 MaxPackBits = 32;
11921
11922 // Repeatedly pack down to the target size.
11923 SDValue Res;
11924 for (unsigned i = 0; i != NumStages; ++i) {
11925 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11926 unsigned NumSrcElts = SizeBits / SrcEltBits;
11927 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11928 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11929 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11930 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11931 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11932 DAG.getBitcast(SrcVT, V2));
11933 V1 = V2 = Res;
11934 CurrentEltBits /= 2;
11935 }
11936 assert(Res && Res.getValueType() == VT &&((void)0)
11937 "Failed to lower compaction shuffle")((void)0);
11938 return Res;
11939}
11940
11941/// Try to emit a bitmask instruction for a shuffle.
11942///
11943/// This handles cases where we can model a blend exactly as a bitmask due to
11944/// one of the inputs being zeroable.
11945static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11946 SDValue V2, ArrayRef<int> Mask,
11947 const APInt &Zeroable,
11948 const X86Subtarget &Subtarget,
11949 SelectionDAG &DAG) {
11950 MVT MaskVT = VT;
11951 MVT EltVT = VT.getVectorElementType();
11952 SDValue Zero, AllOnes;
11953 // Use f64 if i64 isn't legal.
11954 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11955 EltVT = MVT::f64;
11956 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11957 }
11958
11959 MVT LogicVT = VT;
11960 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11961 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11962 APFloat AllOnesValue = APFloat::getAllOnesValue(
11963 SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
11964 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11965 LogicVT =
11966 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11967 } else {
11968 Zero = DAG.getConstant(0, DL, EltVT);
11969 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11970 }
11971
11972 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11973 SDValue V;
11974 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11975 if (Zeroable[i])
11976 continue;
11977 if (Mask[i] % Size != i)
11978 return SDValue(); // Not a blend.
11979 if (!V)
11980 V = Mask[i] < Size ? V1 : V2;
11981 else if (V != (Mask[i] < Size ? V1 : V2))
11982 return SDValue(); // Can only let one input through the mask.
11983
11984 VMaskOps[i] = AllOnes;
11985 }
11986 if (!V)
11987 return SDValue(); // No non-zeroable elements!
11988
11989 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11990 VMask = DAG.getBitcast(LogicVT, VMask);
11991 V = DAG.getBitcast(LogicVT, V);
11992 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11993 return DAG.getBitcast(VT, And);
11994}
11995
11996/// Try to emit a blend instruction for a shuffle using bit math.
11997///
11998/// This is used as a fallback approach when first class blend instructions are
11999/// unavailable. Currently it is only suitable for integer vectors, but could
12000/// be generalized for floating point vectors if desirable.
12001static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12002 SDValue V2, ArrayRef<int> Mask,
12003 SelectionDAG &DAG) {
12004 assert(VT.isInteger() && "Only supports integer vector types!")((void)0);
12005 MVT EltVT = VT.getVectorElementType();
12006 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12007 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12008 SmallVector<SDValue, 16> MaskOps;
12009 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12010 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12011 return SDValue(); // Shuffled input!
12012 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12013 }
12014
12015 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12016 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12017 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12018 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12019}
12020
12021static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12022 SDValue PreservedSrc,
12023 const X86Subtarget &Subtarget,
12024 SelectionDAG &DAG);
12025
12026static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12027 MutableArrayRef<int> Mask,
12028 const APInt &Zeroable, bool &ForceV1Zero,
12029 bool &ForceV2Zero, uint64_t &BlendMask) {
12030 bool V1IsZeroOrUndef =
12031 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12032 bool V2IsZeroOrUndef =
12033 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12034
12035 BlendMask = 0;
12036 ForceV1Zero = false, ForceV2Zero = false;
12037 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")((void)0);
12038
12039 // Attempt to generate the binary blend mask. If an input is zero then
12040 // we can use any lane.
12041 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12042 int M = Mask[i];
12043 if (M == SM_SentinelUndef)
12044 continue;
12045 if (M == i)
12046 continue;
12047 if (M == i + Size) {
12048 BlendMask |= 1ull << i;
12049 continue;
12050 }
12051 if (Zeroable[i]) {
12052 if (V1IsZeroOrUndef) {
12053 ForceV1Zero = true;
12054 Mask[i] = i;
12055 continue;
12056 }
12057 if (V2IsZeroOrUndef) {
12058 ForceV2Zero = true;
12059 BlendMask |= 1ull << i;
12060 Mask[i] = i + Size;
12061 continue;
12062 }
12063 }
12064 return false;
12065 }
12066 return true;
12067}
12068
12069static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12070 int Scale) {
12071 uint64_t ScaledMask = 0;
12072 for (int i = 0; i != Size; ++i)
12073 if (BlendMask & (1ull << i))
12074 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12075 return ScaledMask;
12076}
12077
12078/// Try to emit a blend instruction for a shuffle.
12079///
12080/// This doesn't do any checks for the availability of instructions for blending
12081/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12082/// be matched in the backend with the type given. What it does check for is
12083/// that the shuffle mask is a blend, or convertible into a blend with zero.
12084static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12085 SDValue V2, ArrayRef<int> Original,
12086 const APInt &Zeroable,
12087 const X86Subtarget &Subtarget,
12088 SelectionDAG &DAG) {
12089 uint64_t BlendMask = 0;
12090 bool ForceV1Zero = false, ForceV2Zero = false;
12091 SmallVector<int, 64> Mask(Original.begin(), Original.end());
12092 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12093 BlendMask))
12094 return SDValue();
12095
12096 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12097 if (ForceV1Zero)
12098 V1 = getZeroVector(VT, Subtarget, DAG, DL);
12099 if (ForceV2Zero)
12100 V2 = getZeroVector(VT, Subtarget, DAG, DL);
12101
12102 switch (VT.SimpleTy) {
12103 case MVT::v4i64:
12104 case MVT::v8i32:
12105 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((void)0);
12106 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12107 case MVT::v4f64:
12108 case MVT::v8f32:
12109 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")((void)0);
12110 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12111 case MVT::v2f64:
12112 case MVT::v2i64:
12113 case MVT::v4f32:
12114 case MVT::v4i32:
12115 case MVT::v8i16:
12116 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")((void)0);
12117 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12118 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12119 case MVT::v16i16: {
12120 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")((void)0);
12121 SmallVector<int, 8> RepeatedMask;
12122 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12123 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12124 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((void)0);
12125 BlendMask = 0;
12126 for (int i = 0; i < 8; ++i)
12127 if (RepeatedMask[i] >= 8)
12128 BlendMask |= 1ull << i;
12129 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12130 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12131 }
12132 // Use PBLENDW for lower/upper lanes and then blend lanes.
12133 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12134 // merge to VSELECT where useful.
12135 uint64_t LoMask = BlendMask & 0xFF;
12136 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12137 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12138 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12139 DAG.getTargetConstant(LoMask, DL, MVT::i8));
12140 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12141 DAG.getTargetConstant(HiMask, DL, MVT::i8));
12142 return DAG.getVectorShuffle(
12143 MVT::v16i16, DL, Lo, Hi,
12144 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12145 }
12146 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12147 }
12148 case MVT::v32i8:
12149 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")((void)0);
12150 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12151 case MVT::v16i8: {
12152 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")((void)0);
12153
12154 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12155 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12156 Subtarget, DAG))
12157 return Masked;
12158
12159 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12160 MVT IntegerType =
12161 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12162 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12163 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12164 }
12165
12166 // If we have VPTERNLOG, we can use that as a bit blend.
12167 if (Subtarget.hasVLX())
12168 if (SDValue BitBlend =
12169 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12170 return BitBlend;
12171
12172 // Scale the blend by the number of bytes per element.
12173 int Scale = VT.getScalarSizeInBits() / 8;
12174
12175 // This form of blend is always done on bytes. Compute the byte vector
12176 // type.
12177 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12178
12179 // x86 allows load folding with blendvb from the 2nd source operand. But
12180 // we are still using LLVM select here (see comment below), so that's V1.
12181 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12182 // allow that load-folding possibility.
12183 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12184 ShuffleVectorSDNode::commuteMask(Mask);
12185 std::swap(V1, V2);
12186 }
12187
12188 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12189 // mix of LLVM's code generator and the x86 backend. We tell the code
12190 // generator that boolean values in the elements of an x86 vector register
12191 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12192 // mapping a select to operand #1, and 'false' mapping to operand #2. The
12193 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12194 // of the element (the remaining are ignored) and 0 in that high bit would
12195 // mean operand #1 while 1 in the high bit would mean operand #2. So while
12196 // the LLVM model for boolean values in vector elements gets the relevant
12197 // bit set, it is set backwards and over constrained relative to x86's
12198 // actual model.
12199 SmallVector<SDValue, 32> VSELECTMask;
12200 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12201 for (int j = 0; j < Scale; ++j)
12202 VSELECTMask.push_back(
12203 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12204 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12205 MVT::i8));
12206
12207 V1 = DAG.getBitcast(BlendVT, V1);
12208 V2 = DAG.getBitcast(BlendVT, V2);
12209 return DAG.getBitcast(
12210 VT,
12211 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12212 V1, V2));
12213 }
12214 case MVT::v16f32:
12215 case MVT::v8f64:
12216 case MVT::v8i64:
12217 case MVT::v16i32:
12218 case MVT::v32i16:
12219 case MVT::v64i8: {
12220 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12221 bool OptForSize = DAG.shouldOptForSize();
12222 if (!OptForSize) {
12223 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12224 Subtarget, DAG))
12225 return Masked;
12226 }
12227
12228 // Otherwise load an immediate into a GPR, cast to k-register, and use a
12229 // masked move.
12230 MVT IntegerType =
12231 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12232 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12233 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12234 }
12235 default:
12236 llvm_unreachable("Not a supported integer vector type!")__builtin_unreachable();
12237 }
12238}
12239
12240/// Try to lower as a blend of elements from two inputs followed by
12241/// a single-input permutation.
12242///
12243/// This matches the pattern where we can blend elements from two inputs and
12244/// then reduce the shuffle to a single-input permutation.
12245static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12246 SDValue V1, SDValue V2,
12247 ArrayRef<int> Mask,
12248 SelectionDAG &DAG,
12249 bool ImmBlends = false) {
12250 // We build up the blend mask while checking whether a blend is a viable way
12251 // to reduce the shuffle.
12252 SmallVector<int, 32> BlendMask(Mask.size(), -1);
12253 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12254
12255 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12256 if (Mask[i] < 0)
12257 continue;
12258
12259 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((void)0);
12260
12261 if (BlendMask[Mask[i] % Size] < 0)
12262 BlendMask[Mask[i] % Size] = Mask[i];
12263 else if (BlendMask[Mask[i] % Size] != Mask[i])
12264 return SDValue(); // Can't blend in the needed input!
12265
12266 PermuteMask[i] = Mask[i] % Size;
12267 }
12268
12269 // If only immediate blends, then bail if the blend mask can't be widened to
12270 // i16.
12271 unsigned EltSize = VT.getScalarSizeInBits();
12272 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12273 return SDValue();
12274
12275 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12276 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12277}
12278
12279/// Try to lower as an unpack of elements from two inputs followed by
12280/// a single-input permutation.
12281///
12282/// This matches the pattern where we can unpack elements from two inputs and
12283/// then reduce the shuffle to a single-input (wider) permutation.
12284static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12285 SDValue V1, SDValue V2,
12286 ArrayRef<int> Mask,
12287 SelectionDAG &DAG) {
12288 int NumElts = Mask.size();
12289 int NumLanes = VT.getSizeInBits() / 128;
12290 int NumLaneElts = NumElts / NumLanes;
12291 int NumHalfLaneElts = NumLaneElts / 2;
12292
12293 bool MatchLo = true, MatchHi = true;
12294 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12295
12296 // Determine UNPCKL/UNPCKH type and operand order.
12297 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12298 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12299 int M = Mask[Lane + Elt];
12300 if (M < 0)
12301 continue;
12302
12303 SDValue &Op = Ops[Elt & 1];
12304 if (M < NumElts && (Op.isUndef() || Op == V1))
12305 Op = V1;
12306 else if (NumElts <= M && (Op.isUndef() || Op == V2))
12307 Op = V2;
12308 else
12309 return SDValue();
12310
12311 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12312 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12313 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12314 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12315 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12316 if (!MatchLo && !MatchHi)
12317 return SDValue();
12318 }
12319 }
12320 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")((void)0);
12321
12322 // Now check that each pair of elts come from the same unpack pair
12323 // and set the permute mask based on each pair.
12324 // TODO - Investigate cases where we permute individual elements.
12325 SmallVector<int, 32> PermuteMask(NumElts, -1);
12326 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12327 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12328 int M0 = Mask[Lane + Elt + 0];
12329 int M1 = Mask[Lane + Elt + 1];
12330 if (0 <= M0 && 0 <= M1 &&
12331 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12332 return SDValue();
12333 if (0 <= M0)
12334 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12335 if (0 <= M1)
12336 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12337 }
12338 }
12339
12340 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12341 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12342 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12343}
12344
12345/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12346/// permuting the elements of the result in place.
12347static SDValue lowerShuffleAsByteRotateAndPermute(
12348 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12349 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12350 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12351 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12352 (VT.is512BitVector() && !Subtarget.hasBWI()))
12353 return SDValue();
12354
12355 // We don't currently support lane crossing permutes.
12356 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12357 return SDValue();
12358
12359 int Scale = VT.getScalarSizeInBits() / 8;
12360 int NumLanes = VT.getSizeInBits() / 128;
12361 int NumElts = VT.getVectorNumElements();
12362 int NumEltsPerLane = NumElts / NumLanes;
12363
12364 // Determine range of mask elts.
12365 bool Blend1 = true;
12366 bool Blend2 = true;
12367 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12368 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12369 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12370 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12371 int M = Mask[Lane + Elt];
12372 if (M < 0)
12373 continue;
12374 if (M < NumElts) {
12375 Blend1 &= (M == (Lane + Elt));
12376 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((void)0);
12377 M = M % NumEltsPerLane;
12378 Range1.first = std::min(Range1.first, M);
12379 Range1.second = std::max(Range1.second, M);
12380 } else {
12381 M -= NumElts;
12382 Blend2 &= (M == (Lane + Elt));
12383 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((void)0);
12384 M = M % NumEltsPerLane;
12385 Range2.first = std::min(Range2.first, M);
12386 Range2.second = std::max(Range2.second, M);
12387 }
12388 }
12389 }
12390
12391 // Bail if we don't need both elements.
12392 // TODO - it might be worth doing this for unary shuffles if the permute
12393 // can be widened.
12394 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12395 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12396 return SDValue();
12397
12398 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12399 return SDValue();
12400
12401 // Rotate the 2 ops so we can access both ranges, then permute the result.
12402 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12403 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12404 SDValue Rotate = DAG.getBitcast(
12405 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12406 DAG.getBitcast(ByteVT, Lo),
12407 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12408 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12409 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12410 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12411 int M = Mask[Lane + Elt];
12412 if (M < 0)
12413 continue;
12414 if (M < NumElts)
12415 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12416 else
12417 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12418 }
12419 }
12420 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12421 };
12422
12423 // Check if the ranges are small enough to rotate from either direction.
12424 if (Range2.second < Range1.first)
12425 return RotateAndPermute(V1, V2, Range1.first, 0);
12426 if (Range1.second < Range2.first)
12427 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12428 return SDValue();
12429}
12430
12431/// Generic routine to decompose a shuffle and blend into independent
12432/// blends and permutes.
12433///
12434/// This matches the extremely common pattern for handling combined
12435/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12436/// operations. It will try to pick the best arrangement of shuffles and
12437/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12438static SDValue lowerShuffleAsDecomposedShuffleMerge(
12439 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12440 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12441 int NumElts = Mask.size();
12442 int NumLanes = VT.getSizeInBits() / 128;
12443 int NumEltsPerLane = NumElts / NumLanes;
12444
12445 // Shuffle the input elements into the desired positions in V1 and V2 and
12446 // unpack/blend them together.
12447 bool IsAlternating = true;
12448 SmallVector<int, 32> V1Mask(NumElts, -1);
12449 SmallVector<int, 32> V2Mask(NumElts, -1);
12450 SmallVector<int, 32> FinalMask(NumElts, -1);
12451 for (int i = 0; i < NumElts; ++i) {
12452 int M = Mask[i];
12453 if (M >= 0 && M < NumElts) {
12454 V1Mask[i] = M;
12455 FinalMask[i] = i;
12456 IsAlternating &= (i & 1) == 0;
12457 } else if (M >= NumElts) {
12458 V2Mask[i] = M - NumElts;
12459 FinalMask[i] = i + NumElts;
12460 IsAlternating &= (i & 1) == 1;
12461 }
12462 }
12463
12464 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12465 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12466 // the shuffle may be able to fold with a load or other benefit. However, when
12467 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12468 // pre-shuffle first is a better strategy.
12469 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12470 // Only prefer immediate blends to unpack/rotate.
12471 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12472 DAG, true))
12473 return BlendPerm;
12474 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12475 DAG))
12476 return UnpackPerm;
12477 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12478 DL, VT, V1, V2, Mask, Subtarget, DAG))
12479 return RotatePerm;
12480 // Unpack/rotate failed - try again with variable blends.
12481 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12482 DAG))
12483 return BlendPerm;
12484 }
12485
12486 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12487 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12488 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12489 // than half the elements coming from each source.
12490 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12491 V1Mask.assign(NumElts, -1);
12492 V2Mask.assign(NumElts, -1);
12493 FinalMask.assign(NumElts, -1);
12494 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12495 for (int j = 0; j != NumEltsPerLane; ++j) {
12496 int M = Mask[i + j];
12497 if (M >= 0 && M < NumElts) {
12498 V1Mask[i + (j / 2)] = M;
12499 FinalMask[i + j] = i + (j / 2);
12500 } else if (M >= NumElts) {
12501 V2Mask[i + (j / 2)] = M - NumElts;
12502 FinalMask[i + j] = i + (j / 2) + NumElts;
12503 }
12504 }
12505 }
12506
12507 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12508 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12509 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12510}
12511
12512/// Try to lower a vector shuffle as a bit rotation.
12513///
12514/// Look for a repeated rotation pattern in each sub group.
12515/// Returns a ISD::ROTL element rotation amount or -1 if failed.
12516static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12517 int NumElts = Mask.size();
12518 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")((void)0);
12519
12520 int RotateAmt = -1;
12521 for (int i = 0; i != NumElts; i += NumSubElts) {
12522 for (int j = 0; j != NumSubElts; ++j) {
12523 int M = Mask[i + j];
12524 if (M < 0)
12525 continue;
12526 if (!isInRange(M, i, i + NumSubElts))
12527 return -1;
12528 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12529 if (0 <= RotateAmt && Offset != RotateAmt)
12530 return -1;
12531 RotateAmt = Offset;
12532 }
12533 }
12534 return RotateAmt;
12535}
12536
12537static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12538 const X86Subtarget &Subtarget,
12539 ArrayRef<int> Mask) {
12540 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12541 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")((void)0);
12542
12543 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12544 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12545 int MaxSubElts = 64 / EltSizeInBits;
12546 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12547 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12548 if (RotateAmt < 0)
12549 continue;
12550
12551 int NumElts = Mask.size();
12552 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12553 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12554 return RotateAmt * EltSizeInBits;
12555 }
12556
12557 return -1;
12558}
12559
12560/// Lower shuffle using X86ISD::VROTLI rotations.
12561static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12562 ArrayRef<int> Mask,
12563 const X86Subtarget &Subtarget,
12564 SelectionDAG &DAG) {
12565 // Only XOP + AVX512 targets have bit rotation instructions.
12566 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12567 bool IsLegal =
12568 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12569 if (!IsLegal && Subtarget.hasSSE3())
12570 return SDValue();
12571
12572 MVT RotateVT;
12573 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12574 Subtarget, Mask);
12575 if (RotateAmt < 0)
12576 return SDValue();
12577
12578 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12579 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12580 // widen to vXi16 or more then existing lowering should will be better.
12581 if (!IsLegal) {
12582 if ((RotateAmt % 16) == 0)
12583 return SDValue();
12584 // TODO: Use getTargetVShiftByConstNode.
12585 unsigned ShlAmt = RotateAmt;
12586 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12587 V1 = DAG.getBitcast(RotateVT, V1);
12588 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12589 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12590 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12591 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12592 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12593 return DAG.getBitcast(VT, Rot);
12594 }
12595
12596 SDValue Rot =
12597 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12598 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12599 return DAG.getBitcast(VT, Rot);
12600}
12601
12602/// Try to match a vector shuffle as an element rotation.
12603///
12604/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12605static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12606 ArrayRef<int> Mask) {
12607 int NumElts = Mask.size();
12608
12609 // We need to detect various ways of spelling a rotation:
12610 // [11, 12, 13, 14, 15, 0, 1, 2]
12611 // [-1, 12, 13, 14, -1, -1, 1, -1]
12612 // [-1, -1, -1, -1, -1, -1, 1, 2]
12613 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12614 // [-1, 4, 5, 6, -1, -1, 9, -1]
12615 // [-1, 4, 5, 6, -1, -1, -1, -1]
12616 int Rotation = 0;
12617 SDValue Lo, Hi;
12618 for (int i = 0; i < NumElts; ++i) {
12619 int M = Mask[i];
12620 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&((void)0)
12621 "Unexpected mask index.")((void)0);
12622 if (M < 0)
12623 continue;
12624
12625 // Determine where a rotated vector would have started.
12626 int StartIdx = i - (M % NumElts);
12627 if (StartIdx == 0)
12628 // The identity rotation isn't interesting, stop.
12629 return -1;
12630
12631 // If we found the tail of a vector the rotation must be the missing
12632 // front. If we found the head of a vector, it must be how much of the
12633 // head.
12634 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12635
12636 if (Rotation == 0)
12637 Rotation = CandidateRotation;
12638 else if (Rotation != CandidateRotation)
12639 // The rotations don't match, so we can't match this mask.
12640 return -1;
12641
12642 // Compute which value this mask is pointing at.
12643 SDValue MaskV = M < NumElts ? V1 : V2;
12644
12645 // Compute which of the two target values this index should be assigned
12646 // to. This reflects whether the high elements are remaining or the low
12647 // elements are remaining.
12648 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12649
12650 // Either set up this value if we've not encountered it before, or check
12651 // that it remains consistent.
12652 if (!TargetV)
12653 TargetV = MaskV;
12654 else if (TargetV != MaskV)
12655 // This may be a rotation, but it pulls from the inputs in some
12656 // unsupported interleaving.
12657 return -1;
12658 }
12659
12660 // Check that we successfully analyzed the mask, and normalize the results.
12661 assert(Rotation != 0 && "Failed to locate a viable rotation!")((void)0);
12662 assert((Lo || Hi) && "Failed to find a rotated input vector!")((void)0);
12663 if (!Lo)
12664 Lo = Hi;
12665 else if (!Hi)
12666 Hi = Lo;
12667
12668 V1 = Lo;
12669 V2 = Hi;
12670
12671 return Rotation;
12672}
12673
12674/// Try to lower a vector shuffle as a byte rotation.
12675///
12676/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12677/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12678/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12679/// try to generically lower a vector shuffle through such an pattern. It
12680/// does not check for the profitability of lowering either as PALIGNR or
12681/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12682/// This matches shuffle vectors that look like:
12683///
12684/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12685///
12686/// Essentially it concatenates V1 and V2, shifts right by some number of
12687/// elements, and takes the low elements as the result. Note that while this is
12688/// specified as a *right shift* because x86 is little-endian, it is a *left
12689/// rotate* of the vector lanes.
12690static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12691 ArrayRef<int> Mask) {
12692 // Don't accept any shuffles with zero elements.
12693 if (isAnyZero(Mask))
12694 return -1;
12695
12696 // PALIGNR works on 128-bit lanes.
12697 SmallVector<int, 16> RepeatedMask;
12698 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12699 return -1;
12700
12701 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12702 if (Rotation <= 0)
12703 return -1;
12704
12705 // PALIGNR rotates bytes, so we need to scale the
12706 // rotation based on how many bytes are in the vector lane.
12707 int NumElts = RepeatedMask.size();
12708 int Scale = 16 / NumElts;
12709 return Rotation * Scale;
12710}
12711
12712static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12713 SDValue V2, ArrayRef<int> Mask,
12714 const X86Subtarget &Subtarget,
12715 SelectionDAG &DAG) {
12716 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12717
12718 SDValue Lo = V1, Hi = V2;
12719 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12720 if (ByteRotation <= 0)
12721 return SDValue();
12722
12723 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12724 // PSLLDQ/PSRLDQ.
12725 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12726 Lo = DAG.getBitcast(ByteVT, Lo);
12727 Hi = DAG.getBitcast(ByteVT, Hi);
12728
12729 // SSSE3 targets can use the palignr instruction.
12730 if (Subtarget.hasSSSE3()) {
12731 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&((void)0)
12732 "512-bit PALIGNR requires BWI instructions")((void)0);
12733 return DAG.getBitcast(
12734 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12735 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12736 }
12737
12738 assert(VT.is128BitVector() &&((void)0)
12739 "Rotate-based lowering only supports 128-bit lowering!")((void)0);
12740 assert(Mask.size() <= 16 &&((void)0)
12741 "Can shuffle at most 16 bytes in a 128-bit vector!")((void)0);
12742 assert(ByteVT == MVT::v16i8 &&((void)0)
12743 "SSE2 rotate lowering only needed for v16i8!")((void)0);
12744
12745 // Default SSE2 implementation
12746 int LoByteShift = 16 - ByteRotation;
12747 int HiByteShift = ByteRotation;
12748
12749 SDValue LoShift =
12750 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12751 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12752 SDValue HiShift =
12753 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12754 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12755 return DAG.getBitcast(VT,
12756 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12757}
12758
12759/// Try to lower a vector shuffle as a dword/qword rotation.
12760///
12761/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12762/// rotation of the concatenation of two vectors; This routine will
12763/// try to generically lower a vector shuffle through such an pattern.
12764///
12765/// Essentially it concatenates V1 and V2, shifts right by some number of
12766/// elements, and takes the low elements as the result. Note that while this is
12767/// specified as a *right shift* because x86 is little-endian, it is a *left
12768/// rotate* of the vector lanes.
12769static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
12770 SDValue V2, ArrayRef<int> Mask,
12771 const X86Subtarget &Subtarget,
12772 SelectionDAG &DAG) {
12773 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&((void)0)
12774 "Only 32-bit and 64-bit elements are supported!")((void)0);
12775
12776 // 128/256-bit vectors are only supported with VLX.
12777 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))((void)0)
12778 && "VLX required for 128/256-bit vectors")((void)0);
12779
12780 SDValue Lo = V1, Hi = V2;
12781 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12782 if (Rotation <= 0)
12783 return SDValue();
12784
12785 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12786 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12787}
12788
12789/// Try to lower a vector shuffle as a byte shift sequence.
12790static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12791 SDValue V2, ArrayRef<int> Mask,
12792 const APInt &Zeroable,
12793 const X86Subtarget &Subtarget,
12794 SelectionDAG &DAG) {
12795 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12796 assert(VT.is128BitVector() && "Only 128-bit vectors supported")((void)0);
12797
12798 // We need a shuffle that has zeros at one/both ends and a sequential
12799 // shuffle from one source within.
12800 unsigned ZeroLo = Zeroable.countTrailingOnes();
12801 unsigned ZeroHi = Zeroable.countLeadingOnes();
12802 if (!ZeroLo && !ZeroHi)
12803 return SDValue();
12804
12805 unsigned NumElts = Mask.size();
12806 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12807 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12808 return SDValue();
12809
12810 unsigned Scale = VT.getScalarSizeInBits() / 8;
12811 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12812 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12813 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12814 return SDValue();
12815
12816 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12817 Res = DAG.getBitcast(MVT::v16i8, Res);
12818
12819 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12820 // inner sequential set of elements, possibly offset:
12821 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12822 // 01234567 --> 4567zzzz --> zzzzz456
12823 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12824 if (ZeroLo == 0) {
12825 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12826 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12827 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12828 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12829 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12830 } else if (ZeroHi == 0) {
12831 unsigned Shift = Mask[ZeroLo] % NumElts;
12832 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12833 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12834 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12835 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12836 } else if (!Subtarget.hasSSSE3()) {
12837 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12838 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12839 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12840 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12841 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12842 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12843 Shift += Mask[ZeroLo] % NumElts;
12844 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12845 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12846 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12847 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12848 } else
12849 return SDValue();
12850
12851 return DAG.getBitcast(VT, Res);
12852}
12853
12854/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12855///
12856/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12857/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12858/// matches elements from one of the input vectors shuffled to the left or
12859/// right with zeroable elements 'shifted in'. It handles both the strictly
12860/// bit-wise element shifts and the byte shift across an entire 128-bit double
12861/// quad word lane.
12862///
12863/// PSHL : (little-endian) left bit shift.
12864/// [ zz, 0, zz, 2 ]
12865/// [ -1, 4, zz, -1 ]
12866/// PSRL : (little-endian) right bit shift.
12867/// [ 1, zz, 3, zz]
12868/// [ -1, -1, 7, zz]
12869/// PSLLDQ : (little-endian) left byte shift
12870/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12871/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12872/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12873/// PSRLDQ : (little-endian) right byte shift
12874/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12875/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12876/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12877static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12878 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12879 int MaskOffset, const APInt &Zeroable,
12880 const X86Subtarget &Subtarget) {
12881 int Size = Mask.size();
12882 unsigned SizeInBits = Size * ScalarSizeInBits;
12883
12884 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12885 for (int i = 0; i < Size; i += Scale)
12886 for (int j = 0; j < Shift; ++j)
12887 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12888 return false;
12889
12890 return true;
12891 };
12892
12893 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12894 for (int i = 0; i != Size; i += Scale) {
12895 unsigned Pos = Left ? i + Shift : i;
12896 unsigned Low = Left ? i : i + Shift;
12897 unsigned Len = Scale - Shift;
12898 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12899 return -1;
12900 }
12901
12902 int ShiftEltBits = ScalarSizeInBits * Scale;
12903 bool ByteShift = ShiftEltBits > 64;
12904 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12905 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12906 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12907
12908 // Normalize the scale for byte shifts to still produce an i64 element
12909 // type.
12910 Scale = ByteShift ? Scale / 2 : Scale;
12911
12912 // We need to round trip through the appropriate type for the shift.
12913 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12914 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12915 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12916 return (int)ShiftAmt;
12917 };
12918
12919 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12920 // keep doubling the size of the integer elements up to that. We can
12921 // then shift the elements of the integer vector by whole multiples of
12922 // their width within the elements of the larger integer vector. Test each
12923 // multiple to see if we can find a match with the moved element indices
12924 // and that the shifted in elements are all zeroable.
12925 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12926 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12927 for (int Shift = 1; Shift != Scale; ++Shift)
12928 for (bool Left : {true, false})
12929 if (CheckZeros(Shift, Scale, Left)) {
12930 int ShiftAmt = MatchShift(Shift, Scale, Left);
12931 if (0 < ShiftAmt)
12932 return ShiftAmt;
12933 }
12934
12935 // no match
12936 return -1;
12937}
12938
12939static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12940 SDValue V2, ArrayRef<int> Mask,
12941 const APInt &Zeroable,
12942 const X86Subtarget &Subtarget,
12943 SelectionDAG &DAG) {
12944 int Size = Mask.size();
12945 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
12946
12947 MVT ShiftVT;
12948 SDValue V = V1;
12949 unsigned Opcode;
12950
12951 // Try to match shuffle against V1 shift.
12952 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12953 Mask, 0, Zeroable, Subtarget);
12954
12955 // If V1 failed, try to match shuffle against V2 shift.
12956 if (ShiftAmt < 0) {
12957 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12958 Mask, Size, Zeroable, Subtarget);
12959 V = V2;
12960 }
12961
12962 if (ShiftAmt < 0)
12963 return SDValue();
12964
12965 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((void)0)
12966 "Illegal integer vector type")((void)0);
12967 V = DAG.getBitcast(ShiftVT, V);
12968 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12969 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12970 return DAG.getBitcast(VT, V);
12971}
12972
12973// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12974// Remainder of lower half result is zero and upper half is all undef.
12975static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12976 ArrayRef<int> Mask, uint64_t &BitLen,
12977 uint64_t &BitIdx, const APInt &Zeroable) {
12978 int Size = Mask.size();
12979 int HalfSize = Size / 2;
12980 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
12981 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((void)0);
12982
12983 // Upper half must be undefined.
12984 if (!isUndefUpperHalf(Mask))
12985 return false;
12986
12987 // Determine the extraction length from the part of the
12988 // lower half that isn't zeroable.
12989 int Len = HalfSize;
12990 for (; Len > 0; --Len)
12991 if (!Zeroable[Len - 1])
12992 break;
12993 assert(Len > 0 && "Zeroable shuffle mask")((void)0);
12994
12995 // Attempt to match first Len sequential elements from the lower half.
12996 SDValue Src;
12997 int Idx = -1;
12998 for (int i = 0; i != Len; ++i) {
12999 int M = Mask[i];
13000 if (M == SM_SentinelUndef)
13001 continue;
13002 SDValue &V = (M < Size ? V1 : V2);
13003 M = M % Size;
13004
13005 // The extracted elements must start at a valid index and all mask
13006 // elements must be in the lower half.
13007 if (i > M || M >= HalfSize)
13008 return false;
13009
13010 if (Idx < 0 || (Src == V && Idx == (M - i))) {
13011 Src = V;
13012 Idx = M - i;
13013 continue;
13014 }
13015 return false;
13016 }
13017
13018 if (!Src || Idx < 0)
13019 return false;
13020
13021 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")((void)0);
13022 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13023 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13024 V1 = Src;
13025 return true;
13026}
13027
13028// INSERTQ: Extract lowest Len elements from lower half of second source and
13029// insert over first source, starting at Idx.
13030// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13031static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13032 ArrayRef<int> Mask, uint64_t &BitLen,
13033 uint64_t &BitIdx) {
13034 int Size = Mask.size();
13035 int HalfSize = Size / 2;
13036 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
13037
13038 // Upper half must be undefined.
13039 if (!isUndefUpperHalf(Mask))
13040 return false;
13041
13042 for (int Idx = 0; Idx != HalfSize; ++Idx) {
13043 SDValue Base;
13044
13045 // Attempt to match first source from mask before insertion point.
13046 if (isUndefInRange(Mask, 0, Idx)) {
13047 /* EMPTY */
13048 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13049 Base = V1;
13050 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13051 Base = V2;
13052 } else {
13053 continue;
13054 }
13055
13056 // Extend the extraction length looking to match both the insertion of
13057 // the second source and the remaining elements of the first.
13058 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13059 SDValue Insert;
13060 int Len = Hi - Idx;
13061
13062 // Match insertion.
13063 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13064 Insert = V1;
13065 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13066 Insert = V2;
13067 } else {
13068 continue;
13069 }
13070
13071 // Match the remaining elements of the lower half.
13072 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13073 /* EMPTY */
13074 } else if ((!Base || (Base == V1)) &&
13075 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13076 Base = V1;
13077 } else if ((!Base || (Base == V2)) &&
13078 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13079 Size + Hi)) {
13080 Base = V2;
13081 } else {
13082 continue;
13083 }
13084
13085 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13086 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13087 V1 = Base;
13088 V2 = Insert;
13089 return true;
13090 }
13091 }
13092
13093 return false;
13094}
13095
13096/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13097static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13098 SDValue V2, ArrayRef<int> Mask,
13099 const APInt &Zeroable, SelectionDAG &DAG) {
13100 uint64_t BitLen, BitIdx;
13101 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13102 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13103 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13104 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13105
13106 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13107 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13108 V2 ? V2 : DAG.getUNDEF(VT),
13109 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13110 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13111
13112 return SDValue();
13113}
13114
13115/// Lower a vector shuffle as a zero or any extension.
13116///
13117/// Given a specific number of elements, element bit width, and extension
13118/// stride, produce either a zero or any extension based on the available
13119/// features of the subtarget. The extended elements are consecutive and
13120/// begin and can start from an offsetted element index in the input; to
13121/// avoid excess shuffling the offset must either being in the bottom lane
13122/// or at the start of a higher lane. All extended elements must be from
13123/// the same lane.
13124static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13125 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13126 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13127 assert(Scale > 1 && "Need a scale to extend.")((void)0);
13128 int EltBits = VT.getScalarSizeInBits();
13129 int NumElements = VT.getVectorNumElements();
13130 int NumEltsPerLane = 128 / EltBits;
13131 int OffsetLane = Offset / NumEltsPerLane;
13132 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&((void)0)
13133 "Only 8, 16, and 32 bit elements can be extended.")((void)0);
13134 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((void)0);
13135 assert(0 <= Offset && "Extension offset must be positive.")((void)0);
13136 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&((void)0)
13137 "Extension offset must be in the first lane or start an upper lane.")((void)0);
13138
13139 // Check that an index is in same lane as the base offset.
13140 auto SafeOffset = [&](int Idx) {
13141 return OffsetLane == (Idx / NumEltsPerLane);
13142 };
13143
13144 // Shift along an input so that the offset base moves to the first element.
13145 auto ShuffleOffset = [&](SDValue V) {
13146 if (!Offset)
13147 return V;
13148
13149 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13150 for (int i = 0; i * Scale < NumElements; ++i) {
13151 int SrcIdx = i + Offset;
13152 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13153 }
13154 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13155 };
13156
13157 // Found a valid a/zext mask! Try various lowering strategies based on the
13158 // input type and available ISA extensions.
13159 if (Subtarget.hasSSE41()) {
13160 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13161 // PUNPCK will catch this in a later shuffle match.
13162 if (Offset && Scale == 2 && VT.is128BitVector())
13163 return SDValue();
13164 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13165 NumElements / Scale);
13166 InputV = ShuffleOffset(InputV);
13167 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13168 DL, ExtVT, InputV, DAG);
13169 return DAG.getBitcast(VT, InputV);
13170 }
13171
13172 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((void)0);
13173
13174 // For any extends we can cheat for larger element sizes and use shuffle
13175 // instructions that can fold with a load and/or copy.
13176 if (AnyExt && EltBits == 32) {
13177 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13178 -1};
13179 return DAG.getBitcast(
13180 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13181 DAG.getBitcast(MVT::v4i32, InputV),
13182 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13183 }
13184 if (AnyExt && EltBits == 16 && Scale > 2) {
13185 int PSHUFDMask[4] = {Offset / 2, -1,
13186 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13187 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13188 DAG.getBitcast(MVT::v4i32, InputV),
13189 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13190 int PSHUFWMask[4] = {1, -1, -1, -1};
13191 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13192 return DAG.getBitcast(
13193 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13194 DAG.getBitcast(MVT::v8i16, InputV),
13195 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13196 }
13197
13198 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13199 // to 64-bits.
13200 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13201 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((void)0);
13202 assert(VT.is128BitVector() && "Unexpected vector width!")((void)0);
13203
13204 int LoIdx = Offset * EltBits;
13205 SDValue Lo = DAG.getBitcast(
13206 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13207 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13208 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13209
13210 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13211 return DAG.getBitcast(VT, Lo);
13212
13213 int HiIdx = (Offset + 1) * EltBits;
13214 SDValue Hi = DAG.getBitcast(
13215 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13216 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13217 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13218 return DAG.getBitcast(VT,
13219 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13220 }
13221
13222 // If this would require more than 2 unpack instructions to expand, use
13223 // pshufb when available. We can only use more than 2 unpack instructions
13224 // when zero extending i8 elements which also makes it easier to use pshufb.
13225 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13226 assert(NumElements == 16 && "Unexpected byte vector width!")((void)0);
13227 SDValue PSHUFBMask[16];
13228 for (int i = 0; i < 16; ++i) {
13229 int Idx = Offset + (i / Scale);
13230 if ((i % Scale == 0 && SafeOffset(Idx))) {
13231 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13232 continue;
13233 }
13234 PSHUFBMask[i] =
13235 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13236 }
13237 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13238 return DAG.getBitcast(
13239 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13240 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13241 }
13242
13243 // If we are extending from an offset, ensure we start on a boundary that
13244 // we can unpack from.
13245 int AlignToUnpack = Offset % (NumElements / Scale);
13246 if (AlignToUnpack) {
13247 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13248 for (int i = AlignToUnpack; i < NumElements; ++i)
13249 ShMask[i - AlignToUnpack] = i;
13250 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13251 Offset -= AlignToUnpack;
13252 }
13253
13254 // Otherwise emit a sequence of unpacks.
13255 do {
13256 unsigned UnpackLoHi = X86ISD::UNPCKL;
13257 if (Offset >= (NumElements / 2)) {
13258 UnpackLoHi = X86ISD::UNPCKH;
13259 Offset -= (NumElements / 2);
13260 }
13261
13262 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13263 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13264 : getZeroVector(InputVT, Subtarget, DAG, DL);
13265 InputV = DAG.getBitcast(InputVT, InputV);
13266 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13267 Scale /= 2;
13268 EltBits *= 2;
13269 NumElements /= 2;
13270 } while (Scale > 1);
13271 return DAG.getBitcast(VT, InputV);
13272}
13273
13274/// Try to lower a vector shuffle as a zero extension on any microarch.
13275///
13276/// This routine will try to do everything in its power to cleverly lower
13277/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13278/// check for the profitability of this lowering, it tries to aggressively
13279/// match this pattern. It will use all of the micro-architectural details it
13280/// can to emit an efficient lowering. It handles both blends with all-zero
13281/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13282/// masking out later).
13283///
13284/// The reason we have dedicated lowering for zext-style shuffles is that they
13285/// are both incredibly common and often quite performance sensitive.
13286static SDValue lowerShuffleAsZeroOrAnyExtend(
13287 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13288 const APInt &Zeroable, const X86Subtarget &Subtarget,
13289 SelectionDAG &DAG) {
13290 int Bits = VT.getSizeInBits();
13291 int NumLanes = Bits / 128;
13292 int NumElements = VT.getVectorNumElements();
13293 int NumEltsPerLane = NumElements / NumLanes;
13294 assert(VT.getScalarSizeInBits() <= 32 &&((void)0)
13295 "Exceeds 32-bit integer zero extension limit")((void)0);
13296 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")((void)0);
13297
13298 // Define a helper function to check a particular ext-scale and lower to it if
13299 // valid.
13300 auto Lower = [&](int Scale) -> SDValue {
13301 SDValue InputV;
13302 bool AnyExt = true;
13303 int Offset = 0;
13304 int Matches = 0;
13305 for (int i = 0; i < NumElements; ++i) {
13306 int M = Mask[i];
13307 if (M < 0)
13308 continue; // Valid anywhere but doesn't tell us anything.
13309 if (i % Scale != 0) {
13310 // Each of the extended elements need to be zeroable.
13311 if (!Zeroable[i])
13312 return SDValue();
13313
13314 // We no longer are in the anyext case.
13315 AnyExt = false;
13316 continue;
13317 }
13318
13319 // Each of the base elements needs to be consecutive indices into the
13320 // same input vector.
13321 SDValue V = M < NumElements ? V1 : V2;
13322 M = M % NumElements;
13323 if (!InputV) {
13324 InputV = V;
13325 Offset = M - (i / Scale);
13326 } else if (InputV != V)
13327 return SDValue(); // Flip-flopping inputs.
13328
13329 // Offset must start in the lowest 128-bit lane or at the start of an
13330 // upper lane.
13331 // FIXME: Is it ever worth allowing a negative base offset?
13332 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13333 (Offset % NumEltsPerLane) == 0))
13334 return SDValue();
13335
13336 // If we are offsetting, all referenced entries must come from the same
13337 // lane.
13338 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13339 return SDValue();
13340
13341 if ((M % NumElements) != (Offset + (i / Scale)))
13342 return SDValue(); // Non-consecutive strided elements.
13343 Matches++;
13344 }
13345
13346 // If we fail to find an input, we have a zero-shuffle which should always
13347 // have already been handled.
13348 // FIXME: Maybe handle this here in case during blending we end up with one?
13349 if (!InputV)
13350 return SDValue();
13351
13352 // If we are offsetting, don't extend if we only match a single input, we
13353 // can always do better by using a basic PSHUF or PUNPCK.
13354 if (Offset != 0 && Matches < 2)
13355 return SDValue();
13356
13357 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13358 InputV, Mask, Subtarget, DAG);
13359 };
13360
13361 // The widest scale possible for extending is to a 64-bit integer.
13362 assert(Bits % 64 == 0 &&((void)0)
13363 "The number of bits in a vector must be divisible by 64 on x86!")((void)0);
13364 int NumExtElements = Bits / 64;
13365
13366 // Each iteration, try extending the elements half as much, but into twice as
13367 // many elements.
13368 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13369 assert(NumElements % NumExtElements == 0 &&((void)0)
13370 "The input vector size must be divisible by the extended size.")((void)0);
13371 if (SDValue V = Lower(NumElements / NumExtElements))
13372 return V;
13373 }
13374
13375 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13376 if (Bits != 128)
13377 return SDValue();
13378
13379 // Returns one of the source operands if the shuffle can be reduced to a
13380 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13381 auto CanZExtLowHalf = [&]() {
13382 for (int i = NumElements / 2; i != NumElements; ++i)
13383 if (!Zeroable[i])
13384 return SDValue();
13385 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13386 return V1;
13387 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13388 return V2;
13389 return SDValue();
13390 };
13391
13392 if (SDValue V = CanZExtLowHalf()) {
13393 V = DAG.getBitcast(MVT::v2i64, V);
13394 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13395 return DAG.getBitcast(VT, V);
13396 }
13397
13398 // No viable ext lowering found.
13399 return SDValue();
13400}
13401
13402/// Try to get a scalar value for a specific element of a vector.
13403///
13404/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13405static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13406 SelectionDAG &DAG) {
13407 MVT VT = V.getSimpleValueType();
13408 MVT EltVT = VT.getVectorElementType();
13409 V = peekThroughBitcasts(V);
13410
13411 // If the bitcasts shift the element size, we can't extract an equivalent
13412 // element from it.
13413 MVT NewVT = V.getSimpleValueType();
13414 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13415 return SDValue();
13416
13417 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13418 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13419 // Ensure the scalar operand is the same size as the destination.
13420 // FIXME: Add support for scalar truncation where possible.
13421 SDValue S = V.getOperand(Idx);
13422 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13423 return DAG.getBitcast(EltVT, S);
13424 }
13425
13426 return SDValue();
13427}
13428
13429/// Helper to test for a load that can be folded with x86 shuffles.
13430///
13431/// This is particularly important because the set of instructions varies
13432/// significantly based on whether the operand is a load or not.
13433static bool isShuffleFoldableLoad(SDValue V) {
13434 V = peekThroughBitcasts(V);
13435 return ISD::isNON_EXTLoad(V.getNode());
13436}
13437
13438/// Try to lower insertion of a single element into a zero vector.
13439///
13440/// This is a common pattern that we have especially efficient patterns to lower
13441/// across all subtarget feature sets.
13442static SDValue lowerShuffleAsElementInsertion(
13443 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13444 const APInt &Zeroable, const X86Subtarget &Subtarget,
13445 SelectionDAG &DAG) {
13446 MVT ExtVT = VT;
13447 MVT EltVT = VT.getVectorElementType();
13448
13449 int V2Index =
13450 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13451 Mask.begin();
13452 bool IsV1Zeroable = true;
13453 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13454 if (i != V2Index && !Zeroable[i]) {
13455 IsV1Zeroable = false;
13456 break;
13457 }
13458
13459 // Check for a single input from a SCALAR_TO_VECTOR node.
13460 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13461 // all the smarts here sunk into that routine. However, the current
13462 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13463 // vector shuffle lowering is dead.
13464 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13465 DAG);
13466 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13467 // We need to zext the scalar if it is smaller than an i32.
13468 V2S = DAG.getBitcast(EltVT, V2S);
13469 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
13470 // Using zext to expand a narrow element won't work for non-zero
13471 // insertions.
13472 if (!IsV1Zeroable)
13473 return SDValue();
13474
13475 // Zero-extend directly to i32.
13476 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13477 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13478 }
13479 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13480 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13481 EltVT == MVT::i16) {
13482 // Either not inserting from the low element of the input or the input
13483 // element size is too small to use VZEXT_MOVL to clear the high bits.
13484 return SDValue();
13485 }
13486
13487 if (!IsV1Zeroable) {
13488 // If V1 can't be treated as a zero vector we have fewer options to lower
13489 // this. We can't support integer vectors or non-zero targets cheaply, and
13490 // the V1 elements can't be permuted in any way.
13491 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((void)0);
13492 if (!VT.isFloatingPoint() || V2Index != 0)
13493 return SDValue();
13494 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13495 V1Mask[V2Index] = -1;
13496 if (!isNoopShuffleMask(V1Mask))
13497 return SDValue();
13498 if (!VT.is128BitVector())
13499 return SDValue();
13500
13501 // Otherwise, use MOVSD or MOVSS.
13502 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&((void)0)
13503 "Only two types of floating point element types to handle!")((void)0);
13504 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
13505 ExtVT, V1, V2);
13506 }
13507
13508 // This lowering only works for the low element with floating point vectors.
13509 if (VT.isFloatingPoint() && V2Index != 0)
13510 return SDValue();
13511
13512 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13513 if (ExtVT != VT)
13514 V2 = DAG.getBitcast(VT, V2);
13515
13516 if (V2Index != 0) {
13517 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13518 // the desired position. Otherwise it is more efficient to do a vector
13519 // shift left. We know that we can do a vector shift left because all
13520 // the inputs are zero.
13521 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13522 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13523 V2Shuffle[V2Index] = 0;
13524 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13525 } else {
13526 V2 = DAG.getBitcast(MVT::v16i8, V2);
13527 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13528 DAG.getTargetConstant(
13529 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13530 V2 = DAG.getBitcast(VT, V2);
13531 }
13532 }
13533 return V2;
13534}
13535
13536/// Try to lower broadcast of a single - truncated - integer element,
13537/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13538///
13539/// This assumes we have AVX2.
13540static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13541 int BroadcastIdx,
13542 const X86Subtarget &Subtarget,
13543 SelectionDAG &DAG) {
13544 assert(Subtarget.hasAVX2() &&((void)0)
13545 "We can only lower integer broadcasts with AVX2!")((void)0);
13546
13547 MVT EltVT = VT.getVectorElementType();
13548 MVT V0VT = V0.getSimpleValueType();
13549
13550 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((void)0);
13551 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((void)0);
13552
13553 MVT V0EltVT = V0VT.getVectorElementType();
13554 if (!V0EltVT.isInteger())
13555 return SDValue();
13556
13557 const unsigned EltSize = EltVT.getSizeInBits();
13558 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13559
13560 // This is only a truncation if the original element type is larger.
13561 if (V0EltSize <= EltSize)
13562 return SDValue();
13563
13564 assert(((V0EltSize % EltSize) == 0) &&((void)0)
13565 "Scalar type sizes must all be powers of 2 on x86!")((void)0);
13566
13567 const unsigned V0Opc = V0.getOpcode();
13568 const unsigned Scale = V0EltSize / EltSize;
13569 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13570
13571 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13572 V0Opc != ISD::BUILD_VECTOR)
13573 return SDValue();
13574
13575 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13576
13577 // If we're extracting non-least-significant bits, shift so we can truncate.
13578 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13579 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13580 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13581 if (const int OffsetIdx = BroadcastIdx % Scale)
13582 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13583 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13584
13585 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13586 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13587}
13588
13589/// Test whether this can be lowered with a single SHUFPS instruction.
13590///
13591/// This is used to disable more specialized lowerings when the shufps lowering
13592/// will happen to be efficient.
13593static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13594 // This routine only handles 128-bit shufps.
13595 assert(Mask.size() == 4 && "Unsupported mask size!")((void)0);
13596 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((void)0);
13597 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((void)0);
13598 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((void)0);
13599 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((void)0);
13600
13601 // To lower with a single SHUFPS we need to have the low half and high half
13602 // each requiring a single input.
13603 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13604 return false;
13605 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13606 return false;
13607
13608 return true;
13609}
13610
13611/// If we are extracting two 128-bit halves of a vector and shuffling the
13612/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13613/// multi-shuffle lowering.
13614static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13615 SDValue N1, ArrayRef<int> Mask,
13616 SelectionDAG &DAG) {
13617 MVT VT = N0.getSimpleValueType();
13618 assert((VT.is128BitVector() &&((void)0)
13619 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&((void)0)
13620 "VPERM* family of shuffles requires 32-bit or 64-bit elements")((void)0);
13621
13622 // Check that both sources are extracts of the same source vector.
13623 if (!N0.hasOneUse() || !N1.hasOneUse() ||
13624 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13625 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13626 N0.getOperand(0) != N1.getOperand(0))
13627 return SDValue();
13628
13629 SDValue WideVec = N0.getOperand(0);
13630 MVT WideVT = WideVec.getSimpleValueType();
13631 if (!WideVT.is256BitVector())
13632 return SDValue();
13633
13634 // Match extracts of each half of the wide source vector. Commute the shuffle
13635 // if the extract of the low half is N1.
13636 unsigned NumElts = VT.getVectorNumElements();
13637 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13638 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13639 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13640 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13641 ShuffleVectorSDNode::commuteMask(NewMask);
13642 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13643 return SDValue();
13644
13645 // Final bailout: if the mask is simple, we are better off using an extract
13646 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13647 // because that avoids a constant load from memory.
13648 if (NumElts == 4 &&
13649 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13650 return SDValue();
13651
13652 // Extend the shuffle mask with undef elements.
13653 NewMask.append(NumElts, -1);
13654
13655 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13656 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13657 NewMask);
13658 // This is free: ymm -> xmm.
13659 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13660 DAG.getIntPtrConstant(0, DL));
13661}
13662
13663/// Try to lower broadcast of a single element.
13664///
13665/// For convenience, this code also bundles all of the subtarget feature set
13666/// filtering. While a little annoying to re-dispatch on type here, there isn't
13667/// a convenient way to factor it out.
13668static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13669 SDValue V2, ArrayRef<int> Mask,
13670 const X86Subtarget &Subtarget,
13671 SelectionDAG &DAG) {
13672 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13673 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13674 (Subtarget.hasAVX2() && VT.isInteger())))
13675 return SDValue();
13676
13677 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13678 // we can only broadcast from a register with AVX2.
13679 unsigned NumEltBits = VT.getScalarSizeInBits();
13680 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13681 ? X86ISD::MOVDDUP
13682 : X86ISD::VBROADCAST;
13683 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13684
13685 // Check that the mask is a broadcast.
13686 int BroadcastIdx = getSplatIndex(Mask);
13687 if (BroadcastIdx < 0)
13688 return SDValue();
13689 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((void)0)
13690 "a sorted mask where the broadcast "((void)0)
13691 "comes from V1.")((void)0);
13692
13693 // Go up the chain of (vector) values to find a scalar load that we can
13694 // combine with the broadcast.
13695 // TODO: Combine this logic with findEltLoadSrc() used by
13696 // EltsFromConsecutiveLoads().
13697 int BitOffset = BroadcastIdx * NumEltBits;
13698 SDValue V = V1;
13699 for (;;) {
13700 switch (V.getOpcode()) {
13701 case ISD::BITCAST: {
13702 V = V.getOperand(0);
13703 continue;
13704 }
13705 case ISD::CONCAT_VECTORS: {
13706 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13707 int OpIdx = BitOffset / OpBitWidth;
13708 V = V.getOperand(OpIdx);
13709 BitOffset %= OpBitWidth;
13710 continue;
13711 }
13712 case ISD::EXTRACT_SUBVECTOR: {
13713 // The extraction index adds to the existing offset.
13714 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13715 unsigned Idx = V.getConstantOperandVal(1);
13716 unsigned BeginOffset = Idx * EltBitWidth;
13717 BitOffset += BeginOffset;
13718 V = V.getOperand(0);
13719 continue;
13720 }
13721 case ISD::INSERT_SUBVECTOR: {
13722 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13723 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13724 int Idx = (int)V.getConstantOperandVal(2);
13725 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13726 int BeginOffset = Idx * EltBitWidth;
13727 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13728 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13729 BitOffset -= BeginOffset;
13730 V = VInner;
13731 } else {
13732 V = VOuter;
13733 }
13734 continue;
13735 }
13736 }
13737 break;
13738 }
13739 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")((void)0);
13740 BroadcastIdx = BitOffset / NumEltBits;
13741
13742 // Do we need to bitcast the source to retrieve the original broadcast index?
13743 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13744
13745 // Check if this is a broadcast of a scalar. We special case lowering
13746 // for scalars so that we can more effectively fold with loads.
13747 // If the original value has a larger element type than the shuffle, the
13748 // broadcast element is in essence truncated. Make that explicit to ease
13749 // folding.
13750 if (BitCastSrc && VT.isInteger())
13751 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13752 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13753 return TruncBroadcast;
13754
13755 // Also check the simpler case, where we can directly reuse the scalar.
13756 if (!BitCastSrc &&
13757 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13758 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13759 V = V.getOperand(BroadcastIdx);
13760
13761 // If we can't broadcast from a register, check that the input is a load.
13762 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13763 return SDValue();
13764 } else if (ISD::isNormalLoad(V.getNode()) &&
13765 cast<LoadSDNode>(V)->isSimple()) {
13766 // We do not check for one-use of the vector load because a broadcast load
13767 // is expected to be a win for code size, register pressure, and possibly
13768 // uops even if the original vector load is not eliminated.
13769
13770 // Reduce the vector load and shuffle to a broadcasted scalar load.
13771 LoadSDNode *Ld = cast<LoadSDNode>(V);
13772 SDValue BaseAddr = Ld->getOperand(1);
13773 MVT SVT = VT.getScalarType();
13774 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13775 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")((void)0);
13776 SDValue NewAddr =
13777 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
13778
13779 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13780 // than MOVDDUP.
13781 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13782 if (Opcode == X86ISD::VBROADCAST) {
13783 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13784 SDValue Ops[] = {Ld->getChain(), NewAddr};
13785 V = DAG.getMemIntrinsicNode(
13786 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13787 DAG.getMachineFunction().getMachineMemOperand(
13788 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13789 DAG.makeEquivalentMemoryOrdering(Ld, V);
13790 return DAG.getBitcast(VT, V);
13791 }
13792 assert(SVT == MVT::f64 && "Unexpected VT!")((void)0);
13793 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13794 DAG.getMachineFunction().getMachineMemOperand(
13795 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13796 DAG.makeEquivalentMemoryOrdering(Ld, V);
13797 } else if (!BroadcastFromReg) {
13798 // We can't broadcast from a vector register.
13799 return SDValue();
13800 } else if (BitOffset != 0) {
13801 // We can only broadcast from the zero-element of a vector register,
13802 // but it can be advantageous to broadcast from the zero-element of a
13803 // subvector.
13804 if (!VT.is256BitVector() && !VT.is512BitVector())
13805 return SDValue();
13806
13807 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13808 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13809 return SDValue();
13810
13811 // Only broadcast the zero-element of a 128-bit subvector.
13812 if ((BitOffset % 128) != 0)
13813 return SDValue();
13814
13815 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&((void)0)
13816 "Unexpected bit-offset")((void)0);
13817 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&((void)0)
13818 "Unexpected vector size")((void)0);
13819 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13820 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13821 }
13822
13823 // On AVX we can use VBROADCAST directly for scalar sources.
13824 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13825 V = DAG.getBitcast(MVT::f64, V);
13826 if (Subtarget.hasAVX()) {
13827 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13828 return DAG.getBitcast(VT, V);
13829 }
13830 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13831 }
13832
13833 // If this is a scalar, do the broadcast on this type and bitcast.
13834 if (!V.getValueType().isVector()) {
13835 assert(V.getScalarValueSizeInBits() == NumEltBits &&((void)0)
13836 "Unexpected scalar size")((void)0);
13837 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13838 VT.getVectorNumElements());
13839 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13840 }
13841
13842 // We only support broadcasting from 128-bit vectors to minimize the
13843 // number of patterns we need to deal with in isel. So extract down to
13844 // 128-bits, removing as many bitcasts as possible.
13845 if (V.getValueSizeInBits() > 128)
13846 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13847
13848 // Otherwise cast V to a vector with the same element type as VT, but
13849 // possibly narrower than VT. Then perform the broadcast.
13850 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13851 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13852 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13853}
13854
13855// Check for whether we can use INSERTPS to perform the shuffle. We only use
13856// INSERTPS when the V1 elements are already in the correct locations
13857// because otherwise we can just always use two SHUFPS instructions which
13858// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13859// perform INSERTPS if a single V1 element is out of place and all V2
13860// elements are zeroable.
13861static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13862 unsigned &InsertPSMask,
13863 const APInt &Zeroable,
13864 ArrayRef<int> Mask, SelectionDAG &DAG) {
13865 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((void)0);
13866 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((void)0);
13867 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
13868
13869 // Attempt to match INSERTPS with one element from VA or VB being
13870 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13871 // are updated.
13872 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13873 ArrayRef<int> CandidateMask) {
13874 unsigned ZMask = 0;
13875 int VADstIndex = -1;
13876 int VBDstIndex = -1;
13877 bool VAUsedInPlace = false;
13878
13879 for (int i = 0; i < 4; ++i) {
13880 // Synthesize a zero mask from the zeroable elements (includes undefs).
13881 if (Zeroable[i]) {
13882 ZMask |= 1 << i;
13883 continue;
13884 }
13885
13886 // Flag if we use any VA inputs in place.
13887 if (i == CandidateMask[i]) {
13888 VAUsedInPlace = true;
13889 continue;
13890 }
13891
13892 // We can only insert a single non-zeroable element.
13893 if (VADstIndex >= 0 || VBDstIndex >= 0)
13894 return false;
13895
13896 if (CandidateMask[i] < 4) {
13897 // VA input out of place for insertion.
13898 VADstIndex = i;
13899 } else {
13900 // VB input for insertion.
13901 VBDstIndex = i;
13902 }
13903 }
13904
13905 // Don't bother if we have no (non-zeroable) element for insertion.
13906 if (VADstIndex < 0 && VBDstIndex < 0)
13907 return false;
13908
13909 // Determine element insertion src/dst indices. The src index is from the
13910 // start of the inserted vector, not the start of the concatenated vector.
13911 unsigned VBSrcIndex = 0;
13912 if (VADstIndex >= 0) {
13913 // If we have a VA input out of place, we use VA as the V2 element
13914 // insertion and don't use the original V2 at all.
13915 VBSrcIndex = CandidateMask[VADstIndex];
13916 VBDstIndex = VADstIndex;
13917 VB = VA;
13918 } else {
13919 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13920 }
13921
13922 // If no V1 inputs are used in place, then the result is created only from
13923 // the zero mask and the V2 insertion - so remove V1 dependency.
13924 if (!VAUsedInPlace)
13925 VA = DAG.getUNDEF(MVT::v4f32);
13926
13927 // Update V1, V2 and InsertPSMask accordingly.
13928 V1 = VA;
13929 V2 = VB;
13930
13931 // Insert the V2 element into the desired position.
13932 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13933 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")((void)0);
13934 return true;
13935 };
13936
13937 if (matchAsInsertPS(V1, V2, Mask))
13938 return true;
13939
13940 // Commute and try again.
13941 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13942 ShuffleVectorSDNode::commuteMask(CommutedMask);
13943 if (matchAsInsertPS(V2, V1, CommutedMask))
13944 return true;
13945
13946 return false;
13947}
13948
13949static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13950 ArrayRef<int> Mask, const APInt &Zeroable,
13951 SelectionDAG &DAG) {
13952 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
13953 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
13954
13955 // Attempt to match the insertps pattern.
13956 unsigned InsertPSMask = 0;
13957 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13958 return SDValue();
13959
13960 // Insert the V2 element into the desired position.
13961 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13962 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13963}
13964
13965/// Try to lower a shuffle as a permute of the inputs followed by an
13966/// UNPCK instruction.
13967///
13968/// This specifically targets cases where we end up with alternating between
13969/// the two inputs, and so can permute them into something that feeds a single
13970/// UNPCK instruction. Note that this routine only targets integer vectors
13971/// because for floating point vectors we have a generalized SHUFPS lowering
13972/// strategy that handles everything that doesn't *exactly* match an unpack,
13973/// making this clever lowering unnecessary.
13974static SDValue lowerShuffleAsPermuteAndUnpack(
13975 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13976 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13977 assert(!VT.isFloatingPoint() &&((void)0)
13978 "This routine only supports integer vectors.")((void)0);
13979 assert(VT.is128BitVector() &&((void)0)
13980 "This routine only works on 128-bit vectors.")((void)0);
13981 assert(!V2.isUndef() &&((void)0)
13982 "This routine should only be used when blending two inputs.")((void)0);
13983 assert(Mask.size() >= 2 && "Single element masks are invalid.")((void)0);
13984
13985 int Size = Mask.size();
13986
13987 int NumLoInputs =
13988 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13989 int NumHiInputs =
13990 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13991
13992 bool UnpackLo = NumLoInputs >= NumHiInputs;
13993
13994 auto TryUnpack = [&](int ScalarSize, int Scale) {
13995 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13996 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13997
13998 for (int i = 0; i < Size; ++i) {
13999 if (Mask[i] < 0)
14000 continue;
14001
14002 // Each element of the unpack contains Scale elements from this mask.
14003 int UnpackIdx = i / Scale;
14004
14005 // We only handle the case where V1 feeds the first slots of the unpack.
14006 // We rely on canonicalization to ensure this is the case.
14007 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14008 return SDValue();
14009
14010 // Setup the mask for this input. The indexing is tricky as we have to
14011 // handle the unpack stride.
14012 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14013 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14014 Mask[i] % Size;
14015 }
14016
14017 // If we will have to shuffle both inputs to use the unpack, check whether
14018 // we can just unpack first and shuffle the result. If so, skip this unpack.
14019 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14020 !isNoopShuffleMask(V2Mask))
14021 return SDValue();
14022
14023 // Shuffle the inputs into place.
14024 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14025 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14026
14027 // Cast the inputs to the type we will use to unpack them.
14028 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14029 V1 = DAG.getBitcast(UnpackVT, V1);
14030 V2 = DAG.getBitcast(UnpackVT, V2);
14031
14032 // Unpack the inputs and cast the result back to the desired type.
14033 return DAG.getBitcast(
14034 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14035 UnpackVT, V1, V2));
14036 };
14037
14038 // We try each unpack from the largest to the smallest to try and find one
14039 // that fits this mask.
14040 int OrigScalarSize = VT.getScalarSizeInBits();
14041 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14042 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14043 return Unpack;
14044
14045 // If we're shuffling with a zero vector then we're better off not doing
14046 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14047 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14048 ISD::isBuildVectorAllZeros(V2.getNode()))
14049 return SDValue();
14050
14051 // If none of the unpack-rooted lowerings worked (or were profitable) try an
14052 // initial unpack.
14053 if (NumLoInputs == 0 || NumHiInputs == 0) {
14054 assert((NumLoInputs > 0 || NumHiInputs > 0) &&((void)0)
14055 "We have to have *some* inputs!")((void)0);
14056 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14057
14058 // FIXME: We could consider the total complexity of the permute of each
14059 // possible unpacking. Or at the least we should consider how many
14060 // half-crossings are created.
14061 // FIXME: We could consider commuting the unpacks.
14062
14063 SmallVector<int, 32> PermMask((unsigned)Size, -1);
14064 for (int i = 0; i < Size; ++i) {
14065 if (Mask[i] < 0)
14066 continue;
14067
14068 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((void)0);
14069
14070 PermMask[i] =
14071 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14072 }
14073 return DAG.getVectorShuffle(
14074 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14075 DL, VT, V1, V2),
14076 DAG.getUNDEF(VT), PermMask);
14077 }
14078
14079 return SDValue();
14080}
14081
14082/// Handle lowering of 2-lane 64-bit floating point shuffles.
14083///
14084/// This is the basis function for the 2-lane 64-bit shuffles as we have full
14085/// support for floating point shuffles but not integer shuffles. These
14086/// instructions will incur a domain crossing penalty on some chips though so
14087/// it is better to avoid lowering through this for integer vectors where
14088/// possible.
14089static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14090 const APInt &Zeroable, SDValue V1, SDValue V2,
14091 const X86Subtarget &Subtarget,
14092 SelectionDAG &DAG) {
14093 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((void)0);
14094 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((void)0);
14095 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((void)0);
14096
14097 if (V2.isUndef()) {
14098 // Check for being able to broadcast a single element.
14099 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14100 Mask, Subtarget, DAG))
14101 return Broadcast;
14102
14103 // Straight shuffle of a single input vector. Simulate this by using the
14104 // single input as both of the "inputs" to this instruction..
14105 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14106
14107 if (Subtarget.hasAVX()) {
14108 // If we have AVX, we can use VPERMILPS which will allow folding a load
14109 // into the shuffle.
14110 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14111 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14112 }
14113
14114 return DAG.getNode(
14115 X86ISD::SHUFP, DL, MVT::v2f64,
14116 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14117 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14118 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14119 }
14120 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14121 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14122 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((void)0);
14123 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((void)0);
14124
14125 if (Subtarget.hasAVX2())
14126 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14127 return Extract;
14128
14129 // When loading a scalar and then shuffling it into a vector we can often do
14130 // the insertion cheaply.
14131 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14132 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14133 return Insertion;
14134 // Try inverting the insertion since for v2 masks it is easy to do and we
14135 // can't reliably sort the mask one way or the other.
14136 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14137 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14138 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14139 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14140 return Insertion;
14141
14142 // Try to use one of the special instruction patterns to handle two common
14143 // blend patterns if a zero-blend above didn't work.
14144 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14145 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14146 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14147 // We can either use a special instruction to load over the low double or
14148 // to move just the low double.
14149 return DAG.getNode(
14150 X86ISD::MOVSD, DL, MVT::v2f64, V2,
14151 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14152
14153 if (Subtarget.hasSSE41())
14154 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14155 Zeroable, Subtarget, DAG))
14156 return Blend;
14157
14158 // Use dedicated unpack instructions for masks that match their pattern.
14159 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14160 return V;
14161
14162 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14163 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14164 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14165}
14166
14167/// Handle lowering of 2-lane 64-bit integer shuffles.
14168///
14169/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14170/// the integer unit to minimize domain crossing penalties. However, for blends
14171/// it falls back to the floating point shuffle operation with appropriate bit
14172/// casting.
14173static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14174 const APInt &Zeroable, SDValue V1, SDValue V2,
14175 const X86Subtarget &Subtarget,
14176 SelectionDAG &DAG) {
14177 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((void)0);
14178 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((void)0);
14179 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((void)0);
14180
14181 if (V2.isUndef()) {
14182 // Check for being able to broadcast a single element.
14183 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14184 Mask, Subtarget, DAG))
14185 return Broadcast;
14186
14187 // Straight shuffle of a single input vector. For everything from SSE2
14188 // onward this has a single fast instruction with no scary immediates.
14189 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14190 V1 = DAG.getBitcast(MVT::v4i32, V1);
14191 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14192 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14193 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14194 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14195 return DAG.getBitcast(
14196 MVT::v2i64,
14197 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14198 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14199 }
14200 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14201 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14202 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((void)0);
14203 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((void)0);
14204
14205 if (Subtarget.hasAVX2())
14206 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14207 return Extract;
14208
14209 // Try to use shift instructions.
14210 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14211 Zeroable, Subtarget, DAG))
14212 return Shift;
14213
14214 // When loading a scalar and then shuffling it into a vector we can often do
14215 // the insertion cheaply.
14216 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14217 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14218 return Insertion;
14219 // Try inverting the insertion since for v2 masks it is easy to do and we
14220 // can't reliably sort the mask one way or the other.
14221 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14222 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14223 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14224 return Insertion;
14225
14226 // We have different paths for blend lowering, but they all must use the
14227 // *exact* same predicate.
14228 bool IsBlendSupported = Subtarget.hasSSE41();
14229 if (IsBlendSupported)
14230 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14231 Zeroable, Subtarget, DAG))
14232 return Blend;
14233
14234 // Use dedicated unpack instructions for masks that match their pattern.
14235 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14236 return V;
14237
14238 // Try to use byte rotation instructions.
14239 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14240 if (Subtarget.hasSSSE3()) {
14241 if (Subtarget.hasVLX())
14242 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14243 Subtarget, DAG))
14244 return Rotate;
14245
14246 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14247 Subtarget, DAG))
14248 return Rotate;
14249 }
14250
14251 // If we have direct support for blends, we should lower by decomposing into
14252 // a permute. That will be faster than the domain cross.
14253 if (IsBlendSupported)
14254 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14255 Subtarget, DAG);
14256
14257 // We implement this with SHUFPD which is pretty lame because it will likely
14258 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14259 // However, all the alternatives are still more cycles and newer chips don't
14260 // have this problem. It would be really nice if x86 had better shuffles here.
14261 V1 = DAG.getBitcast(MVT::v2f64, V1);
14262 V2 = DAG.getBitcast(MVT::v2f64, V2);
14263 return DAG.getBitcast(MVT::v2i64,
14264 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14265}
14266
14267/// Lower a vector shuffle using the SHUFPS instruction.
14268///
14269/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14270/// It makes no assumptions about whether this is the *best* lowering, it simply
14271/// uses it.
14272static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14273 ArrayRef<int> Mask, SDValue V1,
14274 SDValue V2, SelectionDAG &DAG) {
14275 SDValue LowV = V1, HighV = V2;
14276 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14277 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14278
14279 if (NumV2Elements == 1) {
14280 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14281
14282 // Compute the index adjacent to V2Index and in the same half by toggling
14283 // the low bit.
14284 int V2AdjIndex = V2Index ^ 1;
14285
14286 if (Mask[V2AdjIndex] < 0) {
14287 // Handles all the cases where we have a single V2 element and an undef.
14288 // This will only ever happen in the high lanes because we commute the
14289 // vector otherwise.
14290 if (V2Index < 2)
14291 std::swap(LowV, HighV);
14292 NewMask[V2Index] -= 4;
14293 } else {
14294 // Handle the case where the V2 element ends up adjacent to a V1 element.
14295 // To make this work, blend them together as the first step.
14296 int V1Index = V2AdjIndex;
14297 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14298 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14299 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14300
14301 // Now proceed to reconstruct the final blend as we have the necessary
14302 // high or low half formed.
14303 if (V2Index < 2) {
14304 LowV = V2;
14305 HighV = V1;
14306 } else {
14307 HighV = V2;
14308 }
14309 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14310 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14311 }
14312 } else if (NumV2Elements == 2) {
14313 if (Mask[0] < 4 && Mask[1] < 4) {
14314 // Handle the easy case where we have V1 in the low lanes and V2 in the
14315 // high lanes.
14316 NewMask[2] -= 4;
14317 NewMask[3] -= 4;
14318 } else if (Mask[2] < 4 && Mask[3] < 4) {
14319 // We also handle the reversed case because this utility may get called
14320 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14321 // arrange things in the right direction.
14322 NewMask[0] -= 4;
14323 NewMask[1] -= 4;
14324 HighV = V1;
14325 LowV = V2;
14326 } else {
14327 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14328 // trying to place elements directly, just blend them and set up the final
14329 // shuffle to place them.
14330
14331 // The first two blend mask elements are for V1, the second two are for
14332 // V2.
14333 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14334 Mask[2] < 4 ? Mask[2] : Mask[3],
14335 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14336 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14337 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14338 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14339
14340 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14341 // a blend.
14342 LowV = HighV = V1;
14343 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14344 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14345 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14346 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14347 }
14348 } else if (NumV2Elements == 3) {
14349 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14350 // we can get here due to other paths (e.g repeated mask matching) that we
14351 // don't want to do another round of lowerVECTOR_SHUFFLE.
14352 ShuffleVectorSDNode::commuteMask(NewMask);
14353 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14354 }
14355 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14356 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14357}
14358
14359/// Lower 4-lane 32-bit floating point shuffles.
14360///
14361/// Uses instructions exclusively from the floating point unit to minimize
14362/// domain crossing penalties, as these are sufficient to implement all v4f32
14363/// shuffles.
14364static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14365 const APInt &Zeroable, SDValue V1, SDValue V2,
14366 const X86Subtarget &Subtarget,
14367 SelectionDAG &DAG) {
14368 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
14369 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
14370 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
14371
14372 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14373
14374 if (NumV2Elements == 0) {
14375 // Check for being able to broadcast a single element.
14376 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14377 Mask, Subtarget, DAG))
14378 return Broadcast;
14379
14380 // Use even/odd duplicate instructions for masks that match their pattern.
14381 if (Subtarget.hasSSE3()) {
14382 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14383 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14384 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14385 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14386 }
14387
14388 if (Subtarget.hasAVX()) {
14389 // If we have AVX, we can use VPERMILPS which will allow folding a load
14390 // into the shuffle.
14391 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14392 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14393 }
14394
14395 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14396 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14397 if (!Subtarget.hasSSE2()) {
14398 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14399 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14400 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14401 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14402 }
14403
14404 // Otherwise, use a straight shuffle of a single input vector. We pass the
14405 // input vector to both operands to simulate this with a SHUFPS.
14406 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14407 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14408 }
14409
14410 if (Subtarget.hasAVX2())
14411 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14412 return Extract;
14413
14414 // There are special ways we can lower some single-element blends. However, we
14415 // have custom ways we can lower more complex single-element blends below that
14416 // we defer to if both this and BLENDPS fail to match, so restrict this to
14417 // when the V2 input is targeting element 0 of the mask -- that is the fast
14418 // case here.
14419 if (NumV2Elements == 1 && Mask[0] >= 4)
14420 if (SDValue V = lowerShuffleAsElementInsertion(
14421 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14422 return V;
14423
14424 if (Subtarget.hasSSE41()) {
14425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14426 Zeroable, Subtarget, DAG))
14427 return Blend;
14428
14429 // Use INSERTPS if we can complete the shuffle efficiently.
14430 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14431 return V;
14432
14433 if (!isSingleSHUFPSMask(Mask))
14434 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14435 V2, Mask, DAG))
14436 return BlendPerm;
14437 }
14438
14439 // Use low/high mov instructions. These are only valid in SSE1 because
14440 // otherwise they are widened to v2f64 and never get here.
14441 if (!Subtarget.hasSSE2()) {
14442 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14443 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14444 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14445 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14446 }
14447
14448 // Use dedicated unpack instructions for masks that match their pattern.
14449 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14450 return V;
14451
14452 // Otherwise fall back to a SHUFPS lowering strategy.
14453 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14454}
14455
14456/// Lower 4-lane i32 vector shuffles.
14457///
14458/// We try to handle these with integer-domain shuffles where we can, but for
14459/// blends we use the floating point domain blend instructions.
14460static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14461 const APInt &Zeroable, SDValue V1, SDValue V2,
14462 const X86Subtarget &Subtarget,
14463 SelectionDAG &DAG) {
14464 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((void)0);
14465 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((void)0);
14466 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
14467
14468 // Whenever we can lower this as a zext, that instruction is strictly faster
14469 // than any alternative. It also allows us to fold memory operands into the
14470 // shuffle in many cases.
14471 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14472 Zeroable, Subtarget, DAG))
14473 return ZExt;
14474
14475 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14476
14477 if (NumV2Elements == 0) {
14478 // Try to use broadcast unless the mask only has one non-undef element.
14479 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14480 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14481 Mask, Subtarget, DAG))
14482 return Broadcast;
14483 }
14484
14485 // Straight shuffle of a single input vector. For everything from SSE2
14486 // onward this has a single fast instruction with no scary immediates.
14487 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14488 // but we aren't actually going to use the UNPCK instruction because doing
14489 // so prevents folding a load into this instruction or making a copy.
14490 const int UnpackLoMask[] = {0, 0, 1, 1};
14491 const int UnpackHiMask[] = {2, 2, 3, 3};
14492 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14493 Mask = UnpackLoMask;
14494 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14495 Mask = UnpackHiMask;
14496
14497 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14498 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14499 }
14500
14501 if (Subtarget.hasAVX2())
14502 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14503 return Extract;
14504
14505 // Try to use shift instructions.
14506 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14507 Zeroable, Subtarget, DAG))
14508 return Shift;
14509
14510 // There are special ways we can lower some single-element blends.
14511 if (NumV2Elements == 1)
14512 if (SDValue V = lowerShuffleAsElementInsertion(
14513 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14514 return V;
14515
14516 // We have different paths for blend lowering, but they all must use the
14517 // *exact* same predicate.
14518 bool IsBlendSupported = Subtarget.hasSSE41();
14519 if (IsBlendSupported)
14520 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14521 Zeroable, Subtarget, DAG))
14522 return Blend;
14523
14524 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14525 Zeroable, Subtarget, DAG))
14526 return Masked;
14527
14528 // Use dedicated unpack instructions for masks that match their pattern.
14529 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14530 return V;
14531
14532 // Try to use byte rotation instructions.
14533 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14534 if (Subtarget.hasSSSE3()) {
14535 if (Subtarget.hasVLX())
14536 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14537 Subtarget, DAG))
14538 return Rotate;
14539
14540 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14541 Subtarget, DAG))
14542 return Rotate;
14543 }
14544
14545 // Assume that a single SHUFPS is faster than an alternative sequence of
14546 // multiple instructions (even if the CPU has a domain penalty).
14547 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14548 if (!isSingleSHUFPSMask(Mask)) {
14549 // If we have direct support for blends, we should lower by decomposing into
14550 // a permute. That will be faster than the domain cross.
14551 if (IsBlendSupported)
14552 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14553 Subtarget, DAG);
14554
14555 // Try to lower by permuting the inputs into an unpack instruction.
14556 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14557 Mask, Subtarget, DAG))
14558 return Unpack;
14559 }
14560
14561 // We implement this with SHUFPS because it can blend from two vectors.
14562 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14563 // up the inputs, bypassing domain shift penalties that we would incur if we
14564 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14565 // relevant.
14566 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14567 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14568 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14569 return DAG.getBitcast(MVT::v4i32, ShufPS);
14570}
14571
14572/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14573/// shuffle lowering, and the most complex part.
14574///
14575/// The lowering strategy is to try to form pairs of input lanes which are
14576/// targeted at the same half of the final vector, and then use a dword shuffle
14577/// to place them onto the right half, and finally unpack the paired lanes into
14578/// their final position.
14579///
14580/// The exact breakdown of how to form these dword pairs and align them on the
14581/// correct sides is really tricky. See the comments within the function for
14582/// more of the details.
14583///
14584/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14585/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14586/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14587/// vector, form the analogous 128-bit 8-element Mask.
14588static SDValue lowerV8I16GeneralSingleInputShuffle(
14589 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14590 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14591 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((void)0);
14592 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14593
14594 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((void)0);
14595 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14596 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14597
14598 // Attempt to directly match PSHUFLW or PSHUFHW.
14599 if (isUndefOrInRange(LoMask, 0, 4) &&
14600 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14601 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14602 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14603 }
14604 if (isUndefOrInRange(HiMask, 4, 8) &&
14605 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14606 for (int i = 0; i != 4; ++i)
14607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14608 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14609 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14610 }
14611
14612 SmallVector<int, 4> LoInputs;
14613 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14614 array_pod_sort(LoInputs.begin(), LoInputs.end());
14615 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14616 SmallVector<int, 4> HiInputs;
14617 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14618 array_pod_sort(HiInputs.begin(), HiInputs.end());
14619 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14620 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14621 int NumHToL = LoInputs.size() - NumLToL;
14622 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14623 int NumHToH = HiInputs.size() - NumLToH;
14624 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14625 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14626 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14627 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14628
14629 // If we are shuffling values from one half - check how many different DWORD
14630 // pairs we need to create. If only 1 or 2 then we can perform this as a
14631 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14632 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14633 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14634 V = DAG.getNode(ShufWOp, DL, VT, V,
14635 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14636 V = DAG.getBitcast(PSHUFDVT, V);
14637 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14638 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14639 return DAG.getBitcast(VT, V);
14640 };
14641
14642 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14643 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14644 SmallVector<std::pair<int, int>, 4> DWordPairs;
14645 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14646
14647 // Collect the different DWORD pairs.
14648 for (int DWord = 0; DWord != 4; ++DWord) {
14649 int M0 = Mask[2 * DWord + 0];
14650 int M1 = Mask[2 * DWord + 1];
14651 M0 = (M0 >= 0 ? M0 % 4 : M0);
14652 M1 = (M1 >= 0 ? M1 % 4 : M1);
14653 if (M0 < 0 && M1 < 0)
14654 continue;
14655
14656 bool Match = false;
14657 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14658 auto &DWordPair = DWordPairs[j];
14659 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14660 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14661 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14662 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14663 PSHUFDMask[DWord] = DOffset + j;
14664 Match = true;
14665 break;
14666 }
14667 }
14668 if (!Match) {
14669 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14670 DWordPairs.push_back(std::make_pair(M0, M1));
14671 }
14672 }
14673
14674 if (DWordPairs.size() <= 2) {
14675 DWordPairs.resize(2, std::make_pair(-1, -1));
14676 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14677 DWordPairs[1].first, DWordPairs[1].second};
14678 if ((NumHToL + NumHToH) == 0)
14679 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14680 if ((NumLToL + NumLToH) == 0)
14681 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14682 }
14683 }
14684
14685 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14686 // such inputs we can swap two of the dwords across the half mark and end up
14687 // with <=2 inputs to each half in each half. Once there, we can fall through
14688 // to the generic code below. For example:
14689 //
14690 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14691 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14692 //
14693 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14694 // and an existing 2-into-2 on the other half. In this case we may have to
14695 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14696 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14697 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14698 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14699 // half than the one we target for fixing) will be fixed when we re-enter this
14700 // path. We will also combine away any sequence of PSHUFD instructions that
14701 // result into a single instruction. Here is an example of the tricky case:
14702 //
14703 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14704 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14705 //
14706 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14707 //
14708 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14709 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14710 //
14711 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14712 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14713 //
14714 // The result is fine to be handled by the generic logic.
14715 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14716 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14717 int AOffset, int BOffset) {
14718 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&((void)0)
14719 "Must call this with A having 3 or 1 inputs from the A half.")((void)0);
14720 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&((void)0)
14721 "Must call this with B having 1 or 3 inputs from the B half.")((void)0);
14722 assert(AToAInputs.size() + BToAInputs.size() == 4 &&((void)0)
14723 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")((void)0);
14724
14725 bool ThreeAInputs = AToAInputs.size() == 3;
14726
14727 // Compute the index of dword with only one word among the three inputs in
14728 // a half by taking the sum of the half with three inputs and subtracting
14729 // the sum of the actual three inputs. The difference is the remaining
14730 // slot.
14731 int ADWord = 0, BDWord = 0;
14732 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14733 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14734 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14735 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14736 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14737 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14738 int TripleNonInputIdx =
14739 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14740 TripleDWord = TripleNonInputIdx / 2;
14741
14742 // We use xor with one to compute the adjacent DWord to whichever one the
14743 // OneInput is in.
14744 OneInputDWord = (OneInput / 2) ^ 1;
14745
14746 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14747 // and BToA inputs. If there is also such a problem with the BToB and AToB
14748 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14749 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14750 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14751 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14752 // Compute how many inputs will be flipped by swapping these DWords. We
14753 // need
14754 // to balance this to ensure we don't form a 3-1 shuffle in the other
14755 // half.
14756 int NumFlippedAToBInputs =
14757 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
14758 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
14759 int NumFlippedBToBInputs =
14760 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
14761 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
14762 if ((NumFlippedAToBInputs == 1 &&
14763 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14764 (NumFlippedBToBInputs == 1 &&
14765 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14766 // We choose whether to fix the A half or B half based on whether that
14767 // half has zero flipped inputs. At zero, we may not be able to fix it
14768 // with that half. We also bias towards fixing the B half because that
14769 // will more commonly be the high half, and we have to bias one way.
14770 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14771 ArrayRef<int> Inputs) {
14772 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14773 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14774 // Determine whether the free index is in the flipped dword or the
14775 // unflipped dword based on where the pinned index is. We use this bit
14776 // in an xor to conditionally select the adjacent dword.
14777 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14778 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14779 if (IsFixIdxInput == IsFixFreeIdxInput)
14780 FixFreeIdx += 1;
14781 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14782 assert(IsFixIdxInput != IsFixFreeIdxInput &&((void)0)
14783 "We need to be changing the number of flipped inputs!")((void)0);
14784 int PSHUFHalfMask[] = {0, 1, 2, 3};
14785 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14786 V = DAG.getNode(
14787 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14788 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14789 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14790
14791 for (int &M : Mask)
14792 if (M >= 0 && M == FixIdx)
14793 M = FixFreeIdx;
14794 else if (M >= 0 && M == FixFreeIdx)
14795 M = FixIdx;
14796 };
14797 if (NumFlippedBToBInputs != 0) {
14798 int BPinnedIdx =
14799 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14800 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14801 } else {
14802 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((void)0);
14803 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14804 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14805 }
14806 }
14807 }
14808
14809 int PSHUFDMask[] = {0, 1, 2, 3};
14810 PSHUFDMask[ADWord] = BDWord;
14811 PSHUFDMask[BDWord] = ADWord;
14812 V = DAG.getBitcast(
14813 VT,
14814 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14815 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14816
14817 // Adjust the mask to match the new locations of A and B.
14818 for (int &M : Mask)
14819 if (M >= 0 && M/2 == ADWord)
14820 M = 2 * BDWord + M % 2;
14821 else if (M >= 0 && M/2 == BDWord)
14822 M = 2 * ADWord + M % 2;
14823
14824 // Recurse back into this routine to re-compute state now that this isn't
14825 // a 3 and 1 problem.
14826 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14827 };
14828 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14829 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14830 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14831 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14832
14833 // At this point there are at most two inputs to the low and high halves from
14834 // each half. That means the inputs can always be grouped into dwords and
14835 // those dwords can then be moved to the correct half with a dword shuffle.
14836 // We use at most one low and one high word shuffle to collect these paired
14837 // inputs into dwords, and finally a dword shuffle to place them.
14838 int PSHUFLMask[4] = {-1, -1, -1, -1};
14839 int PSHUFHMask[4] = {-1, -1, -1, -1};
14840 int PSHUFDMask[4] = {-1, -1, -1, -1};
14841
14842 // First fix the masks for all the inputs that are staying in their
14843 // original halves. This will then dictate the targets of the cross-half
14844 // shuffles.
14845 auto fixInPlaceInputs =
14846 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14847 MutableArrayRef<int> SourceHalfMask,
14848 MutableArrayRef<int> HalfMask, int HalfOffset) {
14849 if (InPlaceInputs.empty())
14850 return;
14851 if (InPlaceInputs.size() == 1) {
14852 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14853 InPlaceInputs[0] - HalfOffset;
14854 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14855 return;
14856 }
14857 if (IncomingInputs.empty()) {
14858 // Just fix all of the in place inputs.
14859 for (int Input : InPlaceInputs) {
14860 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14861 PSHUFDMask[Input / 2] = Input / 2;
14862 }
14863 return;
14864 }
14865
14866 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((void)0);
14867 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14868 InPlaceInputs[0] - HalfOffset;
14869 // Put the second input next to the first so that they are packed into
14870 // a dword. We find the adjacent index by toggling the low bit.
14871 int AdjIndex = InPlaceInputs[0] ^ 1;
14872 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14873 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14874 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14875 };
14876 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14877 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14878
14879 // Now gather the cross-half inputs and place them into a free dword of
14880 // their target half.
14881 // FIXME: This operation could almost certainly be simplified dramatically to
14882 // look more like the 3-1 fixing operation.
14883 auto moveInputsToRightHalf = [&PSHUFDMask](
14884 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14885 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14886 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14887 int DestOffset) {
14888 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14889 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14890 };
14891 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14892 int Word) {
14893 int LowWord = Word & ~1;
14894 int HighWord = Word | 1;
14895 return isWordClobbered(SourceHalfMask, LowWord) ||
14896 isWordClobbered(SourceHalfMask, HighWord);
14897 };
14898
14899 if (IncomingInputs.empty())
14900 return;
14901
14902 if (ExistingInputs.empty()) {
14903 // Map any dwords with inputs from them into the right half.
14904 for (int Input : IncomingInputs) {
14905 // If the source half mask maps over the inputs, turn those into
14906 // swaps and use the swapped lane.
14907 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14908 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14909 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14910 Input - SourceOffset;
14911 // We have to swap the uses in our half mask in one sweep.
14912 for (int &M : HalfMask)
14913 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14914 M = Input;
14915 else if (M == Input)
14916 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14917 } else {
14918 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((void)0)
14919 Input - SourceOffset &&((void)0)
14920 "Previous placement doesn't match!")((void)0);
14921 }
14922 // Note that this correctly re-maps both when we do a swap and when
14923 // we observe the other side of the swap above. We rely on that to
14924 // avoid swapping the members of the input list directly.
14925 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14926 }
14927
14928 // Map the input's dword into the correct half.
14929 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14930 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14931 else
14932 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((void)0)
14933 Input / 2 &&((void)0)
14934 "Previous placement doesn't match!")((void)0);
14935 }
14936
14937 // And just directly shift any other-half mask elements to be same-half
14938 // as we will have mirrored the dword containing the element into the
14939 // same position within that half.
14940 for (int &M : HalfMask)
14941 if (M >= SourceOffset && M < SourceOffset + 4) {
14942 M = M - SourceOffset + DestOffset;
14943 assert(M >= 0 && "This should never wrap below zero!")((void)0);
14944 }
14945 return;
14946 }
14947
14948 // Ensure we have the input in a viable dword of its current half. This
14949 // is particularly tricky because the original position may be clobbered
14950 // by inputs being moved and *staying* in that half.
14951 if (IncomingInputs.size() == 1) {
14952 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14953 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14954 SourceOffset;
14955 SourceHalfMask[InputFixed - SourceOffset] =
14956 IncomingInputs[0] - SourceOffset;
14957 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14958 InputFixed);
14959 IncomingInputs[0] = InputFixed;
14960 }
14961 } else if (IncomingInputs.size() == 2) {
14962 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14963 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14964 // We have two non-adjacent or clobbered inputs we need to extract from
14965 // the source half. To do this, we need to map them into some adjacent
14966 // dword slot in the source mask.
14967 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14968 IncomingInputs[1] - SourceOffset};
14969
14970 // If there is a free slot in the source half mask adjacent to one of
14971 // the inputs, place the other input in it. We use (Index XOR 1) to
14972 // compute an adjacent index.
14973 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14974 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14975 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14976 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14977 InputsFixed[1] = InputsFixed[0] ^ 1;
14978 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14979 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14980 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14981 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14982 InputsFixed[0] = InputsFixed[1] ^ 1;
14983 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14984 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14985 // The two inputs are in the same DWord but it is clobbered and the
14986 // adjacent DWord isn't used at all. Move both inputs to the free
14987 // slot.
14988 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14989 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14990 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14991 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14992 } else {
14993 // The only way we hit this point is if there is no clobbering
14994 // (because there are no off-half inputs to this half) and there is no
14995 // free slot adjacent to one of the inputs. In this case, we have to
14996 // swap an input with a non-input.
14997 for (int i = 0; i < 4; ++i)
14998 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&((void)0)
14999 "We can't handle any clobbers here!")((void)0);
15000 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((void)0)
15001 "Cannot have adjacent inputs here!")((void)0);
15002
15003 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15004 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15005
15006 // We also have to update the final source mask in this case because
15007 // it may need to undo the above swap.
15008 for (int &M : FinalSourceHalfMask)
15009 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15010 M = InputsFixed[1] + SourceOffset;
15011 else if (M == InputsFixed[1] + SourceOffset)
15012 M = (InputsFixed[0] ^ 1) + SourceOffset;
15013
15014 InputsFixed[1] = InputsFixed[0] ^ 1;
15015 }
15016
15017 // Point everything at the fixed inputs.
15018 for (int &M : HalfMask)
15019 if (M == IncomingInputs[0])
15020 M = InputsFixed[0] + SourceOffset;
15021 else if (M == IncomingInputs[1])
15022 M = InputsFixed[1] + SourceOffset;
15023
15024 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15025 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15026 }
15027 } else {
15028 llvm_unreachable("Unhandled input size!")__builtin_unreachable();
15029 }
15030
15031 // Now hoist the DWord down to the right half.
15032 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15033 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((void)0);
15034 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15035 for (int &M : HalfMask)
15036 for (int Input : IncomingInputs)
15037 if (M == Input)
15038 M = FreeDWord * 2 + Input % 2;
15039 };
15040 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15041 /*SourceOffset*/ 4, /*DestOffset*/ 0);
15042 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15043 /*SourceOffset*/ 0, /*DestOffset*/ 4);
15044
15045 // Now enact all the shuffles we've computed to move the inputs into their
15046 // target half.
15047 if (!isNoopShuffleMask(PSHUFLMask))
15048 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15049 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15050 if (!isNoopShuffleMask(PSHUFHMask))
15051 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15052 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15053 if (!isNoopShuffleMask(PSHUFDMask))
15054 V = DAG.getBitcast(
15055 VT,
15056 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15057 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15058
15059 // At this point, each half should contain all its inputs, and we can then
15060 // just shuffle them into their final position.
15061 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((void)0)
15062 "Failed to lift all the high half inputs to the low mask!")((void)0);
15063 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((void)0)
15064 "Failed to lift all the low half inputs to the high mask!")((void)0);
15065
15066 // Do a half shuffle for the low mask.
15067 if (!isNoopShuffleMask(LoMask))
15068 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15069 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15070
15071 // Do a half shuffle with the high mask after shifting its values down.
15072 for (int &M : HiMask)
15073 if (M >= 0)
15074 M -= 4;
15075 if (!isNoopShuffleMask(HiMask))
15076 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15077 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15078
15079 return V;
15080}
15081
15082/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15083/// blend if only one input is used.
15084static SDValue lowerShuffleAsBlendOfPSHUFBs(
15085 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15086 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15087 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&((void)0)
15088 "Lane crossing shuffle masks not supported")((void)0);
15089
15090 int NumBytes = VT.getSizeInBits() / 8;
15091 int Size = Mask.size();
15092 int Scale = NumBytes / Size;
15093
15094 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15095 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15096 V1InUse = false;
15097 V2InUse = false;
15098
15099 for (int i = 0; i < NumBytes; ++i) {
15100 int M = Mask[i / Scale];
15101 if (M < 0)
15102 continue;
15103
15104 const int ZeroMask = 0x80;
15105 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15106 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15107 if (Zeroable[i / Scale])
15108 V1Idx = V2Idx = ZeroMask;
15109
15110 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15111 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15112 V1InUse |= (ZeroMask != V1Idx);
15113 V2InUse |= (ZeroMask != V2Idx);
15114 }
15115
15116 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15117 if (V1InUse)
15118 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15119 DAG.getBuildVector(ShufVT, DL, V1Mask));
15120 if (V2InUse)
15121 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15122 DAG.getBuildVector(ShufVT, DL, V2Mask));
15123
15124 // If we need shuffled inputs from both, blend the two.
15125 SDValue V;
15126 if (V1InUse && V2InUse)
15127 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15128 else
15129 V = V1InUse ? V1 : V2;
15130
15131 // Cast the result back to the correct type.
15132 return DAG.getBitcast(VT, V);
15133}
15134
15135/// Generic lowering of 8-lane i16 shuffles.
15136///
15137/// This handles both single-input shuffles and combined shuffle/blends with
15138/// two inputs. The single input shuffles are immediately delegated to
15139/// a dedicated lowering routine.
15140///
15141/// The blends are lowered in one of three fundamental ways. If there are few
15142/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15143/// of the input is significantly cheaper when lowered as an interleaving of
15144/// the two inputs, try to interleave them. Otherwise, blend the low and high
15145/// halves of the inputs separately (making them have relatively few inputs)
15146/// and then concatenate them.
15147static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15148 const APInt &Zeroable, SDValue V1, SDValue V2,
15149 const X86Subtarget &Subtarget,
15150 SelectionDAG &DAG) {
15151 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((void)0);
15152 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((void)0);
15153 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
15154
15155 // Whenever we can lower this as a zext, that instruction is strictly faster
15156 // than any alternative.
15157 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15158 Zeroable, Subtarget, DAG))
15159 return ZExt;
15160
15161 // Try to use lower using a truncation.
15162 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15163 Subtarget, DAG))
15164 return V;
15165
15166 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15167
15168 if (NumV2Inputs == 0) {
15169 // Try to use shift instructions.
15170 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15171 Zeroable, Subtarget, DAG))
15172 return Shift;
15173
15174 // Check for being able to broadcast a single element.
15175 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15176 Mask, Subtarget, DAG))
15177 return Broadcast;
15178
15179 // Try to use bit rotation instructions.
15180 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15181 Subtarget, DAG))
15182 return Rotate;
15183
15184 // Use dedicated unpack instructions for masks that match their pattern.
15185 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15186 return V;
15187
15188 // Use dedicated pack instructions for masks that match their pattern.
15189 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15190 Subtarget))
15191 return V;
15192
15193 // Try to use byte rotation instructions.
15194 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15195 Subtarget, DAG))
15196 return Rotate;
15197
15198 // Make a copy of the mask so it can be modified.
15199 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15200 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15201 Subtarget, DAG);
15202 }
15203
15204 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((void)0)
15205 "All single-input shuffles should be canonicalized to be V1-input "((void)0)
15206 "shuffles.")((void)0);
15207
15208 // Try to use shift instructions.
15209 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15210 Zeroable, Subtarget, DAG))
15211 return Shift;
15212
15213 // See if we can use SSE4A Extraction / Insertion.
15214 if (Subtarget.hasSSE4A())
15215 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15216 Zeroable, DAG))
15217 return V;
15218
15219 // There are special ways we can lower some single-element blends.
15220 if (NumV2Inputs == 1)
15221 if (SDValue V = lowerShuffleAsElementInsertion(
15222 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15223 return V;
15224
15225 // We have different paths for blend lowering, but they all must use the
15226 // *exact* same predicate.
15227 bool IsBlendSupported = Subtarget.hasSSE41();
15228 if (IsBlendSupported)
15229 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15230 Zeroable, Subtarget, DAG))
15231 return Blend;
15232
15233 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15234 Zeroable, Subtarget, DAG))
15235 return Masked;
15236
15237 // Use dedicated unpack instructions for masks that match their pattern.
15238 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15239 return V;
15240
15241 // Use dedicated pack instructions for masks that match their pattern.
15242 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15243 Subtarget))
15244 return V;
15245
15246 // Try to use lower using a truncation.
15247 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15248 Subtarget, DAG))
15249 return V;
15250
15251 // Try to use byte rotation instructions.
15252 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15253 Subtarget, DAG))
15254 return Rotate;
15255
15256 if (SDValue BitBlend =
15257 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15258 return BitBlend;
15259
15260 // Try to use byte shift instructions to mask.
15261 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15262 Zeroable, Subtarget, DAG))
15263 return V;
15264
15265 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15266 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15267 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15268 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15269 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15270 !Subtarget.hasVLX()) {
15271 SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15272 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15273 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15274 SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15275 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15276 DWordClearMask);
15277 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15278 DWordClearMask);
15279 // Now pack things back together.
15280 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15281 if (NumEvenDrops == 2) {
15282 Result = DAG.getBitcast(MVT::v4i32, Result);
15283 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15284 }
15285 return Result;
15286 }
15287
15288 // Try to lower by permuting the inputs into an unpack instruction.
15289 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15290 Mask, Subtarget, DAG))
15291 return Unpack;
15292
15293 // If we can't directly blend but can use PSHUFB, that will be better as it
15294 // can both shuffle and set up the inefficient blend.
15295 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15296 bool V1InUse, V2InUse;
15297 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15298 Zeroable, DAG, V1InUse, V2InUse);
15299 }
15300
15301 // We can always bit-blend if we have to so the fallback strategy is to
15302 // decompose into single-input permutes and blends/unpacks.
15303 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15304 Mask, Subtarget, DAG);
15305}
15306
15307// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15308// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15309// the active subvector is extracted.
15310static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15311 ArrayRef<int> Mask, SDValue V1, SDValue V2,
15312 const X86Subtarget &Subtarget,
15313 SelectionDAG &DAG) {
15314 MVT MaskVT = VT.changeTypeToInteger();
15315 SDValue MaskNode;
15316 MVT ShuffleVT = VT;
15317 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15318 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15319 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15320 ShuffleVT = V1.getSimpleValueType();
15321
15322 // Adjust mask to correct indices for the second input.
15323 int NumElts = VT.getVectorNumElements();
15324 unsigned Scale = 512 / VT.getSizeInBits();
15325 SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15326 for (int &M : AdjustedMask)
15327 if (NumElts <= M)
15328 M += (Scale - 1) * NumElts;
15329 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15330 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15331 } else {
15332 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15333 }
15334
15335 SDValue Result;
15336 if (V2.isUndef())
15337 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15338 else
15339 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15340
15341 if (VT != ShuffleVT)
15342 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15343
15344 return Result;
15345}
15346
15347/// Generic lowering of v16i8 shuffles.
15348///
15349/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15350/// detect any complexity reducing interleaving. If that doesn't help, it uses
15351/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15352/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15353/// back together.
15354static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15355 const APInt &Zeroable, SDValue V1, SDValue V2,
15356 const X86Subtarget &Subtarget,
15357 SelectionDAG &DAG) {
15358 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((void)0);
15359 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((void)0);
15360 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
15361
15362 // Try to use shift instructions.
15363 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15364 Zeroable, Subtarget, DAG))
15365 return Shift;
15366
15367 // Try to use byte rotation instructions.
15368 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15369 Subtarget, DAG))
15370 return Rotate;
15371
15372 // Use dedicated pack instructions for masks that match their pattern.
15373 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15374 Subtarget))
15375 return V;
15376
15377 // Try to use a zext lowering.
15378 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15379 Zeroable, Subtarget, DAG))
15380 return ZExt;
15381
15382 // Try to use lower using a truncation.
15383 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15384 Subtarget, DAG))
15385 return V;
15386
15387 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15388 Subtarget, DAG))
15389 return V;
15390
15391 // See if we can use SSE4A Extraction / Insertion.
15392 if (Subtarget.hasSSE4A())
15393 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15394 Zeroable, DAG))
15395 return V;
15396
15397 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15398
15399 // For single-input shuffles, there are some nicer lowering tricks we can use.
15400 if (NumV2Elements == 0) {
15401 // Check for being able to broadcast a single element.
15402 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15403 Mask, Subtarget, DAG))
15404 return Broadcast;
15405
15406 // Try to use bit rotation instructions.
15407 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15408 Subtarget, DAG))
15409 return Rotate;
15410
15411 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15412 return V;
15413
15414 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15415 // Notably, this handles splat and partial-splat shuffles more efficiently.
15416 // However, it only makes sense if the pre-duplication shuffle simplifies
15417 // things significantly. Currently, this means we need to be able to
15418 // express the pre-duplication shuffle as an i16 shuffle.
15419 //
15420 // FIXME: We should check for other patterns which can be widened into an
15421 // i16 shuffle as well.
15422 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15423 for (int i = 0; i < 16; i += 2)
15424 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15425 return false;
15426
15427 return true;
15428 };
15429 auto tryToWidenViaDuplication = [&]() -> SDValue {
15430 if (!canWidenViaDuplication(Mask))
15431 return SDValue();
15432 SmallVector<int, 4> LoInputs;
15433 copy_if(Mask, std::back_inserter(LoInputs),
15434 [](int M) { return M >= 0 && M < 8; });
15435 array_pod_sort(LoInputs.begin(), LoInputs.end());
15436 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15437 LoInputs.end());
15438 SmallVector<int, 4> HiInputs;
15439 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15440 array_pod_sort(HiInputs.begin(), HiInputs.end());
15441 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15442 HiInputs.end());
15443
15444 bool TargetLo = LoInputs.size() >= HiInputs.size();
15445 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15446 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15447
15448 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15449 SmallDenseMap<int, int, 8> LaneMap;
15450 for (int I : InPlaceInputs) {
15451 PreDupI16Shuffle[I/2] = I/2;
15452 LaneMap[I] = I;
15453 }
15454 int j = TargetLo ? 0 : 4, je = j + 4;
15455 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15456 // Check if j is already a shuffle of this input. This happens when
15457 // there are two adjacent bytes after we move the low one.
15458 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15459 // If we haven't yet mapped the input, search for a slot into which
15460 // we can map it.
15461 while (j < je && PreDupI16Shuffle[j] >= 0)
15462 ++j;
15463
15464 if (j == je)
15465 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15466 return SDValue();
15467
15468 // Map this input with the i16 shuffle.
15469 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15470 }
15471
15472 // Update the lane map based on the mapping we ended up with.
15473 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15474 }
15475 V1 = DAG.getBitcast(
15476 MVT::v16i8,
15477 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15478 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15479
15480 // Unpack the bytes to form the i16s that will be shuffled into place.
15481 bool EvenInUse = false, OddInUse = false;
15482 for (int i = 0; i < 16; i += 2) {
15483 EvenInUse |= (Mask[i + 0] >= 0);
15484 OddInUse |= (Mask[i + 1] >= 0);
15485 if (EvenInUse && OddInUse)
15486 break;
15487 }
15488 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15489 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15490 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15491
15492 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15493 for (int i = 0; i < 16; ++i)
15494 if (Mask[i] >= 0) {
15495 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15496 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((void)0);
15497 if (PostDupI16Shuffle[i / 2] < 0)
15498 PostDupI16Shuffle[i / 2] = MappedMask;
15499 else
15500 assert(PostDupI16Shuffle[i / 2] == MappedMask &&((void)0)
15501 "Conflicting entries in the original shuffle!")((void)0);
15502 }
15503 return DAG.getBitcast(
15504 MVT::v16i8,
15505 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15506 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15507 };
15508 if (SDValue V = tryToWidenViaDuplication())
15509 return V;
15510 }
15511
15512 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15513 Zeroable, Subtarget, DAG))
15514 return Masked;
15515
15516 // Use dedicated unpack instructions for masks that match their pattern.
15517 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15518 return V;
15519
15520 // Try to use byte shift instructions to mask.
15521 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15522 Zeroable, Subtarget, DAG))
15523 return V;
15524
15525 // Check for compaction patterns.
15526 bool IsSingleInput = V2.isUndef();
15527 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15528
15529 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15530 // with PSHUFB. It is important to do this before we attempt to generate any
15531 // blends but after all of the single-input lowerings. If the single input
15532 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15533 // want to preserve that and we can DAG combine any longer sequences into
15534 // a PSHUFB in the end. But once we start blending from multiple inputs,
15535 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15536 // and there are *very* few patterns that would actually be faster than the
15537 // PSHUFB approach because of its ability to zero lanes.
15538 //
15539 // If the mask is a binary compaction, we can more efficiently perform this
15540 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15541 //
15542 // FIXME: The only exceptions to the above are blends which are exact
15543 // interleavings with direct instructions supporting them. We currently don't
15544 // handle those well here.
15545 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15546 bool V1InUse = false;
15547 bool V2InUse = false;
15548
15549 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15550 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15551
15552 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15553 // do so. This avoids using them to handle blends-with-zero which is
15554 // important as a single pshufb is significantly faster for that.
15555 if (V1InUse && V2InUse) {
15556 if (Subtarget.hasSSE41())
15557 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15558 Zeroable, Subtarget, DAG))
15559 return Blend;
15560
15561 // We can use an unpack to do the blending rather than an or in some
15562 // cases. Even though the or may be (very minorly) more efficient, we
15563 // preference this lowering because there are common cases where part of
15564 // the complexity of the shuffles goes away when we do the final blend as
15565 // an unpack.
15566 // FIXME: It might be worth trying to detect if the unpack-feeding
15567 // shuffles will both be pshufb, in which case we shouldn't bother with
15568 // this.
15569 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15570 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15571 return Unpack;
15572
15573 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15574 if (Subtarget.hasVBMI())
15575 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15576 DAG);
15577
15578 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15579 if (Subtarget.hasXOP()) {
15580 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15581 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15582 }
15583
15584 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15585 // PALIGNR will be cheaper than the second PSHUFB+OR.
15586 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15587 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15588 return V;
15589 }
15590
15591 return PSHUFB;
15592 }
15593
15594 // There are special ways we can lower some single-element blends.
15595 if (NumV2Elements == 1)
15596 if (SDValue V = lowerShuffleAsElementInsertion(
15597 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15598 return V;
15599
15600 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15601 return Blend;
15602
15603 // Check whether a compaction lowering can be done. This handles shuffles
15604 // which take every Nth element for some even N. See the helper function for
15605 // details.
15606 //
15607 // We special case these as they can be particularly efficiently handled with
15608 // the PACKUSB instruction on x86 and they show up in common patterns of
15609 // rearranging bytes to truncate wide elements.
15610 if (NumEvenDrops) {
15611 // NumEvenDrops is the power of two stride of the elements. Another way of
15612 // thinking about it is that we need to drop the even elements this many
15613 // times to get the original input.
15614
15615 // First we need to zero all the dropped bytes.
15616 assert(NumEvenDrops <= 3 &&((void)0)
15617 "No support for dropping even elements more than 3 times.")((void)0);
15618 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15619 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15620 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15621 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15622 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15623 WordClearMask);
15624 if (!IsSingleInput)
15625 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15626 WordClearMask);
15627
15628 // Now pack things back together.
15629 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15630 IsSingleInput ? V1 : V2);
15631 for (int i = 1; i < NumEvenDrops; ++i) {
15632 Result = DAG.getBitcast(MVT::v8i16, Result);
15633 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15634 }
15635 return Result;
15636 }
15637
15638 // Handle multi-input cases by blending/unpacking single-input shuffles.
15639 if (NumV2Elements > 0)
15640 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15641 Subtarget, DAG);
15642
15643 // The fallback path for single-input shuffles widens this into two v8i16
15644 // vectors with unpacks, shuffles those, and then pulls them back together
15645 // with a pack.
15646 SDValue V = V1;
15647
15648 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15649 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15650 for (int i = 0; i < 16; ++i)
15651 if (Mask[i] >= 0)
15652 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15653
15654 SDValue VLoHalf, VHiHalf;
15655 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15656 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15657 // i16s.
15658 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15659 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15660 // Use a mask to drop the high bytes.
15661 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15662 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15663 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15664
15665 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15666 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15667
15668 // Squash the masks to point directly into VLoHalf.
15669 for (int &M : LoBlendMask)
15670 if (M >= 0)
15671 M /= 2;
15672 for (int &M : HiBlendMask)
15673 if (M >= 0)
15674 M /= 2;
15675 } else {
15676 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15677 // VHiHalf so that we can blend them as i16s.
15678 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15679
15680 VLoHalf = DAG.getBitcast(
15681 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15682 VHiHalf = DAG.getBitcast(
15683 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15684 }
15685
15686 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15687 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15688
15689 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15690}
15691
15692/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15693///
15694/// This routine breaks down the specific type of 128-bit shuffle and
15695/// dispatches to the lowering routines accordingly.
15696static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15697 MVT VT, SDValue V1, SDValue V2,
15698 const APInt &Zeroable,
15699 const X86Subtarget &Subtarget,
15700 SelectionDAG &DAG) {
15701 switch (VT.SimpleTy) {
15702 case MVT::v2i64:
15703 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15704 case MVT::v2f64:
15705 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15706 case MVT::v4i32:
15707 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15708 case MVT::v4f32:
15709 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15710 case MVT::v8i16:
15711 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15712 case MVT::v16i8:
15713 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15714
15715 default:
15716 llvm_unreachable("Unimplemented!")__builtin_unreachable();
15717 }
15718}
15719
15720/// Generic routine to split vector shuffle into half-sized shuffles.
15721///
15722/// This routine just extracts two subvectors, shuffles them independently, and
15723/// then concatenates them back together. This should work effectively with all
15724/// AVX vector shuffle types.
15725static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15726 SDValue V2, ArrayRef<int> Mask,
15727 SelectionDAG &DAG) {
15728 assert(VT.getSizeInBits() >= 256 &&((void)0)
15729 "Only for 256-bit or wider vector shuffles!")((void)0);
15730 assert(V1.getSimpleValueType() == VT && "Bad operand type!")((void)0);
15731 assert(V2.getSimpleValueType() == VT && "Bad operand type!")((void)0);
15732
15733 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15734 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15735
15736 int NumElements = VT.getVectorNumElements();
15737 int SplitNumElements = NumElements / 2;
15738 MVT ScalarVT = VT.getVectorElementType();
15739 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15740
15741 // Use splitVector/extractSubVector so that split build-vectors just build two
15742 // narrower build vectors. This helps shuffling with splats and zeros.
15743 auto SplitVector = [&](SDValue V) {
15744 SDValue LoV, HiV;
15745 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15746 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15747 DAG.getBitcast(SplitVT, HiV));
15748 };
15749
15750 SDValue LoV1, HiV1, LoV2, HiV2;
15751 std::tie(LoV1, HiV1) = SplitVector(V1);
15752 std::tie(LoV2, HiV2) = SplitVector(V2);
15753
15754 // Now create two 4-way blends of these half-width vectors.
15755 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15756 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
15757 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15758 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15759 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15760 for (int i = 0; i < SplitNumElements; ++i) {
15761 int M = HalfMask[i];
15762 if (M >= NumElements) {
15763 if (M >= NumElements + SplitNumElements)
15764 UseHiV2 = true;
15765 else
15766 UseLoV2 = true;
15767 V2BlendMask[i] = M - NumElements;
15768 BlendMask[i] = SplitNumElements + i;
15769 } else if (M >= 0) {
15770 if (M >= SplitNumElements)
15771 UseHiV1 = true;
15772 else
15773 UseLoV1 = true;
15774 V1BlendMask[i] = M;
15775 BlendMask[i] = i;
15776 }
15777 }
15778
15779 // Because the lowering happens after all combining takes place, we need to
15780 // manually combine these blend masks as much as possible so that we create
15781 // a minimal number of high-level vector shuffle nodes.
15782
15783 // First try just blending the halves of V1 or V2.
15784 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15785 return DAG.getUNDEF(SplitVT);
15786 if (!UseLoV2 && !UseHiV2)
15787 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15788 if (!UseLoV1 && !UseHiV1)
15789 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15790
15791 SDValue V1Blend, V2Blend;
15792 if (UseLoV1 && UseHiV1) {
15793 V1Blend =
15794 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15795 } else {
15796 // We only use half of V1 so map the usage down into the final blend mask.
15797 V1Blend = UseLoV1 ? LoV1 : HiV1;
15798 for (int i = 0; i < SplitNumElements; ++i)
15799 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15800 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15801 }
15802 if (UseLoV2 && UseHiV2) {
15803 V2Blend =
15804 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15805 } else {
15806 // We only use half of V2 so map the usage down into the final blend mask.
15807 V2Blend = UseLoV2 ? LoV2 : HiV2;
15808 for (int i = 0; i < SplitNumElements; ++i)
15809 if (BlendMask[i] >= SplitNumElements)
15810 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15811 }
15812 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15813 };
15814 SDValue Lo = HalfBlend(LoMask);
15815 SDValue Hi = HalfBlend(HiMask);
15816 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15817}
15818
15819/// Either split a vector in halves or decompose the shuffles and the
15820/// blend/unpack.
15821///
15822/// This is provided as a good fallback for many lowerings of non-single-input
15823/// shuffles with more than one 128-bit lane. In those cases, we want to select
15824/// between splitting the shuffle into 128-bit components and stitching those
15825/// back together vs. extracting the single-input shuffles and blending those
15826/// results.
15827static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15828 SDValue V2, ArrayRef<int> Mask,
15829 const X86Subtarget &Subtarget,
15830 SelectionDAG &DAG) {
15831 assert(!V2.isUndef() && "This routine must not be used to lower single-input "((void)0)
15832 "shuffles as it could then recurse on itself.")((void)0);
15833 int Size = Mask.size();
15834
15835 // If this can be modeled as a broadcast of two elements followed by a blend,
15836 // prefer that lowering. This is especially important because broadcasts can
15837 // often fold with memory operands.
15838 auto DoBothBroadcast = [&] {
15839 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15840 for (int M : Mask)
15841 if (M >= Size) {
15842 if (V2BroadcastIdx < 0)
15843 V2BroadcastIdx = M - Size;
15844 else if (M - Size != V2BroadcastIdx)
15845 return false;
15846 } else if (M >= 0) {
15847 if (V1BroadcastIdx < 0)
15848 V1BroadcastIdx = M;
15849 else if (M != V1BroadcastIdx)
15850 return false;
15851 }
15852 return true;
15853 };
15854 if (DoBothBroadcast())
15855 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15856 DAG);
15857
15858 // If the inputs all stem from a single 128-bit lane of each input, then we
15859 // split them rather than blending because the split will decompose to
15860 // unusually few instructions.
15861 int LaneCount = VT.getSizeInBits() / 128;
15862 int LaneSize = Size / LaneCount;
15863 SmallBitVector LaneInputs[2];
15864 LaneInputs[0].resize(LaneCount, false);
15865 LaneInputs[1].resize(LaneCount, false);
15866 for (int i = 0; i < Size; ++i)
15867 if (Mask[i] >= 0)
15868 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15869 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15870 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15871
15872 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15873 // requires that the decomposed single-input shuffles don't end up here.
15874 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15875 DAG);
15876}
15877
15878// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15879// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15880static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15881 SDValue V1, SDValue V2,
15882 ArrayRef<int> Mask,
15883 SelectionDAG &DAG) {
15884 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")((void)0);
15885
15886 int LHSMask[4] = {-1, -1, -1, -1};
15887 int RHSMask[4] = {-1, -1, -1, -1};
15888 unsigned SHUFPMask = 0;
15889
15890 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15891 // perform the shuffle once the lanes have been shuffled in place.
15892 for (int i = 0; i != 4; ++i) {
15893 int M = Mask[i];
15894 if (M < 0)
15895 continue;
15896 int LaneBase = i & ~1;
15897 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15898 LaneMask[LaneBase + (M & 1)] = M;
15899 SHUFPMask |= (M & 1) << i;
15900 }
15901
15902 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15903 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15904 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15905 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15906}
15907
15908/// Lower a vector shuffle crossing multiple 128-bit lanes as
15909/// a lane permutation followed by a per-lane permutation.
15910///
15911/// This is mainly for cases where we can have non-repeating permutes
15912/// in each lane.
15913///
15914/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15915/// we should investigate merging them.
15916static SDValue lowerShuffleAsLanePermuteAndPermute(
15917 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15918 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15919 int NumElts = VT.getVectorNumElements();
15920 int NumLanes = VT.getSizeInBits() / 128;
15921 int NumEltsPerLane = NumElts / NumLanes;
15922 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15923
15924 /// Attempts to find a sublane permute with the given size
15925 /// that gets all elements into their target lanes.
15926 ///
15927 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15928 /// If unsuccessful, returns false and may overwrite InLaneMask.
15929 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15930 int NumSublanesPerLane = NumSublanes / NumLanes;
15931 int NumEltsPerSublane = NumElts / NumSublanes;
15932
15933 SmallVector<int, 16> CrossLaneMask;
15934 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15935 // CrossLaneMask but one entry == one sublane.
15936 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15937
15938 for (int i = 0; i != NumElts; ++i) {
15939 int M = Mask[i];
15940 if (M < 0)
15941 continue;
15942
15943 int SrcSublane = M / NumEltsPerSublane;
15944 int DstLane = i / NumEltsPerLane;
15945
15946 // We only need to get the elements into the right lane, not sublane.
15947 // So search all sublanes that make up the destination lane.
15948 bool Found = false;
15949 int DstSubStart = DstLane * NumSublanesPerLane;
15950 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15951 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15952 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15953 continue;
15954
15955 Found = true;
15956 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15957 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15958 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15959 break;
15960 }
15961 if (!Found)
15962 return SDValue();
15963 }
15964
15965 // Fill CrossLaneMask using CrossLaneMaskLarge.
15966 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15967
15968 if (!CanUseSublanes) {
15969 // If we're only shuffling a single lowest lane and the rest are identity
15970 // then don't bother.
15971 // TODO - isShuffleMaskInputInPlace could be extended to something like
15972 // this.
15973 int NumIdentityLanes = 0;
15974 bool OnlyShuffleLowestLane = true;
15975 for (int i = 0; i != NumLanes; ++i) {
15976 int LaneOffset = i * NumEltsPerLane;
15977 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15978 i * NumEltsPerLane))
15979 NumIdentityLanes++;
15980 else if (CrossLaneMask[LaneOffset] != 0)
15981 OnlyShuffleLowestLane = false;
15982 }
15983 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15984 return SDValue();
15985 }
15986
15987 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15988 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15989 InLaneMask);
15990 };
15991
15992 // First attempt a solution with full lanes.
15993 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15994 return V;
15995
15996 // The rest of the solutions use sublanes.
15997 if (!CanUseSublanes)
15998 return SDValue();
15999
16000 // Then attempt a solution with 64-bit sublanes (vpermq).
16001 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16002 return V;
16003
16004 // If that doesn't work and we have fast variable cross-lane shuffle,
16005 // attempt 32-bit sublanes (vpermd).
16006 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16007 return SDValue();
16008
16009 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16010}
16011
16012/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16013/// source with a lane permutation.
16014///
16015/// This lowering strategy results in four instructions in the worst case for a
16016/// single-input cross lane shuffle which is lower than any other fully general
16017/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16018/// shuffle pattern should be handled prior to trying this lowering.
16019static SDValue lowerShuffleAsLanePermuteAndShuffle(
16020 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16021 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16022 // FIXME: This should probably be generalized for 512-bit vectors as well.
16023 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((void)0);
16024 int Size = Mask.size();
16025 int LaneSize = Size / 2;
16026
16027 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16028 // Only do this if the elements aren't all from the lower lane,
16029 // otherwise we're (probably) better off doing a split.
16030 if (VT == MVT::v4f64 &&
16031 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16032 if (SDValue V =
16033 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16034 return V;
16035
16036 // If there are only inputs from one 128-bit lane, splitting will in fact be
16037 // less expensive. The flags track whether the given lane contains an element
16038 // that crosses to another lane.
16039 if (!Subtarget.hasAVX2()) {
16040 bool LaneCrossing[2] = {false, false};
16041 for (int i = 0; i < Size; ++i)
16042 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16043 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16044 if (!LaneCrossing[0] || !LaneCrossing[1])
16045 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16046 } else {
16047 bool LaneUsed[2] = {false, false};
16048 for (int i = 0; i < Size; ++i)
16049 if (Mask[i] >= 0)
16050 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16051 if (!LaneUsed[0] || !LaneUsed[1])
16052 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16053 }
16054
16055 // TODO - we could support shuffling V2 in the Flipped input.
16056 assert(V2.isUndef() &&((void)0)
16057 "This last part of this routine only works on single input shuffles")((void)0);
16058
16059 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16060 for (int i = 0; i < Size; ++i) {
16061 int &M = InLaneMask[i];
16062 if (M < 0)
16063 continue;
16064 if (((M % Size) / LaneSize) != (i / LaneSize))
16065 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16066 }
16067 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&((void)0)
16068 "In-lane shuffle mask expected")((void)0);
16069
16070 // Flip the lanes, and shuffle the results which should now be in-lane.
16071 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16072 SDValue Flipped = DAG.getBitcast(PVT, V1);
16073 Flipped =
16074 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16075 Flipped = DAG.getBitcast(VT, Flipped);
16076 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16077}
16078
16079/// Handle lowering 2-lane 128-bit shuffles.
16080static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16081 SDValue V2, ArrayRef<int> Mask,
16082 const APInt &Zeroable,
16083 const X86Subtarget &Subtarget,
16084 SelectionDAG &DAG) {
16085 if (V2.isUndef()) {
16086 // Attempt to match VBROADCAST*128 subvector broadcast load.
16087 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16088 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16089 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16090 MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
16091 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16092 if (!Ld->isNonTemporal()) {
16093 MVT MemVT = VT.getHalfNumVectorElementsVT();
16094 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16095 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
16096 SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
16097 TypeSize::Fixed(Ofs), DL);
16098 SDValue Ops[] = {Ld->getChain(), Ptr};
16099 SDValue BcastLd = DAG.getMemIntrinsicNode(
16100 X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
16101 DAG.getMachineFunction().getMachineMemOperand(
16102 Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
16103 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
16104 return BcastLd;
16105 }
16106 }
16107
16108 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16109 if (Subtarget.hasAVX2())
16110 return SDValue();
16111 }
16112
16113 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16114
16115 SmallVector<int, 4> WidenedMask;
16116 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16117 return SDValue();
16118
16119 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16120 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16121
16122 // Try to use an insert into a zero vector.
16123 if (WidenedMask[0] == 0 && IsHighZero) {
16124 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16125 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16126 DAG.getIntPtrConstant(0, DL));
16127 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16128 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16129 DAG.getIntPtrConstant(0, DL));
16130 }
16131
16132 // TODO: If minimizing size and one of the inputs is a zero vector and the
16133 // the zero vector has only one use, we could use a VPERM2X128 to save the
16134 // instruction bytes needed to explicitly generate the zero vector.
16135
16136 // Blends are faster and handle all the non-lane-crossing cases.
16137 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16138 Subtarget, DAG))
16139 return Blend;
16140
16141 // If either input operand is a zero vector, use VPERM2X128 because its mask
16142 // allows us to replace the zero input with an implicit zero.
16143 if (!IsLowZero && !IsHighZero) {
16144 // Check for patterns which can be matched with a single insert of a 128-bit
16145 // subvector.
16146 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16147 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16148
16149 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16150 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16151 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16152 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16153 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16154 OnlyUsesV1 ? V1 : V2,
16155 DAG.getIntPtrConstant(0, DL));
16156 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16157 DAG.getIntPtrConstant(2, DL));
16158 }
16159 }
16160
16161 // Try to use SHUF128 if possible.
16162 if (Subtarget.hasVLX()) {
16163 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16164 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16165 ((WidenedMask[1] % 2) << 1);
16166 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16167 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16168 }
16169 }
16170 }
16171
16172 // Otherwise form a 128-bit permutation. After accounting for undefs,
16173 // convert the 64-bit shuffle mask selection values into 128-bit
16174 // selection bits by dividing the indexes by 2 and shifting into positions
16175 // defined by a vperm2*128 instruction's immediate control byte.
16176
16177 // The immediate permute control byte looks like this:
16178 // [1:0] - select 128 bits from sources for low half of destination
16179 // [2] - ignore
16180 // [3] - zero low half of destination
16181 // [5:4] - select 128 bits from sources for high half of destination
16182 // [6] - ignore
16183 // [7] - zero high half of destination
16184
16185 assert((WidenedMask[0] >= 0 || IsLowZero) &&((void)0)
16186 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")((void)0);
16187
16188 unsigned PermMask = 0;
16189 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16190 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16191
16192 // Check the immediate mask and replace unused sources with undef.
16193 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16194 V1 = DAG.getUNDEF(VT);
16195 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16196 V2 = DAG.getUNDEF(VT);
16197
16198 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16199 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16200}
16201
16202/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16203/// shuffling each lane.
16204///
16205/// This attempts to create a repeated lane shuffle where each lane uses one
16206/// or two of the lanes of the inputs. The lanes of the input vectors are
16207/// shuffled in one or two independent shuffles to get the lanes into the
16208/// position needed by the final shuffle.
16209static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16210 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16211 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16212 assert(!V2.isUndef() && "This is only useful with multiple inputs.")((void)0);
16213
16214 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16215 return SDValue();
16216
16217 int NumElts = Mask.size();
16218 int NumLanes = VT.getSizeInBits() / 128;
16219 int NumLaneElts = 128 / VT.getScalarSizeInBits();
16220 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16221 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16222
16223 // First pass will try to fill in the RepeatMask from lanes that need two
16224 // sources.
16225 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16226 int Srcs[2] = {-1, -1};
16227 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16228 for (int i = 0; i != NumLaneElts; ++i) {
16229 int M = Mask[(Lane * NumLaneElts) + i];
16230 if (M < 0)
16231 continue;
16232 // Determine which of the possible input lanes (NumLanes from each source)
16233 // this element comes from. Assign that as one of the sources for this
16234 // lane. We can assign up to 2 sources for this lane. If we run out
16235 // sources we can't do anything.
16236 int LaneSrc = M / NumLaneElts;
16237 int Src;
16238 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16239 Src = 0;
16240 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16241 Src = 1;
16242 else
16243 return SDValue();
16244
16245 Srcs[Src] = LaneSrc;
16246 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16247 }
16248
16249 // If this lane has two sources, see if it fits with the repeat mask so far.
16250 if (Srcs[1] < 0)
16251 continue;
16252
16253 LaneSrcs[Lane][0] = Srcs[0];
16254 LaneSrcs[Lane][1] = Srcs[1];
16255
16256 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16257 assert(M1.size() == M2.size() && "Unexpected mask size")((void)0);
16258 for (int i = 0, e = M1.size(); i != e; ++i)
16259 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16260 return false;
16261 return true;
16262 };
16263
16264 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16265 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")((void)0);
16266 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16267 int M = Mask[i];
16268 if (M < 0)
16269 continue;
16270 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&((void)0)
16271 "Unexpected mask element")((void)0);
16272 MergedMask[i] = M;
16273 }
16274 };
16275
16276 if (MatchMasks(InLaneMask, RepeatMask)) {
16277 // Merge this lane mask into the final repeat mask.
16278 MergeMasks(InLaneMask, RepeatMask);
16279 continue;
16280 }
16281
16282 // Didn't find a match. Swap the operands and try again.
16283 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16284 ShuffleVectorSDNode::commuteMask(InLaneMask);
16285
16286 if (MatchMasks(InLaneMask, RepeatMask)) {
16287 // Merge this lane mask into the final repeat mask.
16288 MergeMasks(InLaneMask, RepeatMask);
16289 continue;
16290 }
16291
16292 // Couldn't find a match with the operands in either order.
16293 return SDValue();
16294 }
16295
16296 // Now handle any lanes with only one source.
16297 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16298 // If this lane has already been processed, skip it.
16299 if (LaneSrcs[Lane][0] >= 0)
16300 continue;
16301
16302 for (int i = 0; i != NumLaneElts; ++i) {
16303 int M = Mask[(Lane * NumLaneElts) + i];
16304 if (M < 0)
16305 continue;
16306
16307 // If RepeatMask isn't defined yet we can define it ourself.
16308 if (RepeatMask[i] < 0)
16309 RepeatMask[i] = M % NumLaneElts;
16310
16311 if (RepeatMask[i] < NumElts) {
16312 if (RepeatMask[i] != M % NumLaneElts)
16313 return SDValue();
16314 LaneSrcs[Lane][0] = M / NumLaneElts;
16315 } else {
16316 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16317 return SDValue();
16318 LaneSrcs[Lane][1] = M / NumLaneElts;
16319 }
16320 }
16321
16322 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16323 return SDValue();
16324 }
16325
16326 SmallVector<int, 16> NewMask(NumElts, -1);
16327 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16328 int Src = LaneSrcs[Lane][0];
16329 for (int i = 0; i != NumLaneElts; ++i) {
16330 int M = -1;
16331 if (Src >= 0)
16332 M = Src * NumLaneElts + i;
16333 NewMask[Lane * NumLaneElts + i] = M;
16334 }
16335 }
16336 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16337 // Ensure we didn't get back the shuffle we started with.
16338 // FIXME: This is a hack to make up for some splat handling code in
16339 // getVectorShuffle.
16340 if (isa<ShuffleVectorSDNode>(NewV1) &&
16341 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16342 return SDValue();
16343
16344 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16345 int Src = LaneSrcs[Lane][1];
16346 for (int i = 0; i != NumLaneElts; ++i) {
16347 int M = -1;
16348 if (Src >= 0)
16349 M = Src * NumLaneElts + i;
16350 NewMask[Lane * NumLaneElts + i] = M;
16351 }
16352 }
16353 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16354 // Ensure we didn't get back the shuffle we started with.
16355 // FIXME: This is a hack to make up for some splat handling code in
16356 // getVectorShuffle.
16357 if (isa<ShuffleVectorSDNode>(NewV2) &&
16358 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16359 return SDValue();
16360
16361 for (int i = 0; i != NumElts; ++i) {
16362 NewMask[i] = RepeatMask[i % NumLaneElts];
16363 if (NewMask[i] < 0)
16364 continue;
16365
16366 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16367 }
16368 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16369}
16370
16371/// If the input shuffle mask results in a vector that is undefined in all upper
16372/// or lower half elements and that mask accesses only 2 halves of the
16373/// shuffle's operands, return true. A mask of half the width with mask indexes
16374/// adjusted to access the extracted halves of the original shuffle operands is
16375/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16376/// lower half of each input operand is accessed.
16377static bool
16378getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16379 int &HalfIdx1, int &HalfIdx2) {
16380 assert((Mask.size() == HalfMask.size() * 2) &&((void)0)
16381 "Expected input mask to be twice as long as output")((void)0);
16382
16383 // Exactly one half of the result must be undef to allow narrowing.
16384 bool UndefLower = isUndefLowerHalf(Mask);
16385 bool UndefUpper = isUndefUpperHalf(Mask);
16386 if (UndefLower == UndefUpper)
16387 return false;
16388
16389 unsigned HalfNumElts = HalfMask.size();
16390 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16391 HalfIdx1 = -1;
16392 HalfIdx2 = -1;
16393 for (unsigned i = 0; i != HalfNumElts; ++i) {
16394 int M = Mask[i + MaskIndexOffset];
16395 if (M < 0) {
16396 HalfMask[i] = M;
16397 continue;
16398 }
16399
16400 // Determine which of the 4 half vectors this element is from.
16401 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16402 int HalfIdx = M / HalfNumElts;
16403
16404 // Determine the element index into its half vector source.
16405 int HalfElt = M % HalfNumElts;
16406
16407 // We can shuffle with up to 2 half vectors, set the new 'half'
16408 // shuffle mask accordingly.
16409 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16410 HalfMask[i] = HalfElt;
16411 HalfIdx1 = HalfIdx;
16412 continue;
16413 }
16414 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16415 HalfMask[i] = HalfElt + HalfNumElts;
16416 HalfIdx2 = HalfIdx;
16417 continue;
16418 }
16419
16420 // Too many half vectors referenced.
16421 return false;
16422 }
16423
16424 return true;
16425}
16426
16427/// Given the output values from getHalfShuffleMask(), create a half width
16428/// shuffle of extracted vectors followed by an insert back to full width.
16429static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16430 ArrayRef<int> HalfMask, int HalfIdx1,
16431 int HalfIdx2, bool UndefLower,
16432 SelectionDAG &DAG, bool UseConcat = false) {
16433 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")((void)0);
16434 assert(V1.getValueType().isSimple() && "Expecting only simple types")((void)0);
16435
16436 MVT VT = V1.getSimpleValueType();
16437 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16438 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16439
16440 auto getHalfVector = [&](int HalfIdx) {
16441 if (HalfIdx < 0)
16442 return DAG.getUNDEF(HalfVT);
16443 SDValue V = (HalfIdx < 2 ? V1 : V2);
16444 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16445 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16446 DAG.getIntPtrConstant(HalfIdx, DL));
16447 };
16448
16449 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16450 SDValue Half1 = getHalfVector(HalfIdx1);
16451 SDValue Half2 = getHalfVector(HalfIdx2);
16452 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16453 if (UseConcat) {
16454 SDValue Op0 = V;
16455 SDValue Op1 = DAG.getUNDEF(HalfVT);
16456 if (UndefLower)
16457 std::swap(Op0, Op1);
16458 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16459 }
16460
16461 unsigned Offset = UndefLower ? HalfNumElts : 0;
16462 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16463 DAG.getIntPtrConstant(Offset, DL));
16464}
16465
16466/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16467/// This allows for fast cases such as subvector extraction/insertion
16468/// or shuffling smaller vector types which can lower more efficiently.
16469static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16470 SDValue V2, ArrayRef<int> Mask,
16471 const X86Subtarget &Subtarget,
16472 SelectionDAG &DAG) {
16473 assert((VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
16474 "Expected 256-bit or 512-bit vector")((void)0);
16475
16476 bool UndefLower = isUndefLowerHalf(Mask);
16477 if (!UndefLower && !isUndefUpperHalf(Mask))
16478 return SDValue();
16479
16480 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&((void)0)
16481 "Completely undef shuffle mask should have been simplified already")((void)0);
16482
16483 // Upper half is undef and lower half is whole upper subvector.
16484 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16485 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16486 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16487 if (!UndefLower &&
16488 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16489 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16490 DAG.getIntPtrConstant(HalfNumElts, DL));
16491 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16492 DAG.getIntPtrConstant(0, DL));
16493 }
16494
16495 // Lower half is undef and upper half is whole lower subvector.
16496 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16497 if (UndefLower &&
16498 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16499 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16500 DAG.getIntPtrConstant(0, DL));
16501 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16502 DAG.getIntPtrConstant(HalfNumElts, DL));
16503 }
16504
16505 int HalfIdx1, HalfIdx2;
16506 SmallVector<int, 8> HalfMask(HalfNumElts);
16507 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16508 return SDValue();
16509
16510 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((void)0);
16511
16512 // Only shuffle the halves of the inputs when useful.
16513 unsigned NumLowerHalves =
16514 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16515 unsigned NumUpperHalves =
16516 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16517 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")((void)0);
16518
16519 // Determine the larger pattern of undef/halves, then decide if it's worth
16520 // splitting the shuffle based on subtarget capabilities and types.
16521 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16522 if (!UndefLower) {
16523 // XXXXuuuu: no insert is needed.
16524 // Always extract lowers when setting lower - these are all free subreg ops.
16525 if (NumUpperHalves == 0)
16526 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16527 UndefLower, DAG);
16528
16529 if (NumUpperHalves == 1) {
16530 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16531 if (Subtarget.hasAVX2()) {
16532 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16533 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16534 !is128BitUnpackShuffleMask(HalfMask) &&
16535 (!isSingleSHUFPSMask(HalfMask) ||
16536 Subtarget.hasFastVariableCrossLaneShuffle()))
16537 return SDValue();
16538 // If this is a unary shuffle (assume that the 2nd operand is
16539 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16540 // are better off extracting the upper half of 1 operand and using a
16541 // narrow shuffle.
16542 if (EltWidth == 64 && V2.isUndef())
16543 return SDValue();
16544 }
16545 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16546 if (Subtarget.hasAVX512() && VT.is512BitVector())
16547 return SDValue();
16548 // Extract + narrow shuffle is better than the wide alternative.
16549 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16550 UndefLower, DAG);
16551 }
16552
16553 // Don't extract both uppers, instead shuffle and then extract.
16554 assert(NumUpperHalves == 2 && "Half vector count went wrong")((void)0);
16555 return SDValue();
16556 }
16557
16558 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16559 if (NumUpperHalves == 0) {
16560 // AVX2 has efficient 64-bit element cross-lane shuffles.
16561 // TODO: Refine to account for unary shuffle, splat, and other masks?
16562 if (Subtarget.hasAVX2() && EltWidth == 64)
16563 return SDValue();
16564 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16565 if (Subtarget.hasAVX512() && VT.is512BitVector())
16566 return SDValue();
16567 // Narrow shuffle + insert is better than the wide alternative.
16568 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16569 UndefLower, DAG);
16570 }
16571
16572 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16573 return SDValue();
16574}
16575
16576/// Test whether the specified input (0 or 1) is in-place blended by the
16577/// given mask.
16578///
16579/// This returns true if the elements from a particular input are already in the
16580/// slot required by the given mask and require no permutation.
16581static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16582 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")((void)0);
16583 int Size = Mask.size();
16584 for (int i = 0; i < Size; ++i)
16585 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16586 return false;
16587
16588 return true;
16589}
16590
16591/// Handle case where shuffle sources are coming from the same 128-bit lane and
16592/// every lane can be represented as the same repeating mask - allowing us to
16593/// shuffle the sources with the repeating shuffle and then permute the result
16594/// to the destination lanes.
16595static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16596 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16597 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16598 int NumElts = VT.getVectorNumElements();
16599 int NumLanes = VT.getSizeInBits() / 128;
16600 int NumLaneElts = NumElts / NumLanes;
16601
16602 // On AVX2 we may be able to just shuffle the lowest elements and then
16603 // broadcast the result.
16604 if (Subtarget.hasAVX2()) {
16605 for (unsigned BroadcastSize : {16, 32, 64}) {
16606 if (BroadcastSize <= VT.getScalarSizeInBits())
16607 continue;
16608 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16609
16610 // Attempt to match a repeating pattern every NumBroadcastElts,
16611 // accounting for UNDEFs but only references the lowest 128-bit
16612 // lane of the inputs.
16613 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16614 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16615 for (int j = 0; j != NumBroadcastElts; ++j) {
16616 int M = Mask[i + j];
16617 if (M < 0)
16618 continue;
16619 int &R = RepeatMask[j];
16620 if (0 != ((M % NumElts) / NumLaneElts))
16621 return false;
16622 if (0 <= R && R != M)
16623 return false;
16624 R = M;
16625 }
16626 return true;
16627 };
16628
16629 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16630 if (!FindRepeatingBroadcastMask(RepeatMask))
16631 continue;
16632
16633 // Shuffle the (lowest) repeated elements in place for broadcast.
16634 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16635
16636 // Shuffle the actual broadcast.
16637 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16638 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16639 for (int j = 0; j != NumBroadcastElts; ++j)
16640 BroadcastMask[i + j] = j;
16641 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16642 BroadcastMask);
16643 }
16644 }
16645
16646 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16647 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16648 return SDValue();
16649
16650 // Bail if we already have a repeated lane shuffle mask.
16651 SmallVector<int, 8> RepeatedShuffleMask;
16652 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16653 return SDValue();
16654
16655 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16656 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16657 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16658 int NumSubLanes = NumLanes * SubLaneScale;
16659 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16660
16661 // Check that all the sources are coming from the same lane and see if we can
16662 // form a repeating shuffle mask (local to each sub-lane). At the same time,
16663 // determine the source sub-lane for each destination sub-lane.
16664 int TopSrcSubLane = -1;
16665 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16666 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16667 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16668 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16669
16670 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16671 // Extract the sub-lane mask, check that it all comes from the same lane
16672 // and normalize the mask entries to come from the first lane.
16673 int SrcLane = -1;
16674 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16675 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16676 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16677 if (M < 0)
16678 continue;
16679 int Lane = (M % NumElts) / NumLaneElts;
16680 if ((0 <= SrcLane) && (SrcLane != Lane))
16681 return SDValue();
16682 SrcLane = Lane;
16683 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16684 SubLaneMask[Elt] = LocalM;
16685 }
16686
16687 // Whole sub-lane is UNDEF.
16688 if (SrcLane < 0)
16689 continue;
16690
16691 // Attempt to match against the candidate repeated sub-lane masks.
16692 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16693 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16694 for (int i = 0; i != NumSubLaneElts; ++i) {
16695 if (M1[i] < 0 || M2[i] < 0)
16696 continue;
16697 if (M1[i] != M2[i])
16698 return false;
16699 }
16700 return true;
16701 };
16702
16703 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16704 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16705 continue;
16706
16707 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16708 for (int i = 0; i != NumSubLaneElts; ++i) {
16709 int M = SubLaneMask[i];
16710 if (M < 0)
16711 continue;
16712 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&((void)0)
16713 "Unexpected mask element")((void)0);
16714 RepeatedSubLaneMask[i] = M;
16715 }
16716
16717 // Track the top most source sub-lane - by setting the remaining to UNDEF
16718 // we can greatly simplify shuffle matching.
16719 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16720 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16721 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16722 break;
16723 }
16724
16725 // Bail if we failed to find a matching repeated sub-lane mask.
16726 if (Dst2SrcSubLanes[DstSubLane] < 0)
16727 return SDValue();
16728 }
16729 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((void)0)
16730 "Unexpected source lane")((void)0);
16731
16732 // Create a repeating shuffle mask for the entire vector.
16733 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16734 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16735 int Lane = SubLane / SubLaneScale;
16736 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16737 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16738 int M = RepeatedSubLaneMask[Elt];
16739 if (M < 0)
16740 continue;
16741 int Idx = (SubLane * NumSubLaneElts) + Elt;
16742 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16743 }
16744 }
16745 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16746
16747 // Shuffle each source sub-lane to its destination.
16748 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16749 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16750 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16751 if (SrcSubLane < 0)
16752 continue;
16753 for (int j = 0; j != NumSubLaneElts; ++j)
16754 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16755 }
16756
16757 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16758 SubLaneMask);
16759}
16760
16761static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
16762 bool &ForceV1Zero, bool &ForceV2Zero,
16763 unsigned &ShuffleImm, ArrayRef<int> Mask,
16764 const APInt &Zeroable) {
16765 int NumElts = VT.getVectorNumElements();
16766 assert(VT.getScalarSizeInBits() == 64 &&((void)0)
16767 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&((void)0)
16768 "Unexpected data type for VSHUFPD")((void)0);
16769 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&((void)0)
16770 "Illegal shuffle mask")((void)0);
16771
16772 bool ZeroLane[2] = { true, true };
16773 for (int i = 0; i < NumElts; ++i)
16774 ZeroLane[i & 1] &= Zeroable[i];
16775
16776 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16777 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16778 ShuffleImm = 0;
16779 bool ShufpdMask = true;
16780 bool CommutableMask = true;
16781 for (int i = 0; i < NumElts; ++i) {
16782 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16783 continue;
16784 if (Mask[i] < 0)
16785 return false;
16786 int Val = (i & 6) + NumElts * (i & 1);
16787 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16788 if (Mask[i] < Val || Mask[i] > Val + 1)
16789 ShufpdMask = false;
16790 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16791 CommutableMask = false;
16792 ShuffleImm |= (Mask[i] % 2) << i;
16793 }
16794
16795 if (!ShufpdMask && !CommutableMask)
16796 return false;
16797
16798 if (!ShufpdMask && CommutableMask)
16799 std::swap(V1, V2);
16800
16801 ForceV1Zero = ZeroLane[0];
16802 ForceV2Zero = ZeroLane[1];
16803 return true;
16804}
16805
16806static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
16807 SDValue V2, ArrayRef<int> Mask,
16808 const APInt &Zeroable,
16809 const X86Subtarget &Subtarget,
16810 SelectionDAG &DAG) {
16811 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&((void)0)
16812 "Unexpected data type for VSHUFPD")((void)0);
16813
16814 unsigned Immediate = 0;
16815 bool ForceV1Zero = false, ForceV2Zero = false;
16816 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16817 Mask, Zeroable))
16818 return SDValue();
16819
16820 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16821 if (ForceV1Zero)
16822 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16823 if (ForceV2Zero)
16824 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16825
16826 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16827 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16828}
16829
16830// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16831// by zeroable elements in the remaining 24 elements. Turn this into two
16832// vmovqb instructions shuffled together.
16833static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16834 SDValue V1, SDValue V2,
16835 ArrayRef<int> Mask,
16836 const APInt &Zeroable,
16837 SelectionDAG &DAG) {
16838 assert(VT == MVT::v32i8 && "Unexpected type!")((void)0);
16839
16840 // The first 8 indices should be every 8th element.
16841 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16842 return SDValue();
16843
16844 // Remaining elements need to be zeroable.
16845 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16846 return SDValue();
16847
16848 V1 = DAG.getBitcast(MVT::v4i64, V1);
16849 V2 = DAG.getBitcast(MVT::v4i64, V2);
16850
16851 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16852 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16853
16854 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16855 // the upper bits of the result using an unpckldq.
16856 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16857 { 0, 1, 2, 3, 16, 17, 18, 19,
16858 4, 5, 6, 7, 20, 21, 22, 23 });
16859 // Insert the unpckldq into a zero vector to widen to v32i8.
16860 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16861 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16862 DAG.getIntPtrConstant(0, DL));
16863}
16864
16865
16866/// Handle lowering of 4-lane 64-bit floating point shuffles.
16867///
16868/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16869/// isn't available.
16870static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16871 const APInt &Zeroable, SDValue V1, SDValue V2,
16872 const X86Subtarget &Subtarget,
16873 SelectionDAG &DAG) {
16874 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((void)0);
16875 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((void)0);
16876 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
16877
16878 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16879 Subtarget, DAG))
16880 return V;
16881
16882 if (V2.isUndef()) {
16883 // Check for being able to broadcast a single element.
16884 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16885 Mask, Subtarget, DAG))
16886 return Broadcast;
16887
16888 // Use low duplicate instructions for masks that match their pattern.
16889 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16890 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16891
16892 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16893 // Non-half-crossing single input shuffles can be lowered with an
16894 // interleaved permutation.
16895 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16896 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16897 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16898 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16899 }
16900
16901 // With AVX2 we have direct support for this permutation.
16902 if (Subtarget.hasAVX2())
16903 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16904 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16905
16906 // Try to create an in-lane repeating shuffle mask and then shuffle the
16907 // results into the target lanes.
16908 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16909 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16910 return V;
16911
16912 // Try to permute the lanes and then use a per-lane permute.
16913 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16914 Mask, DAG, Subtarget))
16915 return V;
16916
16917 // Otherwise, fall back.
16918 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16919 DAG, Subtarget);
16920 }
16921
16922 // Use dedicated unpack instructions for masks that match their pattern.
16923 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16924 return V;
16925
16926 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16927 Zeroable, Subtarget, DAG))
16928 return Blend;
16929
16930 // Check if the blend happens to exactly fit that of SHUFPD.
16931 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16932 Zeroable, Subtarget, DAG))
16933 return Op;
16934
16935 // If we have lane crossing shuffles AND they don't all come from the lower
16936 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16937 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16938 // canonicalize to a blend of splat which isn't necessary for this combine.
16939 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16940 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16941 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16942 (V2.getOpcode() != ISD::BUILD_VECTOR))
16943 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16944 Mask, DAG))
16945 return Op;
16946
16947 // If we have one input in place, then we can permute the other input and
16948 // blend the result.
16949 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16950 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16951 Subtarget, DAG);
16952
16953 // Try to create an in-lane repeating shuffle mask and then shuffle the
16954 // results into the target lanes.
16955 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16956 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16957 return V;
16958
16959 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16960 // shuffle. However, if we have AVX2 and either inputs are already in place,
16961 // we will be able to shuffle even across lanes the other input in a single
16962 // instruction so skip this pattern.
16963 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16964 isShuffleMaskInputInPlace(1, Mask))))
16965 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16966 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16967 return V;
16968
16969 // If we have VLX support, we can use VEXPAND.
16970 if (Subtarget.hasVLX())
16971 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16972 DAG, Subtarget))
16973 return V;
16974
16975 // If we have AVX2 then we always want to lower with a blend because an v4 we
16976 // can fully permute the elements.
16977 if (Subtarget.hasAVX2())
16978 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16979 Subtarget, DAG);
16980
16981 // Otherwise fall back on generic lowering.
16982 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16983 Subtarget, DAG);
16984}
16985
16986/// Handle lowering of 4-lane 64-bit integer shuffles.
16987///
16988/// This routine is only called when we have AVX2 and thus a reasonable
16989/// instruction set for v4i64 shuffling..
16990static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16991 const APInt &Zeroable, SDValue V1, SDValue V2,
16992 const X86Subtarget &Subtarget,
16993 SelectionDAG &DAG) {
16994 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((void)0);
16995 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((void)0);
16996 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
16997 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((void)0);
16998
16999 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17000 Subtarget, DAG))
17001 return V;
17002
17003 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17004 Zeroable, Subtarget, DAG))
17005 return Blend;
17006
17007 // Check for being able to broadcast a single element.
17008 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17009 Subtarget, DAG))
17010 return Broadcast;
17011
17012 if (V2.isUndef()) {
17013 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17014 // can use lower latency instructions that will operate on both lanes.
17015 SmallVector<int, 2> RepeatedMask;
17016 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17017 SmallVector<int, 4> PSHUFDMask;
17018 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17019 return DAG.getBitcast(
17020 MVT::v4i64,
17021 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17022 DAG.getBitcast(MVT::v8i32, V1),
17023 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17024 }
17025
17026 // AVX2 provides a direct instruction for permuting a single input across
17027 // lanes.
17028 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17029 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17030 }
17031
17032 // Try to use shift instructions.
17033 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17034 Zeroable, Subtarget, DAG))
17035 return Shift;
17036
17037 // If we have VLX support, we can use VALIGN or VEXPAND.
17038 if (Subtarget.hasVLX()) {
17039 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17040 Subtarget, DAG))
17041 return Rotate;
17042
17043 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17044 DAG, Subtarget))
17045 return V;
17046 }
17047
17048 // Try to use PALIGNR.
17049 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17050 Subtarget, DAG))
17051 return Rotate;
17052
17053 // Use dedicated unpack instructions for masks that match their pattern.
17054 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17055 return V;
17056
17057 // If we have one input in place, then we can permute the other input and
17058 // blend the result.
17059 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17060 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17061 Subtarget, DAG);
17062
17063 // Try to create an in-lane repeating shuffle mask and then shuffle the
17064 // results into the target lanes.
17065 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17066 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17067 return V;
17068
17069 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17070 // shuffle. However, if we have AVX2 and either inputs are already in place,
17071 // we will be able to shuffle even across lanes the other input in a single
17072 // instruction so skip this pattern.
17073 if (!isShuffleMaskInputInPlace(0, Mask) &&
17074 !isShuffleMaskInputInPlace(1, Mask))
17075 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17076 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17077 return Result;
17078
17079 // Otherwise fall back on generic blend lowering.
17080 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17081 Subtarget, DAG);
17082}
17083
17084/// Handle lowering of 8-lane 32-bit floating point shuffles.
17085///
17086/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17087/// isn't available.
17088static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17089 const APInt &Zeroable, SDValue V1, SDValue V2,
17090 const X86Subtarget &Subtarget,
17091 SelectionDAG &DAG) {
17092 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((void)0);
17093 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((void)0);
17094 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17095
17096 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17097 Zeroable, Subtarget, DAG))
17098 return Blend;
17099
17100 // Check for being able to broadcast a single element.
17101 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17102 Subtarget, DAG))
17103 return Broadcast;
17104
17105 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17106 // options to efficiently lower the shuffle.
17107 SmallVector<int, 4> RepeatedMask;
17108 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17109 assert(RepeatedMask.size() == 4 &&((void)0)
17110 "Repeated masks must be half the mask width!")((void)0);
17111
17112 // Use even/odd duplicate instructions for masks that match their pattern.
17113 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17114 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17115 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17116 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17117
17118 if (V2.isUndef())
17119 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17120 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17121
17122 // Use dedicated unpack instructions for masks that match their pattern.
17123 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17124 return V;
17125
17126 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17127 // have already handled any direct blends.
17128 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17129 }
17130
17131 // Try to create an in-lane repeating shuffle mask and then shuffle the
17132 // results into the target lanes.
17133 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17134 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17135 return V;
17136
17137 // If we have a single input shuffle with different shuffle patterns in the
17138 // two 128-bit lanes use the variable mask to VPERMILPS.
17139 if (V2.isUndef()) {
17140 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17141 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17142 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17143 }
17144 if (Subtarget.hasAVX2()) {
17145 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17146 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17147 }
17148 // Otherwise, fall back.
17149 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17150 DAG, Subtarget);
17151 }
17152
17153 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17154 // shuffle.
17155 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17156 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17157 return Result;
17158
17159 // If we have VLX support, we can use VEXPAND.
17160 if (Subtarget.hasVLX())
17161 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17162 DAG, Subtarget))
17163 return V;
17164
17165 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17166 // since after split we get a more efficient code using vpunpcklwd and
17167 // vpunpckhwd instrs than vblend.
17168 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17169 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17170 DAG);
17171
17172 // If we have AVX2 then we always want to lower with a blend because at v8 we
17173 // can fully permute the elements.
17174 if (Subtarget.hasAVX2())
17175 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17176 Subtarget, DAG);
17177
17178 // Otherwise fall back on generic lowering.
17179 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17180 Subtarget, DAG);
17181}
17182
17183/// Handle lowering of 8-lane 32-bit integer shuffles.
17184///
17185/// This routine is only called when we have AVX2 and thus a reasonable
17186/// instruction set for v8i32 shuffling..
17187static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17188 const APInt &Zeroable, SDValue V1, SDValue V2,
17189 const X86Subtarget &Subtarget,
17190 SelectionDAG &DAG) {
17191 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((void)0);
17192 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((void)0);
17193 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17194 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((void)0);
17195
17196 // Whenever we can lower this as a zext, that instruction is strictly faster
17197 // than any alternative. It also allows us to fold memory operands into the
17198 // shuffle in many cases.
17199 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17200 Zeroable, Subtarget, DAG))
17201 return ZExt;
17202
17203 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17204 // since after split we get a more efficient code than vblend by using
17205 // vpunpcklwd and vpunpckhwd instrs.
17206 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17207 !Subtarget.hasAVX512())
17208 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17209 DAG);
17210
17211 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17212 Zeroable, Subtarget, DAG))
17213 return Blend;
17214
17215 // Check for being able to broadcast a single element.
17216 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17217 Subtarget, DAG))
17218 return Broadcast;
17219
17220 // If the shuffle mask is repeated in each 128-bit lane we can use more
17221 // efficient instructions that mirror the shuffles across the two 128-bit
17222 // lanes.
17223 SmallVector<int, 4> RepeatedMask;
17224 bool Is128BitLaneRepeatedShuffle =
17225 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17226 if (Is128BitLaneRepeatedShuffle) {
17227 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17228 if (V2.isUndef())
17229 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17230 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17231
17232 // Use dedicated unpack instructions for masks that match their pattern.
17233 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17234 return V;
17235 }
17236
17237 // Try to use shift instructions.
17238 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17239 Zeroable, Subtarget, DAG))
17240 return Shift;
17241
17242 // If we have VLX support, we can use VALIGN or EXPAND.
17243 if (Subtarget.hasVLX()) {
17244 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17245 Subtarget, DAG))
17246 return Rotate;
17247
17248 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17249 DAG, Subtarget))
17250 return V;
17251 }
17252
17253 // Try to use byte rotation instructions.
17254 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17255 Subtarget, DAG))
17256 return Rotate;
17257
17258 // Try to create an in-lane repeating shuffle mask and then shuffle the
17259 // results into the target lanes.
17260 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17261 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17262 return V;
17263
17264 if (V2.isUndef()) {
17265 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17266 // because that should be faster than the variable permute alternatives.
17267 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17268 return V;
17269
17270 // If the shuffle patterns aren't repeated but it's a single input, directly
17271 // generate a cross-lane VPERMD instruction.
17272 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17273 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17274 }
17275
17276 // Assume that a single SHUFPS is faster than an alternative sequence of
17277 // multiple instructions (even if the CPU has a domain penalty).
17278 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17279 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17280 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17281 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17282 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17283 CastV1, CastV2, DAG);
17284 return DAG.getBitcast(MVT::v8i32, ShufPS);
17285 }
17286
17287 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17288 // shuffle.
17289 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17290 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17291 return Result;
17292
17293 // Otherwise fall back on generic blend lowering.
17294 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17295 Subtarget, DAG);
17296}
17297
17298/// Handle lowering of 16-lane 16-bit integer shuffles.
17299///
17300/// This routine is only called when we have AVX2 and thus a reasonable
17301/// instruction set for v16i16 shuffling..
17302static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17303 const APInt &Zeroable, SDValue V1, SDValue V2,
17304 const X86Subtarget &Subtarget,
17305 SelectionDAG &DAG) {
17306 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((void)0);
17307 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((void)0);
17308 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17309 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((void)0);
17310
17311 // Whenever we can lower this as a zext, that instruction is strictly faster
17312 // than any alternative. It also allows us to fold memory operands into the
17313 // shuffle in many cases.
17314 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17315 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17316 return ZExt;
17317
17318 // Check for being able to broadcast a single element.
17319 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17320 Subtarget, DAG))
17321 return Broadcast;
17322
17323 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17324 Zeroable, Subtarget, DAG))
17325 return Blend;
17326
17327 // Use dedicated unpack instructions for masks that match their pattern.
17328 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17329 return V;
17330
17331 // Use dedicated pack instructions for masks that match their pattern.
17332 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17333 Subtarget))
17334 return V;
17335
17336 // Try to use lower using a truncation.
17337 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17338 Subtarget, DAG))
17339 return V;
17340
17341 // Try to use shift instructions.
17342 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17343 Zeroable, Subtarget, DAG))
17344 return Shift;
17345
17346 // Try to use byte rotation instructions.
17347 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17348 Subtarget, DAG))
17349 return Rotate;
17350
17351 // Try to create an in-lane repeating shuffle mask and then shuffle the
17352 // results into the target lanes.
17353 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17354 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17355 return V;
17356
17357 if (V2.isUndef()) {
17358 // Try to use bit rotation instructions.
17359 if (SDValue Rotate =
17360 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17361 return Rotate;
17362
17363 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17364 // because that should be faster than the variable permute alternatives.
17365 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17366 return V;
17367
17368 // There are no generalized cross-lane shuffle operations available on i16
17369 // element types.
17370 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17371 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17372 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17373 return V;
17374
17375 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17376 DAG, Subtarget);
17377 }
17378
17379 SmallVector<int, 8> RepeatedMask;
17380 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17381 // As this is a single-input shuffle, the repeated mask should be
17382 // a strictly valid v8i16 mask that we can pass through to the v8i16
17383 // lowering to handle even the v16 case.
17384 return lowerV8I16GeneralSingleInputShuffle(
17385 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17386 }
17387 }
17388
17389 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17390 Zeroable, Subtarget, DAG))
17391 return PSHUFB;
17392
17393 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17394 if (Subtarget.hasBWI())
17395 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17396
17397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17398 // shuffle.
17399 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17400 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17401 return Result;
17402
17403 // Try to permute the lanes and then use a per-lane permute.
17404 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17405 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17406 return V;
17407
17408 // Otherwise fall back on generic lowering.
17409 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17410 Subtarget, DAG);
17411}
17412
17413/// Handle lowering of 32-lane 8-bit integer shuffles.
17414///
17415/// This routine is only called when we have AVX2 and thus a reasonable
17416/// instruction set for v32i8 shuffling..
17417static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17418 const APInt &Zeroable, SDValue V1, SDValue V2,
17419 const X86Subtarget &Subtarget,
17420 SelectionDAG &DAG) {
17421 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((void)0);
17422 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((void)0);
17423 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((void)0);
17424 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((void)0);
17425
17426 // Whenever we can lower this as a zext, that instruction is strictly faster
17427 // than any alternative. It also allows us to fold memory operands into the
17428 // shuffle in many cases.
17429 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17430 Zeroable, Subtarget, DAG))
17431 return ZExt;
17432
17433 // Check for being able to broadcast a single element.
17434 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17435 Subtarget, DAG))
17436 return Broadcast;
17437
17438 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17439 Zeroable, Subtarget, DAG))
17440 return Blend;
17441
17442 // Use dedicated unpack instructions for masks that match their pattern.
17443 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17444 return V;
17445
17446 // Use dedicated pack instructions for masks that match their pattern.
17447 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17448 Subtarget))
17449 return V;
17450
17451 // Try to use lower using a truncation.
17452 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17453 Subtarget, DAG))
17454 return V;
17455
17456 // Try to use shift instructions.
17457 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17458 Zeroable, Subtarget, DAG))
17459 return Shift;
17460
17461 // Try to use byte rotation instructions.
17462 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17463 Subtarget, DAG))
17464 return Rotate;
17465
17466 // Try to use bit rotation instructions.
17467 if (V2.isUndef())
17468 if (SDValue Rotate =
17469 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17470 return Rotate;
17471
17472 // Try to create an in-lane repeating shuffle mask and then shuffle the
17473 // results into the target lanes.
17474 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17475 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17476 return V;
17477
17478 // There are no generalized cross-lane shuffle operations available on i8
17479 // element types.
17480 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17481 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17482 // because that should be faster than the variable permute alternatives.
17483 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17484 return V;
17485
17486 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17487 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17488 return V;
17489
17490 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17491 DAG, Subtarget);
17492 }
17493
17494 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17495 Zeroable, Subtarget, DAG))
17496 return PSHUFB;
17497
17498 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17499 if (Subtarget.hasVBMI())
17500 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17501
17502 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17503 // shuffle.
17504 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17505 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17506 return Result;
17507
17508 // Try to permute the lanes and then use a per-lane permute.
17509 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17510 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17511 return V;
17512
17513 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17514 // by zeroable elements in the remaining 24 elements. Turn this into two
17515 // vmovqb instructions shuffled together.
17516 if (Subtarget.hasVLX())
17517 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17518 Mask, Zeroable, DAG))
17519 return V;
17520
17521 // Otherwise fall back on generic lowering.
17522 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17523 Subtarget, DAG);
17524}
17525
17526/// High-level routine to lower various 256-bit x86 vector shuffles.
17527///
17528/// This routine either breaks down the specific type of a 256-bit x86 vector
17529/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17530/// together based on the available instructions.
17531static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17532 SDValue V1, SDValue V2, const APInt &Zeroable,
17533 const X86Subtarget &Subtarget,
17534 SelectionDAG &DAG) {
17535 // If we have a single input to the zero element, insert that into V1 if we
17536 // can do so cheaply.
17537 int NumElts = VT.getVectorNumElements();
17538 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17539
17540 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17541 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17542 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17543 return Insertion;
17544
17545 // Handle special cases where the lower or upper half is UNDEF.
17546 if (SDValue V =
17547 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17548 return V;
17549
17550 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17551 // can check for those subtargets here and avoid much of the subtarget
17552 // querying in the per-vector-type lowering routines. With AVX1 we have
17553 // essentially *zero* ability to manipulate a 256-bit vector with integer
17554 // types. Since we'll use floating point types there eventually, just
17555 // immediately cast everything to a float and operate entirely in that domain.
17556 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17557 int ElementBits = VT.getScalarSizeInBits();
17558 if (ElementBits < 32) {
17559 // No floating point type available, if we can't use the bit operations
17560 // for masking/blending then decompose into 128-bit vectors.
17561 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17562 Subtarget, DAG))
17563 return V;
17564 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17565 return V;
17566 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17567 }
17568
17569 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17570 VT.getVectorNumElements());
17571 V1 = DAG.getBitcast(FpVT, V1);
17572 V2 = DAG.getBitcast(FpVT, V2);
17573 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17574 }
17575
17576 switch (VT.SimpleTy) {
17577 case MVT::v4f64:
17578 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17579 case MVT::v4i64:
17580 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17581 case MVT::v8f32:
17582 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17583 case MVT::v8i32:
17584 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17585 case MVT::v16i16:
17586 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17587 case MVT::v32i8:
17588 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17589
17590 default:
17591 llvm_unreachable("Not a valid 256-bit x86 vector type!")__builtin_unreachable();
17592 }
17593}
17594
17595/// Try to lower a vector shuffle as a 128-bit shuffles.
17596static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17597 const APInt &Zeroable, SDValue V1, SDValue V2,
17598 const X86Subtarget &Subtarget,
17599 SelectionDAG &DAG) {
17600 assert(VT.getScalarSizeInBits() == 64 &&((void)0)
17601 "Unexpected element type size for 128bit shuffle.")((void)0);
17602
17603 // To handle 256 bit vector requires VLX and most probably
17604 // function lowerV2X128VectorShuffle() is better solution.
17605 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((void)0);
17606
17607 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17608 SmallVector<int, 4> Widened128Mask;
17609 if (!canWidenShuffleElements(Mask, Widened128Mask))
17610 return SDValue();
17611 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")((void)0);
17612
17613 // Try to use an insert into a zero vector.
17614 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17615 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17616 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17617 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17618 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17619 DAG.getIntPtrConstant(0, DL));
17620 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17621 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17622 DAG.getIntPtrConstant(0, DL));
17623 }
17624
17625 // Check for patterns which can be matched with a single insert of a 256-bit
17626 // subvector.
17627 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17628 if (OnlyUsesV1 ||
17629 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17630 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17631 SDValue SubVec =
17632 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17633 DAG.getIntPtrConstant(0, DL));
17634 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17635 DAG.getIntPtrConstant(4, DL));
17636 }
17637
17638 // See if this is an insertion of the lower 128-bits of V2 into V1.
17639 bool IsInsert = true;
17640 int V2Index = -1;
17641 for (int i = 0; i < 4; ++i) {
17642 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((void)0);
17643 if (Widened128Mask[i] < 0)
17644 continue;
17645
17646 // Make sure all V1 subvectors are in place.
17647 if (Widened128Mask[i] < 4) {
17648 if (Widened128Mask[i] != i) {
17649 IsInsert = false;
17650 break;
17651 }
17652 } else {
17653 // Make sure we only have a single V2 index and its the lowest 128-bits.
17654 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17655 IsInsert = false;
17656 break;
17657 }
17658 V2Index = i;
17659 }
17660 }
17661 if (IsInsert && V2Index >= 0) {
17662 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17663 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17664 DAG.getIntPtrConstant(0, DL));
17665 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17666 }
17667
17668 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17669 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17670 // possible we at least ensure the lanes stay sequential to help later
17671 // combines.
17672 SmallVector<int, 2> Widened256Mask;
17673 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17674 Widened128Mask.clear();
17675 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17676 }
17677
17678 // Try to lower to vshuf64x2/vshuf32x4.
17679 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17680 unsigned PermMask = 0;
17681 // Insure elements came from the same Op.
17682 for (int i = 0; i < 4; ++i) {
17683 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((void)0);
17684 if (Widened128Mask[i] < 0)
17685 continue;
17686
17687 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17688 unsigned OpIndex = i / 2;
17689 if (Ops[OpIndex].isUndef())
17690 Ops[OpIndex] = Op;
17691 else if (Ops[OpIndex] != Op)
17692 return SDValue();
17693
17694 // Convert the 128-bit shuffle mask selection values into 128-bit selection
17695 // bits defined by a vshuf64x2 instruction's immediate control byte.
17696 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17697 }
17698
17699 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17700 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17701}
17702
17703/// Handle lowering of 8-lane 64-bit floating point shuffles.
17704static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17705 const APInt &Zeroable, SDValue V1, SDValue V2,
17706 const X86Subtarget &Subtarget,
17707 SelectionDAG &DAG) {
17708 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((void)0);
17709 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((void)0);
17710 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17711
17712 if (V2.isUndef()) {
17713 // Use low duplicate instructions for masks that match their pattern.
17714 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17715 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17716
17717 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17718 // Non-half-crossing single input shuffles can be lowered with an
17719 // interleaved permutation.
17720 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17721 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17722 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17723 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17724 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17725 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17726 }
17727
17728 SmallVector<int, 4> RepeatedMask;
17729 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17730 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17731 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17732 }
17733
17734 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17735 V2, Subtarget, DAG))
17736 return Shuf128;
17737
17738 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
17739 return Unpck;
17740
17741 // Check if the blend happens to exactly fit that of SHUFPD.
17742 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17743 Zeroable, Subtarget, DAG))
17744 return Op;
17745
17746 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
17747 DAG, Subtarget))
17748 return V;
17749
17750 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17751 Zeroable, Subtarget, DAG))
17752 return Blend;
17753
17754 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17755}
17756
17757/// Handle lowering of 16-lane 32-bit floating point shuffles.
17758static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17759 const APInt &Zeroable, SDValue V1, SDValue V2,
17760 const X86Subtarget &Subtarget,
17761 SelectionDAG &DAG) {
17762 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((void)0);
17763 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((void)0);
17764 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17765
17766 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17767 // options to efficiently lower the shuffle.
17768 SmallVector<int, 4> RepeatedMask;
17769 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17770 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17771
17772 // Use even/odd duplicate instructions for masks that match their pattern.
17773 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17774 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17775 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17776 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17777
17778 if (V2.isUndef())
17779 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17780 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17781
17782 // Use dedicated unpack instructions for masks that match their pattern.
17783 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
17784 return V;
17785
17786 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17787 Zeroable, Subtarget, DAG))
17788 return Blend;
17789
17790 // Otherwise, fall back to a SHUFPS sequence.
17791 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17792 }
17793
17794 // Try to create an in-lane repeating shuffle mask and then shuffle the
17795 // results into the target lanes.
17796 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17797 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17798 return V;
17799
17800 // If we have a single input shuffle with different shuffle patterns in the
17801 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17802 if (V2.isUndef() &&
17803 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17804 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17805 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17806 }
17807
17808 // If we have AVX512F support, we can use VEXPAND.
17809 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
17810 V1, V2, DAG, Subtarget))
17811 return V;
17812
17813 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17814}
17815
17816/// Handle lowering of 8-lane 64-bit integer shuffles.
17817static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17818 const APInt &Zeroable, SDValue V1, SDValue V2,
17819 const X86Subtarget &Subtarget,
17820 SelectionDAG &DAG) {
17821 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((void)0);
17822 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((void)0);
17823 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17824
17825 if (V2.isUndef()) {
17826 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17827 // can use lower latency instructions that will operate on all four
17828 // 128-bit lanes.
17829 SmallVector<int, 2> Repeated128Mask;
17830 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17831 SmallVector<int, 4> PSHUFDMask;
17832 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17833 return DAG.getBitcast(
17834 MVT::v8i64,
17835 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17836 DAG.getBitcast(MVT::v16i32, V1),
17837 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17838 }
17839
17840 SmallVector<int, 4> Repeated256Mask;
17841 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17842 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17843 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17844 }
17845
17846 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17847 V2, Subtarget, DAG))
17848 return Shuf128;
17849
17850 // Try to use shift instructions.
17851 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
17852 Zeroable, Subtarget, DAG))
17853 return Shift;
17854
17855 // Try to use VALIGN.
17856 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17857 Subtarget, DAG))
17858 return Rotate;
17859
17860 // Try to use PALIGNR.
17861 if (Subtarget.hasBWI())
17862 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17863 Subtarget, DAG))
17864 return Rotate;
17865
17866 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17867 return Unpck;
17868
17869 // If we have AVX512F support, we can use VEXPAND.
17870 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17871 DAG, Subtarget))
17872 return V;
17873
17874 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17875 Zeroable, Subtarget, DAG))
17876 return Blend;
17877
17878 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17879}
17880
17881/// Handle lowering of 16-lane 32-bit integer shuffles.
17882static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17883 const APInt &Zeroable, SDValue V1, SDValue V2,
17884 const X86Subtarget &Subtarget,
17885 SelectionDAG &DAG) {
17886 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((void)0);
17887 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((void)0);
17888 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17889
17890 // Whenever we can lower this as a zext, that instruction is strictly faster
17891 // than any alternative. It also allows us to fold memory operands into the
17892 // shuffle in many cases.
17893 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17894 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17895 return ZExt;
17896
17897 // If the shuffle mask is repeated in each 128-bit lane we can use more
17898 // efficient instructions that mirror the shuffles across the four 128-bit
17899 // lanes.
17900 SmallVector<int, 4> RepeatedMask;
17901 bool Is128BitLaneRepeatedShuffle =
17902 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17903 if (Is128BitLaneRepeatedShuffle) {
17904 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17905 if (V2.isUndef())
17906 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17907 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17908
17909 // Use dedicated unpack instructions for masks that match their pattern.
17910 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17911 return V;
17912 }
17913
17914 // Try to use shift instructions.
17915 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17916 Zeroable, Subtarget, DAG))
17917 return Shift;
17918
17919 // Try to use VALIGN.
17920 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17921 Subtarget, DAG))
17922 return Rotate;
17923
17924 // Try to use byte rotation instructions.
17925 if (Subtarget.hasBWI())
17926 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17927 Subtarget, DAG))
17928 return Rotate;
17929
17930 // Assume that a single SHUFPS is faster than using a permv shuffle.
17931 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17932 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17933 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17934 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17935 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17936 CastV1, CastV2, DAG);
17937 return DAG.getBitcast(MVT::v16i32, ShufPS);
17938 }
17939
17940 // Try to create an in-lane repeating shuffle mask and then shuffle the
17941 // results into the target lanes.
17942 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17943 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17944 return V;
17945
17946 // If we have AVX512F support, we can use VEXPAND.
17947 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17948 DAG, Subtarget))
17949 return V;
17950
17951 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17952 Zeroable, Subtarget, DAG))
17953 return Blend;
17954
17955 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17956}
17957
17958/// Handle lowering of 32-lane 16-bit integer shuffles.
17959static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17960 const APInt &Zeroable, SDValue V1, SDValue V2,
17961 const X86Subtarget &Subtarget,
17962 SelectionDAG &DAG) {
17963 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((void)0);
17964 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((void)0);
17965 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((void)0);
17966 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((void)0);
17967
17968 // Whenever we can lower this as a zext, that instruction is strictly faster
17969 // than any alternative. It also allows us to fold memory operands into the
17970 // shuffle in many cases.
17971 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17972 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17973 return ZExt;
17974
17975 // Use dedicated unpack instructions for masks that match their pattern.
17976 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17977 return V;
17978
17979 // Use dedicated pack instructions for masks that match their pattern.
17980 if (SDValue V =
17981 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17982 return V;
17983
17984 // Try to use shift instructions.
17985 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17986 Zeroable, Subtarget, DAG))
17987 return Shift;
17988
17989 // Try to use byte rotation instructions.
17990 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17991 Subtarget, DAG))
17992 return Rotate;
17993
17994 if (V2.isUndef()) {
17995 // Try to use bit rotation instructions.
17996 if (SDValue Rotate =
17997 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17998 return Rotate;
17999
18000 SmallVector<int, 8> RepeatedMask;
18001 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18002 // As this is a single-input shuffle, the repeated mask should be
18003 // a strictly valid v8i16 mask that we can pass through to the v8i16
18004 // lowering to handle even the v32 case.
18005 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18006 RepeatedMask, Subtarget, DAG);
18007 }
18008 }
18009
18010 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18011 Zeroable, Subtarget, DAG))
18012 return Blend;
18013
18014 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18015 Zeroable, Subtarget, DAG))
18016 return PSHUFB;
18017
18018 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18019}
18020
18021/// Handle lowering of 64-lane 8-bit integer shuffles.
18022static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18023 const APInt &Zeroable, SDValue V1, SDValue V2,
18024 const X86Subtarget &Subtarget,
18025 SelectionDAG &DAG) {
18026 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((void)0);
18027 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((void)0);
18028 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((void)0);
18029 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((void)0);
18030
18031 // Whenever we can lower this as a zext, that instruction is strictly faster
18032 // than any alternative. It also allows us to fold memory operands into the
18033 // shuffle in many cases.
18034 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18035 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18036 return ZExt;
18037
18038 // Use dedicated unpack instructions for masks that match their pattern.
18039 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18040 return V;
18041
18042 // Use dedicated pack instructions for masks that match their pattern.
18043 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18044 Subtarget))
18045 return V;
18046
18047 // Try to use shift instructions.
18048 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18049 Zeroable, Subtarget, DAG))
18050 return Shift;
18051
18052 // Try to use byte rotation instructions.
18053 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18054 Subtarget, DAG))
18055 return Rotate;
18056
18057 // Try to use bit rotation instructions.
18058 if (V2.isUndef())
18059 if (SDValue Rotate =
18060 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18061 return Rotate;
18062
18063 // Lower as AND if possible.
18064 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18065 Zeroable, Subtarget, DAG))
18066 return Masked;
18067
18068 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18069 Zeroable, Subtarget, DAG))
18070 return PSHUFB;
18071
18072 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18073 if (Subtarget.hasVBMI())
18074 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18075
18076 // Try to create an in-lane repeating shuffle mask and then shuffle the
18077 // results into the target lanes.
18078 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18079 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18080 return V;
18081
18082 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18083 Zeroable, Subtarget, DAG))
18084 return Blend;
18085
18086 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18087 // shuffle.
18088 if (!V2.isUndef())
18089 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18090 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18091 return Result;
18092
18093 // FIXME: Implement direct support for this type!
18094 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18095}
18096
18097/// High-level routine to lower various 512-bit x86 vector shuffles.
18098///
18099/// This routine either breaks down the specific type of a 512-bit x86 vector
18100/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18101/// together based on the available instructions.
18102static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18103 MVT VT, SDValue V1, SDValue V2,
18104 const APInt &Zeroable,
18105 const X86Subtarget &Subtarget,
18106 SelectionDAG &DAG) {
18107 assert(Subtarget.hasAVX512() &&((void)0)
18108 "Cannot lower 512-bit vectors w/ basic ISA!")((void)0);
18109
18110 // If we have a single input to the zero element, insert that into V1 if we
18111 // can do so cheaply.
18112 int NumElts = Mask.size();
18113 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18114
18115 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18116 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18117 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18118 return Insertion;
18119
18120 // Handle special cases where the lower or upper half is UNDEF.
18121 if (SDValue V =
18122 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18123 return V;
18124
18125 // Check for being able to broadcast a single element.
18126 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18127 Subtarget, DAG))
18128 return Broadcast;
18129
18130 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18131 // Try using bit ops for masking and blending before falling back to
18132 // splitting.
18133 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18134 Subtarget, DAG))
18135 return V;
18136 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18137 return V;
18138
18139 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18140 }
18141
18142 // Dispatch to each element type for lowering. If we don't have support for
18143 // specific element type shuffles at 512 bits, immediately split them and
18144 // lower them. Each lowering routine of a given type is allowed to assume that
18145 // the requisite ISA extensions for that element type are available.
18146 switch (VT.SimpleTy) {
18147 case MVT::v8f64:
18148 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18149 case MVT::v16f32:
18150 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18151 case MVT::v8i64:
18152 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18153 case MVT::v16i32:
18154 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18155 case MVT::v32i16:
18156 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18157 case MVT::v64i8:
18158 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18159
18160 default:
18161 llvm_unreachable("Not a valid 512-bit x86 vector type!")__builtin_unreachable();
18162 }
18163}
18164
18165static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18166 MVT VT, SDValue V1, SDValue V2,
18167 const X86Subtarget &Subtarget,
18168 SelectionDAG &DAG) {
18169 // Shuffle should be unary.
18170 if (!V2.isUndef())
18171 return SDValue();
18172
18173 int ShiftAmt = -1;
18174 int NumElts = Mask.size();
18175 for (int i = 0; i != NumElts; ++i) {
18176 int M = Mask[i];
18177 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&((void)0)
18178 "Unexpected mask index.")((void)0);
18179 if (M < 0)
18180 continue;
18181
18182 // The first non-undef element determines our shift amount.
18183 if (ShiftAmt < 0) {
18184 ShiftAmt = M - i;
18185 // Need to be shifting right.
18186 if (ShiftAmt <= 0)
18187 return SDValue();
18188 }
18189 // All non-undef elements must shift by the same amount.
18190 if (ShiftAmt != M - i)
18191 return SDValue();
18192 }
18193 assert(ShiftAmt >= 0 && "All undef?")((void)0);
18194
18195 // Great we found a shift right.
18196 MVT WideVT = VT;
18197 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18198 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18199 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18200 DAG.getUNDEF(WideVT), V1,
18201 DAG.getIntPtrConstant(0, DL));
18202 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18203 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18204 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18205 DAG.getIntPtrConstant(0, DL));
18206}
18207
18208// Determine if this shuffle can be implemented with a KSHIFT instruction.
18209// Returns the shift amount if possible or -1 if not. This is a simplified
18210// version of matchShuffleAsShift.
18211static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18212 int MaskOffset, const APInt &Zeroable) {
18213 int Size = Mask.size();
18214
18215 auto CheckZeros = [&](int Shift, bool Left) {
18216 for (int j = 0; j < Shift; ++j)
18217 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18218 return false;
18219
18220 return true;
18221 };
18222
18223 auto MatchShift = [&](int Shift, bool Left) {
18224 unsigned Pos = Left ? Shift : 0;
18225 unsigned Low = Left ? 0 : Shift;
18226 unsigned Len = Size - Shift;
18227 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18228 };
18229
18230 for (int Shift = 1; Shift != Size; ++Shift)
18231 for (bool Left : {true, false})
18232 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18233 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18234 return Shift;
18235 }
18236
18237 return -1;
18238}
18239
18240
18241// Lower vXi1 vector shuffles.
18242// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18243// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18244// vector, shuffle and then truncate it back.
18245static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18246 MVT VT, SDValue V1, SDValue V2,
18247 const APInt &Zeroable,
18248 const X86Subtarget &Subtarget,
18249 SelectionDAG &DAG) {
18250 assert(Subtarget.hasAVX512() &&((void)0)
18251 "Cannot lower 512-bit vectors w/o basic ISA!")((void)0);
18252
18253 int NumElts = Mask.size();
18254
18255 // Try to recognize shuffles that are just padding a subvector with zeros.
18256 int SubvecElts = 0;
18257 int Src = -1;
18258 for (int i = 0; i != NumElts; ++i) {
18259 if (Mask[i] >= 0) {
18260 // Grab the source from the first valid mask. All subsequent elements need
18261 // to use this same source.
18262 if (Src < 0)
18263 Src = Mask[i] / NumElts;
18264 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18265 break;
18266 }
18267
18268 ++SubvecElts;
18269 }
18270 assert(SubvecElts != NumElts && "Identity shuffle?")((void)0);
18271
18272 // Clip to a power 2.
18273 SubvecElts = PowerOf2Floor(SubvecElts);
18274
18275 // Make sure the number of zeroable bits in the top at least covers the bits
18276 // not covered by the subvector.
18277 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18278 assert(Src >= 0 && "Expected a source!")((void)0);
18279 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18280 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18281 Src == 0 ? V1 : V2,
18282 DAG.getIntPtrConstant(0, DL));
18283 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18284 DAG.getConstant(0, DL, VT),
18285 Extract, DAG.getIntPtrConstant(0, DL));
18286 }
18287
18288 // Try a simple shift right with undef elements. Later we'll try with zeros.
18289 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18290 DAG))
18291 return Shift;
18292
18293 // Try to match KSHIFTs.
18294 unsigned Offset = 0;
18295 for (SDValue V : { V1, V2 }) {
18296 unsigned Opcode;
18297 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18298 if (ShiftAmt >= 0) {
18299 MVT WideVT = VT;
18300 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18301 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18302 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18303 DAG.getUNDEF(WideVT), V,
18304 DAG.getIntPtrConstant(0, DL));
18305 // Widened right shifts need two shifts to ensure we shift in zeroes.
18306 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18307 int WideElts = WideVT.getVectorNumElements();
18308 // Shift left to put the original vector in the MSBs of the new size.
18309 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18310 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18311 // Increase the shift amount to account for the left shift.
18312 ShiftAmt += WideElts - NumElts;
18313 }
18314
18315 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18316 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18317 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18318 DAG.getIntPtrConstant(0, DL));
18319 }
18320 Offset += NumElts; // Increment for next iteration.
18321 }
18322
18323
18324
18325 MVT ExtVT;
18326 switch (VT.SimpleTy) {
18327 default:
18328 llvm_unreachable("Expected a vector of i1 elements")__builtin_unreachable();
18329 case MVT::v2i1:
18330 ExtVT = MVT::v2i64;
18331 break;
18332 case MVT::v4i1:
18333 ExtVT = MVT::v4i32;
18334 break;
18335 case MVT::v8i1:
18336 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18337 // shuffle.
18338 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18339 break;
18340 case MVT::v16i1:
18341 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18342 // 256-bit operation available.
18343 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18344 break;
18345 case MVT::v32i1:
18346 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18347 // 256-bit operation available.
18348 assert(Subtarget.hasBWI() && "Expected AVX512BW support")((void)0);
18349 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18350 break;
18351 case MVT::v64i1:
18352 // Fall back to scalarization. FIXME: We can do better if the shuffle
18353 // can be partitioned cleanly.
18354 if (!Subtarget.useBWIRegs())
18355 return SDValue();
18356 ExtVT = MVT::v64i8;
18357 break;
18358 }
18359
18360 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18361 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18362
18363 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18364 // i1 was sign extended we can use X86ISD::CVT2MASK.
18365 int NumElems = VT.getVectorNumElements();
18366 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18367 (Subtarget.hasDQI() && (NumElems < 32)))
18368 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18369 Shuffle, ISD::SETGT);
18370
18371 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18372}
18373
18374/// Helper function that returns true if the shuffle mask should be
18375/// commuted to improve canonicalization.
18376static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18377 int NumElements = Mask.size();
18378
18379 int NumV1Elements = 0, NumV2Elements = 0;
18380 for (int M : Mask)
18381 if (M < 0)
18382 continue;
18383 else if (M < NumElements)
18384 ++NumV1Elements;
18385 else
18386 ++NumV2Elements;
18387
18388 // Commute the shuffle as needed such that more elements come from V1 than
18389 // V2. This allows us to match the shuffle pattern strictly on how many
18390 // elements come from V1 without handling the symmetric cases.
18391 if (NumV2Elements > NumV1Elements)
18392 return true;
18393
18394 assert(NumV1Elements > 0 && "No V1 indices")((void)0);
18395
18396 if (NumV2Elements == 0)
18397 return false;
18398
18399 // When the number of V1 and V2 elements are the same, try to minimize the
18400 // number of uses of V2 in the low half of the vector. When that is tied,
18401 // ensure that the sum of indices for V1 is equal to or lower than the sum
18402 // indices for V2. When those are equal, try to ensure that the number of odd
18403 // indices for V1 is lower than the number of odd indices for V2.
18404 if (NumV1Elements == NumV2Elements) {
18405 int LowV1Elements = 0, LowV2Elements = 0;
18406 for (int M : Mask.slice(0, NumElements / 2))
18407 if (M >= NumElements)
18408 ++LowV2Elements;
18409 else if (M >= 0)
18410 ++LowV1Elements;
18411 if (LowV2Elements > LowV1Elements)
18412 return true;
18413 if (LowV2Elements == LowV1Elements) {
18414 int SumV1Indices = 0, SumV2Indices = 0;
18415 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18416 if (Mask[i] >= NumElements)
18417 SumV2Indices += i;
18418 else if (Mask[i] >= 0)
18419 SumV1Indices += i;
18420 if (SumV2Indices < SumV1Indices)
18421 return true;
18422 if (SumV2Indices == SumV1Indices) {
18423 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18424 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18425 if (Mask[i] >= NumElements)
18426 NumV2OddIndices += i % 2;
18427 else if (Mask[i] >= 0)
18428 NumV1OddIndices += i % 2;
18429 if (NumV2OddIndices < NumV1OddIndices)
18430 return true;
18431 }
18432 }
18433 }
18434
18435 return false;
18436}
18437
18438/// Top-level lowering for x86 vector shuffles.
18439///
18440/// This handles decomposition, canonicalization, and lowering of all x86
18441/// vector shuffles. Most of the specific lowering strategies are encapsulated
18442/// above in helper routines. The canonicalization attempts to widen shuffles
18443/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18444/// s.t. only one of the two inputs needs to be tested, etc.
18445static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18446 SelectionDAG &DAG) {
18447 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18448 ArrayRef<int> OrigMask = SVOp->getMask();
18449 SDValue V1 = Op.getOperand(0);
18450 SDValue V2 = Op.getOperand(1);
18451 MVT VT = Op.getSimpleValueType();
18452 int NumElements = VT.getVectorNumElements();
18453 SDLoc DL(Op);
18454 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18455
18456 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&((void)0)
18457 "Can't lower MMX shuffles")((void)0);
18458
18459 bool V1IsUndef = V1.isUndef();
18460 bool V2IsUndef = V2.isUndef();
18461 if (V1IsUndef && V2IsUndef)
18462 return DAG.getUNDEF(VT);
18463
18464 // When we create a shuffle node we put the UNDEF node to second operand,
18465 // but in some cases the first operand may be transformed to UNDEF.
18466 // In this case we should just commute the node.
18467 if (V1IsUndef)
18468 return DAG.getCommutedVectorShuffle(*SVOp);
18469
18470 // Check for non-undef masks pointing at an undef vector and make the masks
18471 // undef as well. This makes it easier to match the shuffle based solely on
18472 // the mask.
18473 if (V2IsUndef &&
18474 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18475 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18476 for (int &M : NewMask)
18477 if (M >= NumElements)
18478 M = -1;
18479 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18480 }
18481
18482 // Check for illegal shuffle mask element index values.
18483 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18484 (void)MaskUpperLimit;
18485 assert(llvm::all_of(OrigMask,((void)0)
18486 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((void)0)
18487 "Out of bounds shuffle index")((void)0);
18488
18489 // We actually see shuffles that are entirely re-arrangements of a set of
18490 // zero inputs. This mostly happens while decomposing complex shuffles into
18491 // simple ones. Directly lower these as a buildvector of zeros.
18492 APInt KnownUndef, KnownZero;
18493 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18494
18495 APInt Zeroable = KnownUndef | KnownZero;
18496 if (Zeroable.isAllOnesValue())
18497 return getZeroVector(VT, Subtarget, DAG, DL);
18498
18499 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18500
18501 // Try to collapse shuffles into using a vector type with fewer elements but
18502 // wider element types. We cap this to not form integers or floating point
18503 // elements wider than 64 bits. It does not seem beneficial to form i128
18504 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18505 SmallVector<int, 16> WidenedMask;
18506 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18507 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18508 // Shuffle mask widening should not interfere with a broadcast opportunity
18509 // by obfuscating the operands with bitcasts.
18510 // TODO: Avoid lowering directly from this top-level function: make this
18511 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18512 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18513 Subtarget, DAG))
18514 return Broadcast;
18515
18516 MVT NewEltVT = VT.isFloatingPoint()
18517 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18518 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18519 int NewNumElts = NumElements / 2;
18520 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18521 // Make sure that the new vector type is legal. For example, v2f64 isn't
18522 // legal on SSE1.
18523 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18524 if (V2IsZero) {
18525 // Modify the new Mask to take all zeros from the all-zero vector.
18526 // Choose indices that are blend-friendly.
18527 bool UsedZeroVector = false;
18528 assert(is_contained(WidenedMask, SM_SentinelZero) &&((void)0)
18529 "V2's non-undef elements are used?!")((void)0);
18530 for (int i = 0; i != NewNumElts; ++i)
18531 if (WidenedMask[i] == SM_SentinelZero) {
18532 WidenedMask[i] = i + NewNumElts;
18533 UsedZeroVector = true;
18534 }
18535 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18536 // some elements to be undef.
18537 if (UsedZeroVector)
18538 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18539 }
18540 V1 = DAG.getBitcast(NewVT, V1);
18541 V2 = DAG.getBitcast(NewVT, V2);
18542 return DAG.getBitcast(
18543 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18544 }
18545 }
18546
18547 // Commute the shuffle if it will improve canonicalization.
18548 SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
18549 if (canonicalizeShuffleMaskWithCommute(Mask)) {
18550 ShuffleVectorSDNode::commuteMask(Mask);
18551 std::swap(V1, V2);
18552 }
18553
18554 // For each vector width, delegate to a specialized lowering routine.
18555 if (VT.is128BitVector())
18556 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18557
18558 if (VT.is256BitVector())
18559 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18560
18561 if (VT.is512BitVector())
18562 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18563
18564 if (Is1BitVector)
18565 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18566
18567 llvm_unreachable("Unimplemented!")__builtin_unreachable();
18568}
18569
18570/// Try to lower a VSELECT instruction to a vector shuffle.
18571static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18572 const X86Subtarget &Subtarget,
18573 SelectionDAG &DAG) {
18574 SDValue Cond = Op.getOperand(0);
18575 SDValue LHS = Op.getOperand(1);
18576 SDValue RHS = Op.getOperand(2);
18577 MVT VT = Op.getSimpleValueType();
18578
18579 // Only non-legal VSELECTs reach this lowering, convert those into generic
18580 // shuffles and re-use the shuffle lowering path for blends.
18581 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18582 SmallVector<int, 32> Mask;
18583 if (createShuffleMaskFromVSELECT(Mask, Cond))
18584 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18585 }
18586
18587 return SDValue();
18588}
18589
18590SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18591 SDValue Cond = Op.getOperand(0);
18592 SDValue LHS = Op.getOperand(1);
18593 SDValue RHS = Op.getOperand(2);
18594
18595 // A vselect where all conditions and data are constants can be optimized into
18596 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18597 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18598 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18599 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18600 return SDValue();
18601
18602 // Try to lower this to a blend-style vector shuffle. This can handle all
18603 // constant condition cases.
18604 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18605 return BlendOp;
18606
18607 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18608 // with patterns on the mask registers on AVX-512.
18609 MVT CondVT = Cond.getSimpleValueType();
18610 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18611 if (CondEltSize == 1)
18612 return Op;
18613
18614 // Variable blends are only legal from SSE4.1 onward.
18615 if (!Subtarget.hasSSE41())
18616 return SDValue();
18617
18618 SDLoc dl(Op);
18619 MVT VT = Op.getSimpleValueType();
18620 unsigned EltSize = VT.getScalarSizeInBits();
18621 unsigned NumElts = VT.getVectorNumElements();
18622
18623 // Expand v32i16/v64i8 without BWI.
18624 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18625 return SDValue();
18626
18627 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18628 // into an i1 condition so that we can use the mask-based 512-bit blend
18629 // instructions.
18630 if (VT.getSizeInBits() == 512) {
18631 // Build a mask by testing the condition against zero.
18632 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18633 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18634 DAG.getConstant(0, dl, CondVT),
18635 ISD::SETNE);
18636 // Now return a new VSELECT using the mask.
18637 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18638 }
18639
18640 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18641 if (CondEltSize != EltSize) {
18642 // If we don't have a sign splat, rely on the expansion.
18643 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18644 return SDValue();
18645
18646 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18647 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18648 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18649 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18650 }
18651
18652 // Only some types will be legal on some subtargets. If we can emit a legal
18653 // VSELECT-matching blend, return Op, and but if we need to expand, return
18654 // a null value.
18655 switch (VT.SimpleTy) {
18656 default:
18657 // Most of the vector types have blends past SSE4.1.
18658 return Op;
18659
18660 case MVT::v32i8:
18661 // The byte blends for AVX vectors were introduced only in AVX2.
18662 if (Subtarget.hasAVX2())
18663 return Op;
18664
18665 return SDValue();
18666
18667 case MVT::v8i16:
18668 case MVT::v16i16: {
18669 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18670 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18671 Cond = DAG.getBitcast(CastVT, Cond);
18672 LHS = DAG.getBitcast(CastVT, LHS);
18673 RHS = DAG.getBitcast(CastVT, RHS);
18674 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18675 return DAG.getBitcast(VT, Select);
18676 }
18677 }
18678}
18679
18680static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18681 MVT VT = Op.getSimpleValueType();
18682 SDValue Vec = Op.getOperand(0);
18683 SDValue Idx = Op.getOperand(1);
18684 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")((void)0);
18685 SDLoc dl(Op);
18686
18687 if (!Vec.getSimpleValueType().is128BitVector())
18688 return SDValue();
18689
18690 if (VT.getSizeInBits() == 8) {
18691 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18692 // we're going to zero extend the register or fold the store.
18693 if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
18694 !MayFoldIntoStore(Op))
18695 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18696 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18697 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18698
18699 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18700 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18701 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18702 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18703 }
18704
18705 if (VT == MVT::f32) {
18706 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18707 // the result back to FR32 register. It's only worth matching if the
18708 // result has a single use which is a store or a bitcast to i32. And in
18709 // the case of a store, it's not worth it if the index is a constant 0,
18710 // because a MOVSSmr can be used instead, which is smaller and faster.
18711 if (!Op.hasOneUse())
18712 return SDValue();
18713 SDNode *User = *Op.getNode()->use_begin();
18714 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18715 (User->getOpcode() != ISD::BITCAST ||
18716 User->getValueType(0) != MVT::i32))
18717 return SDValue();
18718 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18719 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18720 return DAG.getBitcast(MVT::f32, Extract);
18721 }
18722
18723 if (VT == MVT::i32 || VT == MVT::i64)
18724 return Op;
18725
18726 return SDValue();
18727}
18728
18729/// Extract one bit from mask vector, like v16i1 or v8i1.
18730/// AVX-512 feature.
18731static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
18732 const X86Subtarget &Subtarget) {
18733 SDValue Vec = Op.getOperand(0);
18734 SDLoc dl(Vec);
18735 MVT VecVT = Vec.getSimpleValueType();
18736 SDValue Idx = Op.getOperand(1);
18737 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18738 MVT EltVT = Op.getSimpleValueType();
18739
18740 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&((void)0)
18741 "Unexpected vector type in ExtractBitFromMaskVector")((void)0);
18742
18743 // variable index can't be handled in mask registers,
18744 // extend vector to VR512/128
18745 if (!IdxC) {
18746 unsigned NumElts = VecVT.getVectorNumElements();
18747 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18748 // than extending to 128/256bit.
18749 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18750 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18751 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18752 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18753 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18754 }
18755
18756 unsigned IdxVal = IdxC->getZExtValue();
18757 if (IdxVal == 0) // the operation is legal
18758 return Op;
18759
18760 // Extend to natively supported kshift.
18761 unsigned NumElems = VecVT.getVectorNumElements();
18762 MVT WideVecVT = VecVT;
18763 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18764 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18765 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18766 DAG.getUNDEF(WideVecVT), Vec,
18767 DAG.getIntPtrConstant(0, dl));
18768 }
18769
18770 // Use kshiftr instruction to move to the lower element.
18771 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18772 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18773
18774 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18775 DAG.getIntPtrConstant(0, dl));
18776}
18777
18778SDValue
18779X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18780 SelectionDAG &DAG) const {
18781 SDLoc dl(Op);
18782 SDValue Vec = Op.getOperand(0);
18783 MVT VecVT = Vec.getSimpleValueType();
18784 SDValue Idx = Op.getOperand(1);
18785 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18786
18787 if (VecVT.getVectorElementType() == MVT::i1)
18788 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18789
18790 if (!IdxC) {
18791 // Its more profitable to go through memory (1 cycles throughput)
18792 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
18793 // IACA tool was used to get performance estimation
18794 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18795 //
18796 // example : extractelement <16 x i8> %a, i32 %i
18797 //
18798 // Block Throughput: 3.00 Cycles
18799 // Throughput Bottleneck: Port5
18800 //
18801 // | Num Of | Ports pressure in cycles | |
18802 // | Uops | 0 - DV | 5 | 6 | 7 | |
18803 // ---------------------------------------------
18804 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18805 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18806 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18807 // Total Num Of Uops: 4
18808 //
18809 //
18810 // Block Throughput: 1.00 Cycles
18811 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18812 //
18813 // | | Ports pressure in cycles | |
18814 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18815 // ---------------------------------------------------------
18816 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18817 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18818 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18819 // Total Num Of Uops: 4
18820
18821 return SDValue();
18822 }
18823
18824 unsigned IdxVal = IdxC->getZExtValue();
18825
18826 // If this is a 256-bit vector result, first extract the 128-bit vector and
18827 // then extract the element from the 128-bit vector.
18828 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18829 // Get the 128-bit vector.
18830 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18831 MVT EltVT = VecVT.getVectorElementType();
18832
18833 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18834 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
18835
18836 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18837 // this can be done with a mask.
18838 IdxVal &= ElemsPerChunk - 1;
18839 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18840 DAG.getIntPtrConstant(IdxVal, dl));
18841 }
18842
18843 assert(VecVT.is128BitVector() && "Unexpected vector length")((void)0);
18844
18845 MVT VT = Op.getSimpleValueType();
18846
18847 if (VT.getSizeInBits() == 16) {
18848 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18849 // we're going to zero extend the register or fold the store (SSE41 only).
18850 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
18851 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
18852 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18853 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18854 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18855
18856 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18857 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18858 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18859 }
18860
18861 if (Subtarget.hasSSE41())
18862 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18863 return Res;
18864
18865 // TODO: We only extract a single element from v16i8, we can probably afford
18866 // to be more aggressive here before using the default approach of spilling to
18867 // stack.
18868 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
18869 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18870 int DWordIdx = IdxVal / 4;
18871 if (DWordIdx == 0) {
18872 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18873 DAG.getBitcast(MVT::v4i32, Vec),
18874 DAG.getIntPtrConstant(DWordIdx, dl));
18875 int ShiftVal = (IdxVal % 4) * 8;
18876 if (ShiftVal != 0)
18877 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18878 DAG.getConstant(ShiftVal, dl, MVT::i8));
18879 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18880 }
18881
18882 int WordIdx = IdxVal / 2;
18883 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18884 DAG.getBitcast(MVT::v8i16, Vec),
18885 DAG.getIntPtrConstant(WordIdx, dl));
18886 int ShiftVal = (IdxVal % 2) * 8;
18887 if (ShiftVal != 0)
18888 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18889 DAG.getConstant(ShiftVal, dl, MVT::i8));
18890 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18891 }
18892
18893 if (VT.getSizeInBits() == 32) {
18894 if (IdxVal == 0)
18895 return Op;
18896
18897 // SHUFPS the element to the lowest double word, then movss.
18898 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18899 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18900 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18901 DAG.getIntPtrConstant(0, dl));
18902 }
18903
18904 if (VT.getSizeInBits() == 64) {
18905 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18906 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18907 // to match extract_elt for f64.
18908 if (IdxVal == 0)
18909 return Op;
18910
18911 // UNPCKHPD the element to the lowest double word, then movsd.
18912 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18913 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18914 int Mask[2] = { 1, -1 };
18915 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18916 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18917 DAG.getIntPtrConstant(0, dl));
18918 }
18919
18920 return SDValue();
18921}
18922
18923/// Insert one bit to mask vector, like v16i1 or v8i1.
18924/// AVX-512 feature.
18925static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18926 const X86Subtarget &Subtarget) {
18927 SDLoc dl(Op);
18928 SDValue Vec = Op.getOperand(0);
18929 SDValue Elt = Op.getOperand(1);
18930 SDValue Idx = Op.getOperand(2);
18931 MVT VecVT = Vec.getSimpleValueType();
18932
18933 if (!isa<ConstantSDNode>(Idx)) {
18934 // Non constant index. Extend source and destination,
18935 // insert element and then truncate the result.
18936 unsigned NumElts = VecVT.getVectorNumElements();
18937 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18938 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18939 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18940 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18941 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18942 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18943 }
18944
18945 // Copy into a k-register, extract to v1i1 and insert_subvector.
18946 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18947 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18948}
18949
18950SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18951 SelectionDAG &DAG) const {
18952 MVT VT = Op.getSimpleValueType();
18953 MVT EltVT = VT.getVectorElementType();
18954 unsigned NumElts = VT.getVectorNumElements();
18955 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18956
18957 if (EltVT == MVT::i1)
18958 return InsertBitToMaskVector(Op, DAG, Subtarget);
18959
18960 SDLoc dl(Op);
18961 SDValue N0 = Op.getOperand(0);
18962 SDValue N1 = Op.getOperand(1);
18963 SDValue N2 = Op.getOperand(2);
18964 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18965
18966 if (!N2C) {
18967 // Variable insertion indices, usually we're better off spilling to stack,
18968 // but AVX512 can use a variable compare+select by comparing against all
18969 // possible vector indices, and FP insertion has less gpr->simd traffic.
18970 if (!(Subtarget.hasBWI() ||
18971 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18972 (Subtarget.hasSSE41() && VT.isFloatingPoint())))
18973 return SDValue();
18974
18975 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18976 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18977 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18978 return SDValue();
18979
18980 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18981 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18982 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18983
18984 SmallVector<SDValue, 16> RawIndices;
18985 for (unsigned I = 0; I != NumElts; ++I)
18986 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18987 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18988
18989 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18990 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18991 ISD::CondCode::SETEQ);
18992 }
18993
18994 if (N2C->getAPIntValue().uge(NumElts))
18995 return SDValue();
18996 uint64_t IdxVal = N2C->getZExtValue();
18997
18998 bool IsZeroElt = X86::isZeroNode(N1);
18999 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19000
19001 // If we are inserting a element, see if we can do this more efficiently with
19002 // a blend shuffle with a rematerializable vector than a costly integer
19003 // insertion.
19004 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
19005 (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
19006 SmallVector<int, 8> BlendMask;
19007 for (unsigned i = 0; i != NumElts; ++i)
19008 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19009 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19010 : getOnesVector(VT, DAG, dl);
19011 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19012 }
19013
19014 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19015 // into that, and then insert the subvector back into the result.
19016 if (VT.is256BitVector() || VT.is512BitVector()) {
19017 // With a 256-bit vector, we can insert into the zero element efficiently
19018 // using a blend if we have AVX or AVX2 and the right data type.
19019 if (VT.is256BitVector() && IdxVal == 0) {
19020 // TODO: It is worthwhile to cast integer to floating point and back
19021 // and incur a domain crossing penalty if that's what we'll end up
19022 // doing anyway after extracting to a 128-bit vector.
19023 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19024 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19025 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19026 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19027 DAG.getTargetConstant(1, dl, MVT::i8));
19028 }
19029 }
19030
19031 // Get the desired 128-bit vector chunk.
19032 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19033
19034 // Insert the element into the desired chunk.
19035 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19036 assert(isPowerOf2_32(NumEltsIn128))((void)0);
19037 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19038 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19039
19040 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19041 DAG.getIntPtrConstant(IdxIn128, dl));
19042
19043 // Insert the changed part back into the bigger vector
19044 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19045 }
19046 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((void)0);
19047
19048 // This will be just movd/movq/movss/movsd.
19049 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19050 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19051 EltVT == MVT::i64) {
19052 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19053 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19054 }
19055
19056 // We can't directly insert an i8 or i16 into a vector, so zero extend
19057 // it to i32 first.
19058 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19059 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19060 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19061 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19062 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19063 return DAG.getBitcast(VT, N1);
19064 }
19065 }
19066
19067 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19068 // argument. SSE41 required for pinsrb.
19069 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19070 unsigned Opc;
19071 if (VT == MVT::v8i16) {
19072 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((void)0);
19073 Opc = X86ISD::PINSRW;
19074 } else {
19075 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((void)0);
19076 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((void)0);
19077 Opc = X86ISD::PINSRB;
19078 }
19079
19080 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")((void)0);
19081 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19082 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19083 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19084 }
19085
19086 if (Subtarget.hasSSE41()) {
19087 if (EltVT == MVT::f32) {
19088 // Bits [7:6] of the constant are the source select. This will always be
19089 // zero here. The DAG Combiner may combine an extract_elt index into
19090 // these bits. For example (insert (extract, 3), 2) could be matched by
19091 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19092 // Bits [5:4] of the constant are the destination select. This is the
19093 // value of the incoming immediate.
19094 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19095 // combine either bitwise AND or insert of float 0.0 to set these bits.
19096
19097 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19098 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19099 // If this is an insertion of 32-bits into the low 32-bits of
19100 // a vector, we prefer to generate a blend with immediate rather
19101 // than an insertps. Blends are simpler operations in hardware and so
19102 // will always have equal or better performance than insertps.
19103 // But if optimizing for size and there's a load folding opportunity,
19104 // generate insertps because blendps does not have a 32-bit memory
19105 // operand form.
19106 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19107 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19108 DAG.getTargetConstant(1, dl, MVT::i8));
19109 }
19110 // Create this as a scalar to vector..
19111 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19112 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19113 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19114 }
19115
19116 // PINSR* works with constant index.
19117 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19118 return Op;
19119 }
19120
19121 return SDValue();
19122}
19123
19124static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19125 SelectionDAG &DAG) {
19126 SDLoc dl(Op);
19127 MVT OpVT = Op.getSimpleValueType();
19128
19129 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19130 // combines.
19131 if (X86::isZeroNode(Op.getOperand(0)))
19132 return getZeroVector(OpVT, Subtarget, DAG, dl);
19133
19134 // If this is a 256-bit vector result, first insert into a 128-bit
19135 // vector and then insert into the 256-bit vector.
19136 if (!OpVT.is128BitVector()) {
19137 // Insert into a 128-bit vector.
19138 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19139 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19140 OpVT.getVectorNumElements() / SizeFactor);
19141
19142 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19143
19144 // Insert the 128-bit vector.
19145 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19146 }
19147 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&((void)0)
19148 "Expected an SSE type!")((void)0);
19149
19150 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
19151 if (OpVT == MVT::v4i32)
19152 return Op;
19153
19154 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19155 return DAG.getBitcast(
19156 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19157}
19158
19159// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19160// simple superregister reference or explicit instructions to insert
19161// the upper bits of a vector.
19162static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19163 SelectionDAG &DAG) {
19164 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((void)0);
19165
19166 return insert1BitVector(Op, DAG, Subtarget);
19167}
19168
19169static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19170 SelectionDAG &DAG) {
19171 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((void)0)
19172 "Only vXi1 extract_subvectors need custom lowering")((void)0);
19173
19174 SDLoc dl(Op);
19175 SDValue Vec = Op.getOperand(0);
19176 uint64_t IdxVal = Op.getConstantOperandVal(1);
19177
19178 if (IdxVal == 0) // the operation is legal
19179 return Op;
19180
19181 MVT VecVT = Vec.getSimpleValueType();
19182 unsigned NumElems = VecVT.getVectorNumElements();
19183
19184 // Extend to natively supported kshift.
19185 MVT WideVecVT = VecVT;
19186 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19187 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19188 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19189 DAG.getUNDEF(WideVecVT), Vec,
19190 DAG.getIntPtrConstant(0, dl));
19191 }
19192
19193 // Shift to the LSB.
19194 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19195 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19196
19197 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19198 DAG.getIntPtrConstant(0, dl));
19199}
19200
19201// Returns the appropriate wrapper opcode for a global reference.
19202unsigned X86TargetLowering::getGlobalWrapperKind(
19203 const GlobalValue *GV, const unsigned char OpFlags) const {
19204 // References to absolute symbols are never PC-relative.
19205 if (GV && GV->isAbsoluteSymbolRef())
19206 return X86ISD::Wrapper;
19207
19208 CodeModel::Model M = getTargetMachine().getCodeModel();
19209 if (Subtarget.isPICStyleRIPRel() &&
19210 (M == CodeModel::Small || M == CodeModel::Kernel))
19211 return X86ISD::WrapperRIP;
19212
19213 // GOTPCREL references must always use RIP.
19214 if (OpFlags == X86II::MO_GOTPCREL)
19215 return X86ISD::WrapperRIP;
19216
19217 return X86ISD::Wrapper;
19218}
19219
19220// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19221// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19222// one of the above mentioned nodes. It has to be wrapped because otherwise
19223// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19224// be used to form addressing mode. These wrapped nodes will be selected
19225// into MOV32ri.
19226SDValue
19227X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19228 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19229
19230 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19231 // global base reg.
19232 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19233
19234 auto PtrVT = getPointerTy(DAG.getDataLayout());
19235 SDValue Result = DAG.getTargetConstantPool(
19236 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19237 SDLoc DL(CP);
19238 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19239 // With PIC, the address is actually $g + Offset.
19240 if (OpFlag) {
19241 Result =
19242 DAG.getNode(ISD::ADD, DL, PtrVT,
19243 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19244 }
19245
19246 return Result;
19247}
19248
19249SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19250 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19251
19252 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19253 // global base reg.
19254 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19255
19256 auto PtrVT = getPointerTy(DAG.getDataLayout());
19257 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19258 SDLoc DL(JT);
19259 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19260
19261 // With PIC, the address is actually $g + Offset.
19262 if (OpFlag)
19263 Result =
19264 DAG.getNode(ISD::ADD, DL, PtrVT,
19265 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19266
19267 return Result;
19268}
19269
19270SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19271 SelectionDAG &DAG) const {
19272 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19273}
19274
19275SDValue
19276X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19277 // Create the TargetBlockAddressAddress node.
19278 unsigned char OpFlags =
19279 Subtarget.classifyBlockAddressReference();
19280 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19281 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19282 SDLoc dl(Op);
19283 auto PtrVT = getPointerTy(DAG.getDataLayout());
19284 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19285 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19286
19287 // With PIC, the address is actually $g + Offset.
19288 if (isGlobalRelativeToPICBase(OpFlags)) {
19289 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19290 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19291 }
19292
19293 return Result;
19294}
19295
19296/// Creates target global address or external symbol nodes for calls or
19297/// other uses.
19298SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19299 bool ForCall) const {
19300 // Unpack the global address or external symbol.
19301 const SDLoc &dl = SDLoc(Op);
19302 const GlobalValue *GV = nullptr;
19303 int64_t Offset = 0;
19304 const char *ExternalSym = nullptr;
19305 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19306 GV = G->getGlobal();
19307 Offset = G->getOffset();
19308 } else {
19309 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19310 ExternalSym = ES->getSymbol();
19311 }
19312
19313 // Calculate some flags for address lowering.
19314 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19315 unsigned char OpFlags;
19316 if (ForCall)
19317 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19318 else
19319 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19320 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19321 bool NeedsLoad = isGlobalStubReference(OpFlags);
19322
19323 CodeModel::Model M = DAG.getTarget().getCodeModel();
19324 auto PtrVT = getPointerTy(DAG.getDataLayout());
19325 SDValue Result;
19326
19327 if (GV) {
19328 // Create a target global address if this is a global. If possible, fold the
19329 // offset into the global address reference. Otherwise, ADD it on later.
19330 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19331 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19332 // relocation will compute to a negative value, which is invalid.
19333 int64_t GlobalOffset = 0;
19334 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19335 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19336 std::swap(GlobalOffset, Offset);
19337 }
19338 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19339 } else {
19340 // If this is not a global address, this must be an external symbol.
19341 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19342 }
19343
19344 // If this is a direct call, avoid the wrapper if we don't need to do any
19345 // loads or adds. This allows SDAG ISel to match direct calls.
19346 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19347 return Result;
19348
19349 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19350
19351 // With PIC, the address is actually $g + Offset.
19352 if (HasPICReg) {
19353 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19354 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19355 }
19356
19357 // For globals that require a load from a stub to get the address, emit the
19358 // load.
19359 if (NeedsLoad)
19360 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19361 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19362
19363 // If there was a non-zero offset that we didn't fold, create an explicit
19364 // addition for it.
19365 if (Offset != 0)
19366 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19367 DAG.getConstant(Offset, dl, PtrVT));
19368
19369 return Result;
19370}
19371
19372SDValue
19373X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19374 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19375}
19376
19377static SDValue
19378GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19379 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19380 unsigned char OperandFlags, bool LocalDynamic = false) {
19381 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19382 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19383 SDLoc dl(GA);
19384 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19385 GA->getValueType(0),
19386 GA->getOffset(),
19387 OperandFlags);
19388
19389 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19390 : X86ISD::TLSADDR;
19391
19392 if (InFlag) {
19393 SDValue Ops[] = { Chain, TGA, *InFlag };
19394 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19395 } else {
19396 SDValue Ops[] = { Chain, TGA };
19397 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19398 }
19399
19400 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19401 MFI.setAdjustsStack(true);
19402 MFI.setHasCalls(true);
19403
19404 SDValue Flag = Chain.getValue(1);
19405 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19406}
19407
19408// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19409static SDValue
19410LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19411 const EVT PtrVT) {
19412 SDValue InFlag;
19413 SDLoc dl(GA); // ? function entry point might be better
19414 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19415 DAG.getNode(X86ISD::GlobalBaseReg,
19416 SDLoc(), PtrVT), InFlag);
19417 InFlag = Chain.getValue(1);
19418
19419 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19420}
19421
19422// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19423static SDValue
19424LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19425 const EVT PtrVT) {
19426 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19427 X86::RAX, X86II::MO_TLSGD);
19428}
19429
19430// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19431static SDValue
19432LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19433 const EVT PtrVT) {
19434 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19435 X86::EAX, X86II::MO_TLSGD);
19436}
19437
19438static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19439 SelectionDAG &DAG, const EVT PtrVT,
19440 bool Is64Bit, bool Is64BitLP64) {
19441 SDLoc dl(GA);
19442
19443 // Get the start address of the TLS block for this module.
19444 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19445 .getInfo<X86MachineFunctionInfo>();
19446 MFI->incNumLocalDynamicTLSAccesses();
19447
19448 SDValue Base;
19449 if (Is64Bit) {
19450 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19451 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19452 X86II::MO_TLSLD, /*LocalDynamic=*/true);
19453 } else {
19454 SDValue InFlag;
19455 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19456 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19457 InFlag = Chain.getValue(1);
19458 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19459 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19460 }
19461
19462 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19463 // of Base.
19464
19465 // Build x@dtpoff.
19466 unsigned char OperandFlags = X86II::MO_DTPOFF;
19467 unsigned WrapperKind = X86ISD::Wrapper;
19468 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19469 GA->getValueType(0),
19470 GA->getOffset(), OperandFlags);
19471 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19472
19473 // Add x@dtpoff with the base.
19474 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19475}
19476
19477// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19478static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19479 const EVT PtrVT, TLSModel::Model model,
19480 bool is64Bit, bool isPIC) {
19481 SDLoc dl(GA);
19482
19483 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19484 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19485 is64Bit ? 257 : 256));
19486
19487 SDValue ThreadPointer =
19488 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19489 MachinePointerInfo(Ptr));
19490
19491 unsigned char OperandFlags = 0;
19492 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19493 // initialexec.
19494 unsigned WrapperKind = X86ISD::Wrapper;
19495 if (model == TLSModel::LocalExec) {
19496 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19497 } else if (model == TLSModel::InitialExec) {
19498 if (is64Bit) {
19499 OperandFlags = X86II::MO_GOTTPOFF;
19500 WrapperKind = X86ISD::WrapperRIP;
19501 } else {
19502 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19503 }
19504 } else {
19505 llvm_unreachable("Unexpected model")__builtin_unreachable();
19506 }
19507
19508 // emit "addl x@ntpoff,%eax" (local exec)
19509 // or "addl x@indntpoff,%eax" (initial exec)
19510 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19511 SDValue TGA =
19512 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19513 GA->getOffset(), OperandFlags);
19514 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19515
19516 if (model == TLSModel::InitialExec) {
19517 if (isPIC && !is64Bit) {
19518 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19519 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19520 Offset);
19521 }
19522
19523 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19524 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19525 }
19526
19527 // The address of the thread local variable is the add of the thread
19528 // pointer with the offset of the variable.
19529 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19530}
19531
19532SDValue
19533X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19534
19535 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19536
19537 if (DAG.getTarget().useEmulatedTLS())
19538 return LowerToTLSEmulatedModel(GA, DAG);
19539
19540 const GlobalValue *GV = GA->getGlobal();
19541 auto PtrVT = getPointerTy(DAG.getDataLayout());
19542 bool PositionIndependent = isPositionIndependent();
19543
19544 if (Subtarget.isTargetELF()) {
19545 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19546 switch (model) {
19547 case TLSModel::GeneralDynamic:
19548 if (Subtarget.is64Bit()) {
19549 if (Subtarget.isTarget64BitLP64())
19550 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19551 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19552 }
19553 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19554 case TLSModel::LocalDynamic:
19555 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19556 Subtarget.isTarget64BitLP64());
19557 case TLSModel::InitialExec:
19558 case TLSModel::LocalExec:
19559 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19560 PositionIndependent);
19561 }
19562 llvm_unreachable("Unknown TLS model.")__builtin_unreachable();
19563 }
19564
19565 if (Subtarget.isTargetDarwin()) {
19566 // Darwin only has one model of TLS. Lower to that.
19567 unsigned char OpFlag = 0;
19568 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19569 X86ISD::WrapperRIP : X86ISD::Wrapper;
19570
19571 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19572 // global base reg.
19573 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19574 if (PIC32)
19575 OpFlag = X86II::MO_TLVP_PIC_BASE;
19576 else
19577 OpFlag = X86II::MO_TLVP;
19578 SDLoc DL(Op);
19579 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19580 GA->getValueType(0),
19581 GA->getOffset(), OpFlag);
19582 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19583
19584 // With PIC32, the address is actually $g + Offset.
19585 if (PIC32)
19586 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19587 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19588 Offset);
19589
19590 // Lowering the machine isd will make sure everything is in the right
19591 // location.
19592 SDValue Chain = DAG.getEntryNode();
19593 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19594 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19595 SDValue Args[] = { Chain, Offset };
19596 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19597 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19598 DAG.getIntPtrConstant(0, DL, true),
19599 Chain.getValue(1), DL);
19600
19601 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19602 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19603 MFI.setAdjustsStack(true);
19604
19605 // And our return value (tls address) is in the standard call return value
19606 // location.
19607 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19608 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19609 }
19610
19611 if (Subtarget.isOSWindows()) {
19612 // Just use the implicit TLS architecture
19613 // Need to generate something similar to:
19614 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19615 // ; from TEB
19616 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19617 // mov rcx, qword [rdx+rcx*8]
19618 // mov eax, .tls$:tlsvar
19619 // [rax+rcx] contains the address
19620 // Windows 64bit: gs:0x58
19621 // Windows 32bit: fs:__tls_array
19622
19623 SDLoc dl(GA);
19624 SDValue Chain = DAG.getEntryNode();
19625
19626 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19627 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19628 // use its literal value of 0x2C.
19629 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19630 ? Type::getInt8PtrTy(*DAG.getContext(),
19631 256)
19632 : Type::getInt32PtrTy(*DAG.getContext(),
19633 257));
19634
19635 SDValue TlsArray = Subtarget.is64Bit()
19636 ? DAG.getIntPtrConstant(0x58, dl)
19637 : (Subtarget.isTargetWindowsGNU()
19638 ? DAG.getIntPtrConstant(0x2C, dl)
19639 : DAG.getExternalSymbol("_tls_array", PtrVT));
19640
19641 SDValue ThreadPointer =
19642 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19643
19644 SDValue res;
19645 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19646 res = ThreadPointer;
19647 } else {
19648 // Load the _tls_index variable
19649 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19650 if (Subtarget.is64Bit())
19651 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19652 MachinePointerInfo(), MVT::i32);
19653 else
19654 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19655
19656 const DataLayout &DL = DAG.getDataLayout();
19657 SDValue Scale =
19658 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19659 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19660
19661 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19662 }
19663
19664 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19665
19666 // Get the offset of start of .tls section
19667 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19668 GA->getValueType(0),
19669 GA->getOffset(), X86II::MO_SECREL);
19670 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19671
19672 // The address of the thread local variable is the add of the thread
19673 // pointer with the offset of the variable.
19674 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19675 }
19676
19677 llvm_unreachable("TLS not implemented for this target.")__builtin_unreachable();
19678}
19679
19680/// Lower SRA_PARTS and friends, which return two i32 values
19681/// and take a 2 x i32 value to shift plus a shift amount.
19682/// TODO: Can this be moved to general expansion code?
19683static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
19684 SDValue Lo, Hi;
19685 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19686 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19687}
19688
19689static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
19690 SelectionDAG &DAG) {
19691 MVT VT = Op.getSimpleValueType();
19692 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&((void)0)
19693 "Unexpected funnel shift opcode!")((void)0);
19694
19695 SDLoc DL(Op);
19696 SDValue Op0 = Op.getOperand(0);
19697 SDValue Op1 = Op.getOperand(1);
19698 SDValue Amt = Op.getOperand(2);
19699
19700 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
19701
19702 if (VT.isVector()) {
19703 assert(Subtarget.hasVBMI2() && "Expected VBMI2")((void)0);
19704
19705 if (IsFSHR)
19706 std::swap(Op0, Op1);
19707
19708 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19709 if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
19710 Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
19711 Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
19712 }
19713
19714 SDValue Funnel;
19715 APInt APIntShiftAmt;
19716 MVT ResultVT = Op0.getSimpleValueType();
19717 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
19718 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
19719 Funnel =
19720 DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
19721 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19722 } else {
19723 if (!Subtarget.hasVLX() && !VT.is512BitVector())
19724 Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
19725 Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
19726 ResultVT, Op0, Op1, Amt);
19727 }
19728 if (!Subtarget.hasVLX() && !VT.is512BitVector())
19729 Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
19730 return Funnel;
19731 }
19732 assert(((void)0)
19733 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&((void)0)
19734 "Unexpected funnel shift type!")((void)0);
19735
19736 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
19737 bool OptForSize = DAG.shouldOptForSize();
19738 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
19739
19740 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
19741 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
19742 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
19743 !isa<ConstantSDNode>(Amt)) {
19744 unsigned EltSizeInBits = VT.getScalarSizeInBits();
19745 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
19746 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
19747 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
19748 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
19749 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
19750 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
19751 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
19752 if (IsFSHR) {
19753 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
19754 } else {
19755 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
19756 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
19757 }
19758 return DAG.getZExtOrTrunc(Res, DL, VT);
19759 }
19760
19761 if (VT == MVT::i8 || ExpandFunnel)
19762 return SDValue();
19763
19764 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
19765 if (VT == MVT::i16) {
19766 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
19767 DAG.getConstant(15, DL, Amt.getValueType()));
19768 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
19769 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
19770 }
19771
19772 return Op;
19773}
19774
19775// Try to use a packed vector operation to handle i64 on 32-bit targets when
19776// AVX512DQ is enabled.
19777static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
19778 const X86Subtarget &Subtarget) {
19779 assert((Op.getOpcode() == ISD::SINT_TO_FP ||((void)0)
19780 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||((void)0)
19781 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||((void)0)
19782 Op.getOpcode() == ISD::UINT_TO_FP) &&((void)0)
19783 "Unexpected opcode!")((void)0);
19784 bool IsStrict = Op->isStrictFPOpcode();
19785 unsigned OpNo = IsStrict ? 1 : 0;
19786 SDValue Src = Op.getOperand(OpNo);
19787 MVT SrcVT = Src.getSimpleValueType();
19788 MVT VT = Op.getSimpleValueType();
19789
19790 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19791 (VT != MVT::f32 && VT != MVT::f64))
19792 return SDValue();
19793
19794 // Pack the i64 into a vector, do the operation and extract.
19795
19796 // Using 256-bit to ensure result is 128-bits for f32 case.
19797 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19798 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19799 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19800
19801 SDLoc dl(Op);
19802 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19803 if (IsStrict) {
19804 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19805 {Op.getOperand(0), InVec});
19806 SDValue Chain = CvtVec.getValue(1);
19807 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19808 DAG.getIntPtrConstant(0, dl));
19809 return DAG.getMergeValues({Value, Chain}, dl);
19810 }
19811
19812 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19813
19814 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19815 DAG.getIntPtrConstant(0, dl));
19816}
19817
19818static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19819 const X86Subtarget &Subtarget) {
19820 switch (Opcode) {
19821 case ISD::SINT_TO_FP:
19822 // TODO: Handle wider types with AVX/AVX512.
19823 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19824 return false;
19825 // CVTDQ2PS or (V)CVTDQ2PD
19826 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19827
19828 case ISD::UINT_TO_FP:
19829 // TODO: Handle wider types and i64 elements.
19830 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19831 return false;
19832 // VCVTUDQ2PS or VCVTUDQ2PD
19833 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19834
19835 default:
19836 return false;
19837 }
19838}
19839
19840/// Given a scalar cast operation that is extracted from a vector, try to
19841/// vectorize the cast op followed by extraction. This will avoid an expensive
19842/// round-trip between XMM and GPR.
19843static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
19844 const X86Subtarget &Subtarget) {
19845 // TODO: This could be enhanced to handle smaller integer types by peeking
19846 // through an extend.
19847 SDValue Extract = Cast.getOperand(0);
19848 MVT DestVT = Cast.getSimpleValueType();
19849 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19850 !isa<ConstantSDNode>(Extract.getOperand(1)))
19851 return SDValue();
19852
19853 // See if we have a 128-bit vector cast op for this type of cast.
19854 SDValue VecOp = Extract.getOperand(0);
19855 MVT FromVT = VecOp.getSimpleValueType();
19856 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19857 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19858 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19859 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19860 return SDValue();
19861
19862 // If we are extracting from a non-zero element, first shuffle the source
19863 // vector to allow extracting from element zero.
19864 SDLoc DL(Cast);
19865 if (!isNullConstant(Extract.getOperand(1))) {
19866 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19867 Mask[0] = Extract.getConstantOperandVal(1);
19868 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19869 }
19870 // If the source vector is wider than 128-bits, extract the low part. Do not
19871 // create an unnecessarily wide vector cast op.
19872 if (FromVT != Vec128VT)
19873 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19874
19875 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19876 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19877 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19878 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19879 DAG.getIntPtrConstant(0, DL));
19880}
19881
19882/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19883/// try to vectorize the cast ops. This will avoid an expensive round-trip
19884/// between XMM and GPR.
19885static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
19886 const X86Subtarget &Subtarget) {
19887 // TODO: Allow FP_TO_UINT.
19888 SDValue CastToInt = CastToFP.getOperand(0);
19889 MVT VT = CastToFP.getSimpleValueType();
19890 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19891 return SDValue();
19892
19893 MVT IntVT = CastToInt.getSimpleValueType();
19894 SDValue X = CastToInt.getOperand(0);
19895 MVT SrcVT = X.getSimpleValueType();
19896 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19897 return SDValue();
19898
19899 // See if we have 128-bit vector cast instructions for this type of cast.
19900 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19901 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19902 IntVT != MVT::i32)
19903 return SDValue();
19904
19905 unsigned SrcSize = SrcVT.getSizeInBits();
19906 unsigned IntSize = IntVT.getSizeInBits();
19907 unsigned VTSize = VT.getSizeInBits();
19908 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19909 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19910 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19911
19912 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19913 unsigned ToIntOpcode =
19914 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19915 unsigned ToFPOpcode =
19916 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19917
19918 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19919 //
19920 // We are not defining the high elements (for example, zero them) because
19921 // that could nullify any performance advantage that we hoped to gain from
19922 // this vector op hack. We do not expect any adverse effects (like denorm
19923 // penalties) with cast ops.
19924 SDLoc DL(CastToFP);
19925 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19926 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19927 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19928 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19929 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19930}
19931
19932static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
19933 const X86Subtarget &Subtarget) {
19934 SDLoc DL(Op);
19935 bool IsStrict = Op->isStrictFPOpcode();
19936 MVT VT = Op->getSimpleValueType(0);
19937 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19938
19939 if (Subtarget.hasDQI()) {
19940 assert(!Subtarget.hasVLX() && "Unexpected features")((void)0);
19941
19942 assert((Src.getSimpleValueType() == MVT::v2i64 ||((void)0)
19943 Src.getSimpleValueType() == MVT::v4i64) &&((void)0)
19944 "Unsupported custom type")((void)0);
19945
19946 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19947 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&((void)0)
19948 "Unexpected VT!")((void)0);
19949 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19950
19951 // Need to concat with zero vector for strict fp to avoid spurious
19952 // exceptions.
19953 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19954 : DAG.getUNDEF(MVT::v8i64);
19955 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19956 DAG.getIntPtrConstant(0, DL));
19957 SDValue Res, Chain;
19958 if (IsStrict) {
19959 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19960 {Op->getOperand(0), Src});
19961 Chain = Res.getValue(1);
19962 } else {
19963 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19964 }
19965
19966 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19967 DAG.getIntPtrConstant(0, DL));
19968
19969 if (IsStrict)
19970 return DAG.getMergeValues({Res, Chain}, DL);
19971 return Res;
19972 }
19973
19974 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19975 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19976 if (VT != MVT::v4f32 || IsSigned)
19977 return SDValue();
19978
19979 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19980 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19981 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19982 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19983 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19984 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19985 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19986 SmallVector<SDValue, 4> SignCvts(4);
19987 SmallVector<SDValue, 4> Chains(4);
19988 for (int i = 0; i != 4; ++i) {
19989 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19990 DAG.getIntPtrConstant(i, DL));
19991 if (IsStrict) {
19992 SignCvts[i] =
19993 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19994 {Op.getOperand(0), Elt});
19995 Chains[i] = SignCvts[i].getValue(1);
19996 } else {
19997 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19998 }
19999 }
20000 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20001
20002 SDValue Slow, Chain;
20003 if (IsStrict) {
20004 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20005 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20006 {Chain, SignCvt, SignCvt});
20007 Chain = Slow.getValue(1);
20008 } else {
20009 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20010 }
20011
20012 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20013 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20014
20015 if (IsStrict)
20016 return DAG.getMergeValues({Cvt, Chain}, DL);
20017
20018 return Cvt;
20019}
20020
20021SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20022 SelectionDAG &DAG) const {
20023 bool IsStrict = Op->isStrictFPOpcode();
20024 unsigned OpNo = IsStrict ? 1 : 0;
20025 SDValue Src = Op.getOperand(OpNo);
20026 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20027 MVT SrcVT = Src.getSimpleValueType();
20028 MVT VT = Op.getSimpleValueType();
20029 SDLoc dl(Op);
20030
20031 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20032 return Extract;
20033
20034 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20035 return R;
20036
20037 if (SrcVT.isVector()) {
20038 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20039 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20040 // source for strict FP.
20041 if (IsStrict)
20042 return DAG.getNode(
20043 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20044 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20045 DAG.getUNDEF(SrcVT))});
20046 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20047 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20048 DAG.getUNDEF(SrcVT)));
20049 }
20050 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20051 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20052
20053 return SDValue();
20054 }
20055
20056 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((void)0)
20057 "Unknown SINT_TO_FP to lower!")((void)0);
20058
20059 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20060
20061 // These are really Legal; return the operand so the caller accepts it as
20062 // Legal.
20063 if (SrcVT == MVT::i32 && UseSSEReg)
20064 return Op;
20065 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20066 return Op;
20067
20068 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20069 return V;
20070
20071 // SSE doesn't have an i16 conversion so we need to promote.
20072 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20073 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20074 if (IsStrict)
20075 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20076 {Chain, Ext});
20077
20078 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20079 }
20080
20081 if (VT == MVT::f128)
20082 return SDValue();
20083
20084 SDValue ValueToStore = Src;
20085 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20086 // Bitcasting to f64 here allows us to do a single 64-bit store from
20087 // an SSE register, avoiding the store forwarding penalty that would come
20088 // with two 32-bit stores.
20089 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20090
20091 unsigned Size = SrcVT.getStoreSize();
20092 Align Alignment(Size);
20093 MachineFunction &MF = DAG.getMachineFunction();
20094 auto PtrVT = getPointerTy(MF.getDataLayout());
20095 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20096 MachinePointerInfo MPI =
20097 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20098 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20099 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20100 std::pair<SDValue, SDValue> Tmp =
20101 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20102
20103 if (IsStrict)
20104 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20105
20106 return Tmp.first;
20107}
20108
20109std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20110 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20111 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20112 // Build the FILD
20113 SDVTList Tys;
20114 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20115 if (useSSE)
20116 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20117 else
20118 Tys = DAG.getVTList(DstVT, MVT::Other);
20119
20120 SDValue FILDOps[] = {Chain, Pointer};
20121 SDValue Result =
20122 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20123 Alignment, MachineMemOperand::MOLoad);
20124 Chain = Result.getValue(1);
20125
20126 if (useSSE) {
20127 MachineFunction &MF = DAG.getMachineFunction();
20128 unsigned SSFISize = DstVT.getStoreSize();
20129 int SSFI =
20130 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20131 auto PtrVT = getPointerTy(MF.getDataLayout());
20132 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20133 Tys = DAG.getVTList(MVT::Other);
20134 SDValue FSTOps[] = {Chain, Result, StackSlot};
20135 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20136 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20137 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20138
20139 Chain =
20140 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20141 Result = DAG.getLoad(
20142 DstVT, DL, Chain, StackSlot,
20143 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20144 Chain = Result.getValue(1);
20145 }
20146
20147 return { Result, Chain };
20148}
20149
20150/// Horizontal vector math instructions may be slower than normal math with
20151/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20152/// implementation, and likely shuffle complexity of the alternate sequence.
20153static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20154 const X86Subtarget &Subtarget) {
20155 bool IsOptimizingSize = DAG.shouldOptForSize();
20156 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20157 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20158}
20159
20160/// 64-bit unsigned integer to double expansion.
20161static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20162 const X86Subtarget &Subtarget) {
20163 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20164 // when converting 0 when rounding toward negative infinity. Caller will
20165 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20166 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")((void)0);
20167 // This algorithm is not obvious. Here it is what we're trying to output:
20168 /*
20169 movq %rax, %xmm0
20170 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20171 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20172 #ifdef __SSE3__
20173 haddpd %xmm0, %xmm0
20174 #else
20175 pshufd $0x4e, %xmm0, %xmm1
20176 addpd %xmm1, %xmm0
20177 #endif
20178 */
20179
20180 SDLoc dl(Op);
20181 LLVMContext *Context = DAG.getContext();
20182
20183 // Build some magic constants.
20184 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20185 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20186 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20187 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20188
20189 SmallVector<Constant*,2> CV1;
20190 CV1.push_back(
20191 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20192 APInt(64, 0x4330000000000000ULL))));
20193 CV1.push_back(
20194 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20195 APInt(64, 0x4530000000000000ULL))));
20196 Constant *C1 = ConstantVector::get(CV1);
20197 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20198
20199 // Load the 64-bit value into an XMM register.
20200 SDValue XR1 =
20201 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20202 SDValue CLod0 = DAG.getLoad(
20203 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20204 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20205 SDValue Unpck1 =
20206 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20207
20208 SDValue CLod1 = DAG.getLoad(
20209 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20210 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20211 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20212 // TODO: Are there any fast-math-flags to propagate here?
20213 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20214 SDValue Result;
20215
20216 if (Subtarget.hasSSE3() &&
20217 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20218 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20219 } else {
20220 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20221 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20222 }
20223 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20224 DAG.getIntPtrConstant(0, dl));
20225 return Result;
20226}
20227
20228/// 32-bit unsigned integer to float expansion.
20229static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20230 const X86Subtarget &Subtarget) {
20231 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20232 SDLoc dl(Op);
20233 // FP constant to bias correct the final result.
20234 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20235 MVT::f64);
20236
20237 // Load the 32-bit value into an XMM register.
20238 SDValue Load =
20239 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20240
20241 // Zero out the upper parts of the register.
20242 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20243
20244 // Or the load with the bias.
20245 SDValue Or = DAG.getNode(
20246 ISD::OR, dl, MVT::v2i64,
20247 DAG.getBitcast(MVT::v2i64, Load),
20248 DAG.getBitcast(MVT::v2i64,
20249 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20250 Or =
20251 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20252 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20253
20254 if (Op.getNode()->isStrictFPOpcode()) {
20255 // Subtract the bias.
20256 // TODO: Are there any fast-math-flags to propagate here?
20257 SDValue Chain = Op.getOperand(0);
20258 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20259 {Chain, Or, Bias});
20260
20261 if (Op.getValueType() == Sub.getValueType())
20262 return Sub;
20263
20264 // Handle final rounding.
20265 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20266 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20267
20268 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20269 }
20270
20271 // Subtract the bias.
20272 // TODO: Are there any fast-math-flags to propagate here?
20273 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20274
20275 // Handle final rounding.
20276 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20277}
20278
20279static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20280 const X86Subtarget &Subtarget,
20281 const SDLoc &DL) {
20282 if (Op.getSimpleValueType() != MVT::v2f64)
20283 return SDValue();
20284
20285 bool IsStrict = Op->isStrictFPOpcode();
20286
20287 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20288 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((void)0);
20289
20290 if (Subtarget.hasAVX512()) {
20291 if (!Subtarget.hasVLX()) {
20292 // Let generic type legalization widen this.
20293 if (!IsStrict)
20294 return SDValue();
20295 // Otherwise pad the integer input with 0s and widen the operation.
20296 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20297 DAG.getConstant(0, DL, MVT::v2i32));
20298 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20299 {Op.getOperand(0), N0});
20300 SDValue Chain = Res.getValue(1);
20301 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20302 DAG.getIntPtrConstant(0, DL));
20303 return DAG.getMergeValues({Res, Chain}, DL);
20304 }
20305
20306 // Legalize to v4i32 type.
20307 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20308 DAG.getUNDEF(MVT::v2i32));
20309 if (IsStrict)
20310 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20311 {Op.getOperand(0), N0});
20312 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20313 }
20314
20315 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20316 // This gives us the floating point equivalent of 2^52 + the i32 integer
20317 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20318 // point leaving just our i32 integers in double format.
20319 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20320 SDValue VBias =
20321 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20322 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20323 DAG.getBitcast(MVT::v2i64, VBias));
20324 Or = DAG.getBitcast(MVT::v2f64, Or);
20325
20326 if (IsStrict)
20327 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20328 {Op.getOperand(0), Or, VBias});
20329 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20330}
20331
20332static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20333 const X86Subtarget &Subtarget) {
20334 SDLoc DL(Op);
20335 bool IsStrict = Op->isStrictFPOpcode();
20336 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20337 MVT VecIntVT = V.getSimpleValueType();
20338 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&((void)0)
20339 "Unsupported custom type")((void)0);
20340
20341 if (Subtarget.hasAVX512()) {
20342 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20343 assert(!Subtarget.hasVLX() && "Unexpected features")((void)0);
20344 MVT VT = Op->getSimpleValueType(0);
20345
20346 // v8i32->v8f64 is legal with AVX512 so just return it.
20347 if (VT == MVT::v8f64)
20348 return Op;
20349
20350 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&((void)0)
20351 "Unexpected VT!")((void)0);
20352 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20353 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20354 // Need to concat with zero vector for strict fp to avoid spurious
20355 // exceptions.
20356 SDValue Tmp =
20357 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20358 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20359 DAG.getIntPtrConstant(0, DL));
20360 SDValue Res, Chain;
20361 if (IsStrict) {
20362 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20363 {Op->getOperand(0), V});
20364 Chain = Res.getValue(1);
20365 } else {
20366 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20367 }
20368
20369 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20370 DAG.getIntPtrConstant(0, DL));
20371
20372 if (IsStrict)
20373 return DAG.getMergeValues({Res, Chain}, DL);
20374 return Res;
20375 }
20376
20377 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20378 Op->getSimpleValueType(0) == MVT::v4f64) {
20379 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20380 Constant *Bias = ConstantFP::get(
20381 *DAG.getContext(),
20382 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20383 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20384 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20385 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20386 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20387 SDValue VBias = DAG.getMemIntrinsicNode(
20388 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20389 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20390 MachineMemOperand::MOLoad);
20391
20392 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20393 DAG.getBitcast(MVT::v4i64, VBias));
20394 Or = DAG.getBitcast(MVT::v4f64, Or);
20395
20396 if (IsStrict)
20397 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20398 {Op.getOperand(0), Or, VBias});
20399 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20400 }
20401
20402 // The algorithm is the following:
20403 // #ifdef __SSE4_1__
20404 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20405 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20406 // (uint4) 0x53000000, 0xaa);
20407 // #else
20408 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20409 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20410 // #endif
20411 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20412 // return (float4) lo + fhi;
20413
20414 bool Is128 = VecIntVT == MVT::v4i32;
20415 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20416 // If we convert to something else than the supported type, e.g., to v4f64,
20417 // abort early.
20418 if (VecFloatVT != Op->getSimpleValueType(0))
20419 return SDValue();
20420
20421 // In the #idef/#else code, we have in common:
20422 // - The vector of constants:
20423 // -- 0x4b000000
20424 // -- 0x53000000
20425 // - A shift:
20426 // -- v >> 16
20427
20428 // Create the splat vector for 0x4b000000.
20429 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20430 // Create the splat vector for 0x53000000.
20431 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20432
20433 // Create the right shift.
20434 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20435 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20436
20437 SDValue Low, High;
20438 if (Subtarget.hasSSE41()) {
20439 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20440 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20441 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20442 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20443 // Low will be bitcasted right away, so do not bother bitcasting back to its
20444 // original type.
20445 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20446 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20447 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20448 // (uint4) 0x53000000, 0xaa);
20449 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20450 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20451 // High will be bitcasted right away, so do not bother bitcasting back to
20452 // its original type.
20453 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20454 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20455 } else {
20456 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20457 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20458 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20459 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20460
20461 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20462 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20463 }
20464
20465 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20466 SDValue VecCstFSub = DAG.getConstantFP(
20467 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20468
20469 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20470 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20471 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20472 // enabled. See PR24512.
20473 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20474 // TODO: Are there any fast-math-flags to propagate here?
20475 // (float4) lo;
20476 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20477 // return (float4) lo + fhi;
20478 if (IsStrict) {
20479 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20480 {Op.getOperand(0), HighBitcast, VecCstFSub});
20481 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20482 {FHigh.getValue(1), LowBitcast, FHigh});
20483 }
20484
20485 SDValue FHigh =
20486 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20487 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20488}
20489
20490static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20491 const X86Subtarget &Subtarget) {
20492 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20493 SDValue N0 = Op.getOperand(OpNo);
20494 MVT SrcVT = N0.getSimpleValueType();
20495 SDLoc dl(Op);
20496
20497 switch (SrcVT.SimpleTy) {
20498 default:
20499 llvm_unreachable("Custom UINT_TO_FP is not supported!")__builtin_unreachable();
20500 case MVT::v2i32:
20501 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20502 case MVT::v4i32:
20503 case MVT::v8i32:
20504 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20505 case MVT::v2i64:
20506 case MVT::v4i64:
20507 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20508 }
20509}
20510
20511SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20512 SelectionDAG &DAG) const {
20513 bool IsStrict = Op->isStrictFPOpcode();
20514 unsigned OpNo = IsStrict ? 1 : 0;
20515 SDValue Src = Op.getOperand(OpNo);
20516 SDLoc dl(Op);
20517 auto PtrVT = getPointerTy(DAG.getDataLayout());
20518 MVT SrcVT = Src.getSimpleValueType();
20519 MVT DstVT = Op->getSimpleValueType(0);
20520 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20521
20522 if (DstVT == MVT::f128)
20523 return SDValue();
20524
20525 if (DstVT.isVector())
20526 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20527
20528 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20529 return Extract;
20530
20531 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20532 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20533 // Conversions from unsigned i32 to f32/f64 are legal,
20534 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20535 return Op;
20536 }
20537
20538 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20539 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20540 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20541 if (IsStrict)
20542 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20543 {Chain, Src});
20544 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20545 }
20546
20547 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20548 return V;
20549
20550 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20551 // infinity. It produces -0.0, so disable under strictfp.
20552 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20553 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20554 if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20555 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20556 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20557 (DstVT == MVT::f32 || DstVT == MVT::f64))
20558 return SDValue();
20559
20560 // Make a 64-bit buffer, and use it to build an FILD.
20561 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20562 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20563 Align SlotAlign(8);
20564 MachinePointerInfo MPI =
20565 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20566 if (SrcVT == MVT::i32) {
20567 SDValue OffsetSlot =
20568 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20569 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20570 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20571 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20572 std::pair<SDValue, SDValue> Tmp =
20573 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20574 if (IsStrict)
20575 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20576
20577 return Tmp.first;
20578 }
20579
20580 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((void)0);
20581 SDValue ValueToStore = Src;
20582 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20583 // Bitcasting to f64 here allows us to do a single 64-bit store from
20584 // an SSE register, avoiding the store forwarding penalty that would come
20585 // with two 32-bit stores.
20586 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20587 }
20588 SDValue Store =
20589 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20590 // For i64 source, we need to add the appropriate power of 2 if the input
20591 // was negative. We must be careful to do the computation in x87 extended
20592 // precision, not in SSE.
20593 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20594 SDValue Ops[] = { Store, StackSlot };
20595 SDValue Fild =
20596 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20597 SlotAlign, MachineMemOperand::MOLoad);
20598 Chain = Fild.getValue(1);
20599
20600
20601 // Check whether the sign bit is set.
20602 SDValue SignSet = DAG.getSetCC(
20603 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20604 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20605
20606 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20607 APInt FF(64, 0x5F80000000000000ULL);
20608 SDValue FudgePtr = DAG.getConstantPool(
20609 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20610 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20611
20612 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20613 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20614 SDValue Four = DAG.getIntPtrConstant(4, dl);
20615 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20616 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20617
20618 // Load the value out, extending it from f32 to f80.
20619 SDValue Fudge = DAG.getExtLoad(
20620 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20621 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20622 CPAlignment);
20623 Chain = Fudge.getValue(1);
20624 // Extend everything to 80 bits to force it to be done on x87.
20625 // TODO: Are there any fast-math-flags to propagate here?
20626 if (IsStrict) {
20627 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20628 {Chain, Fild, Fudge});
20629 // STRICT_FP_ROUND can't handle equal types.
20630 if (DstVT == MVT::f80)
20631 return Add;
20632 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20633 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20634 }
20635 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
20636 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20637 DAG.getIntPtrConstant(0, dl));
20638}
20639
20640// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20641// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20642// just return an SDValue().
20643// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20644// to i16, i32 or i64, and we lower it to a legal sequence and return the
20645// result.
20646SDValue
20647X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20648 bool IsSigned, SDValue &Chain) const {
20649 bool IsStrict = Op->isStrictFPOpcode();
20650 SDLoc DL(Op);
20651
20652 EVT DstTy = Op.getValueType();
20653 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20654 EVT TheVT = Value.getValueType();
20655 auto PtrVT = getPointerTy(DAG.getDataLayout());
20656
20657 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20658 // f16 must be promoted before using the lowering in this routine.
20659 // fp128 does not use this lowering.
20660 return SDValue();
20661 }
20662
20663 // If using FIST to compute an unsigned i64, we'll need some fixup
20664 // to handle values above the maximum signed i64. A FIST is always
20665 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20666 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20667
20668 // FIXME: This does not generate an invalid exception if the input does not
20669 // fit in i32. PR44019
20670 if (!IsSigned && DstTy != MVT::i64) {
20671 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20672 // The low 32 bits of the fist result will have the correct uint32 result.
20673 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((void)0);
20674 DstTy = MVT::i64;
20675 }
20676
20677 assert(DstTy.getSimpleVT() <= MVT::i64 &&((void)0)
20678 DstTy.getSimpleVT() >= MVT::i16 &&((void)0)
20679 "Unknown FP_TO_INT to lower!")((void)0);
20680
20681 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20682 // stack slot.
20683 MachineFunction &MF = DAG.getMachineFunction();
20684 unsigned MemSize = DstTy.getStoreSize();
20685 int SSFI =
20686 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20687 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20688
20689 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20690
20691 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20692
20693 if (UnsignedFixup) {
20694 //
20695 // Conversion to unsigned i64 is implemented with a select,
20696 // depending on whether the source value fits in the range
20697 // of a signed i64. Let Thresh be the FP equivalent of
20698 // 0x8000000000000000ULL.
20699 //
20700 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20701 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20702 // FistSrc = (Value - FltOfs);
20703 // Fist-to-mem64 FistSrc
20704 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20705 // to XOR'ing the high 32 bits with Adjust.
20706 //
20707 // Being a power of 2, Thresh is exactly representable in all FP formats.
20708 // For X87 we'd like to use the smallest FP type for this constant, but
20709 // for DAG type consistency we have to match the FP operand type.
20710
20711 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20712 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
20713 bool LosesInfo = false;
20714 if (TheVT == MVT::f64)
20715 // The rounding mode is irrelevant as the conversion should be exact.
20716 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20717 &LosesInfo);
20718 else if (TheVT == MVT::f80)
20719 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20720 APFloat::rmNearestTiesToEven, &LosesInfo);
20721
20722 assert(Status == APFloat::opOK && !LosesInfo &&((void)0)
20723 "FP conversion should have been exact")((void)0);
20724
20725 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20726
20727 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20728 *DAG.getContext(), TheVT);
20729 SDValue Cmp;
20730 if (IsStrict) {
20731 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20732 /*IsSignaling*/ true);
20733 Chain = Cmp.getValue(1);
20734 } else {
20735 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20736 }
20737
20738 // Our preferred lowering of
20739 //
20740 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20741 //
20742 // is
20743 //
20744 // (Value >= Thresh) << 63
20745 //
20746 // but since we can get here after LegalOperations, DAGCombine might do the
20747 // wrong thing if we create a select. So, directly create the preferred
20748 // version.
20749 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20750 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20751 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20752
20753 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20754 DAG.getConstantFP(0.0, DL, TheVT));
20755
20756 if (IsStrict) {
20757 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20758 { Chain, Value, FltOfs });
20759 Chain = Value.getValue(1);
20760 } else
20761 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20762 }
20763
20764 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20765
20766 // FIXME This causes a redundant load/store if the SSE-class value is already
20767 // in memory, such as if it is on the callstack.
20768 if (isScalarFPTypeInSSEReg(TheVT)) {
20769 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((void)0);
20770 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20771 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20772 SDValue Ops[] = { Chain, StackSlot };
20773
20774 unsigned FLDSize = TheVT.getStoreSize();
20775 assert(FLDSize <= MemSize && "Stack slot not big enough")((void)0);
20776 MachineMemOperand *MMO = MF.getMachineMemOperand(
20777 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20778 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20779 Chain = Value.getValue(1);
20780 }
20781
20782 // Build the FP_TO_INT*_IN_MEM
20783 MachineMemOperand *MMO = MF.getMachineMemOperand(
20784 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20785 SDValue Ops[] = { Chain, Value, StackSlot };
20786 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
20787 DAG.getVTList(MVT::Other),
20788 Ops, DstTy, MMO);
20789
20790 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20791 Chain = Res.getValue(1);
20792
20793 // If we need an unsigned fixup, XOR the result with adjust.
20794 if (UnsignedFixup)
20795 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20796
20797 return Res;
20798}
20799
20800static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
20801 const X86Subtarget &Subtarget) {
20802 MVT VT = Op.getSimpleValueType();
20803 SDValue In = Op.getOperand(0);
20804 MVT InVT = In.getSimpleValueType();
20805 SDLoc dl(Op);
20806 unsigned Opc = Op.getOpcode();
20807
20808 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((void)0);
20809 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&((void)0)
20810 "Unexpected extension opcode")((void)0);
20811 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
20812 "Expected same number of elements")((void)0);
20813 assert((VT.getVectorElementType() == MVT::i16 ||((void)0)
20814 VT.getVectorElementType() == MVT::i32 ||((void)0)
20815 VT.getVectorElementType() == MVT::i64) &&((void)0)
20816 "Unexpected element type")((void)0);
20817 assert((InVT.getVectorElementType() == MVT::i8 ||((void)0)
20818 InVT.getVectorElementType() == MVT::i16 ||((void)0)
20819 InVT.getVectorElementType() == MVT::i32) &&((void)0)
20820 "Unexpected element type")((void)0);
20821
20822 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
20823
20824 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20825 assert(InVT == MVT::v32i8 && "Unexpected VT!")((void)0);
20826 return splitVectorIntUnary(Op, DAG);
20827 }
20828
20829 if (Subtarget.hasInt256())
20830 return Op;
20831
20832 // Optimize vectors in AVX mode:
20833 //
20834 // v8i16 -> v8i32
20835 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20836 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20837 // Concat upper and lower parts.
20838 //
20839 // v4i32 -> v4i64
20840 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20841 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20842 // Concat upper and lower parts.
20843 //
20844 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20845 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20846
20847 // Short-circuit if we can determine that each 128-bit half is the same value.
20848 // Otherwise, this is difficult to match and optimize.
20849 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20850 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20851 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20852
20853 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20854 SDValue Undef = DAG.getUNDEF(InVT);
20855 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20856 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20857 OpHi = DAG.getBitcast(HalfVT, OpHi);
20858
20859 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20860}
20861
20862// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20863static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20864 const SDLoc &dl, SelectionDAG &DAG) {
20865 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")((void)0);
20866 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20867 DAG.getIntPtrConstant(0, dl));
20868 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20869 DAG.getIntPtrConstant(8, dl));
20870 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20871 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20872 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20873 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20874}
20875
20876static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
20877 const X86Subtarget &Subtarget,
20878 SelectionDAG &DAG) {
20879 MVT VT = Op->getSimpleValueType(0);
20880 SDValue In = Op->getOperand(0);
20881 MVT InVT = In.getSimpleValueType();
20882 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((void)0);
20883 SDLoc DL(Op);
20884 unsigned NumElts = VT.getVectorNumElements();
20885
20886 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20887 // avoids a constant pool load.
20888 if (VT.getVectorElementType() != MVT::i8) {
20889 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20890 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20891 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20892 }
20893
20894 // Extend VT if BWI is not supported.
20895 MVT ExtVT = VT;
20896 if (!Subtarget.hasBWI()) {
20897 // If v16i32 is to be avoided, we'll need to split and concatenate.
20898 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20899 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20900
20901 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20902 }
20903
20904 // Widen to 512-bits if VLX is not supported.
20905 MVT WideVT = ExtVT;
20906 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20907 NumElts *= 512 / ExtVT.getSizeInBits();
20908 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20909 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20910 In, DAG.getIntPtrConstant(0, DL));
20911 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20912 NumElts);
20913 }
20914
20915 SDValue One = DAG.getConstant(1, DL, WideVT);
20916 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20917
20918 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20919
20920 // Truncate if we had to extend above.
20921 if (VT != ExtVT) {
20922 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20923 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20924 }
20925
20926 // Extract back to 128/256-bit if we widened.
20927 if (WideVT != VT)
20928 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20929 DAG.getIntPtrConstant(0, DL));
20930
20931 return SelectedVal;
20932}
20933
20934static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20935 SelectionDAG &DAG) {
20936 SDValue In = Op.getOperand(0);
20937 MVT SVT = In.getSimpleValueType();
20938
20939 if (SVT.getVectorElementType() == MVT::i1)
20940 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20941
20942 assert(Subtarget.hasAVX() && "Expected AVX support")((void)0);
20943 return LowerAVXExtend(Op, DAG, Subtarget);
20944}
20945
20946/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20947/// It makes use of the fact that vectors with enough leading sign/zero bits
20948/// prevent the PACKSS/PACKUS from saturating the results.
20949/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20950/// within each 128-bit lane.
20951static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20952 const SDLoc &DL, SelectionDAG &DAG,
20953 const X86Subtarget &Subtarget) {
20954 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&((void)0)
20955 "Unexpected PACK opcode")((void)0);
20956 assert(DstVT.isVector() && "VT not a vector?")((void)0);
20957
20958 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20959 if (!Subtarget.hasSSE2())
20960 return SDValue();
20961
20962 EVT SrcVT = In.getValueType();
20963
20964 // No truncation required, we might get here due to recursive calls.
20965 if (SrcVT == DstVT)
20966 return In;
20967
20968 // We only support vector truncation to 64bits or greater from a
20969 // 128bits or greater source.
20970 unsigned DstSizeInBits = DstVT.getSizeInBits();
20971 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20972 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
20973 return SDValue();
20974
20975 unsigned NumElems = SrcVT.getVectorNumElements();
20976 if (!isPowerOf2_32(NumElems))
20977 return SDValue();
20978
20979 LLVMContext &Ctx = *DAG.getContext();
20980 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((void)0);
20981 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")((void)0);
20982
20983 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20984
20985 // Pack to the largest type possible:
20986 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20987 EVT InVT = MVT::i16, OutVT = MVT::i8;
20988 if (SrcVT.getScalarSizeInBits() > 16 &&
20989 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20990 InVT = MVT::i32;
20991 OutVT = MVT::i16;
20992 }
20993
20994 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20995 if (SrcVT.is128BitVector()) {
20996 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20997 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20998 In = DAG.getBitcast(InVT, In);
20999 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21000 Res = extractSubVector(Res, 0, DAG, DL, 64);
21001 return DAG.getBitcast(DstVT, Res);
21002 }
21003
21004 // Split lower/upper subvectors.
21005 SDValue Lo, Hi;
21006 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21007
21008 unsigned SubSizeInBits = SrcSizeInBits / 2;
21009 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21010 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21011
21012 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21013 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21014 Lo = DAG.getBitcast(InVT, Lo);
21015 Hi = DAG.getBitcast(InVT, Hi);
21016 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21017 return DAG.getBitcast(DstVT, Res);
21018 }
21019
21020 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21021 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21022 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21023 Lo = DAG.getBitcast(InVT, Lo);
21024 Hi = DAG.getBitcast(InVT, Hi);
21025 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21026
21027 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21028 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21029 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21030 SmallVector<int, 64> Mask;
21031 int Scale = 64 / OutVT.getScalarSizeInBits();
21032 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21033 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21034
21035 if (DstVT.is256BitVector())
21036 return DAG.getBitcast(DstVT, Res);
21037
21038 // If 512bit -> 128bit truncate another stage.
21039 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21040 Res = DAG.getBitcast(PackedVT, Res);
21041 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21042 }
21043
21044 // Recursively pack lower/upper subvectors, concat result and pack again.
21045 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")((void)0);
21046 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21047 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21048 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21049
21050 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21051 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21052 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21053}
21054
21055static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21056 const X86Subtarget &Subtarget) {
21057
21058 SDLoc DL(Op);
21059 MVT VT = Op.getSimpleValueType();
21060 SDValue In = Op.getOperand(0);
21061 MVT InVT = In.getSimpleValueType();
21062
21063 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((void)0);
21064
21065 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21066 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21067 if (InVT.getScalarSizeInBits() <= 16) {
21068 if (Subtarget.hasBWI()) {
21069 // legal, will go to VPMOVB2M, VPMOVW2M
21070 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21071 // We need to shift to get the lsb into sign position.
21072 // Shift packed bytes not supported natively, bitcast to word
21073 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21074 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21075 DAG.getBitcast(ExtVT, In),
21076 DAG.getConstant(ShiftInx, DL, ExtVT));
21077 In = DAG.getBitcast(InVT, In);
21078 }
21079 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21080 In, ISD::SETGT);
21081 }
21082 // Use TESTD/Q, extended vector to packed dword/qword.
21083 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&((void)0)
21084 "Unexpected vector type.")((void)0);
21085 unsigned NumElts = InVT.getVectorNumElements();
21086 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")((void)0);
21087 // We need to change to a wider element type that we have support for.
21088 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21089 // For 16 element vectors we extend to v16i32 unless we are explicitly
21090 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21091 // we need to split into two 8 element vectors which we can extend to v8i32,
21092 // truncate and concat the results. There's an additional complication if
21093 // the original type is v16i8. In that case we can't split the v16i8
21094 // directly, so we need to shuffle high elements to low and use
21095 // sign_extend_vector_inreg.
21096 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21097 SDValue Lo, Hi;
21098 if (InVT == MVT::v16i8) {
21099 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21100 Hi = DAG.getVectorShuffle(
21101 InVT, DL, In, In,
21102 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21103 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21104 } else {
21105 assert(InVT == MVT::v16i16 && "Unexpected VT!")((void)0);
21106 Lo = extract128BitVector(In, 0, DAG, DL);
21107 Hi = extract128BitVector(In, 8, DAG, DL);
21108 }
21109 // We're split now, just emit two truncates and a concat. The two
21110 // truncates will trigger legalization to come back to this function.
21111 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21112 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21113 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21114 }
21115 // We either have 8 elements or we're allowed to use 512-bit vectors.
21116 // If we have VLX, we want to use the narrowest vector that can get the
21117 // job done so we use vXi32.
21118 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21119 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21120 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21121 InVT = ExtVT;
21122 ShiftInx = InVT.getScalarSizeInBits() - 1;
21123 }
21124
21125 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21126 // We need to shift to get the lsb into sign position.
21127 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21128 DAG.getConstant(ShiftInx, DL, InVT));
21129 }
21130 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21131 if (Subtarget.hasDQI())
21132 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21133 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21134}
21135
21136SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21137 SDLoc DL(Op);
21138 MVT VT = Op.getSimpleValueType();
21139 SDValue In = Op.getOperand(0);
21140 MVT InVT = In.getSimpleValueType();
21141 unsigned InNumEltBits = InVT.getScalarSizeInBits();
21142
21143 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
21144 "Invalid TRUNCATE operation")((void)0);
21145
21146 // If we're called by the type legalizer, handle a few cases.
21147 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21148 if (!TLI.isTypeLegal(InVT)) {
21149 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21150 VT.is128BitVector()) {
21151 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&((void)0)
21152 "Unexpected subtarget!")((void)0);
21153 // The default behavior is to truncate one step, concatenate, and then
21154 // truncate the remainder. We'd rather produce two 64-bit results and
21155 // concatenate those.
21156 SDValue Lo, Hi;
21157 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21158
21159 EVT LoVT, HiVT;
21160 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21161
21162 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21163 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21164 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21165 }
21166
21167 // Otherwise let default legalization handle it.
21168 return SDValue();
21169 }
21170
21171 if (VT.getVectorElementType() == MVT::i1)
21172 return LowerTruncateVecI1(Op, DAG, Subtarget);
21173
21174 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21175 if (Subtarget.hasAVX512()) {
21176 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21177 assert(VT == MVT::v32i8 && "Unexpected VT!")((void)0);
21178 return splitVectorIntUnary(Op, DAG);
21179 }
21180
21181 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21182 // and then truncate that. But we should only do that if we haven't been
21183 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21184 // handled by isel patterns.
21185 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21186 Subtarget.canExtendTo512DQ())
21187 return Op;
21188 }
21189
21190 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21191 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21192
21193 // Truncate with PACKUS if we are truncating a vector with leading zero bits
21194 // that extend all the way to the packed/truncated value.
21195 // Pre-SSE41 we can only use PACKUSWB.
21196 KnownBits Known = DAG.computeKnownBits(In);
21197 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21198 if (SDValue V =
21199 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21200 return V;
21201
21202 // Truncate with PACKSS if we are truncating a vector with sign-bits that
21203 // extend all the way to the packed/truncated value.
21204 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21205 if (SDValue V =
21206 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21207 return V;
21208
21209 // Handle truncation of V256 to V128 using shuffles.
21210 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")((void)0);
21211
21212 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21213 In = DAG.getBitcast(MVT::v8i32, In);
21214
21215 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21216 if (Subtarget.hasInt256()) {
21217 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21218 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21219 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21220 DAG.getIntPtrConstant(0, DL));
21221 }
21222
21223 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21224 DAG.getIntPtrConstant(0, DL));
21225 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21226 DAG.getIntPtrConstant(4, DL));
21227 static const int ShufMask[] = {0, 2, 4, 6};
21228 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21229 }
21230
21231 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21232 In = DAG.getBitcast(MVT::v32i8, In);
21233
21234 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21235 if (Subtarget.hasInt256()) {
21236 // The PSHUFB mask:
21237 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21238 -1, -1, -1, -1, -1, -1, -1, -1,
21239 16, 17, 20, 21, 24, 25, 28, 29,
21240 -1, -1, -1, -1, -1, -1, -1, -1 };
21241 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21242 In = DAG.getBitcast(MVT::v4i64, In);
21243
21244 static const int ShufMask2[] = {0, 2, -1, -1};
21245 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21246 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21247 DAG.getBitcast(MVT::v16i16, In),
21248 DAG.getIntPtrConstant(0, DL));
21249 }
21250
21251 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21252 DAG.getIntPtrConstant(0, DL));
21253 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21254 DAG.getIntPtrConstant(16, DL));
21255
21256 // The PSHUFB mask:
21257 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
21258 -1, -1, -1, -1, -1, -1, -1, -1};
21259
21260 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21261 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21262
21263 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21264 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21265
21266 // The MOVLHPS Mask:
21267 static const int ShufMask2[] = {0, 1, 4, 5};
21268 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21269 return DAG.getBitcast(MVT::v8i16, res);
21270 }
21271
21272 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21273 // Use an AND to zero uppper bits for PACKUS.
21274 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21275
21276 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21277 DAG.getIntPtrConstant(0, DL));
21278 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21279 DAG.getIntPtrConstant(8, DL));
21280 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21281 }
21282
21283 llvm_unreachable("All 256->128 cases should have been handled above!")__builtin_unreachable();
21284}
21285
21286// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21287// behaves on out of range inputs to generate optimized conversions.
21288static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21289 SelectionDAG &DAG,
21290 const X86Subtarget &Subtarget) {
21291 MVT SrcVT = Src.getSimpleValueType();
21292 unsigned DstBits = VT.getScalarSizeInBits();
21293 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")((void)0);
21294
21295 // Calculate the converted result for values in the range 0 to
21296 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21297 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21298 SDValue Big =
21299 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21300 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21301 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21302
21303 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21304 // and only if the value was out of range. So we can use that
21305 // as our indicator that we rather use "Big" instead of "Small".
21306 //
21307 // Use "Small" if "IsOverflown" has all bits cleared
21308 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21309
21310 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21311 // use the slightly slower blendv select instead.
21312 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21313 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21314 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21315 }
21316
21317 SDValue IsOverflown =
21318 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21319 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21320 return DAG.getNode(ISD::OR, dl, VT, Small,
21321 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21322}
21323
21324SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21325 bool IsStrict = Op->isStrictFPOpcode();
21326 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21327 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21328 MVT VT = Op->getSimpleValueType(0);
21329 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21330 MVT SrcVT = Src.getSimpleValueType();
21331 SDLoc dl(Op);
21332
21333 if (VT.isVector()) {
21334 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21335 MVT ResVT = MVT::v4i32;
21336 MVT TruncVT = MVT::v4i1;
21337 unsigned Opc;
21338 if (IsStrict)
21339 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21340 else
21341 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21342
21343 if (!IsSigned && !Subtarget.hasVLX()) {
21344 assert(Subtarget.useAVX512Regs() && "Unexpected features!")((void)0);
21345 // Widen to 512-bits.
21346 ResVT = MVT::v8i32;
21347 TruncVT = MVT::v8i1;
21348 Opc = Op.getOpcode();
21349 // Need to concat with zero vector for strict fp to avoid spurious
21350 // exceptions.
21351 // TODO: Should we just do this for non-strict as well?
21352 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21353 : DAG.getUNDEF(MVT::v8f64);
21354 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21355 DAG.getIntPtrConstant(0, dl));
21356 }
21357 SDValue Res, Chain;
21358 if (IsStrict) {
21359 Res =
21360 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
21361 Chain = Res.getValue(1);
21362 } else {
21363 Res = DAG.getNode(Opc, dl, ResVT, Src);
21364 }
21365
21366 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21367 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21368 DAG.getIntPtrConstant(0, dl));
21369 if (IsStrict)
21370 return DAG.getMergeValues({Res, Chain}, dl);
21371 return Res;
21372 }
21373
21374 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21375 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21376 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21377 assert(Subtarget.useAVX512Regs() && "Requires avx512f")((void)0);
21378 return Op;
21379 }
21380
21381 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21382 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21383 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21384 Subtarget.useAVX512Regs()) {
21385 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21386 assert(!Subtarget.hasVLX() && "Unexpected features!")((void)0);
21387 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21388 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21389 // Need to concat with zero vector for strict fp to avoid spurious
21390 // exceptions.
21391 // TODO: Should we just do this for non-strict as well?
21392 SDValue Tmp =
21393 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21394 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21395 DAG.getIntPtrConstant(0, dl));
21396
21397 SDValue Res, Chain;
21398 if (IsStrict) {
21399 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21400 {Op->getOperand(0), Src});
21401 Chain = Res.getValue(1);
21402 } else {
21403 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21404 }
21405
21406 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21407 DAG.getIntPtrConstant(0, dl));
21408
21409 if (IsStrict)
21410 return DAG.getMergeValues({Res, Chain}, dl);
21411 return Res;
21412 }
21413
21414 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21415 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21416 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21417 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21418 assert(!Subtarget.hasVLX() && "Unexpected features!")((void)0);
21419 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21420 // Need to concat with zero vector for strict fp to avoid spurious
21421 // exceptions.
21422 // TODO: Should we just do this for non-strict as well?
21423 SDValue Tmp =
21424 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21425 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21426 DAG.getIntPtrConstant(0, dl));
21427
21428 SDValue Res, Chain;
21429 if (IsStrict) {
21430 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21431 {Op->getOperand(0), Src});
21432 Chain = Res.getValue(1);
21433 } else {
21434 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21435 }
21436
21437 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21438 DAG.getIntPtrConstant(0, dl));
21439
21440 if (IsStrict)
21441 return DAG.getMergeValues({Res, Chain}, dl);
21442 return Res;
21443 }
21444
21445 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21446 if (!Subtarget.hasVLX()) {
21447 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21448 // legalizer and then widened again by vector op legalization.
21449 if (!IsStrict)
21450 return SDValue();
21451
21452 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21453 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21454 {Src, Zero, Zero, Zero});
21455 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21456 {Op->getOperand(0), Tmp});
21457 SDValue Chain = Tmp.getValue(1);
21458 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21459 DAG.getIntPtrConstant(0, dl));
21460 return DAG.getMergeValues({Tmp, Chain}, dl);
21461 }
21462
21463 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")((void)0);
21464 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21465 DAG.getUNDEF(MVT::v2f32));
21466 if (IsStrict) {
21467 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21468 : X86ISD::STRICT_CVTTP2UI;
21469 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21470 }
21471 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21472 return DAG.getNode(Opc, dl, VT, Tmp);
21473 }
21474
21475 // Generate optimized instructions for pre AVX512 unsigned conversions from
21476 // vXf32 to vXi32.
21477 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21478 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21479 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21480 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21481 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21482 }
21483
21484 return SDValue();
21485 }
21486
21487 assert(!VT.isVector())((void)0);
21488
21489 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21490
21491 if (!IsSigned && UseSSEReg) {
21492 // Conversions from f32/f64 with AVX512 should be legal.
21493 if (Subtarget.hasAVX512())
21494 return Op;
21495
21496 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21497 // behaves on out of range inputs to generate optimized conversions.
21498 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21499 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21500 unsigned DstBits = VT.getScalarSizeInBits();
21501 APInt UIntLimit = APInt::getSignMask(DstBits);
21502 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21503 DAG.getConstant(UIntLimit, dl, VT));
21504 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21505
21506 // Calculate the converted result for values in the range:
21507 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21508 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21509 SDValue Small =
21510 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21511 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21512 SDValue Big = DAG.getNode(
21513 X86ISD::CVTTS2SI, dl, VT,
21514 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21515 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21516
21517 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21518 // and only if the value was out of range. So we can use that
21519 // as our indicator that we rather use "Big" instead of "Small".
21520 //
21521 // Use "Small" if "IsOverflown" has all bits cleared
21522 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21523 SDValue IsOverflown = DAG.getNode(
21524 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21525 return DAG.getNode(ISD::OR, dl, VT, Small,
21526 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21527 }
21528
21529 // Use default expansion for i64.
21530 if (VT == MVT::i64)
21531 return SDValue();
21532
21533 assert(VT == MVT::i32 && "Unexpected VT!")((void)0);
21534
21535 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21536 // FIXME: This does not generate an invalid exception if the input does not
21537 // fit in i32. PR44019
21538 if (Subtarget.is64Bit()) {
21539 SDValue Res, Chain;
21540 if (IsStrict) {
21541 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
21542 { Op.getOperand(0), Src });
21543 Chain = Res.getValue(1);
21544 } else
21545 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21546
21547 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21548 if (IsStrict)
21549 return DAG.getMergeValues({ Res, Chain }, dl);
21550 return Res;
21551 }
21552
21553 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21554 // use fisttp which will be handled later.
21555 if (!Subtarget.hasSSE3())
21556 return SDValue();
21557 }
21558
21559 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21560 // FIXME: This does not generate an invalid exception if the input does not
21561 // fit in i16. PR44019
21562 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21563 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")((void)0);
21564 SDValue Res, Chain;
21565 if (IsStrict) {
21566 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
21567 { Op.getOperand(0), Src });
21568 Chain = Res.getValue(1);
21569 } else
21570 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21571
21572 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21573 if (IsStrict)
21574 return DAG.getMergeValues({ Res, Chain }, dl);
21575 return Res;
21576 }
21577
21578 // If this is a FP_TO_SINT using SSEReg we're done.
21579 if (UseSSEReg && IsSigned)
21580 return Op;
21581
21582 // fp128 needs to use a libcall.
21583 if (SrcVT == MVT::f128) {
21584 RTLIB::Libcall LC;
21585 if (IsSigned)
21586 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21587 else
21588 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21589
21590 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21591 MakeLibCallOptions CallOptions;
21592 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21593 SDLoc(Op), Chain);
21594
21595 if (IsStrict)
21596 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21597
21598 return Tmp.first;
21599 }
21600
21601 // Fall back to X87.
21602 SDValue Chain;
21603 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21604 if (IsStrict)
21605 return DAG.getMergeValues({V, Chain}, dl);
21606 return V;
21607 }
21608
21609 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")__builtin_unreachable();
21610}
21611
21612SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21613 SelectionDAG &DAG) const {
21614 SDValue Src = Op.getOperand(0);
21615 MVT SrcVT = Src.getSimpleValueType();
21616
21617 // If the source is in an SSE register, the node is Legal.
21618 if (isScalarFPTypeInSSEReg(SrcVT))
21619 return Op;
21620
21621 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21622}
21623
21624SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21625 SelectionDAG &DAG) const {
21626 EVT DstVT = N->getValueType(0);
21627 SDValue Src = N->getOperand(0);
21628 EVT SrcVT = Src.getValueType();
21629
21630 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21631 // f16 must be promoted before using the lowering in this routine.
21632 // fp128 does not use this lowering.
21633 return SDValue();
21634 }
21635
21636 SDLoc DL(N);
21637 SDValue Chain = DAG.getEntryNode();
21638
21639 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21640
21641 // If we're converting from SSE, the stack slot needs to hold both types.
21642 // Otherwise it only needs to hold the DstVT.
21643 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21644 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21645 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21646 MachinePointerInfo MPI =
21647 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
21648
21649 if (UseSSE) {
21650 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")((void)0);
21651 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21652 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21653 SDValue Ops[] = { Chain, StackPtr };
21654
21655 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21656 /*Align*/ None, MachineMemOperand::MOLoad);
21657 Chain = Src.getValue(1);
21658 }
21659
21660 SDValue StoreOps[] = { Chain, Src, StackPtr };
21661 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21662 StoreOps, DstVT, MPI, /*Align*/ None,
21663 MachineMemOperand::MOStore);
21664
21665 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21666}
21667
21668SDValue
21669X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21670 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21671 // but making use of X86 specifics to produce better instruction sequences.
21672 SDNode *Node = Op.getNode();
21673 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21674 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21675 SDLoc dl(SDValue(Node, 0));
21676 SDValue Src = Node->getOperand(0);
21677
21678 // There are three types involved here: SrcVT is the source floating point
21679 // type, DstVT is the type of the result, and TmpVT is the result of the
21680 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21681 // DstVT).
21682 EVT SrcVT = Src.getValueType();
21683 EVT DstVT = Node->getValueType(0);
21684 EVT TmpVT = DstVT;
21685
21686 // This code is only for floats and doubles. Fall back to generic code for
21687 // anything else.
21688 if (!isScalarFPTypeInSSEReg(SrcVT))
21689 return SDValue();
21690
21691 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21692 unsigned SatWidth = SatVT.getScalarSizeInBits();
21693 unsigned DstWidth = DstVT.getScalarSizeInBits();
21694 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21695 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&((void)0)
21696 "Expected saturation width smaller than result width")((void)0);
21697
21698 // Promote result of FP_TO_*INT to at least 32 bits.
21699 if (TmpWidth < 32) {
21700 TmpVT = MVT::i32;
21701 TmpWidth = 32;
21702 }
21703
21704 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21705 // us to use a native signed conversion instead.
21706 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21707 TmpVT = MVT::i64;
21708 TmpWidth = 64;
21709 }
21710
21711 // If the saturation width is smaller than the size of the temporary result,
21712 // we can always use signed conversion, which is native.
21713 if (SatWidth < TmpWidth)
21714 FpToIntOpcode = ISD::FP_TO_SINT;
21715
21716 // Determine minimum and maximum integer values and their corresponding
21717 // floating-point values.
21718 APInt MinInt, MaxInt;
21719 if (IsSigned) {
21720 MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
21721 MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
21722 } else {
21723 MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
21724 MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
21725 }
21726
21727 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21728 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21729
21730 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21731 MinInt, IsSigned, APFloat::rmTowardZero);
21732 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21733 MaxInt, IsSigned, APFloat::rmTowardZero);
21734 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21735 && !(MaxStatus & APFloat::opStatus::opInexact);
21736
21737 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21738 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21739
21740 // If the integer bounds are exactly representable as floats, emit a
21741 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21742 if (AreExactFloatBounds) {
21743 if (DstVT != TmpVT) {
21744 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21745 SDValue MinClamped = DAG.getNode(
21746 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21747 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21748 SDValue BothClamped = DAG.getNode(
21749 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21750 // Convert clamped value to integer.
21751 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21752
21753 // NaN will become INDVAL, with the top bit set and the rest zero.
21754 // Truncation will discard the top bit, resulting in zero.
21755 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21756 }
21757
21758 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21759 SDValue MinClamped = DAG.getNode(
21760 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21761 // Clamp by MaxFloat from above. NaN cannot occur.
21762 SDValue BothClamped = DAG.getNode(
21763 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21764 // Convert clamped value to integer.
21765 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21766
21767 if (!IsSigned) {
21768 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21769 // which is zero.
21770 return FpToInt;
21771 }
21772
21773 // Otherwise, select zero if Src is NaN.
21774 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21775 return DAG.getSelectCC(
21776 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21777 }
21778
21779 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21780 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21781
21782 // Result of direct conversion, which may be selected away.
21783 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21784
21785 if (DstVT != TmpVT) {
21786 // NaN will become INDVAL, with the top bit set and the rest zero.
21787 // Truncation will discard the top bit, resulting in zero.
21788 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21789 }
21790
21791 SDValue Select = FpToInt;
21792 // For signed conversions where we saturate to the same size as the
21793 // result type of the fptoi instructions, INDVAL coincides with integer
21794 // minimum, so we don't need to explicitly check it.
21795 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21796 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21797 // MinInt if Src is NaN.
21798 Select = DAG.getSelectCC(
21799 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21800 }
21801
21802 // If Src OGT MaxFloat, select MaxInt.
21803 Select = DAG.getSelectCC(
21804 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21805
21806 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21807 // is already zero. The promoted case was already handled above.
21808 if (!IsSigned || DstVT != TmpVT) {
21809 return Select;
21810 }
21811
21812 // Otherwise, select 0 if Src is NaN.
21813 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21814 return DAG.getSelectCC(
21815 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21816}
21817
21818SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21819 bool IsStrict = Op->isStrictFPOpcode();
21820
21821 SDLoc DL(Op);
21822 MVT VT = Op.getSimpleValueType();
21823 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21824 MVT SVT = In.getSimpleValueType();
21825
21826 if (VT == MVT::f128)
21827 return SDValue();
21828
21829 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((void)0);
21830
21831 SDValue Res =
21832 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21833 if (IsStrict)
21834 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21835 {Op->getOperand(0), Res});
21836 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21837}
21838
21839SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21840 bool IsStrict = Op->isStrictFPOpcode();
21841 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21842 // It's legal except when f128 is involved
21843 if (In.getSimpleValueType() != MVT::f128)
21844 return Op;
21845
21846 return SDValue();
21847}
21848
21849static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21850 bool IsStrict = Op->isStrictFPOpcode();
21851 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21852 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&((void)0)
21853 "Unexpected VT!")((void)0);
21854
21855 SDLoc dl(Op);
21856 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21857 DAG.getConstant(0, dl, MVT::v8i16), Src,
21858 DAG.getIntPtrConstant(0, dl));
21859
21860 SDValue Chain;
21861 if (IsStrict) {
21862 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21863 {Op.getOperand(0), Res});
21864 Chain = Res.getValue(1);
21865 } else {
21866 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21867 }
21868
21869 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21870 DAG.getIntPtrConstant(0, dl));
21871
21872 if (IsStrict)
21873 return DAG.getMergeValues({Res, Chain}, dl);
21874
21875 return Res;
21876}
21877
21878static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21879 bool IsStrict = Op->isStrictFPOpcode();
21880 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21881 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&((void)0)
21882 "Unexpected VT!")((void)0);
21883
21884 SDLoc dl(Op);
21885 SDValue Res, Chain;
21886 if (IsStrict) {
21887 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21888 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21889 DAG.getIntPtrConstant(0, dl));
21890 Res = DAG.getNode(
21891 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21892 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21893 Chain = Res.getValue(1);
21894 } else {
21895 // FIXME: Should we use zeros for upper elements for non-strict?
21896 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21897 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21898 DAG.getTargetConstant(4, dl, MVT::i32));
21899 }
21900
21901 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21902 DAG.getIntPtrConstant(0, dl));
21903
21904 if (IsStrict)
21905 return DAG.getMergeValues({Res, Chain}, dl);
21906
21907 return Res;
21908}
21909
21910/// Depending on uarch and/or optimizing for size, we might prefer to use a
21911/// vector operation in place of the typical scalar operation.
21912static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21913 const X86Subtarget &Subtarget) {
21914 // If both operands have other uses, this is probably not profitable.
21915 SDValue LHS = Op.getOperand(0);
21916 SDValue RHS = Op.getOperand(1);
21917 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21918 return Op;
21919
21920 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21921 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21922 if (IsFP && !Subtarget.hasSSE3())
21923 return Op;
21924 if (!IsFP && !Subtarget.hasSSSE3())
21925 return Op;
21926
21927 // Extract from a common vector.
21928 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21929 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21930 LHS.getOperand(0) != RHS.getOperand(0) ||
21931 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21932 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21933 !shouldUseHorizontalOp(true, DAG, Subtarget))
21934 return Op;
21935
21936 // Allow commuted 'hadd' ops.
21937 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21938 unsigned HOpcode;
21939 switch (Op.getOpcode()) {
21940 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21941 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21942 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21943 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21944 default:
21945 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")__builtin_unreachable();
21946 }
21947 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21948 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21949 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21950 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21951 std::swap(LExtIndex, RExtIndex);
21952
21953 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21954 return Op;
21955
21956 SDValue X = LHS.getOperand(0);
21957 EVT VecVT = X.getValueType();
21958 unsigned BitWidth = VecVT.getSizeInBits();
21959 unsigned NumLanes = BitWidth / 128;
21960 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21961 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&((void)0)
21962 "Not expecting illegal vector widths here")((void)0);
21963
21964 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21965 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21966 SDLoc DL(Op);
21967 if (BitWidth == 256 || BitWidth == 512) {
21968 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21969 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21970 LExtIndex %= NumEltsPerLane;
21971 }
21972
21973 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21974 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21975 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21976 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21977 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21978 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21979 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21980}
21981
21982/// Depending on uarch and/or optimizing for size, we might prefer to use a
21983/// vector operation in place of the typical scalar operation.
21984SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21985 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&((void)0)
21986 "Only expecting float/double")((void)0);
21987 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21988}
21989
21990/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21991/// This mode isn't supported in hardware on X86. But as long as we aren't
21992/// compiling with trapping math, we can emulate this with
21993/// floor(X + copysign(nextafter(0.5, 0.0), X)).
21994static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21995 SDValue N0 = Op.getOperand(0);
21996 SDLoc dl(Op);
21997 MVT VT = Op.getSimpleValueType();
21998
21999 // N0 += copysign(nextafter(0.5, 0.0), N0)
22000 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22001 bool Ignored;
22002 APFloat Point5Pred = APFloat(0.5f);
22003 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22004 Point5Pred.next(/*nextDown*/true);
22005
22006 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22007 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22008 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22009
22010 // Truncate the result to remove fraction.
22011 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22012}
22013
22014/// The only differences between FABS and FNEG are the mask and the logic op.
22015/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22016static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22017 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&((void)0)
22018 "Wrong opcode for lowering FABS or FNEG.")((void)0);
22019
22020 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22021
22022 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22023 // into an FNABS. We'll lower the FABS after that if it is still in use.
22024 if (IsFABS)
22025 for (SDNode *User : Op->uses())
22026 if (User->getOpcode() == ISD::FNEG)
22027 return Op;
22028
22029 SDLoc dl(Op);
22030 MVT VT = Op.getSimpleValueType();
22031
22032 bool IsF128 = (VT == MVT::f128);
22033 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||((void)0)
22034 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||((void)0)
22035 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&((void)0)
22036 "Unexpected type in LowerFABSorFNEG")((void)0);
22037
22038 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22039 // decide if we should generate a 16-byte constant mask when we only need 4 or
22040 // 8 bytes for the scalar case.
22041
22042 // There are no scalar bitwise logical SSE/AVX instructions, so we
22043 // generate a 16-byte vector constant and logic op even for the scalar case.
22044 // Using a 16-byte mask allows folding the load of the mask with
22045 // the logic op, so it can save (~4 bytes) on code size.
22046 bool IsFakeVector = !VT.isVector() && !IsF128;
22047 MVT LogicVT = VT;
22048 if (IsFakeVector)
22049 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22050
22051 unsigned EltBits = VT.getScalarSizeInBits();
22052 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22053 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22054 APInt::getSignMask(EltBits);
22055 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22056 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22057
22058 SDValue Op0 = Op.getOperand(0);
22059 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22060 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22061 IsFNABS ? X86ISD::FOR :
22062 X86ISD::FXOR;
22063 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22064
22065 if (VT.isVector() || IsF128)
22066 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22067
22068 // For the scalar case extend to a 128-bit vector, perform the logic op,
22069 // and extract the scalar result back out.
22070 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22071 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22072 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22073 DAG.getIntPtrConstant(0, dl));
22074}
22075
22076static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22077 SDValue Mag = Op.getOperand(0);
22078 SDValue Sign = Op.getOperand(1);
22079 SDLoc dl(Op);
22080
22081 // If the sign operand is smaller, extend it first.
22082 MVT VT = Op.getSimpleValueType();
22083 if (Sign.getSimpleValueType().bitsLT(VT))
22084 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22085
22086 // And if it is bigger, shrink it first.
22087 if (Sign.getSimpleValueType().bitsGT(VT))
22088 Sign =
22089 DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22090
22091 // At this point the operands and the result should have the same
22092 // type, and that won't be f80 since that is not custom lowered.
22093 bool IsF128 = (VT == MVT::f128);
22094 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||((void)0)
22095 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||((void)0)
22096 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&((void)0)
22097 "Unexpected type in LowerFCOPYSIGN")((void)0);
22098
22099 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22100
22101 // Perform all scalar logic operations as 16-byte vectors because there are no
22102 // scalar FP logic instructions in SSE.
22103 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22104 // unnecessary splats, but we might miss load folding opportunities. Should
22105 // this decision be based on OptimizeForSize?
22106 bool IsFakeVector = !VT.isVector() && !IsF128;
22107 MVT LogicVT = VT;
22108 if (IsFakeVector)
22109 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22110
22111 // The mask constants are automatically splatted for vector types.
22112 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22113 SDValue SignMask = DAG.getConstantFP(
22114 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22115 SDValue MagMask = DAG.getConstantFP(
22116 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22117
22118 // First, clear all bits but the sign bit from the second operand (sign).
22119 if (IsFakeVector)
22120 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22121 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22122
22123 // Next, clear the sign bit from the first operand (magnitude).
22124 // TODO: If we had general constant folding for FP logic ops, this check
22125 // wouldn't be necessary.
22126 SDValue MagBits;
22127 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22128 APFloat APF = Op0CN->getValueAPF();
22129 APF.clearSign();
22130 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22131 } else {
22132 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22133 if (IsFakeVector)
22134 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22135 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22136 }
22137
22138 // OR the magnitude value with the sign bit.
22139 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22140 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22141 DAG.getIntPtrConstant(0, dl));
22142}
22143
22144static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22145 SDValue N0 = Op.getOperand(0);
22146 SDLoc dl(Op);
22147 MVT VT = Op.getSimpleValueType();
22148
22149 MVT OpVT = N0.getSimpleValueType();
22150 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&((void)0)
22151 "Unexpected type for FGETSIGN")((void)0);
22152
22153 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22154 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22155 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22156 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22157 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22158 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22159 return Res;
22160}
22161
22162/// Helper for creating a X86ISD::SETCC node.
22163static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22164 SelectionDAG &DAG) {
22165 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22166 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22167}
22168
22169/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22170/// style scalarized (associative) reduction patterns. Partial reductions
22171/// are supported when the pointer SrcMask is non-null.
22172/// TODO - move this to SelectionDAG?
22173static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22174 SmallVectorImpl<SDValue> &SrcOps,
22175 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22176 SmallVector<SDValue, 8> Opnds;
22177 DenseMap<SDValue, APInt> SrcOpMap;
22178 EVT VT = MVT::Other;
22179
22180 // Recognize a special case where a vector is casted into wide integer to
22181 // test all 0s.
22182 assert(Op.getOpcode() == unsigned(BinOp) &&((void)0)
22183 "Unexpected bit reduction opcode")((void)0);
22184 Opnds.push_back(Op.getOperand(0));
22185 Opnds.push_back(Op.getOperand(1));
22186
22187 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22188 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22189 // BFS traverse all BinOp operands.
22190 if (I->getOpcode() == unsigned(BinOp)) {
22191 Opnds.push_back(I->getOperand(0));
22192 Opnds.push_back(I->getOperand(1));
22193 // Re-evaluate the number of nodes to be traversed.
22194 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22195 continue;
22196 }
22197
22198 // Quit if a non-EXTRACT_VECTOR_ELT
22199 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22200 return false;
22201
22202 // Quit if without a constant index.
22203 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22204 if (!Idx)
22205 return false;
22206
22207 SDValue Src = I->getOperand(0);
22208 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22209 if (M == SrcOpMap.end()) {
22210 VT = Src.getValueType();
22211 // Quit if not the same type.
22212 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22213 return false;
22214 unsigned NumElts = VT.getVectorNumElements();
22215 APInt EltCount = APInt::getNullValue(NumElts);
22216 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22217 SrcOps.push_back(Src);
22218 }
22219
22220 // Quit if element already used.
22221 unsigned CIdx = Idx->getZExtValue();
22222 if (M->second[CIdx])
22223 return false;
22224 M->second.setBit(CIdx);
22225 }
22226
22227 if (SrcMask) {
22228 // Collect the source partial masks.
22229 for (SDValue &SrcOp : SrcOps)
22230 SrcMask->push_back(SrcOpMap[SrcOp]);
22231 } else {
22232 // Quit if not all elements are used.
22233 for (const auto &I : SrcOpMap)
22234 if (!I.second.isAllOnesValue())
22235 return false;
22236 }
22237
22238 return true;
22239}
22240
22241// Helper function for comparing all bits of a vector against zero.
22242static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22243 const APInt &Mask,
22244 const X86Subtarget &Subtarget,
22245 SelectionDAG &DAG, X86::CondCode &X86CC) {
22246 EVT VT = V.getValueType();
22247 unsigned ScalarSize = VT.getScalarSizeInBits();
22248 if (Mask.getBitWidth() != ScalarSize) {
22249 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")((void)0);
22250 return SDValue();
22251 }
22252
22253 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")((void)0);
22254 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22255
22256 auto MaskBits = [&](SDValue Src) {
22257 if (Mask.isAllOnesValue())
22258 return Src;
22259 EVT SrcVT = Src.getValueType();
22260 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22261 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22262 };
22263
22264 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22265 if (VT.getSizeInBits() < 128) {
22266 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22267 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22268 return SDValue();
22269 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22270 DAG.getBitcast(IntVT, MaskBits(V)),
22271 DAG.getConstant(0, DL, IntVT));
22272 }
22273
22274 // Quit if not splittable to 128/256-bit vector.
22275 if (!isPowerOf2_32(VT.getSizeInBits()))
22276 return SDValue();
22277
22278 // Split down to 128/256-bit vector.
22279 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22280 while (VT.getSizeInBits() > TestSize) {
22281 auto Split = DAG.SplitVector(V, DL);
22282 VT = Split.first.getValueType();
22283 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22284 }
22285
22286 bool UsePTEST = Subtarget.hasSSE41();
22287 if (UsePTEST) {
22288 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22289 V = DAG.getBitcast(TestVT, MaskBits(V));
22290 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22291 }
22292
22293 // Without PTEST, a masked v2i64 or-reduction is not faster than
22294 // scalarization.
22295 if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22296 return SDValue();
22297
22298 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22299 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22300 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22301 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22302 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22303 DAG.getConstant(0xFFFF, DL, MVT::i32));
22304}
22305
22306// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22307// CMP(MOVMSK(PCMPEQB(X,0))).
22308static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22309 const SDLoc &DL,
22310 const X86Subtarget &Subtarget,
22311 SelectionDAG &DAG, SDValue &X86CC) {
22312 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")((void)0);
22313
22314 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22315 return SDValue();
22316
22317 // Check whether we're masking/truncating an OR-reduction result, in which
22318 // case track the masked bits.
22319 APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22320 switch (Op.getOpcode()) {
22321 case ISD::TRUNCATE: {
22322 SDValue Src = Op.getOperand(0);
22323 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22324 Op.getScalarValueSizeInBits());
22325 Op = Src;
22326 break;
22327 }
22328 case ISD::AND: {
22329 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22330 Mask = Cst->getAPIntValue();
22331 Op = Op.getOperand(0);
22332 }
22333 break;
22334 }
22335 }
22336
22337 SmallVector<SDValue, 8> VecIns;
22338 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22339 EVT VT = VecIns[0].getValueType();
22340 assert(llvm::all_of(VecIns,((void)0)
22341 [VT](SDValue V) { return VT == V.getValueType(); }) &&((void)0)
22342 "Reduction source vector mismatch")((void)0);
22343
22344 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22345 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22346 return SDValue();
22347
22348 // If more than one full vector is evaluated, OR them first before PTEST.
22349 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22350 Slot += 2, e += 1) {
22351 // Each iteration will OR 2 nodes and append the result until there is
22352 // only 1 node left, i.e. the final OR'd value of all vectors.
22353 SDValue LHS = VecIns[Slot];
22354 SDValue RHS = VecIns[Slot + 1];
22355 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22356 }
22357
22358 X86::CondCode CCode;
22359 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22360 DAG, CCode)) {
22361 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22362 return V;
22363 }
22364 }
22365
22366 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22367 ISD::NodeType BinOp;
22368 if (SDValue Match =
22369 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22370 X86::CondCode CCode;
22371 if (SDValue V =
22372 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22373 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22374 return V;
22375 }
22376 }
22377 }
22378
22379 return SDValue();
22380}
22381
22382/// return true if \c Op has a use that doesn't just read flags.
22383static bool hasNonFlagsUse(SDValue Op) {
22384 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22385 ++UI) {
22386 SDNode *User = *UI;
22387 unsigned UOpNo = UI.getOperandNo();
22388 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22389 // Look pass truncate.
22390 UOpNo = User->use_begin().getOperandNo();
22391 User = *User->use_begin();
22392 }
22393
22394 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22395 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22396 return true;
22397 }
22398 return false;
22399}
22400
22401// Transform to an x86-specific ALU node with flags if there is a chance of
22402// using an RMW op or only the flags are used. Otherwise, leave
22403// the node alone and emit a 'cmp' or 'test' instruction.
22404static bool isProfitableToUseFlagOp(SDValue Op) {
22405 for (SDNode *U : Op->uses())
22406 if (U->getOpcode() != ISD::CopyToReg &&
22407 U->getOpcode() != ISD::SETCC &&
22408 U->getOpcode() != ISD::STORE)
22409 return false;
22410
22411 return true;
22412}
22413
22414/// Emit nodes that will be selected as "test Op0,Op0", or something
22415/// equivalent.
22416static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22417 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22418 // CF and OF aren't always set the way we want. Determine which
22419 // of these we need.
22420 bool NeedCF = false;
22421 bool NeedOF = false;
22422 switch (X86CC) {
22423 default: break;
22424 case X86::COND_A: case X86::COND_AE:
22425 case X86::COND_B: case X86::COND_BE:
22426 NeedCF = true;
22427 break;
22428 case X86::COND_G: case X86::COND_GE:
22429 case X86::COND_L: case X86::COND_LE:
22430 case X86::COND_O: case X86::COND_NO: {
22431 // Check if we really need to set the
22432 // Overflow flag. If NoSignedWrap is present
22433 // that is not actually needed.
22434 switch (Op->getOpcode()) {
22435 case ISD::ADD:
22436 case ISD::SUB:
22437 case ISD::MUL:
22438 case ISD::SHL:
22439 if (Op.getNode()->getFlags().hasNoSignedWrap())
22440 break;
22441 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22442 default:
22443 NeedOF = true;
22444 break;
22445 }
22446 break;
22447 }
22448 }
22449 // See if we can use the EFLAGS value from the operand instead of
22450 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22451 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22452 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22453 // Emit a CMP with 0, which is the TEST pattern.
22454 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22455 DAG.getConstant(0, dl, Op.getValueType()));
22456 }
22457 unsigned Opcode = 0;
22458 unsigned NumOperands = 0;
22459
22460 SDValue ArithOp = Op;
22461
22462 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22463 // which may be the result of a CAST. We use the variable 'Op', which is the
22464 // non-casted variable when we check for possible users.
22465 switch (ArithOp.getOpcode()) {
22466 case ISD::AND:
22467 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22468 // because a TEST instruction will be better.
22469 if (!hasNonFlagsUse(Op))
22470 break;
22471
22472 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22473 case ISD::ADD:
22474 case ISD::SUB:
22475 case ISD::OR:
22476 case ISD::XOR:
22477 if (!isProfitableToUseFlagOp(Op))
22478 break;
22479
22480 // Otherwise use a regular EFLAGS-setting instruction.
22481 switch (ArithOp.getOpcode()) {
22482 default: llvm_unreachable("unexpected operator!")__builtin_unreachable();
22483 case ISD::ADD: Opcode = X86ISD::ADD; break;
22484 case ISD::SUB: Opcode = X86ISD::SUB; break;
22485 case ISD::XOR: Opcode = X86ISD::XOR; break;
22486 case ISD::AND: Opcode = X86ISD::AND; break;
22487 case ISD::OR: Opcode = X86ISD::OR; break;
22488 }
22489
22490 NumOperands = 2;
22491 break;
22492 case X86ISD::ADD:
22493 case X86ISD::SUB:
22494 case X86ISD::OR:
22495 case X86ISD::XOR:
22496 case X86ISD::AND:
22497 return SDValue(Op.getNode(), 1);
22498 case ISD::SSUBO:
22499 case ISD::USUBO: {
22500 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22501 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22502 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22503 Op->getOperand(1)).getValue(1);
22504 }
22505 default:
22506 break;
22507 }
22508
22509 if (Opcode == 0) {
22510 // Emit a CMP with 0, which is the TEST pattern.
22511 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22512 DAG.getConstant(0, dl, Op.getValueType()));
22513 }
22514 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22515 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22516
22517 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22518 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22519 return SDValue(New.getNode(), 1);
22520}
22521
22522/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22523/// equivalent.
22524static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22525 const SDLoc &dl, SelectionDAG &DAG,
22526 const X86Subtarget &Subtarget) {
22527 if (isNullConstant(Op1))
22528 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22529
22530 EVT CmpVT = Op0.getValueType();
22531
22532 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||((void)0)
22533 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")((void)0);
22534
22535 // Only promote the compare up to I32 if it is a 16 bit operation
22536 // with an immediate. 16 bit immediates are to be avoided.
22537 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22538 !DAG.getMachineFunction().getFunction().hasMinSize()) {
22539 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22540 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22541 // Don't do this if the immediate can fit in 8-bits.
22542 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22543 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22544 unsigned ExtendOp =
22545 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22546 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22547 // For equality comparisons try to use SIGN_EXTEND if the input was
22548 // truncate from something with enough sign bits.
22549 if (Op0.getOpcode() == ISD::TRUNCATE) {
22550 SDValue In = Op0.getOperand(0);
22551 unsigned EffBits =
22552 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22553 if (EffBits <= 16)
22554 ExtendOp = ISD::SIGN_EXTEND;
22555 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22556 SDValue In = Op1.getOperand(0);
22557 unsigned EffBits =
22558 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22559 if (EffBits <= 16)
22560 ExtendOp = ISD::SIGN_EXTEND;
22561 }
22562 }
22563
22564 CmpVT = MVT::i32;
22565 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22566 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22567 }
22568 }
22569
22570 // Try to shrink i64 compares if the input has enough zero bits.
22571 // FIXME: Do this for non-constant compares for constant on LHS?
22572 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22573 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22574 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22575 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22576 CmpVT = MVT::i32;
22577 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22578 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22579 }
22580
22581 // 0-x == y --> x+y == 0
22582 // 0-x != y --> x+y != 0
22583 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22584 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22585 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22586 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22587 return Add.getValue(1);
22588 }
22589
22590 // x == 0-y --> x+y == 0
22591 // x != 0-y --> x+y != 0
22592 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22593 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22594 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22595 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22596 return Add.getValue(1);
22597 }
22598
22599 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22600 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22601 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22602 return Sub.getValue(1);
22603}
22604
22605/// Check if replacement of SQRT with RSQRT should be disabled.
22606bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22607 EVT VT = Op.getValueType();
22608
22609 // We never want to use both SQRT and RSQRT instructions for the same input.
22610 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22611 return false;
22612
22613 if (VT.isVector())
22614 return Subtarget.hasFastVectorFSQRT();
22615 return Subtarget.hasFastScalarFSQRT();
22616}
22617
22618/// The minimum architected relative accuracy is 2^-12. We need one
22619/// Newton-Raphson step to have a good float result (24 bits of precision).
22620SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22621 SelectionDAG &DAG, int Enabled,
22622 int &RefinementSteps,
22623 bool &UseOneConstNR,
22624 bool Reciprocal) const {
22625 EVT VT = Op.getValueType();
22626
22627 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22628 // It is likely not profitable to do this for f64 because a double-precision
22629 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22630 // instructions: convert to single, rsqrtss, convert back to double, refine
22631 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22632 // along with FMA, this could be a throughput win.
22633 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22634 // after legalize types.
22635 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22636 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22637 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22638 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22639 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22640 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22641 RefinementSteps = 1;
22642
22643 UseOneConstNR = false;
22644 // There is no FSQRT for 512-bits, but there is RSQRT14.
22645 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22646 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22647 }
22648 return SDValue();
22649}
22650
22651/// The minimum architected relative accuracy is 2^-12. We need one
22652/// Newton-Raphson step to have a good float result (24 bits of precision).
22653SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22654 int Enabled,
22655 int &RefinementSteps) const {
22656 EVT VT = Op.getValueType();
22657
22658 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22659 // It is likely not profitable to do this for f64 because a double-precision
22660 // reciprocal estimate with refinement on x86 prior to FMA requires
22661 // 15 instructions: convert to single, rcpss, convert back to double, refine
22662 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22663 // along with FMA, this could be a throughput win.
22664
22665 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22666 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22667 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22668 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22669 // Enable estimate codegen with 1 refinement step for vector division.
22670 // Scalar division estimates are disabled because they break too much
22671 // real-world code. These defaults are intended to match GCC behavior.
22672 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22673 return SDValue();
22674
22675 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22676 RefinementSteps = 1;
22677
22678 // There is no FSQRT for 512-bits, but there is RCP14.
22679 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22680 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22681 }
22682 return SDValue();
22683}
22684
22685/// If we have at least two divisions that use the same divisor, convert to
22686/// multiplication by a reciprocal. This may need to be adjusted for a given
22687/// CPU if a division's cost is not at least twice the cost of a multiplication.
22688/// This is because we still need one division to calculate the reciprocal and
22689/// then we need two multiplies by that reciprocal as replacements for the
22690/// original divisions.
22691unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22692 return 2;
22693}
22694
22695SDValue
22696X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22697 SelectionDAG &DAG,
22698 SmallVectorImpl<SDNode *> &Created) const {
22699 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22700 if (isIntDivCheap(N->getValueType(0), Attr))
22701 return SDValue(N,0); // Lower SDIV as SDIV
22702
22703 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&((void)0)
22704 "Unexpected divisor!")((void)0);
22705
22706 // Only perform this transform if CMOV is supported otherwise the select
22707 // below will become a branch.
22708 if (!Subtarget.hasCMov())
22709 return SDValue();
22710
22711 // fold (sdiv X, pow2)
22712 EVT VT = N->getValueType(0);
22713 // FIXME: Support i8.
22714 if (VT != MVT::i16 && VT != MVT::i32 &&
22715 !(Subtarget.is64Bit() && VT == MVT::i64))
22716 return SDValue();
22717
22718 unsigned Lg2 = Divisor.countTrailingZeros();
22719
22720 // If the divisor is 2 or -2, the default expansion is better.
22721 if (Lg2 == 1)
22722 return SDValue();
22723
22724 SDLoc DL(N);
22725 SDValue N0 = N->getOperand(0);
22726 SDValue Zero = DAG.getConstant(0, DL, VT);
22727 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
22728 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
22729
22730 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
22731 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
22732 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
22733 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
22734
22735 Created.push_back(Cmp.getNode());
22736 Created.push_back(Add.getNode());
22737 Created.push_back(CMov.getNode());
22738
22739 // Divide by pow2.
22740 SDValue SRA =
22741 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
22742
22743 // If we're dividing by a positive value, we're done. Otherwise, we must
22744 // negate the result.
22745 if (Divisor.isNonNegative())
22746 return SRA;
22747
22748 Created.push_back(SRA.getNode());
22749 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
22750}
22751
22752/// Result of 'and' is compared against zero. Change to a BT node if possible.
22753/// Returns the BT node and the condition code needed to use it.
22754static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
22755 const SDLoc &dl, SelectionDAG &DAG,
22756 SDValue &X86CC) {
22757 assert(And.getOpcode() == ISD::AND && "Expected AND node!")((void)0);
22758 SDValue Op0 = And.getOperand(0);
22759 SDValue Op1 = And.getOperand(1);
22760 if (Op0.getOpcode() == ISD::TRUNCATE)
22761 Op0 = Op0.getOperand(0);
22762 if (Op1.getOpcode() == ISD::TRUNCATE)
22763 Op1 = Op1.getOperand(0);
22764
22765 SDValue Src, BitNo;
22766 if (Op1.getOpcode() == ISD::SHL)
22767 std::swap(Op0, Op1);
22768 if (Op0.getOpcode() == ISD::SHL) {
22769 if (isOneConstant(Op0.getOperand(0))) {
22770 // If we looked past a truncate, check that it's only truncating away
22771 // known zeros.
22772 unsigned BitWidth = Op0.getValueSizeInBits();
22773 unsigned AndBitWidth = And.getValueSizeInBits();
22774 if (BitWidth > AndBitWidth) {
22775 KnownBits Known = DAG.computeKnownBits(Op0);
22776 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22777 return SDValue();
22778 }
22779 Src = Op1;
22780 BitNo = Op0.getOperand(1);
22781 }
22782 } else if (Op1.getOpcode() == ISD::Constant) {
22783 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22784 uint64_t AndRHSVal = AndRHS->getZExtValue();
22785 SDValue AndLHS = Op0;
22786
22787 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22788 Src = AndLHS.getOperand(0);
22789 BitNo = AndLHS.getOperand(1);
22790 } else {
22791 // Use BT if the immediate can't be encoded in a TEST instruction or we
22792 // are optimizing for size and the immedaite won't fit in a byte.
22793 bool OptForSize = DAG.shouldOptForSize();
22794 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22795 isPowerOf2_64(AndRHSVal)) {
22796 Src = AndLHS;
22797 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22798 Src.getValueType());
22799 }
22800 }
22801 }
22802
22803 // No patterns found, give up.
22804 if (!Src.getNode())
22805 return SDValue();
22806
22807 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22808 // instruction. Since the shift amount is in-range-or-undefined, we know
22809 // that doing a bittest on the i32 value is ok. We extend to i32 because
22810 // the encoding for the i16 version is larger than the i32 version.
22811 // Also promote i16 to i32 for performance / code size reason.
22812 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
22813 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
22814
22815 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22816 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22817 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22818 // known to be zero.
22819 if (Src.getValueType() == MVT::i64 &&
22820 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22821 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
22822
22823 // If the operand types disagree, extend the shift amount to match. Since
22824 // BT ignores high bits (like shifts) we can use anyextend.
22825 if (Src.getValueType() != BitNo.getValueType())
22826 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
22827
22828 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
22829 dl, MVT::i8);
22830 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
22831}
22832
22833/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22834/// CMPs.
22835static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22836 SDValue &Op1, bool &IsAlwaysSignaling) {
22837 unsigned SSECC;
22838 bool Swap = false;
22839
22840 // SSE Condition code mapping:
22841 // 0 - EQ
22842 // 1 - LT
22843 // 2 - LE
22844 // 3 - UNORD
22845 // 4 - NEQ
22846 // 5 - NLT
22847 // 6 - NLE
22848 // 7 - ORD
22849 switch (SetCCOpcode) {
22850 default: llvm_unreachable("Unexpected SETCC condition")__builtin_unreachable();
22851 case ISD::SETOEQ:
22852 case ISD::SETEQ: SSECC = 0; break;
22853 case ISD::SETOGT:
22854 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22855 case ISD::SETLT:
22856 case ISD::SETOLT: SSECC = 1; break;
22857 case ISD::SETOGE:
22858 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22859 case ISD::SETLE:
22860 case ISD::SETOLE: SSECC = 2; break;
22861 case ISD::SETUO: SSECC = 3; break;
22862 case ISD::SETUNE:
22863 case ISD::SETNE: SSECC = 4; break;
22864 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22865 case ISD::SETUGE: SSECC = 5; break;
22866 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22867 case ISD::SETUGT: SSECC = 6; break;
22868 case ISD::SETO: SSECC = 7; break;
22869 case ISD::SETUEQ: SSECC = 8; break;
22870 case ISD::SETONE: SSECC = 12; break;
22871 }
22872 if (Swap)
22873 std::swap(Op0, Op1);
22874
22875 switch (SetCCOpcode) {
22876 default:
22877 IsAlwaysSignaling = true;
22878 break;
22879 case ISD::SETEQ:
22880 case ISD::SETOEQ:
22881 case ISD::SETUEQ:
22882 case ISD::SETNE:
22883 case ISD::SETONE:
22884 case ISD::SETUNE:
22885 case ISD::SETO:
22886 case ISD::SETUO:
22887 IsAlwaysSignaling = false;
22888 break;
22889 }
22890
22891 return SSECC;
22892}
22893
22894/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22895/// concatenate the result back.
22896static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22897 ISD::CondCode Cond, SelectionDAG &DAG,
22898 const SDLoc &dl) {
22899 assert(VT.isInteger() && VT == LHS.getValueType() &&((void)0)
22900 VT == RHS.getValueType() && "Unsupported VTs!")((void)0);
22901
22902 SDValue CC = DAG.getCondCode(Cond);
22903
22904 // Extract the LHS Lo/Hi vectors
22905 SDValue LHS1, LHS2;
22906 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22907
22908 // Extract the RHS Lo/Hi vectors
22909 SDValue RHS1, RHS2;
22910 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22911
22912 // Issue the operation on the smaller types and concatenate the result back
22913 EVT LoVT, HiVT;
22914 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22915 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22916 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22917 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22918}
22919
22920static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22921
22922 SDValue Op0 = Op.getOperand(0);
22923 SDValue Op1 = Op.getOperand(1);
22924 SDValue CC = Op.getOperand(2);
22925 MVT VT = Op.getSimpleValueType();
22926 SDLoc dl(Op);
22927
22928 assert(VT.getVectorElementType() == MVT::i1 &&((void)0)
22929 "Cannot set masked compare for this operation")((void)0);
22930
22931 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22932
22933 // Prefer SETGT over SETLT.
22934 if (SetCCOpcode == ISD::SETLT) {
22935 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22936 std::swap(Op0, Op1);
22937 }
22938
22939 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22940}
22941
22942/// Given a buildvector constant, return a new vector constant with each element
22943/// incremented or decremented. If incrementing or decrementing would result in
22944/// unsigned overflow or underflow or this is not a simple vector constant,
22945/// return an empty value.
22946static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
22947 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22948 if (!BV)
22949 return SDValue();
22950
22951 MVT VT = V.getSimpleValueType();
22952 MVT EltVT = VT.getVectorElementType();
22953 unsigned NumElts = VT.getVectorNumElements();
22954 SmallVector<SDValue, 8> NewVecC;
22955 SDLoc DL(V);
22956 for (unsigned i = 0; i < NumElts; ++i) {
22957 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22958 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22959 return SDValue();
22960
22961 // Avoid overflow/underflow.
22962 const APInt &EltC = Elt->getAPIntValue();
22963 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
22964 return SDValue();
22965
22966 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22967 }
22968
22969 return DAG.getBuildVector(VT, DL, NewVecC);
22970}
22971
22972/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22973/// Op0 u<= Op1:
22974/// t = psubus Op0, Op1
22975/// pcmpeq t, <0..0>
22976static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22977 ISD::CondCode Cond, const SDLoc &dl,
22978 const X86Subtarget &Subtarget,
22979 SelectionDAG &DAG) {
22980 if (!Subtarget.hasSSE2())
22981 return SDValue();
22982
22983 MVT VET = VT.getVectorElementType();
22984 if (VET != MVT::i8 && VET != MVT::i16)
22985 return SDValue();
22986
22987 switch (Cond) {
22988 default:
22989 return SDValue();
22990 case ISD::SETULT: {
22991 // If the comparison is against a constant we can turn this into a
22992 // setule. With psubus, setule does not require a swap. This is
22993 // beneficial because the constant in the register is no longer
22994 // destructed as the destination so it can be hoisted out of a loop.
22995 // Only do this pre-AVX since vpcmp* is no longer destructive.
22996 if (Subtarget.hasAVX())
22997 return SDValue();
22998 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
22999 if (!ULEOp1)
23000 return SDValue();
23001 Op1 = ULEOp1;
23002 break;
23003 }
23004 case ISD::SETUGT: {
23005 // If the comparison is against a constant, we can turn this into a setuge.
23006 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23007 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23008 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23009 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23010 if (!UGEOp1)
23011 return SDValue();
23012 Op1 = Op0;
23013 Op0 = UGEOp1;
23014 break;
23015 }
23016 // Psubus is better than flip-sign because it requires no inversion.
23017 case ISD::SETUGE:
23018 std::swap(Op0, Op1);
23019 break;
23020 case ISD::SETULE:
23021 break;
23022 }
23023
23024 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23025 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23026 DAG.getConstant(0, dl, VT));
23027}
23028
23029static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23030 SelectionDAG &DAG) {
23031 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23032 Op.getOpcode() == ISD::STRICT_FSETCCS;
23033 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23034 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23035 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23036 MVT VT = Op->getSimpleValueType(0);
23037 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23038 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23039 SDLoc dl(Op);
23040
23041 if (isFP) {
23042#ifndef NDEBUG1
23043 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23044 assert(EltVT == MVT::f32 || EltVT == MVT::f64)((void)0);
23045#endif
23046
23047 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23048 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23049
23050 // If we have a strict compare with a vXi1 result and the input is 128/256
23051 // bits we can't use a masked compare unless we have VLX. If we use a wider
23052 // compare like we do for non-strict, we might trigger spurious exceptions
23053 // from the upper elements. Instead emit a AVX compare and convert to mask.
23054 unsigned Opc;
23055 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23056 (!IsStrict || Subtarget.hasVLX() ||
23057 Op0.getSimpleValueType().is512BitVector())) {
23058 assert(VT.getVectorNumElements() <= 16)((void)0);
23059 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23060 } else {
23061 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23062 // The SSE/AVX packed FP comparison nodes are defined with a
23063 // floating-point vector result that matches the operand type. This allows
23064 // them to work with an SSE1 target (integer vector types are not legal).
23065 VT = Op0.getSimpleValueType();
23066 }
23067
23068 SDValue Cmp;
23069 bool IsAlwaysSignaling;
23070 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23071 if (!Subtarget.hasAVX()) {
23072 // TODO: We could use following steps to handle a quiet compare with
23073 // signaling encodings.
23074 // 1. Get ordered masks from a quiet ISD::SETO
23075 // 2. Use the masks to mask potential unordered elements in operand A, B
23076 // 3. Get the compare results of masked A, B
23077 // 4. Calculating final result using the mask and result from 3
23078 // But currently, we just fall back to scalar operations.
23079 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23080 return SDValue();
23081
23082 // Insert an extra signaling instruction to raise exception.
23083 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23084 SDValue SignalCmp = DAG.getNode(
23085 Opc, dl, {VT, MVT::Other},
23086 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23087 // FIXME: It seems we need to update the flags of all new strict nodes.
23088 // Otherwise, mayRaiseFPException in MI will return false due to
23089 // NoFPExcept = false by default. However, I didn't find it in other
23090 // patches.
23091 SignalCmp->setFlags(Op->getFlags());
23092 Chain = SignalCmp.getValue(1);
23093 }
23094
23095 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23096 // emit two comparisons and a logic op to tie them together.
23097 if (SSECC >= 8) {
23098 // LLVM predicate is SETUEQ or SETONE.
23099 unsigned CC0, CC1;
23100 unsigned CombineOpc;
23101 if (Cond == ISD::SETUEQ) {
23102 CC0 = 3; // UNORD
23103 CC1 = 0; // EQ
23104 CombineOpc = X86ISD::FOR;
23105 } else {
23106 assert(Cond == ISD::SETONE)((void)0);
23107 CC0 = 7; // ORD
23108 CC1 = 4; // NEQ
23109 CombineOpc = X86ISD::FAND;
23110 }
23111
23112 SDValue Cmp0, Cmp1;
23113 if (IsStrict) {
23114 Cmp0 = DAG.getNode(
23115 Opc, dl, {VT, MVT::Other},
23116 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23117 Cmp1 = DAG.getNode(
23118 Opc, dl, {VT, MVT::Other},
23119 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23120 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23121 Cmp1.getValue(1));
23122 } else {
23123 Cmp0 = DAG.getNode(
23124 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23125 Cmp1 = DAG.getNode(
23126 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23127 }
23128 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23129 } else {
23130 if (IsStrict) {
23131 Cmp = DAG.getNode(
23132 Opc, dl, {VT, MVT::Other},
23133 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23134 Chain = Cmp.getValue(1);
23135 } else
23136 Cmp = DAG.getNode(
23137 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23138 }
23139 } else {
23140 // Handle all other FP comparisons here.
23141 if (IsStrict) {
23142 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23143 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23144 Cmp = DAG.getNode(
23145 Opc, dl, {VT, MVT::Other},
23146 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23147 Chain = Cmp.getValue(1);
23148 } else
23149 Cmp = DAG.getNode(
23150 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23151 }
23152
23153 if (VT.getFixedSizeInBits() >
23154 Op.getSimpleValueType().getFixedSizeInBits()) {
23155 // We emitted a compare with an XMM/YMM result. Finish converting to a
23156 // mask register using a vptestm.
23157 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23158 Cmp = DAG.getBitcast(CastVT, Cmp);
23159 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23160 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23161 } else {
23162 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23163 // the result type of SETCC. The bitcast is expected to be optimized
23164 // away during combining/isel.
23165 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23166 }
23167
23168 if (IsStrict)
23169 return DAG.getMergeValues({Cmp, Chain}, dl);
23170
23171 return Cmp;
23172 }
23173
23174 assert(!IsStrict && "Strict SETCC only handles FP operands.")((void)0);
23175
23176 MVT VTOp0 = Op0.getSimpleValueType();
23177 (void)VTOp0;
23178 assert(VTOp0 == Op1.getSimpleValueType() &&((void)0)
23179 "Expected operands with same type!")((void)0);
23180 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((void)0)
23181 "Invalid number of packed elements for source and destination!")((void)0);
23182
23183 // The non-AVX512 code below works under the assumption that source and
23184 // destination types are the same.
23185 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&((void)0)
23186 "Value types for source and destination must be the same!")((void)0);
23187
23188 // The result is boolean, but operands are int/float
23189 if (VT.getVectorElementType() == MVT::i1) {
23190 // In AVX-512 architecture setcc returns mask with i1 elements,
23191 // But there is no compare instruction for i8 and i16 elements in KNL.
23192 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&((void)0)
23193 "Unexpected operand type")((void)0);
23194 return LowerIntVSETCC_AVX512(Op, DAG);
23195 }
23196
23197 // Lower using XOP integer comparisons.
23198 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23199 // Translate compare code to XOP PCOM compare mode.
23200 unsigned CmpMode = 0;
23201 switch (Cond) {
23202 default: llvm_unreachable("Unexpected SETCC condition")__builtin_unreachable();
23203 case ISD::SETULT:
23204 case ISD::SETLT: CmpMode = 0x00; break;
23205 case ISD::SETULE:
23206 case ISD::SETLE: CmpMode = 0x01; break;
23207 case ISD::SETUGT:
23208 case ISD::SETGT: CmpMode = 0x02; break;
23209 case ISD::SETUGE:
23210 case ISD::SETGE: CmpMode = 0x03; break;
23211 case ISD::SETEQ: CmpMode = 0x04; break;
23212 case ISD::SETNE: CmpMode = 0x05; break;
23213 }
23214
23215 // Are we comparing unsigned or signed integers?
23216 unsigned Opc =
23217 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23218
23219 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23220 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23221 }
23222
23223 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23224 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23225 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23226 SDValue BC0 = peekThroughBitcasts(Op0);
23227 if (BC0.getOpcode() == ISD::AND) {
23228 APInt UndefElts;
23229 SmallVector<APInt, 64> EltBits;
23230 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23231 VT.getScalarSizeInBits(), UndefElts,
23232 EltBits, false, false)) {
23233 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23234 Cond = ISD::SETEQ;
23235 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23236 }
23237 }
23238 }
23239 }
23240
23241 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23242 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23243 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23244 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23245 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23246 unsigned BitWidth = VT.getScalarSizeInBits();
23247 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23248
23249 SDValue Result = Op0.getOperand(0);
23250 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23251 DAG.getConstant(ShiftAmt, dl, VT));
23252 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23253 DAG.getConstant(BitWidth - 1, dl, VT));
23254 return Result;
23255 }
23256 }
23257
23258 // Break 256-bit integer vector compare into smaller ones.
23259 if (VT.is256BitVector() && !Subtarget.hasInt256())
23260 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23261
23262 if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23263 assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!")((void)0);
23264 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23265 }
23266
23267 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23268 // not-of-PCMPEQ:
23269 // X != INT_MIN --> X >s INT_MIN
23270 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23271 // +X != 0 --> +X >s 0
23272 APInt ConstValue;
23273 if (Cond == ISD::SETNE &&
23274 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23275 if (ConstValue.isMinSignedValue())
23276 Cond = ISD::SETGT;
23277 else if (ConstValue.isMaxSignedValue())
23278 Cond = ISD::SETLT;
23279 else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23280 Cond = ISD::SETGT;
23281 }
23282
23283 // If both operands are known non-negative, then an unsigned compare is the
23284 // same as a signed compare and there's no need to flip signbits.
23285 // TODO: We could check for more general simplifications here since we're
23286 // computing known bits.
23287 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23288 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23289
23290 // Special case: Use min/max operations for unsigned compares.
23291 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23292 if (ISD::isUnsignedIntSetCC(Cond) &&
23293 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23294 TLI.isOperationLegal(ISD::UMIN, VT)) {
23295 // If we have a constant operand, increment/decrement it and change the
23296 // condition to avoid an invert.
23297 if (Cond == ISD::SETUGT) {
23298 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23299 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23300 Op1 = UGTOp1;
23301 Cond = ISD::SETUGE;
23302 }
23303 }
23304 if (Cond == ISD::SETULT) {
23305 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23306 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23307 Op1 = ULTOp1;
23308 Cond = ISD::SETULE;
23309 }
23310 }
23311 bool Invert = false;
23312 unsigned Opc;
23313 switch (Cond) {
23314 default: llvm_unreachable("Unexpected condition code")__builtin_unreachable();
23315 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23316 case ISD::SETULE: Opc = ISD::UMIN; break;
23317 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23318 case ISD::SETUGE: Opc = ISD::UMAX; break;
23319 }
23320
23321 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23322 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23323
23324 // If the logical-not of the result is required, perform that now.
23325 if (Invert)
23326 Result = DAG.getNOT(dl, Result, VT);
23327
23328 return Result;
23329 }
23330
23331 // Try to use SUBUS and PCMPEQ.
23332 if (FlipSigns)
23333 if (SDValue V =
23334 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23335 return V;
23336
23337 // We are handling one of the integer comparisons here. Since SSE only has
23338 // GT and EQ comparisons for integer, swapping operands and multiple
23339 // operations may be required for some comparisons.
23340 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23341 : X86ISD::PCMPGT;
23342 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23343 Cond == ISD::SETGE || Cond == ISD::SETUGE;
23344 bool Invert = Cond == ISD::SETNE ||
23345 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23346
23347 if (Swap)
23348 std::swap(Op0, Op1);
23349
23350 // Check that the operation in question is available (most are plain SSE2,
23351 // but PCMPGTQ and PCMPEQQ have different requirements).
23352 if (VT == MVT::v2i64) {
23353 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23354 assert(Subtarget.hasSSE2() && "Don't know how to lower!")((void)0);
23355
23356 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23357 // the odd elements over the even elements.
23358 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23359 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23360 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23361
23362 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23363 static const int MaskHi[] = { 1, 1, 3, 3 };
23364 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23365
23366 return DAG.getBitcast(VT, Result);
23367 }
23368
23369 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23370 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23371 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23372
23373 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23374 static const int MaskHi[] = { 1, 1, 3, 3 };
23375 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23376
23377 return DAG.getBitcast(VT, Result);
23378 }
23379
23380 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23381 // bits of the inputs before performing those operations. The lower
23382 // compare is always unsigned.
23383 SDValue SB;
23384 if (FlipSigns) {
23385 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23386 } else {
23387 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23388 }
23389 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23390 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23391
23392 // Cast everything to the right type.
23393 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23394 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23395
23396 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23397 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23398 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23399
23400 // Create masks for only the low parts/high parts of the 64 bit integers.
23401 static const int MaskHi[] = { 1, 1, 3, 3 };
23402 static const int MaskLo[] = { 0, 0, 2, 2 };
23403 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23404 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23405 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23406
23407 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23408 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23409
23410 if (Invert)
23411 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23412
23413 return DAG.getBitcast(VT, Result);
23414 }
23415
23416 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23417 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23418 // pcmpeqd + pshufd + pand.
23419 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((void)0);
23420
23421 // First cast everything to the right type.
23422 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23423 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23424
23425 // Do the compare.
23426 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23427
23428 // Make sure the lower and upper halves are both all-ones.
23429 static const int Mask[] = { 1, 0, 3, 2 };
23430 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23431 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23432
23433 if (Invert)
23434 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23435
23436 return DAG.getBitcast(VT, Result);
23437 }
23438 }
23439
23440 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23441 // bits of the inputs before performing those operations.
23442 if (FlipSigns) {
23443 MVT EltVT = VT.getVectorElementType();
23444 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23445 VT);
23446 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23447 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23448 }
23449
23450 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23451
23452 // If the logical-not of the result is required, perform that now.
23453 if (Invert)
23454 Result = DAG.getNOT(dl, Result, VT);
23455
23456 return Result;
23457}
23458
23459// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23460static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23461 const SDLoc &dl, SelectionDAG &DAG,
23462 const X86Subtarget &Subtarget,
23463 SDValue &X86CC) {
23464 // Only support equality comparisons.
23465 if (CC != ISD::SETEQ && CC != ISD::SETNE)
23466 return SDValue();
23467
23468 // Must be a bitcast from vXi1.
23469 if (Op0.getOpcode() != ISD::BITCAST)
23470 return SDValue();
23471
23472 Op0 = Op0.getOperand(0);
23473 MVT VT = Op0.getSimpleValueType();
23474 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23475 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23476 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23477 return SDValue();
23478
23479 X86::CondCode X86Cond;
23480 if (isNullConstant(Op1)) {
23481 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23482 } else if (isAllOnesConstant(Op1)) {
23483 // C flag is set for all ones.
23484 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23485 } else
23486 return SDValue();
23487
23488 // If the input is an AND, we can combine it's operands into the KTEST.
23489 bool KTestable = false;
23490 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23491 KTestable = true;
23492 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23493 KTestable = true;
23494 if (!isNullConstant(Op1))
23495 KTestable = false;
23496 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23497 SDValue LHS = Op0.getOperand(0);
23498 SDValue RHS = Op0.getOperand(1);
23499 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23500 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23501 }
23502
23503 // If the input is an OR, we can combine it's operands into the KORTEST.
23504 SDValue LHS = Op0;
23505 SDValue RHS = Op0;
23506 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23507 LHS = Op0.getOperand(0);
23508 RHS = Op0.getOperand(1);
23509 }
23510
23511 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23512 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23513}
23514
23515/// Emit flags for the given setcc condition and operands. Also returns the
23516/// corresponding X86 condition code constant in X86CC.
23517SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23518 ISD::CondCode CC, const SDLoc &dl,
23519 SelectionDAG &DAG,
23520 SDValue &X86CC) const {
23521 // Optimize to BT if possible.
23522 // Lower (X & (1 << N)) == 0 to BT(X, N).
23523 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23524 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23525 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23526 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23527 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23528 return BT;
23529 }
23530
23531 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23532 // TODO: We could do AND tree with all 1s as well by using the C flag.
23533 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23534 if (SDValue CmpZ =
23535 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23536 return CmpZ;
23537
23538 // Try to lower using KORTEST or KTEST.
23539 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23540 return Test;
23541
23542 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
23543 // these.
23544 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
23545 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23546 // If the input is a setcc, then reuse the input setcc or use a new one with
23547 // the inverted condition.
23548 if (Op0.getOpcode() == X86ISD::SETCC) {
23549 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23550
23551 X86CC = Op0.getOperand(0);
23552 if (Invert) {
23553 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23554 CCode = X86::GetOppositeBranchCondition(CCode);
23555 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23556 }
23557
23558 return Op0.getOperand(1);
23559 }
23560 }
23561
23562 // Try to use the carry flag from the add in place of an separate CMP for:
23563 // (seteq (add X, -1), -1). Similar for setne.
23564 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23565 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23566 if (isProfitableToUseFlagOp(Op0)) {
23567 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23568
23569 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23570 Op0.getOperand(1));
23571 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23572 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23573 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23574 return SDValue(New.getNode(), 1);
23575 }
23576 }
23577
23578 X86::CondCode CondCode =
23579 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23580 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")((void)0);
23581
23582 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23583 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23584 return EFLAGS;
23585}
23586
23587SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23588
23589 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23590 Op.getOpcode() == ISD::STRICT_FSETCCS;
23591 MVT VT = Op->getSimpleValueType(0);
23592
23593 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23594
23595 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((void)0);
23596 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23597 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23598 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23599 SDLoc dl(Op);
23600 ISD::CondCode CC =
23601 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23602
23603 // Handle f128 first, since one possible outcome is a normal integer
23604 // comparison which gets handled by emitFlagsForSetcc.
23605 if (Op0.getValueType() == MVT::f128) {
23606 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23607 Op.getOpcode() == ISD::STRICT_FSETCCS);
23608
23609 // If softenSetCCOperands returned a scalar, use it.
23610 if (!Op1.getNode()) {
23611 assert(Op0.getValueType() == Op.getValueType() &&((void)0)
23612 "Unexpected setcc expansion!")((void)0);
23613 if (IsStrict)
23614 return DAG.getMergeValues({Op0, Chain}, dl);
23615 return Op0;
23616 }
23617 }
23618
23619 if (Op0.getSimpleValueType().isInteger()) {
23620 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23621 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23622 // this may translate to less uops depending on uarch implementation. The
23623 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23624 // canonicalize to that CondCode.
23625 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23626 // encoding size - so it must either already be a i8 or i32 immediate, or it
23627 // shrinks down to that. We don't do this for any i64's to avoid additional
23628 // constant materializations.
23629 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23630 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23631 const APInt &Op1Val = Op1C->getAPIntValue();
23632 if (!Op1Val.isNullValue()) {
23633 // Ensure the constant+1 doesn't overflow.
23634 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23635 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23636 APInt Op1ValPlusOne = Op1Val + 1;
23637 if (Op1ValPlusOne.isSignedIntN(32) &&
23638 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23639 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23640 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
23641 : ISD::CondCode::SETUGE;
23642 }
23643 }
23644 }
23645 }
23646
23647 SDValue X86CC;
23648 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23649 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23650 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23651 }
23652
23653 // Handle floating point.
23654 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23655 if (CondCode == X86::COND_INVALID)
23656 return SDValue();
23657
23658 SDValue EFLAGS;
23659 if (IsStrict) {
23660 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23661 EFLAGS =
23662 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23663 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23664 Chain = EFLAGS.getValue(1);
23665 } else {
23666 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23667 }
23668
23669 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23670 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23671 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23672}
23673
23674SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23675 SDValue LHS = Op.getOperand(0);
23676 SDValue RHS = Op.getOperand(1);
23677 SDValue Carry = Op.getOperand(2);
23678 SDValue Cond = Op.getOperand(3);
23679 SDLoc DL(Op);
23680
23681 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((void)0);
23682 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23683
23684 // Recreate the carry if needed.
23685 EVT CarryVT = Carry.getValueType();
23686 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23687 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23688
23689 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23690 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23691 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23692}
23693
23694// This function returns three things: the arithmetic computation itself
23695// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23696// flag and the condition code define the case in which the arithmetic
23697// computation overflows.
23698static std::pair<SDValue, SDValue>
23699getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23700 assert(Op.getResNo() == 0 && "Unexpected result number!")((void)0);
23701 SDValue Value, Overflow;
23702 SDValue LHS = Op.getOperand(0);
23703 SDValue RHS = Op.getOperand(1);
23704 unsigned BaseOp = 0;
23705 SDLoc DL(Op);
23706 switch (Op.getOpcode()) {
23707 default: llvm_unreachable("Unknown ovf instruction!")__builtin_unreachable();
23708 case ISD::SADDO:
23709 BaseOp = X86ISD::ADD;
23710 Cond = X86::COND_O;
23711 break;
23712 case ISD::UADDO:
23713 BaseOp = X86ISD::ADD;
23714 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23715 break;
23716 case ISD::SSUBO:
23717 BaseOp = X86ISD::SUB;
23718 Cond = X86::COND_O;
23719 break;
23720 case ISD::USUBO:
23721 BaseOp = X86ISD::SUB;
23722 Cond = X86::COND_B;
23723 break;
23724 case ISD::SMULO:
23725 BaseOp = X86ISD::SMUL;
23726 Cond = X86::COND_O;
23727 break;
23728 case ISD::UMULO:
23729 BaseOp = X86ISD::UMUL;
23730 Cond = X86::COND_O;
23731 break;
23732 }
23733
23734 if (BaseOp) {
23735 // Also sets EFLAGS.
23736 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23737 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23738 Overflow = Value.getValue(1);
23739 }
23740
23741 return std::make_pair(Value, Overflow);
23742}
23743
23744static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23745 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23746 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23747 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23748 // has only one use.
23749 SDLoc DL(Op);
23750 X86::CondCode Cond;
23751 SDValue Value, Overflow;
23752 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23753
23754 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23755 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")((void)0);
23756 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23757}
23758
23759/// Return true if opcode is a X86 logical comparison.
23760static bool isX86LogicalCmp(SDValue Op) {
23761 unsigned Opc = Op.getOpcode();
23762 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23763 Opc == X86ISD::FCMP)
23764 return true;
23765 if (Op.getResNo() == 1 &&
23766 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23767 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23768 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23769 return true;
23770
23771 return false;
23772}
23773
23774static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23775 if (V.getOpcode() != ISD::TRUNCATE)
23776 return false;
23777
23778 SDValue VOp0 = V.getOperand(0);
23779 unsigned InBits = VOp0.getValueSizeInBits();
23780 unsigned Bits = V.getValueSizeInBits();
23781 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23782}
23783
23784SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23785 bool AddTest = true;
23786 SDValue Cond = Op.getOperand(0);
23787 SDValue Op1 = Op.getOperand(1);
23788 SDValue Op2 = Op.getOperand(2);
23789 SDLoc DL(Op);
23790 MVT VT = Op1.getSimpleValueType();
23791 SDValue CC;
23792
23793 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23794 // are available or VBLENDV if AVX is available.
23795 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23796 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23797 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23798 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23799 bool IsAlwaysSignaling;
23800 unsigned SSECC =
23801 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23802 CondOp0, CondOp1, IsAlwaysSignaling);
23803
23804 if (Subtarget.hasAVX512()) {
23805 SDValue Cmp =
23806 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23807 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23808 assert(!VT.isVector() && "Not a scalar type?")((void)0);
23809 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23810 }
23811
23812 if (SSECC < 8 || Subtarget.hasAVX()) {
23813 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23814 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23815
23816 // If we have AVX, we can use a variable vector select (VBLENDV) instead
23817 // of 3 logic instructions for size savings and potentially speed.
23818 // Unfortunately, there is no scalar form of VBLENDV.
23819
23820 // If either operand is a +0.0 constant, don't try this. We can expect to
23821 // optimize away at least one of the logic instructions later in that
23822 // case, so that sequence would be faster than a variable blend.
23823
23824 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23825 // uses XMM0 as the selection register. That may need just as many
23826 // instructions as the AND/ANDN/OR sequence due to register moves, so
23827 // don't bother.
23828 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23829 !isNullFPConstant(Op2)) {
23830 // Convert to vectors, do a VSELECT, and convert back to scalar.
23831 // All of the conversions should be optimized away.
23832 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23833 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23834 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23835 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23836
23837 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23838 VCmp = DAG.getBitcast(VCmpVT, VCmp);
23839
23840 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23841
23842 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23843 VSel, DAG.getIntPtrConstant(0, DL));
23844 }
23845 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23846 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23847 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23848 }
23849 }
23850
23851 // AVX512 fallback is to lower selects of scalar floats to masked moves.
23852 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23853 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23854 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23855 }
23856
23857 if (Cond.getOpcode() == ISD::SETCC) {
23858 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23859 Cond = NewCond;
23860 // If the condition was updated, it's possible that the operands of the
23861 // select were also updated (for example, EmitTest has a RAUW). Refresh
23862 // the local references to the select operands in case they got stale.
23863 Op1 = Op.getOperand(1);
23864 Op2 = Op.getOperand(2);
23865 }
23866 }
23867
23868 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23869 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23870 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23871 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23872 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23873 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23874 if (Cond.getOpcode() == X86ISD::SETCC &&
23875 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23876 isNullConstant(Cond.getOperand(1).getOperand(1))) {
23877 SDValue Cmp = Cond.getOperand(1);
23878 SDValue CmpOp0 = Cmp.getOperand(0);
23879 unsigned CondCode = Cond.getConstantOperandVal(0);
23880
23881 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23882 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23883 // handle to keep the CMP with 0. This should be removed by
23884 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23885 // cttz_zero_undef.
23886 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23887 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23888 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23889 };
23890 if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
23891 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23892 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23893 // Keep Cmp.
23894 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23895 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23896 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23897
23898 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23899 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23900
23901 // Apply further optimizations for special cases
23902 // (select (x != 0), -1, 0) -> neg & sbb
23903 // (select (x == 0), 0, -1) -> neg & sbb
23904 if (isNullConstant(Y) &&
23905 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
23906 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23907 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23908 Zero = DAG.getConstant(0, DL, Op.getValueType());
23909 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
23910 }
23911
23912 Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
23913 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
23914
23915 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
23916 SDValue Res = // Res = 0 or -1.
23917 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
23918
23919 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
23920 Res = DAG.getNOT(DL, Res, Res.getValueType());
23921
23922 return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
23923 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
23924 Cmp.getOperand(0).getOpcode() == ISD::AND &&
23925 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
23926 SDValue Src1, Src2;
23927 // true if Op2 is XOR or OR operator and one of its operands
23928 // is equal to Op1
23929 // ( a , a op b) || ( b , a op b)
23930 auto isOrXorPattern = [&]() {
23931 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23932 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23933 Src1 =
23934 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23935 Src2 = Op1;
23936 return true;
23937 }
23938 return false;
23939 };
23940
23941 if (isOrXorPattern()) {
23942 SDValue Neg;
23943 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23944 // we need mask of all zeros or ones with same size of the other
23945 // operands.
23946 if (CmpSz > VT.getSizeInBits())
23947 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23948 else if (CmpSz < VT.getSizeInBits())
23949 Neg = DAG.getNode(ISD::AND, DL, VT,
23950 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23951 DAG.getConstant(1, DL, VT));
23952 else
23953 Neg = CmpOp0;
23954 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23955 Neg); // -(and (x, 0x1))
23956 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23957 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
23958 }
23959 }
23960 }
23961
23962 // Look past (and (setcc_carry (cmp ...)), 1).
23963 if (Cond.getOpcode() == ISD::AND &&
23964 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23965 isOneConstant(Cond.getOperand(1)))
23966 Cond = Cond.getOperand(0);
23967
23968 // If condition flag is set by a X86ISD::CMP, then use it as the condition
23969 // setting operand in place of the X86ISD::SETCC.
23970 unsigned CondOpcode = Cond.getOpcode();
23971 if (CondOpcode == X86ISD::SETCC ||
23972 CondOpcode == X86ISD::SETCC_CARRY) {
23973 CC = Cond.getOperand(0);
23974
23975 SDValue Cmp = Cond.getOperand(1);
23976 bool IllegalFPCMov = false;
23977 if (VT.isFloatingPoint() && !VT.isVector() &&
23978 !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
23979 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23980
23981 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23982 Cmp.getOpcode() == X86ISD::BT) { // FIXME
23983 Cond = Cmp;
23984 AddTest = false;
23985 }
23986 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23987 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23988 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23989 SDValue Value;
23990 X86::CondCode X86Cond;
23991 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23992
23993 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23994 AddTest = false;
23995 }
23996
23997 if (AddTest) {
23998 // Look past the truncate if the high bits are known zero.
23999 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24000 Cond = Cond.getOperand(0);
24001
24002 // We know the result of AND is compared against zero. Try to match
24003 // it to BT.
24004 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24005 SDValue BTCC;
24006 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
24007 CC = BTCC;
24008 Cond = BT;
24009 AddTest = false;
24010 }
24011 }
24012 }
24013
24014 if (AddTest) {
24015 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24016 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24017 }
24018
24019 // a < b ? -1 : 0 -> RES = ~setcc_carry
24020 // a < b ? 0 : -1 -> RES = setcc_carry
24021 // a >= b ? -1 : 0 -> RES = setcc_carry
24022 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24023 if (Cond.getOpcode() == X86ISD::SUB) {
24024 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24025
24026 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24027 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24028 (isNullConstant(Op1) || isNullConstant(Op2))) {
24029 SDValue Res =
24030 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24031 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24032 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24033 return DAG.getNOT(DL, Res, Res.getValueType());
24034 return Res;
24035 }
24036 }
24037
24038 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24039 // widen the cmov and push the truncate through. This avoids introducing a new
24040 // branch during isel and doesn't add any extensions.
24041 if (Op.getValueType() == MVT::i8 &&
24042 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24043 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24044 if (T1.getValueType() == T2.getValueType() &&
24045 // Exclude CopyFromReg to avoid partial register stalls.
24046 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24047 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24048 CC, Cond);
24049 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24050 }
24051 }
24052
24053 // Or finally, promote i8 cmovs if we have CMOV,
24054 // or i16 cmovs if it won't prevent folding a load.
24055 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24056 // legal, but EmitLoweredSelect() can not deal with these extensions
24057 // being inserted between two CMOV's. (in i16 case too TBN)
24058 // https://bugs.llvm.org/show_bug.cgi?id=40974
24059 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
24060 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
24061 !MayFoldLoad(Op2))) {
24062 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24063 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24064 SDValue Ops[] = { Op2, Op1, CC, Cond };
24065 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24066 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24067 }
24068
24069 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24070 // condition is true.
24071 SDValue Ops[] = { Op2, Op1, CC, Cond };
24072 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24073}
24074
24075static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24076 const X86Subtarget &Subtarget,
24077 SelectionDAG &DAG) {
24078 MVT VT = Op->getSimpleValueType(0);
24079 SDValue In = Op->getOperand(0);
24080 MVT InVT = In.getSimpleValueType();
24081 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((void)0);
24082 MVT VTElt = VT.getVectorElementType();
24083 SDLoc dl(Op);
24084
24085 unsigned NumElts = VT.getVectorNumElements();
24086
24087 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24088 MVT ExtVT = VT;
24089 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24090 // If v16i32 is to be avoided, we'll need to split and concatenate.
24091 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24092 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24093
24094 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24095 }
24096
24097 // Widen to 512-bits if VLX is not supported.
24098 MVT WideVT = ExtVT;
24099 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24100 NumElts *= 512 / ExtVT.getSizeInBits();
24101 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24102 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24103 In, DAG.getIntPtrConstant(0, dl));
24104 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24105 }
24106
24107 SDValue V;
24108 MVT WideEltVT = WideVT.getVectorElementType();
24109 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24110 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24111 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24112 } else {
24113 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24114 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24115 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24116 }
24117
24118 // Truncate if we had to extend i16/i8 above.
24119 if (VT != ExtVT) {
24120 WideVT = MVT::getVectorVT(VTElt, NumElts);
24121 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24122 }
24123
24124 // Extract back to 128/256-bit if we widened.
24125 if (WideVT != VT)
24126 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24127 DAG.getIntPtrConstant(0, dl));
24128
24129 return V;
24130}
24131
24132static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24133 SelectionDAG &DAG) {
24134 SDValue In = Op->getOperand(0);
24135 MVT InVT = In.getSimpleValueType();
24136
24137 if (InVT.getVectorElementType() == MVT::i1)
24138 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24139
24140 assert(Subtarget.hasAVX() && "Expected AVX support")((void)0);
24141 return LowerAVXExtend(Op, DAG, Subtarget);
24142}
24143
24144// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24145// For sign extend this needs to handle all vector sizes and SSE4.1 and
24146// non-SSE4.1 targets. For zero extend this should only handle inputs of
24147// MVT::v64i8 when BWI is not supported, but AVX512 is.
24148static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24149 const X86Subtarget &Subtarget,
24150 SelectionDAG &DAG) {
24151 SDValue In = Op->getOperand(0);
24152 MVT VT = Op->getSimpleValueType(0);
24153 MVT InVT = In.getSimpleValueType();
24154
24155 MVT SVT = VT.getVectorElementType();
24156 MVT InSVT = InVT.getVectorElementType();
24157 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())((void)0);
24158
24159 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24160 return SDValue();
24161 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24162 return SDValue();
24163 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24164 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24165 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24166 return SDValue();
24167
24168 SDLoc dl(Op);
24169 unsigned Opc = Op.getOpcode();
24170 unsigned NumElts = VT.getVectorNumElements();
24171
24172 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24173 // For 512-bit vectors, we need 128-bits or 256-bits.
24174 if (InVT.getSizeInBits() > 128) {
24175 // Input needs to be at least the same number of elements as output, and
24176 // at least 128-bits.
24177 int InSize = InSVT.getSizeInBits() * NumElts;
24178 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24179 InVT = In.getSimpleValueType();
24180 }
24181
24182 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24183 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24184 // need to be handled here for 256/512-bit results.
24185 if (Subtarget.hasInt256()) {
24186 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((void)0);
24187
24188 if (InVT.getVectorNumElements() != NumElts)
24189 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24190
24191 // FIXME: Apparently we create inreg operations that could be regular
24192 // extends.
24193 unsigned ExtOpc =
24194 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24195 : ISD::ZERO_EXTEND;
24196 return DAG.getNode(ExtOpc, dl, VT, In);
24197 }
24198
24199 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24200 if (Subtarget.hasAVX()) {
24201 assert(VT.is256BitVector() && "256-bit vector expected")((void)0);
24202 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24203 int HalfNumElts = HalfVT.getVectorNumElements();
24204
24205 unsigned NumSrcElts = InVT.getVectorNumElements();
24206 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24207 for (int i = 0; i != HalfNumElts; ++i)
24208 HiMask[i] = HalfNumElts + i;
24209
24210 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24211 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24212 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24213 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24214 }
24215
24216 // We should only get here for sign extend.
24217 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")((void)0);
24218 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")((void)0);
24219
24220 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24221 SDValue Curr = In;
24222 SDValue SignExt = Curr;
24223
24224 // As SRAI is only available on i16/i32 types, we expand only up to i32
24225 // and handle i64 separately.
24226 if (InVT != MVT::v4i32) {
24227 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24228
24229 unsigned DestWidth = DestVT.getScalarSizeInBits();
24230 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24231
24232 unsigned InNumElts = InVT.getVectorNumElements();
24233 unsigned DestElts = DestVT.getVectorNumElements();
24234
24235 // Build a shuffle mask that takes each input element and places it in the
24236 // MSBs of the new element size.
24237 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24238 for (unsigned i = 0; i != DestElts; ++i)
24239 Mask[i * Scale + (Scale - 1)] = i;
24240
24241 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24242 Curr = DAG.getBitcast(DestVT, Curr);
24243
24244 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24245 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24246 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24247 }
24248
24249 if (VT == MVT::v2i64) {
24250 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")((void)0);
24251 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24252 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24253 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24254 SignExt = DAG.getBitcast(VT, SignExt);
24255 }
24256
24257 return SignExt;
24258}
24259
24260static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24261 SelectionDAG &DAG) {
24262 MVT VT = Op->getSimpleValueType(0);
24263 SDValue In = Op->getOperand(0);
24264 MVT InVT = In.getSimpleValueType();
24265 SDLoc dl(Op);
24266
24267 if (InVT.getVectorElementType() == MVT::i1)
24268 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24269
24270 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((void)0);
24271 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
24272 "Expected same number of elements")((void)0);
24273 assert((VT.getVectorElementType() == MVT::i16 ||((void)0)
24274 VT.getVectorElementType() == MVT::i32 ||((void)0)
24275 VT.getVectorElementType() == MVT::i64) &&((void)0)
24276 "Unexpected element type")((void)0);
24277 assert((InVT.getVectorElementType() == MVT::i8 ||((void)0)
24278 InVT.getVectorElementType() == MVT::i16 ||((void)0)
24279 InVT.getVectorElementType() == MVT::i32) &&((void)0)
24280 "Unexpected element type")((void)0);
24281
24282 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24283 assert(InVT == MVT::v32i8 && "Unexpected VT!")((void)0);
24284 return splitVectorIntUnary(Op, DAG);
24285 }
24286
24287 if (Subtarget.hasInt256())
24288 return Op;
24289
24290 // Optimize vectors in AVX mode
24291 // Sign extend v8i16 to v8i32 and
24292 // v4i32 to v4i64
24293 //
24294 // Divide input vector into two parts
24295 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24296 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24297 // concat the vectors to original VT
24298 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24299 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24300
24301 unsigned NumElems = InVT.getVectorNumElements();
24302 SmallVector<int,8> ShufMask(NumElems, -1);
24303 for (unsigned i = 0; i != NumElems/2; ++i)
24304 ShufMask[i] = i + NumElems/2;
24305
24306 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24307 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24308
24309 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24310}
24311
24312/// Change a vector store into a pair of half-size vector stores.
24313static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24314 SDValue StoredVal = Store->getValue();
24315 assert((StoredVal.getValueType().is256BitVector() ||((void)0)
24316 StoredVal.getValueType().is512BitVector()) &&((void)0)
24317 "Expecting 256/512-bit op")((void)0);
24318
24319 // Splitting volatile memory ops is not allowed unless the operation was not
24320 // legal to begin with. Assume the input store is legal (this transform is
24321 // only used for targets with AVX). Note: It is possible that we have an
24322 // illegal type like v2i128, and so we could allow splitting a volatile store
24323 // in that case if that is important.
24324 if (!Store->isSimple())
24325 return SDValue();
24326
24327 SDLoc DL(Store);
24328 SDValue Value0, Value1;
24329 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24330 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24331 SDValue Ptr0 = Store->getBasePtr();
24332 SDValue Ptr1 =
24333 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24334 SDValue Ch0 =
24335 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24336 Store->getOriginalAlign(),
24337 Store->getMemOperand()->getFlags());
24338 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24339 Store->getPointerInfo().getWithOffset(HalfOffset),
24340 Store->getOriginalAlign(),
24341 Store->getMemOperand()->getFlags());
24342 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24343}
24344
24345/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24346/// type.
24347static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24348 SelectionDAG &DAG) {
24349 SDValue StoredVal = Store->getValue();
24350 assert(StoreVT.is128BitVector() &&((void)0)
24351 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")((void)0);
24352 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24353
24354 // Splitting volatile memory ops is not allowed unless the operation was not
24355 // legal to begin with. We are assuming the input op is legal (this transform
24356 // is only used for targets with AVX).
24357 if (!Store->isSimple())
24358 return SDValue();
24359
24360 MVT StoreSVT = StoreVT.getScalarType();
24361 unsigned NumElems = StoreVT.getVectorNumElements();
24362 unsigned ScalarSize = StoreSVT.getStoreSize();
24363
24364 SDLoc DL(Store);
24365 SmallVector<SDValue, 4> Stores;
24366 for (unsigned i = 0; i != NumElems; ++i) {
24367 unsigned Offset = i * ScalarSize;
24368 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24369 TypeSize::Fixed(Offset), DL);
24370 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24371 DAG.getIntPtrConstant(i, DL));
24372 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24373 Store->getPointerInfo().getWithOffset(Offset),
24374 Store->getOriginalAlign(),
24375 Store->getMemOperand()->getFlags());
24376 Stores.push_back(Ch);
24377 }
24378 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24379}
24380
24381static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24382 SelectionDAG &DAG) {
24383 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24384 SDLoc dl(St);
24385 SDValue StoredVal = St->getValue();
24386
24387 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24388 if (StoredVal.getValueType().isVector() &&
24389 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24390 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24391 assert(NumElts <= 8 && "Unexpected VT")((void)0);
24392 assert(!St->isTruncatingStore() && "Expected non-truncating store")((void)0);
24393 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((void)0)
24394 "Expected AVX512F without AVX512DQI")((void)0);
24395
24396 // We must pad with zeros to ensure we store zeroes to any unused bits.
24397 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24398 DAG.getUNDEF(MVT::v16i1), StoredVal,
24399 DAG.getIntPtrConstant(0, dl));
24400 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24401 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24402 // Make sure we store zeros in the extra bits.
24403 if (NumElts < 8)
24404 StoredVal = DAG.getZeroExtendInReg(
24405 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24406
24407 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24408 St->getPointerInfo(), St->getOriginalAlign(),
24409 St->getMemOperand()->getFlags());
24410 }
24411
24412 if (St->isTruncatingStore())
24413 return SDValue();
24414
24415 // If this is a 256-bit store of concatenated ops, we are better off splitting
24416 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24417 // and each half can execute independently. Some cores would split the op into
24418 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24419 MVT StoreVT = StoredVal.getSimpleValueType();
24420 if (StoreVT.is256BitVector() ||
24421 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24422 !Subtarget.hasBWI())) {
24423 SmallVector<SDValue, 4> CatOps;
24424 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24425 return splitVectorStore(St, DAG);
24426 return SDValue();
24427 }
24428
24429 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24430 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&((void)0)
24431 "Unexpected VT")((void)0);
24432 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==((void)0)
24433 TargetLowering::TypeWidenVector && "Unexpected type action!")((void)0);
24434
24435 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24436 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24437 DAG.getUNDEF(StoreVT));
24438
24439 if (Subtarget.hasSSE2()) {
24440 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24441 // and store it.
24442 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24443 MVT CastVT = MVT::getVectorVT(StVT, 2);
24444 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24445 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24446 DAG.getIntPtrConstant(0, dl));
24447
24448 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24449 St->getPointerInfo(), St->getOriginalAlign(),
24450 St->getMemOperand()->getFlags());
24451 }
24452 assert(Subtarget.hasSSE1() && "Expected SSE")((void)0);
24453 SDVTList Tys = DAG.getVTList(MVT::Other);
24454 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24455 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24456 St->getMemOperand());
24457}
24458
24459// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24460// may emit an illegal shuffle but the expansion is still better than scalar
24461// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24462// we'll emit a shuffle and a arithmetic shift.
24463// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24464// TODO: It is possible to support ZExt by zeroing the undef values during
24465// the shuffle phase or after the shuffle.
24466static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24467 SelectionDAG &DAG) {
24468 MVT RegVT = Op.getSimpleValueType();
24469 assert(RegVT.isVector() && "We only custom lower vector loads.")((void)0);
24470 assert(RegVT.isInteger() &&((void)0)
24471 "We only custom lower integer vector loads.")((void)0);
24472
24473 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24474 SDLoc dl(Ld);
24475
24476 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24477 if (RegVT.getVectorElementType() == MVT::i1) {
24478 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")((void)0);
24479 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")((void)0);
24480 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((void)0)
24481 "Expected AVX512F without AVX512DQI")((void)0);
24482
24483 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24484 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24485 Ld->getMemOperand()->getFlags());
24486
24487 // Replace chain users with the new chain.
24488 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")((void)0);
24489
24490 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24491 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24492 DAG.getBitcast(MVT::v16i1, Val),
24493 DAG.getIntPtrConstant(0, dl));
24494 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24495 }
24496
24497 return SDValue();
24498}
24499
24500/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24501/// each of which has no other use apart from the AND / OR.
24502static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24503 Opc = Op.getOpcode();
24504 if (Opc != ISD::OR && Opc != ISD::AND)
24505 return false;
24506 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24507 Op.getOperand(0).hasOneUse() &&
24508 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24509 Op.getOperand(1).hasOneUse());
24510}
24511
24512SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24513 SDValue Chain = Op.getOperand(0);
24514 SDValue Cond = Op.getOperand(1);
24515 SDValue Dest = Op.getOperand(2);
24516 SDLoc dl(Op);
24517
24518 if (Cond.getOpcode() == ISD::SETCC &&
24519 Cond.getOperand(0).getValueType() != MVT::f128) {
24520 SDValue LHS = Cond.getOperand(0);
24521 SDValue RHS = Cond.getOperand(1);
24522 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24523
24524 // Special case for
24525 // setcc([su]{add,sub,mul}o == 0)
24526 // setcc([su]{add,sub,mul}o != 1)
24527 if (ISD::isOverflowIntrOpRes(LHS) &&
24528 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24529 (isNullConstant(RHS) || isOneConstant(RHS))) {
24530 SDValue Value, Overflow;
24531 X86::CondCode X86Cond;
24532 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24533
24534 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24535 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24536
24537 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24538 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24539 Overflow);
24540 }
24541
24542 if (LHS.getSimpleValueType().isInteger()) {
24543 SDValue CCVal;
24544 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24545 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24546 EFLAGS);
24547 }
24548
24549 if (CC == ISD::SETOEQ) {
24550 // For FCMP_OEQ, we can emit
24551 // two branches instead of an explicit AND instruction with a
24552 // separate test. However, we only do this if this block doesn't
24553 // have a fall-through edge, because this requires an explicit
24554 // jmp when the condition is false.
24555 if (Op.getNode()->hasOneUse()) {
24556 SDNode *User = *Op.getNode()->use_begin();
24557 // Look for an unconditional branch following this conditional branch.
24558 // We need this because we need to reverse the successors in order
24559 // to implement FCMP_OEQ.
24560 if (User->getOpcode() == ISD::BR) {
24561 SDValue FalseBB = User->getOperand(1);
24562 SDNode *NewBR =
24563 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24564 assert(NewBR == User)((void)0);
24565 (void)NewBR;
24566 Dest = FalseBB;
24567
24568 SDValue Cmp =
24569 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24570 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24571 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24572 CCVal, Cmp);
24573 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24574 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24575 Cmp);
24576 }
24577 }
24578 } else if (CC == ISD::SETUNE) {
24579 // For FCMP_UNE, we can emit
24580 // two branches instead of an explicit OR instruction with a
24581 // separate test.
24582 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24583 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24584 Chain =
24585 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24586 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24587 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24588 Cmp);
24589 } else {
24590 X86::CondCode X86Cond =
24591 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24592 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24593 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24594 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24595 Cmp);
24596 }
24597 }
24598
24599 if (ISD::isOverflowIntrOpRes(Cond)) {
24600 SDValue Value, Overflow;
24601 X86::CondCode X86Cond;
24602 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24603
24604 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24605 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24606 Overflow);
24607 }
24608
24609 // Look past the truncate if the high bits are known zero.
24610 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24611 Cond = Cond.getOperand(0);
24612
24613 EVT CondVT = Cond.getValueType();
24614
24615 // Add an AND with 1 if we don't already have one.
24616 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24617 Cond =
24618 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24619
24620 SDValue LHS = Cond;
24621 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24622
24623 SDValue CCVal;
24624 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24625 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24626 EFLAGS);
24627}
24628
24629// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24630// Calls to _alloca are needed to probe the stack when allocating more than 4k
24631// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24632// that the guard pages used by the OS virtual memory manager are allocated in
24633// correct sequence.
24634SDValue
24635X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24636 SelectionDAG &DAG) const {
24637 MachineFunction &MF = DAG.getMachineFunction();
24638 bool SplitStack = MF.shouldSplitStack();
24639 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24640 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24641 SplitStack || EmitStackProbeCall;
24642 SDLoc dl(Op);
24643
24644 // Get the inputs.
24645 SDNode *Node = Op.getNode();
24646 SDValue Chain = Op.getOperand(0);
24647 SDValue Size = Op.getOperand(1);
24648 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24649 EVT VT = Node->getValueType(0);
24650
24651 // Chain the dynamic stack allocation so that it doesn't modify the stack
24652 // pointer when other instructions are using the stack.
24653 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24654
24655 bool Is64Bit = Subtarget.is64Bit();
24656 MVT SPTy = getPointerTy(DAG.getDataLayout());
24657
24658 SDValue Result;
24659 if (!Lower) {
24660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24661 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24662 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((void)0)
24663 " not tell us which reg is the stack pointer!")((void)0);
24664
24665 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24666 const Align StackAlign = TFI.getStackAlign();
24667 if (hasInlineStackProbe(MF)) {
24668 MachineRegisterInfo &MRI = MF.getRegInfo();
24669
24670 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24671 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24672 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24673 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24674 DAG.getRegister(Vreg, SPTy));
24675 } else {
24676 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24677 Chain = SP.getValue(1);
24678 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24679 }
24680 if (Alignment && *Alignment > StackAlign)
24681 Result =
24682 DAG.getNode(ISD::AND, dl, VT, Result,
24683 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24684 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24685 } else if (SplitStack) {
24686 MachineRegisterInfo &MRI = MF.getRegInfo();
24687
24688 if (Is64Bit) {
24689 // The 64 bit implementation of segmented stacks needs to clobber both r10
24690 // r11. This makes it impossible to use it along with nested parameters.
24691 const Function &F = MF.getFunction();
24692 for (const auto &A : F.args()) {
24693 if (A.hasNestAttr())
24694 report_fatal_error("Cannot use segmented stacks with functions that "
24695 "have nested arguments.");
24696 }
24697 }
24698
24699 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24700 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24701 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24702 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24703 DAG.getRegister(Vreg, SPTy));
24704 } else {
24705 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24706 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
24707 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
24708
24709 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24710 Register SPReg = RegInfo->getStackRegister();
24711 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24712 Chain = SP.getValue(1);
24713
24714 if (Alignment) {
24715 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24716 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24717 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24718 }
24719
24720 Result = SP;
24721 }
24722
24723 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
24724 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
24725
24726 SDValue Ops[2] = {Result, Chain};
24727 return DAG.getMergeValues(Ops, dl);
24728}
24729
24730SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24731 MachineFunction &MF = DAG.getMachineFunction();
24732 auto PtrVT = getPointerTy(MF.getDataLayout());
24733 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24734
24735 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24736 SDLoc DL(Op);
24737
24738 if (!Subtarget.is64Bit() ||
24739 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24740 // vastart just stores the address of the VarArgsFrameIndex slot into the
24741 // memory location argument.
24742 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24743 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24744 MachinePointerInfo(SV));
24745 }
24746
24747 // __va_list_tag:
24748 // gp_offset (0 - 6 * 8)
24749 // fp_offset (48 - 48 + 8 * 16)
24750 // overflow_arg_area (point to parameters coming in memory).
24751 // reg_save_area
24752 SmallVector<SDValue, 8> MemOps;
24753 SDValue FIN = Op.getOperand(1);
24754 // Store gp_offset
24755 SDValue Store = DAG.getStore(
24756 Op.getOperand(0), DL,
24757 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24758 MachinePointerInfo(SV));
24759 MemOps.push_back(Store);
24760
24761 // Store fp_offset
24762 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24763 Store = DAG.getStore(
24764 Op.getOperand(0), DL,
24765 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24766 MachinePointerInfo(SV, 4));
24767 MemOps.push_back(Store);
24768
24769 // Store ptr to overflow_arg_area
24770 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24771 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24772 Store =
24773 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24774 MemOps.push_back(Store);
24775
24776 // Store ptr to reg_save_area.
24777 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24778 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24779 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24780 Store = DAG.getStore(
24781 Op.getOperand(0), DL, RSFIN, FIN,
24782 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24783 MemOps.push_back(Store);
24784 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24785}
24786
24787SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24788 assert(Subtarget.is64Bit() &&((void)0)
24789 "LowerVAARG only handles 64-bit va_arg!")((void)0);
24790 assert(Op.getNumOperands() == 4)((void)0);
24791
24792 MachineFunction &MF = DAG.getMachineFunction();
24793 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24794 // The Win64 ABI uses char* instead of a structure.
24795 return DAG.expandVAArg(Op.getNode());
24796
24797 SDValue Chain = Op.getOperand(0);
24798 SDValue SrcPtr = Op.getOperand(1);
24799 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24800 unsigned Align = Op.getConstantOperandVal(3);
24801 SDLoc dl(Op);
24802
24803 EVT ArgVT = Op.getNode()->getValueType(0);
24804 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24805 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24806 uint8_t ArgMode;
24807
24808 // Decide which area this value should be read from.
24809 // TODO: Implement the AMD64 ABI in its entirety. This simple
24810 // selection mechanism works only for the basic types.
24811 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")((void)0);
24812 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24813 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
24814 } else {
24815 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&((void)0)
24816 "Unhandled argument type in LowerVAARG")((void)0);
24817 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
24818 }
24819
24820 if (ArgMode == 2) {
24821 // Sanity Check: Make sure using fp_offset makes sense.
24822 assert(!Subtarget.useSoftFloat() &&((void)0)
24823 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&((void)0)
24824 Subtarget.hasSSE1())((void)0);
24825 }
24826
24827 // Insert VAARG node into the DAG
24828 // VAARG returns two values: Variable Argument Address, Chain
24829 SDValue InstOps[] = {Chain, SrcPtr,
24830 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24831 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24832 DAG.getTargetConstant(Align, dl, MVT::i32)};
24833 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24834 SDValue VAARG = DAG.getMemIntrinsicNode(
24835 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24836 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24837 /*Alignment=*/None,
24838 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24839 Chain = VAARG.getValue(1);
24840
24841 // Load the next argument and return it
24842 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24843}
24844
24845static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24846 SelectionDAG &DAG) {
24847 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24848 // where a va_list is still an i8*.
24849 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((void)0);
24850 if (Subtarget.isCallingConvWin64(
24851 DAG.getMachineFunction().getFunction().getCallingConv()))
24852 // Probably a Win64 va_copy.
24853 return DAG.expandVACopy(Op.getNode());
24854
24855 SDValue Chain = Op.getOperand(0);
24856 SDValue DstPtr = Op.getOperand(1);
24857 SDValue SrcPtr = Op.getOperand(2);
24858 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24859 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24860 SDLoc DL(Op);
24861
24862 return DAG.getMemcpy(
24863 Chain, DL, DstPtr, SrcPtr,
24864 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24865 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24866 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24867}
24868
24869// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
24870static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24871 switch (Opc) {
24872 case ISD::SHL:
24873 case X86ISD::VSHL:
24874 case X86ISD::VSHLI:
24875 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24876 case ISD::SRL:
24877 case X86ISD::VSRL:
24878 case X86ISD::VSRLI:
24879 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24880 case ISD::SRA:
24881 case X86ISD::VSRA:
24882 case X86ISD::VSRAI:
24883 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24884 }
24885 llvm_unreachable("Unknown target vector shift node")__builtin_unreachable();
24886}
24887
24888/// Handle vector element shifts where the shift amount is a constant.
24889/// Takes immediate version of shift as input.
24890static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24891 SDValue SrcOp, uint64_t ShiftAmt,
24892 SelectionDAG &DAG) {
24893 MVT ElementType = VT.getVectorElementType();
24894
24895 // Bitcast the source vector to the output type, this is mainly necessary for
24896 // vXi8/vXi64 shifts.
24897 if (VT != SrcOp.getSimpleValueType())
24898 SrcOp = DAG.getBitcast(VT, SrcOp);
24899
24900 // Fold this packed shift into its first operand if ShiftAmt is 0.
24901 if (ShiftAmt == 0)
24902 return SrcOp;
24903
24904 // Check for ShiftAmt >= element width
24905 if (ShiftAmt >= ElementType.getSizeInBits()) {
24906 if (Opc == X86ISD::VSRAI)
24907 ShiftAmt = ElementType.getSizeInBits() - 1;
24908 else
24909 return DAG.getConstant(0, dl, VT);
24910 }
24911
24912 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)((void)0)
24913 && "Unknown target vector shift-by-constant node")((void)0);
24914
24915 // Fold this packed vector shift into a build vector if SrcOp is a
24916 // vector of Constants or UNDEFs.
24917 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24918 SmallVector<SDValue, 8> Elts;
24919 unsigned NumElts = SrcOp->getNumOperands();
24920
24921 switch (Opc) {
24922 default: llvm_unreachable("Unknown opcode!")__builtin_unreachable();
24923 case X86ISD::VSHLI:
24924 for (unsigned i = 0; i != NumElts; ++i) {
24925 SDValue CurrentOp = SrcOp->getOperand(i);
24926 if (CurrentOp->isUndef()) {
24927 // Must produce 0s in the correct bits.
24928 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24929 continue;
24930 }
24931 auto *ND = cast<ConstantSDNode>(CurrentOp);
24932 const APInt &C = ND->getAPIntValue();
24933 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
24934 }
24935 break;
24936 case X86ISD::VSRLI:
24937 for (unsigned i = 0; i != NumElts; ++i) {
24938 SDValue CurrentOp = SrcOp->getOperand(i);
24939 if (CurrentOp->isUndef()) {
24940 // Must produce 0s in the correct bits.
24941 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24942 continue;
24943 }
24944 auto *ND = cast<ConstantSDNode>(CurrentOp);
24945 const APInt &C = ND->getAPIntValue();
24946 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
24947 }
24948 break;
24949 case X86ISD::VSRAI:
24950 for (unsigned i = 0; i != NumElts; ++i) {
24951 SDValue CurrentOp = SrcOp->getOperand(i);
24952 if (CurrentOp->isUndef()) {
24953 // All shifted in bits must be the same so use 0.
24954 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24955 continue;
24956 }
24957 auto *ND = cast<ConstantSDNode>(CurrentOp);
24958 const APInt &C = ND->getAPIntValue();
24959 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
24960 }
24961 break;
24962 }
24963
24964 return DAG.getBuildVector(VT, dl, Elts);
24965 }
24966
24967 return DAG.getNode(Opc, dl, VT, SrcOp,
24968 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24969}
24970
24971/// Handle vector element shifts where the shift amount may or may not be a
24972/// constant. Takes immediate version of shift as input.
24973static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24974 SDValue SrcOp, SDValue ShAmt,
24975 const X86Subtarget &Subtarget,
24976 SelectionDAG &DAG) {
24977 MVT SVT = ShAmt.getSimpleValueType();
24978 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")((void)0);
24979
24980 // Catch shift-by-constant.
24981 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
24982 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
24983 CShAmt->getZExtValue(), DAG);
24984
24985 // Change opcode to non-immediate version.
24986 Opc = getTargetVShiftUniformOpcode(Opc, true);
24987
24988 // Need to build a vector containing shift amount.
24989 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
24990 // +====================+============+=======================================+
24991 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
24992 // +====================+============+=======================================+
24993 // | i64 | Yes, No | Use ShAmt as lowest elt |
24994 // | i32 | Yes | zero-extend in-reg |
24995 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
24996 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
24997 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
24998 // +====================+============+=======================================+
24999
25000 if (SVT == MVT::i64)
25001 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
25002 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
25003 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25004 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
25005 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
25006 ShAmt = ShAmt.getOperand(0);
25007 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
25008 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
25009 if (Subtarget.hasSSE41())
25010 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25011 MVT::v2i64, ShAmt);
25012 else {
25013 SDValue ByteShift = DAG.getTargetConstant(
25014 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25015 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25016 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25017 ByteShift);
25018 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25019 ByteShift);
25020 }
25021 } else if (Subtarget.hasSSE41() &&
25022 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25023 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25024 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25025 MVT::v2i64, ShAmt);
25026 } else {
25027 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
25028 DAG.getUNDEF(SVT)};
25029 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
25030 }
25031
25032 // The return type has to be a 128-bit type with the same element
25033 // type as the input type.
25034 MVT EltVT = VT.getVectorElementType();
25035 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25036
25037 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25038 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25039}
25040
25041/// Return Mask with the necessary casting or extending
25042/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25043static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25044 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25045 const SDLoc &dl) {
25046
25047 if (isAllOnesConstant(Mask))
25048 return DAG.getConstant(1, dl, MaskVT);
25049 if (X86::isZeroNode(Mask))
25050 return DAG.getConstant(0, dl, MaskVT);
25051
25052 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")((void)0);
25053
25054 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25055 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")((void)0);
25056 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((void)0);
25057 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25058 SDValue Lo, Hi;
25059 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25060 DAG.getConstant(0, dl, MVT::i32));
25061 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25062 DAG.getConstant(1, dl, MVT::i32));
25063
25064 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25065 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25066
25067 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25068 } else {
25069 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25070 Mask.getSimpleValueType().getSizeInBits());
25071 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25072 // are extracted by EXTRACT_SUBVECTOR.
25073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25074 DAG.getBitcast(BitcastVT, Mask),
25075 DAG.getIntPtrConstant(0, dl));
25076 }
25077}
25078
25079/// Return (and \p Op, \p Mask) for compare instructions or
25080/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25081/// necessary casting or extending for \p Mask when lowering masking intrinsics
25082static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25083 SDValue PreservedSrc,
25084 const X86Subtarget &Subtarget,
25085 SelectionDAG &DAG) {
25086 MVT VT = Op.getSimpleValueType();
25087 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25088 unsigned OpcodeSelect = ISD::VSELECT;
25089 SDLoc dl(Op);
25090
25091 if (isAllOnesConstant(Mask))
25092 return Op;
25093
25094 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25095
25096 if (PreservedSrc.isUndef())
25097 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25098 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25099}
25100
25101/// Creates an SDNode for a predicated scalar operation.
25102/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25103/// The mask is coming as MVT::i8 and it should be transformed
25104/// to MVT::v1i1 while lowering masking intrinsics.
25105/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25106/// "X86select" instead of "vselect". We just can't create the "vselect" node
25107/// for a scalar instruction.
25108static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25109 SDValue PreservedSrc,
25110 const X86Subtarget &Subtarget,
25111 SelectionDAG &DAG) {
25112
25113 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25114 if (MaskConst->getZExtValue() & 0x1)
25115 return Op;
25116
25117 MVT VT = Op.getSimpleValueType();
25118 SDLoc dl(Op);
25119
25120 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")((void)0);
25121 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25122 DAG.getBitcast(MVT::v8i1, Mask),
25123 DAG.getIntPtrConstant(0, dl));
25124 if (Op.getOpcode() == X86ISD::FSETCCM ||
25125 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25126 Op.getOpcode() == X86ISD::VFPCLASSS)
25127 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25128
25129 if (PreservedSrc.isUndef())
25130 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25131 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25132}
25133
25134static int getSEHRegistrationNodeSize(const Function *Fn) {
25135 if (!Fn->hasPersonalityFn())
25136 report_fatal_error(
25137 "querying registration node size for function without personality");
25138 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25139 // WinEHStatePass for the full struct definition.
25140 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25141 case EHPersonality::MSVC_X86SEH: return 24;
25142 case EHPersonality::MSVC_CXX: return 16;
25143 default: break;
25144 }
25145 report_fatal_error(
25146 "can only recover FP for 32-bit MSVC EH personality functions");
25147}
25148
25149/// When the MSVC runtime transfers control to us, either to an outlined
25150/// function or when returning to a parent frame after catching an exception, we
25151/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25152/// Here's the math:
25153/// RegNodeBase = EntryEBP - RegNodeSize
25154/// ParentFP = RegNodeBase - ParentFrameOffset
25155/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25156/// subtracting the offset (negative on x86) takes us back to the parent FP.
25157static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25158 SDValue EntryEBP) {
25159 MachineFunction &MF = DAG.getMachineFunction();
25160 SDLoc dl;
25161
25162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25163 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25164
25165 // It's possible that the parent function no longer has a personality function
25166 // if the exceptional code was optimized away, in which case we just return
25167 // the incoming EBP.
25168 if (!Fn->hasPersonalityFn())
25169 return EntryEBP;
25170
25171 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25172 // registration, or the .set_setframe offset.
25173 MCSymbol *OffsetSym =
25174 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25175 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25176 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25177 SDValue ParentFrameOffset =
25178 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25179
25180 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25181 // prologue to RBP in the parent function.
25182 const X86Subtarget &Subtarget =
25183 static_cast<const X86Subtarget &>(DAG.getSubtarget());
25184 if (Subtarget.is64Bit())
25185 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25186
25187 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25188 // RegNodeBase = EntryEBP - RegNodeSize
25189 // ParentFP = RegNodeBase - ParentFrameOffset
25190 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25191 DAG.getConstant(RegNodeSize, dl, PtrVT));
25192 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25193}
25194
25195SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25196 SelectionDAG &DAG) const {
25197 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25198 auto isRoundModeCurDirection = [](SDValue Rnd) {
25199 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25200 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25201
25202 return false;
25203 };
25204 auto isRoundModeSAE = [](SDValue Rnd) {
25205 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25206 unsigned RC = C->getZExtValue();
25207 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25208 // Clear the NO_EXC bit and check remaining bits.
25209 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25210 // As a convenience we allow no other bits or explicitly
25211 // current direction.
25212 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25213 }
25214 }
25215
25216 return false;
25217 };
25218 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25219 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25220 RC = C->getZExtValue();
25221 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25222 // Clear the NO_EXC bit and check remaining bits.
25223 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25224 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25225 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25226 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25227 RC == X86::STATIC_ROUNDING::TO_ZERO;
25228 }
25229 }
25230
25231 return false;
25232 };
25233
25234 SDLoc dl(Op);
25235 unsigned IntNo = Op.getConstantOperandVal(0);
25236 MVT VT = Op.getSimpleValueType();
25237 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25238
25239 // Propagate flags from original node to transformed node(s).
25240 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25241
25242 if (IntrData) {
25243 switch(IntrData->Type) {
25244 case INTR_TYPE_1OP: {
25245 // We specify 2 possible opcodes for intrinsics with rounding modes.
25246 // First, we check if the intrinsic may have non-default rounding mode,
25247 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25248 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25249 if (IntrWithRoundingModeOpcode != 0) {
25250 SDValue Rnd = Op.getOperand(2);
25251 unsigned RC = 0;
25252 if (isRoundModeSAEToX(Rnd, RC))
25253 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25254 Op.getOperand(1),
25255 DAG.getTargetConstant(RC, dl, MVT::i32));
25256 if (!isRoundModeCurDirection(Rnd))
25257 return SDValue();
25258 }
25259 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25260 Op.getOperand(1));
25261 }
25262 case INTR_TYPE_1OP_SAE: {
25263 SDValue Sae = Op.getOperand(2);
25264
25265 unsigned Opc;
25266 if (isRoundModeCurDirection(Sae))
25267 Opc = IntrData->Opc0;
25268 else if (isRoundModeSAE(Sae))
25269 Opc = IntrData->Opc1;
25270 else
25271 return SDValue();
25272
25273 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25274 }
25275 case INTR_TYPE_2OP: {
25276 SDValue Src2 = Op.getOperand(2);
25277
25278 // We specify 2 possible opcodes for intrinsics with rounding modes.
25279 // First, we check if the intrinsic may have non-default rounding mode,
25280 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25281 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25282 if (IntrWithRoundingModeOpcode != 0) {
25283 SDValue Rnd = Op.getOperand(3);
25284 unsigned RC = 0;
25285 if (isRoundModeSAEToX(Rnd, RC))
25286 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25287 Op.getOperand(1), Src2,
25288 DAG.getTargetConstant(RC, dl, MVT::i32));
25289 if (!isRoundModeCurDirection(Rnd))
25290 return SDValue();
25291 }
25292
25293 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25294 Op.getOperand(1), Src2);
25295 }
25296 case INTR_TYPE_2OP_SAE: {
25297 SDValue Sae = Op.getOperand(3);
25298
25299 unsigned Opc;
25300 if (isRoundModeCurDirection(Sae))
25301 Opc = IntrData->Opc0;
25302 else if (isRoundModeSAE(Sae))
25303 Opc = IntrData->Opc1;
25304 else
25305 return SDValue();
25306
25307 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25308 Op.getOperand(2));
25309 }
25310 case INTR_TYPE_3OP:
25311 case INTR_TYPE_3OP_IMM8: {
25312 SDValue Src1 = Op.getOperand(1);
25313 SDValue Src2 = Op.getOperand(2);
25314 SDValue Src3 = Op.getOperand(3);
25315
25316 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25317 Src3.getValueType() != MVT::i8) {
25318 Src3 = DAG.getTargetConstant(
25319 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25320 }
25321
25322 // We specify 2 possible opcodes for intrinsics with rounding modes.
25323 // First, we check if the intrinsic may have non-default rounding mode,
25324 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25325 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25326 if (IntrWithRoundingModeOpcode != 0) {
25327 SDValue Rnd = Op.getOperand(4);
25328 unsigned RC = 0;
25329 if (isRoundModeSAEToX(Rnd, RC))
25330 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25331 Src1, Src2, Src3,
25332 DAG.getTargetConstant(RC, dl, MVT::i32));
25333 if (!isRoundModeCurDirection(Rnd))
25334 return SDValue();
25335 }
25336
25337 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25338 {Src1, Src2, Src3});
25339 }
25340 case INTR_TYPE_4OP_IMM8: {
25341 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)((void)0);
25342 SDValue Src4 = Op.getOperand(4);
25343 if (Src4.getValueType() != MVT::i8) {
25344 Src4 = DAG.getTargetConstant(
25345 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25346 }
25347
25348 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25349 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25350 Src4);
25351 }
25352 case INTR_TYPE_1OP_MASK: {
25353 SDValue Src = Op.getOperand(1);
25354 SDValue PassThru = Op.getOperand(2);
25355 SDValue Mask = Op.getOperand(3);
25356 // We add rounding mode to the Node when
25357 // - RC Opcode is specified and
25358 // - RC is not "current direction".
25359 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25360 if (IntrWithRoundingModeOpcode != 0) {
25361 SDValue Rnd = Op.getOperand(4);
25362 unsigned RC = 0;
25363 if (isRoundModeSAEToX(Rnd, RC))
25364 return getVectorMaskingNode(
25365 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25366 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25367 Mask, PassThru, Subtarget, DAG);
25368 if (!isRoundModeCurDirection(Rnd))
25369 return SDValue();
25370 }
25371 return getVectorMaskingNode(
25372 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25373 Subtarget, DAG);
25374 }
25375 case INTR_TYPE_1OP_MASK_SAE: {
25376 SDValue Src = Op.getOperand(1);
25377 SDValue PassThru = Op.getOperand(2);
25378 SDValue Mask = Op.getOperand(3);
25379 SDValue Rnd = Op.getOperand(4);
25380
25381 unsigned Opc;
25382 if (isRoundModeCurDirection(Rnd))
25383 Opc = IntrData->Opc0;
25384 else if (isRoundModeSAE(Rnd))
25385 Opc = IntrData->Opc1;
25386 else
25387 return SDValue();
25388
25389 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25390 Subtarget, DAG);
25391 }
25392 case INTR_TYPE_SCALAR_MASK: {
25393 SDValue Src1 = Op.getOperand(1);
25394 SDValue Src2 = Op.getOperand(2);
25395 SDValue passThru = Op.getOperand(3);
25396 SDValue Mask = Op.getOperand(4);
25397 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25398 // There are 2 kinds of intrinsics in this group:
25399 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25400 // (2) With rounding mode and sae - 7 operands.
25401 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25402 if (Op.getNumOperands() == (5U + HasRounding)) {
25403 if (HasRounding) {
25404 SDValue Rnd = Op.getOperand(5);
25405 unsigned RC = 0;
25406 if (isRoundModeSAEToX(Rnd, RC))
25407 return getScalarMaskingNode(
25408 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25409 DAG.getTargetConstant(RC, dl, MVT::i32)),
25410 Mask, passThru, Subtarget, DAG);
25411 if (!isRoundModeCurDirection(Rnd))
25412 return SDValue();
25413 }
25414 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25415 Src2),
25416 Mask, passThru, Subtarget, DAG);
25417 }
25418
25419 assert(Op.getNumOperands() == (6U + HasRounding) &&((void)0)
25420 "Unexpected intrinsic form")((void)0);
25421 SDValue RoundingMode = Op.getOperand(5);
25422 unsigned Opc = IntrData->Opc0;
25423 if (HasRounding) {
25424 SDValue Sae = Op.getOperand(6);
25425 if (isRoundModeSAE(Sae))
25426 Opc = IntrWithRoundingModeOpcode;
25427 else if (!isRoundModeCurDirection(Sae))
25428 return SDValue();
25429 }
25430 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25431 Src2, RoundingMode),
25432 Mask, passThru, Subtarget, DAG);
25433 }
25434 case INTR_TYPE_SCALAR_MASK_RND: {
25435 SDValue Src1 = Op.getOperand(1);
25436 SDValue Src2 = Op.getOperand(2);
25437 SDValue passThru = Op.getOperand(3);
25438 SDValue Mask = Op.getOperand(4);
25439 SDValue Rnd = Op.getOperand(5);
25440
25441 SDValue NewOp;
25442 unsigned RC = 0;
25443 if (isRoundModeCurDirection(Rnd))
25444 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25445 else if (isRoundModeSAEToX(Rnd, RC))
25446 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25447 DAG.getTargetConstant(RC, dl, MVT::i32));
25448 else
25449 return SDValue();
25450
25451 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25452 }
25453 case INTR_TYPE_SCALAR_MASK_SAE: {
25454 SDValue Src1 = Op.getOperand(1);
25455 SDValue Src2 = Op.getOperand(2);
25456 SDValue passThru = Op.getOperand(3);
25457 SDValue Mask = Op.getOperand(4);
25458 SDValue Sae = Op.getOperand(5);
25459 unsigned Opc;
25460 if (isRoundModeCurDirection(Sae))
25461 Opc = IntrData->Opc0;
25462 else if (isRoundModeSAE(Sae))
25463 Opc = IntrData->Opc1;
25464 else
25465 return SDValue();
25466
25467 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25468 Mask, passThru, Subtarget, DAG);
25469 }
25470 case INTR_TYPE_2OP_MASK: {
25471 SDValue Src1 = Op.getOperand(1);
25472 SDValue Src2 = Op.getOperand(2);
25473 SDValue PassThru = Op.getOperand(3);
25474 SDValue Mask = Op.getOperand(4);
25475 SDValue NewOp;
25476 if (IntrData->Opc1 != 0) {
25477 SDValue Rnd = Op.getOperand(5);
25478 unsigned RC = 0;
25479 if (isRoundModeSAEToX(Rnd, RC))
25480 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25481 DAG.getTargetConstant(RC, dl, MVT::i32));
25482 else if (!isRoundModeCurDirection(Rnd))
25483 return SDValue();
25484 }
25485 if (!NewOp)
25486 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25487 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25488 }
25489 case INTR_TYPE_2OP_MASK_SAE: {
25490 SDValue Src1 = Op.getOperand(1);
25491 SDValue Src2 = Op.getOperand(2);
25492 SDValue PassThru = Op.getOperand(3);
25493 SDValue Mask = Op.getOperand(4);
25494
25495 unsigned Opc = IntrData->Opc0;
25496 if (IntrData->Opc1 != 0) {
25497 SDValue Sae = Op.getOperand(5);
25498 if (isRoundModeSAE(Sae))
25499 Opc = IntrData->Opc1;
25500 else if (!isRoundModeCurDirection(Sae))
25501 return SDValue();
25502 }
25503
25504 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25505 Mask, PassThru, Subtarget, DAG);
25506 }
25507 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25508 SDValue Src1 = Op.getOperand(1);
25509 SDValue Src2 = Op.getOperand(2);
25510 SDValue Src3 = Op.getOperand(3);
25511 SDValue PassThru = Op.getOperand(4);
25512 SDValue Mask = Op.getOperand(5);
25513 SDValue Sae = Op.getOperand(6);
25514 unsigned Opc;
25515 if (isRoundModeCurDirection(Sae))
25516 Opc = IntrData->Opc0;
25517 else if (isRoundModeSAE(Sae))
25518 Opc = IntrData->Opc1;
25519 else
25520 return SDValue();
25521
25522 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25523 Mask, PassThru, Subtarget, DAG);
25524 }
25525 case INTR_TYPE_3OP_MASK_SAE: {
25526 SDValue Src1 = Op.getOperand(1);
25527 SDValue Src2 = Op.getOperand(2);
25528 SDValue Src3 = Op.getOperand(3);
25529 SDValue PassThru = Op.getOperand(4);
25530 SDValue Mask = Op.getOperand(5);
25531
25532 unsigned Opc = IntrData->Opc0;
25533 if (IntrData->Opc1 != 0) {
25534 SDValue Sae = Op.getOperand(6);
25535 if (isRoundModeSAE(Sae))
25536 Opc = IntrData->Opc1;
25537 else if (!isRoundModeCurDirection(Sae))
25538 return SDValue();
25539 }
25540 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25541 Mask, PassThru, Subtarget, DAG);
25542 }
25543 case BLENDV: {
25544 SDValue Src1 = Op.getOperand(1);
25545 SDValue Src2 = Op.getOperand(2);
25546 SDValue Src3 = Op.getOperand(3);
25547
25548 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25549 Src3 = DAG.getBitcast(MaskVT, Src3);
25550
25551 // Reverse the operands to match VSELECT order.
25552 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25553 }
25554 case VPERM_2OP : {
25555 SDValue Src1 = Op.getOperand(1);
25556 SDValue Src2 = Op.getOperand(2);
25557
25558 // Swap Src1 and Src2 in the node creation
25559 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25560 }
25561 case IFMA_OP:
25562 // NOTE: We need to swizzle the operands to pass the multiply operands
25563 // first.
25564 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25565 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25566 case FPCLASSS: {
25567 SDValue Src1 = Op.getOperand(1);
25568 SDValue Imm = Op.getOperand(2);
25569 SDValue Mask = Op.getOperand(3);
25570 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25571 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25572 Subtarget, DAG);
25573 // Need to fill with zeros to ensure the bitcast will produce zeroes
25574 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25575 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25576 DAG.getConstant(0, dl, MVT::v8i1),
25577 FPclassMask, DAG.getIntPtrConstant(0, dl));
25578 return DAG.getBitcast(MVT::i8, Ins);
25579 }
25580
25581 case CMP_MASK_CC: {
25582 MVT MaskVT = Op.getSimpleValueType();
25583 SDValue CC = Op.getOperand(3);
25584 SDValue Mask = Op.getOperand(4);
25585 // We specify 2 possible opcodes for intrinsics with rounding modes.
25586 // First, we check if the intrinsic may have non-default rounding mode,
25587 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25588 if (IntrData->Opc1 != 0) {
25589 SDValue Sae = Op.getOperand(5);
25590 if (isRoundModeSAE(Sae))
25591 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25592 Op.getOperand(2), CC, Mask, Sae);
25593 if (!isRoundModeCurDirection(Sae))
25594 return SDValue();
25595 }
25596 //default rounding mode
25597 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25598 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25599 }
25600 case CMP_MASK_SCALAR_CC: {
25601 SDValue Src1 = Op.getOperand(1);
25602 SDValue Src2 = Op.getOperand(2);
25603 SDValue CC = Op.getOperand(3);
25604 SDValue Mask = Op.getOperand(4);
25605
25606 SDValue Cmp;
25607 if (IntrData->Opc1 != 0) {
25608 SDValue Sae = Op.getOperand(5);
25609 if (isRoundModeSAE(Sae))
25610 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25611 else if (!isRoundModeCurDirection(Sae))
25612 return SDValue();
25613 }
25614 //default rounding mode
25615 if (!Cmp.getNode())
25616 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25617
25618 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25619 Subtarget, DAG);
25620 // Need to fill with zeros to ensure the bitcast will produce zeroes
25621 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25622 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25623 DAG.getConstant(0, dl, MVT::v8i1),
25624 CmpMask, DAG.getIntPtrConstant(0, dl));
25625 return DAG.getBitcast(MVT::i8, Ins);
25626 }
25627 case COMI: { // Comparison intrinsics
25628 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25629 SDValue LHS = Op.getOperand(1);
25630 SDValue RHS = Op.getOperand(2);
25631 // Some conditions require the operands to be swapped.
25632 if (CC == ISD::SETLT || CC == ISD::SETLE)
25633 std::swap(LHS, RHS);
25634
25635 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25636 SDValue SetCC;
25637 switch (CC) {
25638 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25639 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25640 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25641 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25642 break;
25643 }
25644 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25645 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25646 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25647 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25648 break;
25649 }
25650 case ISD::SETGT: // (CF = 0 and ZF = 0)
25651 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25652 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25653 break;
25654 }
25655 case ISD::SETGE: // CF = 0
25656 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25657 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25658 break;
25659 default:
25660 llvm_unreachable("Unexpected illegal condition!")__builtin_unreachable();
25661 }
25662 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25663 }
25664 case COMI_RM: { // Comparison intrinsics with Sae
25665 SDValue LHS = Op.getOperand(1);
25666 SDValue RHS = Op.getOperand(2);
25667 unsigned CondVal = Op.getConstantOperandVal(3);
25668 SDValue Sae = Op.getOperand(4);
25669
25670 SDValue FCmp;
25671 if (isRoundModeCurDirection(Sae))
25672 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25673 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25674 else if (isRoundModeSAE(Sae))
25675 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25676 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25677 else
25678 return SDValue();
25679 // Need to fill with zeros to ensure the bitcast will produce zeroes
25680 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25681 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25682 DAG.getConstant(0, dl, MVT::v16i1),
25683 FCmp, DAG.getIntPtrConstant(0, dl));
25684 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25685 DAG.getBitcast(MVT::i16, Ins));
25686 }
25687 case VSHIFT:
25688 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25689 Op.getOperand(1), Op.getOperand(2), Subtarget,
25690 DAG);
25691 case COMPRESS_EXPAND_IN_REG: {
25692 SDValue Mask = Op.getOperand(3);
25693 SDValue DataToCompress = Op.getOperand(1);
25694 SDValue PassThru = Op.getOperand(2);
25695 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25696 return Op.getOperand(1);
25697
25698 // Avoid false dependency.
25699 if (PassThru.isUndef())
25700 PassThru = DAG.getConstant(0, dl, VT);
25701
25702 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25703 Mask);
25704 }
25705 case FIXUPIMM:
25706 case FIXUPIMM_MASKZ: {
25707 SDValue Src1 = Op.getOperand(1);
25708 SDValue Src2 = Op.getOperand(2);
25709 SDValue Src3 = Op.getOperand(3);
25710 SDValue Imm = Op.getOperand(4);
25711 SDValue Mask = Op.getOperand(5);
25712 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25713 ? Src1
25714 : getZeroVector(VT, Subtarget, DAG, dl);
25715
25716 unsigned Opc = IntrData->Opc0;
25717 if (IntrData->Opc1 != 0) {
25718 SDValue Sae = Op.getOperand(6);
25719 if (isRoundModeSAE(Sae))
25720 Opc = IntrData->Opc1;
25721 else if (!isRoundModeCurDirection(Sae))
25722 return SDValue();
25723 }
25724
25725 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25726
25727 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25728 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25729
25730 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25731 }
25732 case ROUNDP: {
25733 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")((void)0);
25734 // Clear the upper bits of the rounding immediate so that the legacy
25735 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25736 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25737 SDValue RoundingMode =
25738 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25739 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25740 Op.getOperand(1), RoundingMode);
25741 }
25742 case ROUNDS: {
25743 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")((void)0);
25744 // Clear the upper bits of the rounding immediate so that the legacy
25745 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25746 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25747 SDValue RoundingMode =
25748 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25749 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25750 Op.getOperand(1), Op.getOperand(2), RoundingMode);
25751 }
25752 case BEXTRI: {
25753 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")((void)0);
25754
25755 uint64_t Imm = Op.getConstantOperandVal(2);
25756 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25757 Op.getValueType());
25758 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25759 Op.getOperand(1), Control);
25760 }
25761 // ADC/ADCX/SBB
25762 case ADX: {
25763 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25764 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25765
25766 SDValue Res;
25767 // If the carry in is zero, then we should just use ADD/SUB instead of
25768 // ADC/SBB.
25769 if (isNullConstant(Op.getOperand(1))) {
25770 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25771 Op.getOperand(3));
25772 } else {
25773 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25774 DAG.getConstant(-1, dl, MVT::i8));
25775 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25776 Op.getOperand(3), GenCF.getValue(1));
25777 }
25778 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25779 SDValue Results[] = { SetCC, Res };
25780 return DAG.getMergeValues(Results, dl);
25781 }
25782 case CVTPD2PS_MASK:
25783 case CVTPD2DQ_MASK:
25784 case CVTQQ2PS_MASK:
25785 case TRUNCATE_TO_REG: {
25786 SDValue Src = Op.getOperand(1);
25787 SDValue PassThru = Op.getOperand(2);
25788 SDValue Mask = Op.getOperand(3);
25789
25790 if (isAllOnesConstant(Mask))
25791 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25792
25793 MVT SrcVT = Src.getSimpleValueType();
25794 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25795 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25796 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25797 {Src, PassThru, Mask});
25798 }
25799 case CVTPS2PH_MASK: {
25800 SDValue Src = Op.getOperand(1);
25801 SDValue Rnd = Op.getOperand(2);
25802 SDValue PassThru = Op.getOperand(3);
25803 SDValue Mask = Op.getOperand(4);
25804
25805 if (isAllOnesConstant(Mask))
25806 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
25807
25808 MVT SrcVT = Src.getSimpleValueType();
25809 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25810 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25811 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
25812 PassThru, Mask);
25813
25814 }
25815 case CVTNEPS2BF16_MASK: {
25816 SDValue Src = Op.getOperand(1);
25817 SDValue PassThru = Op.getOperand(2);
25818 SDValue Mask = Op.getOperand(3);
25819
25820 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25821 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25822
25823 // Break false dependency.
25824 if (PassThru.isUndef())
25825 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25826
25827 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25828 Mask);
25829 }
25830 default:
25831 break;
25832 }
25833 }
25834
25835 switch (IntNo) {
25836 default: return SDValue(); // Don't custom lower most intrinsics.
25837
25838 // ptest and testp intrinsics. The intrinsic these come from are designed to
25839 // return an integer value, not just an instruction so lower it to the ptest
25840 // or testp pattern and a setcc for the result.
25841 case Intrinsic::x86_avx512_ktestc_b:
25842 case Intrinsic::x86_avx512_ktestc_w:
25843 case Intrinsic::x86_avx512_ktestc_d:
25844 case Intrinsic::x86_avx512_ktestc_q:
25845 case Intrinsic::x86_avx512_ktestz_b:
25846 case Intrinsic::x86_avx512_ktestz_w:
25847 case Intrinsic::x86_avx512_ktestz_d:
25848 case Intrinsic::x86_avx512_ktestz_q:
25849 case Intrinsic::x86_sse41_ptestz:
25850 case Intrinsic::x86_sse41_ptestc:
25851 case Intrinsic::x86_sse41_ptestnzc:
25852 case Intrinsic::x86_avx_ptestz_256:
25853 case Intrinsic::x86_avx_ptestc_256:
25854 case Intrinsic::x86_avx_ptestnzc_256:
25855 case Intrinsic::x86_avx_vtestz_ps:
25856 case Intrinsic::x86_avx_vtestc_ps:
25857 case Intrinsic::x86_avx_vtestnzc_ps:
25858 case Intrinsic::x86_avx_vtestz_pd:
25859 case Intrinsic::x86_avx_vtestc_pd:
25860 case Intrinsic::x86_avx_vtestnzc_pd:
25861 case Intrinsic::x86_avx_vtestz_ps_256:
25862 case Intrinsic::x86_avx_vtestc_ps_256:
25863 case Intrinsic::x86_avx_vtestnzc_ps_256:
25864 case Intrinsic::x86_avx_vtestz_pd_256:
25865 case Intrinsic::x86_avx_vtestc_pd_256:
25866 case Intrinsic::x86_avx_vtestnzc_pd_256: {
25867 unsigned TestOpc = X86ISD::PTEST;
25868 X86::CondCode X86CC;
25869 switch (IntNo) {
25870 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")__builtin_unreachable();
25871 case Intrinsic::x86_avx512_ktestc_b:
25872 case Intrinsic::x86_avx512_ktestc_w:
25873 case Intrinsic::x86_avx512_ktestc_d:
25874 case Intrinsic::x86_avx512_ktestc_q:
25875 // CF = 1
25876 TestOpc = X86ISD::KTEST;
25877 X86CC = X86::COND_B;
25878 break;
25879 case Intrinsic::x86_avx512_ktestz_b:
25880 case Intrinsic::x86_avx512_ktestz_w:
25881 case Intrinsic::x86_avx512_ktestz_d:
25882 case Intrinsic::x86_avx512_ktestz_q:
25883 TestOpc = X86ISD::KTEST;
25884 X86CC = X86::COND_E;
25885 break;
25886 case Intrinsic::x86_avx_vtestz_ps:
25887 case Intrinsic::x86_avx_vtestz_pd:
25888 case Intrinsic::x86_avx_vtestz_ps_256:
25889 case Intrinsic::x86_avx_vtestz_pd_256:
25890 TestOpc = X86ISD::TESTP;
25891 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25892 case Intrinsic::x86_sse41_ptestz:
25893 case Intrinsic::x86_avx_ptestz_256:
25894 // ZF = 1
25895 X86CC = X86::COND_E;
25896 break;
25897 case Intrinsic::x86_avx_vtestc_ps:
25898 case Intrinsic::x86_avx_vtestc_pd:
25899 case Intrinsic::x86_avx_vtestc_ps_256:
25900 case Intrinsic::x86_avx_vtestc_pd_256:
25901 TestOpc = X86ISD::TESTP;
25902 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25903 case Intrinsic::x86_sse41_ptestc:
25904 case Intrinsic::x86_avx_ptestc_256:
25905 // CF = 1
25906 X86CC = X86::COND_B;
25907 break;
25908 case Intrinsic::x86_avx_vtestnzc_ps:
25909 case Intrinsic::x86_avx_vtestnzc_pd:
25910 case Intrinsic::x86_avx_vtestnzc_ps_256:
25911 case Intrinsic::x86_avx_vtestnzc_pd_256:
25912 TestOpc = X86ISD::TESTP;
25913 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25914 case Intrinsic::x86_sse41_ptestnzc:
25915 case Intrinsic::x86_avx_ptestnzc_256:
25916 // ZF and CF = 0
25917 X86CC = X86::COND_A;
25918 break;
25919 }
25920
25921 SDValue LHS = Op.getOperand(1);
25922 SDValue RHS = Op.getOperand(2);
25923 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25924 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25925 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25926 }
25927
25928 case Intrinsic::x86_sse42_pcmpistria128:
25929 case Intrinsic::x86_sse42_pcmpestria128:
25930 case Intrinsic::x86_sse42_pcmpistric128:
25931 case Intrinsic::x86_sse42_pcmpestric128:
25932 case Intrinsic::x86_sse42_pcmpistrio128:
25933 case Intrinsic::x86_sse42_pcmpestrio128:
25934 case Intrinsic::x86_sse42_pcmpistris128:
25935 case Intrinsic::x86_sse42_pcmpestris128:
25936 case Intrinsic::x86_sse42_pcmpistriz128:
25937 case Intrinsic::x86_sse42_pcmpestriz128: {
25938 unsigned Opcode;
25939 X86::CondCode X86CC;
25940 switch (IntNo) {
25941 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable(); // Can't reach here.
25942 case Intrinsic::x86_sse42_pcmpistria128:
25943 Opcode = X86ISD::PCMPISTR;
25944 X86CC = X86::COND_A;
25945 break;
25946 case Intrinsic::x86_sse42_pcmpestria128:
25947 Opcode = X86ISD::PCMPESTR;
25948 X86CC = X86::COND_A;
25949 break;
25950 case Intrinsic::x86_sse42_pcmpistric128:
25951 Opcode = X86ISD::PCMPISTR;
25952 X86CC = X86::COND_B;
25953 break;
25954 case Intrinsic::x86_sse42_pcmpestric128:
25955 Opcode = X86ISD::PCMPESTR;
25956 X86CC = X86::COND_B;
25957 break;
25958 case Intrinsic::x86_sse42_pcmpistrio128:
25959 Opcode = X86ISD::PCMPISTR;
25960 X86CC = X86::COND_O;
25961 break;
25962 case Intrinsic::x86_sse42_pcmpestrio128:
25963 Opcode = X86ISD::PCMPESTR;
25964 X86CC = X86::COND_O;
25965 break;
25966 case Intrinsic::x86_sse42_pcmpistris128:
25967 Opcode = X86ISD::PCMPISTR;
25968 X86CC = X86::COND_S;
25969 break;
25970 case Intrinsic::x86_sse42_pcmpestris128:
25971 Opcode = X86ISD::PCMPESTR;
25972 X86CC = X86::COND_S;
25973 break;
25974 case Intrinsic::x86_sse42_pcmpistriz128:
25975 Opcode = X86ISD::PCMPISTR;
25976 X86CC = X86::COND_E;
25977 break;
25978 case Intrinsic::x86_sse42_pcmpestriz128:
25979 Opcode = X86ISD::PCMPESTR;
25980 X86CC = X86::COND_E;
25981 break;
25982 }
25983 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25984 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25985 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25986 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25987 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25988 }
25989
25990 case Intrinsic::x86_sse42_pcmpistri128:
25991 case Intrinsic::x86_sse42_pcmpestri128: {
25992 unsigned Opcode;
25993 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25994 Opcode = X86ISD::PCMPISTR;
25995 else
25996 Opcode = X86ISD::PCMPESTR;
25997
25998 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25999 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26000 return DAG.getNode(Opcode, dl, VTs, NewOps);
26001 }
26002
26003 case Intrinsic::x86_sse42_pcmpistrm128:
26004 case Intrinsic::x86_sse42_pcmpestrm128: {
26005 unsigned Opcode;
26006 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26007 Opcode = X86ISD::PCMPISTR;
26008 else
26009 Opcode = X86ISD::PCMPESTR;
26010
26011 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26012 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26013 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26014 }
26015
26016 case Intrinsic::eh_sjlj_lsda: {
26017 MachineFunction &MF = DAG.getMachineFunction();
26018 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26019 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26020 auto &Context = MF.getMMI().getContext();
26021 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26022 Twine(MF.getFunctionNumber()));
26023 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26024 DAG.getMCSymbol(S, PtrVT));
26025 }
26026
26027 case Intrinsic::x86_seh_lsda: {
26028 // Compute the symbol for the LSDA. We know it'll get emitted later.
26029 MachineFunction &MF = DAG.getMachineFunction();
26030 SDValue Op1 = Op.getOperand(1);
26031 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26032 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26033 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26034
26035 // Generate a simple absolute symbol reference. This intrinsic is only
26036 // supported on 32-bit Windows, which isn't PIC.
26037 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26038 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26039 }
26040
26041 case Intrinsic::eh_recoverfp: {
26042 SDValue FnOp = Op.getOperand(1);
26043 SDValue IncomingFPOp = Op.getOperand(2);
26044 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26045 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26046 if (!Fn)
26047 report_fatal_error(
26048 "llvm.eh.recoverfp must take a function as the first argument");
26049 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26050 }
26051
26052 case Intrinsic::localaddress: {
26053 // Returns one of the stack, base, or frame pointer registers, depending on
26054 // which is used to reference local variables.
26055 MachineFunction &MF = DAG.getMachineFunction();
26056 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26057 unsigned Reg;
26058 if (RegInfo->hasBasePointer(MF))
26059 Reg = RegInfo->getBaseRegister();
26060 else { // Handles the SP or FP case.
26061 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26062 if (CantUseFP)
26063 Reg = RegInfo->getPtrSizedStackRegister(MF);
26064 else
26065 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26066 }
26067 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26068 }
26069 case Intrinsic::swift_async_context_addr: {
26070 auto &MF = DAG.getMachineFunction();
26071 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26072 if (Subtarget.is64Bit()) {
26073 MF.getFrameInfo().setFrameAddressIsTaken(true);
26074 X86FI->setHasSwiftAsyncContext(true);
26075 return SDValue(
26076 DAG.getMachineNode(
26077 X86::SUB64ri8, dl, MVT::i64,
26078 DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
26079 DAG.getTargetConstant(8, dl, MVT::i32)),
26080 0);
26081 } else {
26082 // 32-bit so no special extended frame, create or reuse an existing stack
26083 // slot.
26084 if (!X86FI->getSwiftAsyncContextFrameIdx())
26085 X86FI->setSwiftAsyncContextFrameIdx(
26086 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26087 return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26088 }
26089 }
26090 case Intrinsic::x86_avx512_vp2intersect_q_512:
26091 case Intrinsic::x86_avx512_vp2intersect_q_256:
26092 case Intrinsic::x86_avx512_vp2intersect_q_128:
26093 case Intrinsic::x86_avx512_vp2intersect_d_512:
26094 case Intrinsic::x86_avx512_vp2intersect_d_256:
26095 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26096 MVT MaskVT = Op.getSimpleValueType();
26097
26098 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26099 SDLoc DL(Op);
26100
26101 SDValue Operation =
26102 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26103 Op->getOperand(1), Op->getOperand(2));
26104
26105 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26106 MaskVT, Operation);
26107 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26108 MaskVT, Operation);
26109 return DAG.getMergeValues({Result0, Result1}, DL);
26110 }
26111 case Intrinsic::x86_mmx_pslli_w:
26112 case Intrinsic::x86_mmx_pslli_d:
26113 case Intrinsic::x86_mmx_pslli_q:
26114 case Intrinsic::x86_mmx_psrli_w:
26115 case Intrinsic::x86_mmx_psrli_d:
26116 case Intrinsic::x86_mmx_psrli_q:
26117 case Intrinsic::x86_mmx_psrai_w:
26118 case Intrinsic::x86_mmx_psrai_d: {
26119 SDLoc DL(Op);
26120 SDValue ShAmt = Op.getOperand(2);
26121 // If the argument is a constant, convert it to a target constant.
26122 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26123 // Clamp out of bounds shift amounts since they will otherwise be masked
26124 // to 8-bits which may make it no longer out of bounds.
26125 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26126 if (ShiftAmount == 0)
26127 return Op.getOperand(1);
26128
26129 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26130 Op.getOperand(0), Op.getOperand(1),
26131 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26132 }
26133
26134 unsigned NewIntrinsic;
26135 switch (IntNo) {
26136 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable(); // Can't reach here.
26137 case Intrinsic::x86_mmx_pslli_w:
26138 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26139 break;
26140 case Intrinsic::x86_mmx_pslli_d:
26141 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26142 break;
26143 case Intrinsic::x86_mmx_pslli_q:
26144 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26145 break;
26146 case Intrinsic::x86_mmx_psrli_w:
26147 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26148 break;
26149 case Intrinsic::x86_mmx_psrli_d:
26150 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26151 break;
26152 case Intrinsic::x86_mmx_psrli_q:
26153 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26154 break;
26155 case Intrinsic::x86_mmx_psrai_w:
26156 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26157 break;
26158 case Intrinsic::x86_mmx_psrai_d:
26159 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26160 break;
26161 }
26162
26163 // The vector shift intrinsics with scalars uses 32b shift amounts but
26164 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26165 // MMX register.
26166 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26167 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26168 DAG.getTargetConstant(NewIntrinsic, DL,
26169 getPointerTy(DAG.getDataLayout())),
26170 Op.getOperand(1), ShAmt);
26171 }
26172 }
26173}
26174
26175static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26176 SDValue Src, SDValue Mask, SDValue Base,
26177 SDValue Index, SDValue ScaleOp, SDValue Chain,
26178 const X86Subtarget &Subtarget) {
26179 SDLoc dl(Op);
26180 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26181 // Scale must be constant.
26182 if (!C)
26183 return SDValue();
26184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26185 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26186 TLI.getPointerTy(DAG.getDataLayout()));
26187 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26188 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26189 // If source is undef or we know it won't be used, use a zero vector
26190 // to break register dependency.
26191 // TODO: use undef instead and let BreakFalseDeps deal with it?
26192 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26193 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26194
26195 // Cast mask to an integer type.
26196 Mask = DAG.getBitcast(MaskVT, Mask);
26197
26198 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26199
26200 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26201 SDValue Res =
26202 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26203 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26204 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26205}
26206
26207static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26208 SDValue Src, SDValue Mask, SDValue Base,
26209 SDValue Index, SDValue ScaleOp, SDValue Chain,
26210 const X86Subtarget &Subtarget) {
26211 MVT VT = Op.getSimpleValueType();
26212 SDLoc dl(Op);
26213 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26214 // Scale must be constant.
26215 if (!C)
26216 return SDValue();
26217 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26218 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26219 TLI.getPointerTy(DAG.getDataLayout()));
26220 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26221 VT.getVectorNumElements());
26222 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26223
26224 // We support two versions of the gather intrinsics. One with scalar mask and
26225 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26226 if (Mask.getValueType() != MaskVT)
26227 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26228
26229 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26230 // If source is undef or we know it won't be used, use a zero vector
26231 // to break register dependency.
26232 // TODO: use undef instead and let BreakFalseDeps deal with it?
26233 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26234 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26235
26236 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26237
26238 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26239 SDValue Res =
26240 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26241 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26242 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26243}
26244
26245static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26246 SDValue Src, SDValue Mask, SDValue Base,
26247 SDValue Index, SDValue ScaleOp, SDValue Chain,
26248 const X86Subtarget &Subtarget) {
26249 SDLoc dl(Op);
26250 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26251 // Scale must be constant.
26252 if (!C)
26253 return SDValue();
26254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26255 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26256 TLI.getPointerTy(DAG.getDataLayout()));
26257 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26258 Src.getSimpleValueType().getVectorNumElements());
26259 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26260
26261 // We support two versions of the scatter intrinsics. One with scalar mask and
26262 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26263 if (Mask.getValueType() != MaskVT)
26264 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26265
26266 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26267
26268 SDVTList VTs = DAG.getVTList(MVT::Other);
26269 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26270 SDValue Res =
26271 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26272 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26273 return Res;
26274}
26275
26276static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26277 SDValue Mask, SDValue Base, SDValue Index,
26278 SDValue ScaleOp, SDValue Chain,
26279 const X86Subtarget &Subtarget) {
26280 SDLoc dl(Op);
26281 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26282 // Scale must be constant.
26283 if (!C)
26284 return SDValue();
26285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26286 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26287 TLI.getPointerTy(DAG.getDataLayout()));
26288 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26289 SDValue Segment = DAG.getRegister(0, MVT::i32);
26290 MVT MaskVT =
26291 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26292 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26293 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26294 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26295 return SDValue(Res, 0);
26296}
26297
26298/// Handles the lowering of builtin intrinsics with chain that return their
26299/// value into registers EDX:EAX.
26300/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26301/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26302/// TargetOpcode.
26303/// Returns a Glue value which can be used to add extra copy-from-reg if the
26304/// expanded intrinsics implicitly defines extra registers (i.e. not just
26305/// EDX:EAX).
26306static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26307 SelectionDAG &DAG,
26308 unsigned TargetOpcode,
26309 unsigned SrcReg,
26310 const X86Subtarget &Subtarget,
26311 SmallVectorImpl<SDValue> &Results) {
26312 SDValue Chain = N->getOperand(0);
26313 SDValue Glue;
26314
26315 if (SrcReg) {
26316 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((void)0);
26317 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26318 Glue = Chain.getValue(1);
26319 }
26320
26321 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26322 SDValue N1Ops[] = {Chain, Glue};
26323 SDNode *N1 = DAG.getMachineNode(
26324 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26325 Chain = SDValue(N1, 0);
26326
26327 // Reads the content of XCR and returns it in registers EDX:EAX.
26328 SDValue LO, HI;
26329 if (Subtarget.is64Bit()) {
26330 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26331 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26332 LO.getValue(2));
26333 } else {
26334 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26335 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26336 LO.getValue(2));
26337 }
26338 Chain = HI.getValue(1);
26339 Glue = HI.getValue(2);
26340
26341 if (Subtarget.is64Bit()) {
26342 // Merge the two 32-bit values into a 64-bit one.
26343 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26344 DAG.getConstant(32, DL, MVT::i8));
26345 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26346 Results.push_back(Chain);
26347 return Glue;
26348 }
26349
26350 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26351 SDValue Ops[] = { LO, HI };
26352 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26353 Results.push_back(Pair);
26354 Results.push_back(Chain);
26355 return Glue;
26356}
26357
26358/// Handles the lowering of builtin intrinsics that read the time stamp counter
26359/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26360/// READCYCLECOUNTER nodes.
26361static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26362 SelectionDAG &DAG,
26363 const X86Subtarget &Subtarget,
26364 SmallVectorImpl<SDValue> &Results) {
26365 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26366 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26367 // and the EAX register is loaded with the low-order 32 bits.
26368 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26369 /* NoRegister */0, Subtarget,
26370 Results);
26371 if (Opcode != X86::RDTSCP)
26372 return;
26373
26374 SDValue Chain = Results[1];
26375 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26376 // the ECX register. Add 'ecx' explicitly to the chain.
26377 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26378 Results[1] = ecx;
26379 Results.push_back(ecx.getValue(1));
26380}
26381
26382static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26383 SelectionDAG &DAG) {
26384 SmallVector<SDValue, 3> Results;
26385 SDLoc DL(Op);
26386 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26387 Results);
26388 return DAG.getMergeValues(Results, DL);
26389}
26390
26391static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26392 MachineFunction &MF = DAG.getMachineFunction();
26393 SDValue Chain = Op.getOperand(0);
26394 SDValue RegNode = Op.getOperand(2);
26395 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26396 if (!EHInfo)
26397 report_fatal_error("EH registrations only live in functions using WinEH");
26398
26399 // Cast the operand to an alloca, and remember the frame index.
26400 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26401 if (!FINode)
26402 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26403 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26404
26405 // Return the chain operand without making any DAG nodes.
26406 return Chain;
26407}
26408
26409static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26410 MachineFunction &MF = DAG.getMachineFunction();
26411 SDValue Chain = Op.getOperand(0);
26412 SDValue EHGuard = Op.getOperand(2);
26413 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26414 if (!EHInfo)
26415 report_fatal_error("EHGuard only live in functions using WinEH");
26416
26417 // Cast the operand to an alloca, and remember the frame index.
26418 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26419 if (!FINode)
26420 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26421 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26422
26423 // Return the chain operand without making any DAG nodes.
26424 return Chain;
26425}
26426
26427/// Emit Truncating Store with signed or unsigned saturation.
26428static SDValue
26429EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26430 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26431 SelectionDAG &DAG) {
26432 SDVTList VTs = DAG.getVTList(MVT::Other);
26433 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26434 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26435 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26436 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26437}
26438
26439/// Emit Masked Truncating Store with signed or unsigned saturation.
26440static SDValue
26441EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26442 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26443 MachineMemOperand *MMO, SelectionDAG &DAG) {
26444 SDVTList VTs = DAG.getVTList(MVT::Other);
26445 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26446 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26447 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26448}
26449
26450static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26451 SelectionDAG &DAG) {
26452 unsigned IntNo = Op.getConstantOperandVal(1);
26453 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26454 if (!IntrData) {
26455 switch (IntNo) {
26456 case llvm::Intrinsic::x86_seh_ehregnode:
26457 return MarkEHRegistrationNode(Op, DAG);
26458 case llvm::Intrinsic::x86_seh_ehguard:
26459 return MarkEHGuard(Op, DAG);
26460 case llvm::Intrinsic::x86_rdpkru: {
26461 SDLoc dl(Op);
26462 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26463 // Create a RDPKRU node and pass 0 to the ECX parameter.
26464 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26465 DAG.getConstant(0, dl, MVT::i32));
26466 }
26467 case llvm::Intrinsic::x86_wrpkru: {
26468 SDLoc dl(Op);
26469 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26470 // to the EDX and ECX parameters.
26471 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26472 Op.getOperand(0), Op.getOperand(2),
26473 DAG.getConstant(0, dl, MVT::i32),
26474 DAG.getConstant(0, dl, MVT::i32));
26475 }
26476 case llvm::Intrinsic::x86_flags_read_u32:
26477 case llvm::Intrinsic::x86_flags_read_u64:
26478 case llvm::Intrinsic::x86_flags_write_u32:
26479 case llvm::Intrinsic::x86_flags_write_u64: {
26480 // We need a frame pointer because this will get lowered to a PUSH/POP
26481 // sequence.
26482 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26483 MFI.setHasCopyImplyingStackAdjustment(true);
26484 // Don't do anything here, we will expand these intrinsics out later
26485 // during FinalizeISel in EmitInstrWithCustomInserter.
26486 return Op;
26487 }
26488 case Intrinsic::x86_lwpins32:
26489 case Intrinsic::x86_lwpins64:
26490 case Intrinsic::x86_umwait:
26491 case Intrinsic::x86_tpause: {
26492 SDLoc dl(Op);
26493 SDValue Chain = Op->getOperand(0);
26494 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26495 unsigned Opcode;
26496
26497 switch (IntNo) {
26498 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26499 case Intrinsic::x86_umwait:
26500 Opcode = X86ISD::UMWAIT;
26501 break;
26502 case Intrinsic::x86_tpause:
26503 Opcode = X86ISD::TPAUSE;
26504 break;
26505 case Intrinsic::x86_lwpins32:
26506 case Intrinsic::x86_lwpins64:
26507 Opcode = X86ISD::LWPINS;
26508 break;
26509 }
26510
26511 SDValue Operation =
26512 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26513 Op->getOperand(3), Op->getOperand(4));
26514 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26515 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26516 Operation.getValue(1));
26517 }
26518 case Intrinsic::x86_enqcmd:
26519 case Intrinsic::x86_enqcmds: {
26520 SDLoc dl(Op);
26521 SDValue Chain = Op.getOperand(0);
26522 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26523 unsigned Opcode;
26524 switch (IntNo) {
26525 default: llvm_unreachable("Impossible intrinsic!")__builtin_unreachable();
26526 case Intrinsic::x86_enqcmd:
26527 Opcode = X86ISD::ENQCMD;
26528 break;
26529 case Intrinsic::x86_enqcmds:
26530 Opcode = X86ISD::ENQCMDS;
26531 break;
26532 }
26533 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26534 Op.getOperand(3));
26535 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26536 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26537 Operation.getValue(1));
26538 }
26539 case Intrinsic::x86_aesenc128kl:
26540 case Intrinsic::x86_aesdec128kl:
26541 case Intrinsic::x86_aesenc256kl:
26542 case Intrinsic::x86_aesdec256kl: {
26543 SDLoc DL(Op);
26544 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26545 SDValue Chain = Op.getOperand(0);
26546 unsigned Opcode;
26547
26548 switch (IntNo) {
26549 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26550 case Intrinsic::x86_aesenc128kl:
26551 Opcode = X86ISD::AESENC128KL;
26552 break;
26553 case Intrinsic::x86_aesdec128kl:
26554 Opcode = X86ISD::AESDEC128KL;
26555 break;
26556 case Intrinsic::x86_aesenc256kl:
26557 Opcode = X86ISD::AESENC256KL;
26558 break;
26559 case Intrinsic::x86_aesdec256kl:
26560 Opcode = X86ISD::AESDEC256KL;
26561 break;
26562 }
26563
26564 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26565 MachineMemOperand *MMO = MemIntr->getMemOperand();
26566 EVT MemVT = MemIntr->getMemoryVT();
26567 SDValue Operation = DAG.getMemIntrinsicNode(
26568 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26569 MMO);
26570 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26571
26572 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26573 {ZF, Operation.getValue(0), Operation.getValue(2)});
26574 }
26575 case Intrinsic::x86_aesencwide128kl:
26576 case Intrinsic::x86_aesdecwide128kl:
26577 case Intrinsic::x86_aesencwide256kl:
26578 case Intrinsic::x86_aesdecwide256kl: {
26579 SDLoc DL(Op);
26580 SDVTList VTs = DAG.getVTList(
26581 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26582 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26583 SDValue Chain = Op.getOperand(0);
26584 unsigned Opcode;
26585
26586 switch (IntNo) {
26587 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26588 case Intrinsic::x86_aesencwide128kl:
26589 Opcode = X86ISD::AESENCWIDE128KL;
26590 break;
26591 case Intrinsic::x86_aesdecwide128kl:
26592 Opcode = X86ISD::AESDECWIDE128KL;
26593 break;
26594 case Intrinsic::x86_aesencwide256kl:
26595 Opcode = X86ISD::AESENCWIDE256KL;
26596 break;
26597 case Intrinsic::x86_aesdecwide256kl:
26598 Opcode = X86ISD::AESDECWIDE256KL;
26599 break;
26600 }
26601
26602 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26603 MachineMemOperand *MMO = MemIntr->getMemOperand();
26604 EVT MemVT = MemIntr->getMemoryVT();
26605 SDValue Operation = DAG.getMemIntrinsicNode(
26606 Opcode, DL, VTs,
26607 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26608 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26609 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26610 MemVT, MMO);
26611 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26612
26613 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26614 {ZF, Operation.getValue(1), Operation.getValue(2),
26615 Operation.getValue(3), Operation.getValue(4),
26616 Operation.getValue(5), Operation.getValue(6),
26617 Operation.getValue(7), Operation.getValue(8),
26618 Operation.getValue(9)});
26619 }
26620 case Intrinsic::x86_testui: {
26621 SDLoc dl(Op);
26622 SDValue Chain = Op.getOperand(0);
26623 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26624 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26625 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26626 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26627 Operation.getValue(1));
26628 }
26629 }
26630 return SDValue();
26631 }
26632
26633 SDLoc dl(Op);
26634 switch(IntrData->Type) {
26635 default: llvm_unreachable("Unknown Intrinsic Type")__builtin_unreachable();
26636 case RDSEED:
26637 case RDRAND: {
26638 // Emit the node with the right value type.
26639 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26640 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26641
26642 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26643 // Otherwise return the value from Rand, which is always 0, casted to i32.
26644 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26645 DAG.getConstant(1, dl, Op->getValueType(1)),
26646 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26647 SDValue(Result.getNode(), 1)};
26648 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26649
26650 // Return { result, isValid, chain }.
26651 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26652 SDValue(Result.getNode(), 2));
26653 }
26654 case GATHER_AVX2: {
26655 SDValue Chain = Op.getOperand(0);
26656 SDValue Src = Op.getOperand(2);
26657 SDValue Base = Op.getOperand(3);
26658 SDValue Index = Op.getOperand(4);
26659 SDValue Mask = Op.getOperand(5);
26660 SDValue Scale = Op.getOperand(6);
26661 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26662 Scale, Chain, Subtarget);
26663 }
26664 case GATHER: {
26665 //gather(v1, mask, index, base, scale);
26666 SDValue Chain = Op.getOperand(0);
26667 SDValue Src = Op.getOperand(2);
26668 SDValue Base = Op.getOperand(3);
26669 SDValue Index = Op.getOperand(4);
26670 SDValue Mask = Op.getOperand(5);
26671 SDValue Scale = Op.getOperand(6);
26672 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26673 Chain, Subtarget);
26674 }
26675 case SCATTER: {
26676 //scatter(base, mask, index, v1, scale);
26677 SDValue Chain = Op.getOperand(0);
26678 SDValue Base = Op.getOperand(2);
26679 SDValue Mask = Op.getOperand(3);
26680 SDValue Index = Op.getOperand(4);
26681 SDValue Src = Op.getOperand(5);
26682 SDValue Scale = Op.getOperand(6);
26683 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26684 Scale, Chain, Subtarget);
26685 }
26686 case PREFETCH: {
26687 const APInt &HintVal = Op.getConstantOperandAPInt(6);
26688 assert((HintVal == 2 || HintVal == 3) &&((void)0)
26689 "Wrong prefetch hint in intrinsic: should be 2 or 3")((void)0);
26690 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26691 SDValue Chain = Op.getOperand(0);
26692 SDValue Mask = Op.getOperand(2);
26693 SDValue Index = Op.getOperand(3);
26694 SDValue Base = Op.getOperand(4);
26695 SDValue Scale = Op.getOperand(5);
26696 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26697 Subtarget);
26698 }
26699 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26700 case RDTSC: {
26701 SmallVector<SDValue, 2> Results;
26702 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26703 Results);
26704 return DAG.getMergeValues(Results, dl);
26705 }
26706 // Read Performance Monitoring Counters.
26707 case RDPMC:
26708 // GetExtended Control Register.
26709 case XGETBV: {
26710 SmallVector<SDValue, 2> Results;
26711
26712 // RDPMC uses ECX to select the index of the performance counter to read.
26713 // XGETBV uses ECX to select the index of the XCR register to return.
26714 // The result is stored into registers EDX:EAX.
26715 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26716 Subtarget, Results);
26717 return DAG.getMergeValues(Results, dl);
26718 }
26719 // XTEST intrinsics.
26720 case XTEST: {
26721 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26722 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26723
26724 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26725 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26726 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26727 Ret, SDValue(InTrans.getNode(), 1));
26728 }
26729 case TRUNCATE_TO_MEM_VI8:
26730 case TRUNCATE_TO_MEM_VI16:
26731 case TRUNCATE_TO_MEM_VI32: {
26732 SDValue Mask = Op.getOperand(4);
26733 SDValue DataToTruncate = Op.getOperand(3);
26734 SDValue Addr = Op.getOperand(2);
26735 SDValue Chain = Op.getOperand(0);
26736
26737 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26738 assert(MemIntr && "Expected MemIntrinsicSDNode!")((void)0);
26739
26740 EVT MemVT = MemIntr->getMemoryVT();
26741
26742 uint16_t TruncationOp = IntrData->Opc0;
26743 switch (TruncationOp) {
26744 case X86ISD::VTRUNC: {
26745 if (isAllOnesConstant(Mask)) // return just a truncate store
26746 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26747 MemIntr->getMemOperand());
26748
26749 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26750 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26751 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26752
26753 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26754 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26755 true /* truncating */);
26756 }
26757 case X86ISD::VTRUNCUS:
26758 case X86ISD::VTRUNCS: {
26759 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26760 if (isAllOnesConstant(Mask))
26761 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26762 MemIntr->getMemOperand(), DAG);
26763
26764 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26765 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26766
26767 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26768 VMask, MemVT, MemIntr->getMemOperand(), DAG);
26769 }
26770 default:
26771 llvm_unreachable("Unsupported truncstore intrinsic")__builtin_unreachable();
26772 }
26773 }
26774 }
26775}
26776
26777SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26778 SelectionDAG &DAG) const {
26779 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26780 MFI.setReturnAddressIsTaken(true);
26781
26782 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26783 return SDValue();
26784
26785 unsigned Depth = Op.getConstantOperandVal(0);
26786 SDLoc dl(Op);
26787 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26788
26789 if (Depth > 0) {
26790 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26791 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26792 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26793 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26794 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26795 MachinePointerInfo());
26796 }
26797
26798 // Just load the return address.
26799 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26800 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26801 MachinePointerInfo());
26802}
26803
26804SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26805 SelectionDAG &DAG) const {
26806 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26807 return getReturnAddressFrameIndex(DAG);
26808}
26809
26810SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26811 MachineFunction &MF = DAG.getMachineFunction();
26812 MachineFrameInfo &MFI = MF.getFrameInfo();
26813 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26814 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26815 EVT VT = Op.getValueType();
26816
26817 MFI.setFrameAddressIsTaken(true);
26818
26819 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26820 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
26821 // is not possible to crawl up the stack without looking at the unwind codes
26822 // simultaneously.
26823 int FrameAddrIndex = FuncInfo->getFAIndex();
26824 if (!FrameAddrIndex) {
26825 // Set up a frame object for the return address.
26826 unsigned SlotSize = RegInfo->getSlotSize();
26827 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26828 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26829 FuncInfo->setFAIndex(FrameAddrIndex);
26830 }
26831 return DAG.getFrameIndex(FrameAddrIndex, VT);
26832 }
26833
26834 unsigned FrameReg =
26835 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26836 SDLoc dl(Op); // FIXME probably not meaningful
26837 unsigned Depth = Op.getConstantOperandVal(0);
26838 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((void)0)
26839 (FrameReg == X86::EBP && VT == MVT::i32)) &&((void)0)
26840 "Invalid Frame Register!")((void)0);
26841 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26842 while (Depth--)
26843 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
26844 MachinePointerInfo());
26845 return FrameAddr;
26846}
26847
26848// FIXME? Maybe this could be a TableGen attribute on some registers and
26849// this table could be generated automatically from RegInfo.
26850Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
26851 const MachineFunction &MF) const {
26852 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26853
26854 Register Reg = StringSwitch<unsigned>(RegName)
26855 .Case("esp", X86::ESP)
26856 .Case("rsp", X86::RSP)
26857 .Case("ebp", X86::EBP)
26858 .Case("rbp", X86::RBP)
26859 .Default(0);
26860
26861 if (Reg == X86::EBP || Reg == X86::RBP) {
26862 if (!TFI.hasFP(MF))
26863 report_fatal_error("register " + StringRef(RegName) +
26864 " is allocatable: function has no frame pointer");
26865#ifndef NDEBUG1
26866 else {
26867 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26868 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
26869 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&((void)0)
26870 "Invalid Frame Register!")((void)0);
26871 }
26872#endif
26873 }
26874
26875 if (Reg)
26876 return Reg;
26877
26878 report_fatal_error("Invalid register name global variable");
26879}
26880
26881SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
26882 SelectionDAG &DAG) const {
26883 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26884 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
26885}
26886
26887Register X86TargetLowering::getExceptionPointerRegister(
26888 const Constant *PersonalityFn) const {
26889 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
26890 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26891
26892 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
26893}
26894
26895Register X86TargetLowering::getExceptionSelectorRegister(
26896 const Constant *PersonalityFn) const {
26897 // Funclet personalities don't use selectors (the runtime does the selection).
26898 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
26899 return X86::NoRegister;
26900 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26901}
26902
26903bool X86TargetLowering::needsFixedCatchObjects() const {
26904 return Subtarget.isTargetWin64();
26905}
26906
26907SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
26908 SDValue Chain = Op.getOperand(0);
26909 SDValue Offset = Op.getOperand(1);
26910 SDValue Handler = Op.getOperand(2);
26911 SDLoc dl (Op);
26912
26913 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26914 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26915 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
26916 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((void)0)
26917 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((void)0)
26918 "Invalid Frame Register!")((void)0);
26919 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
26920 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
26921
26922 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
26923 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
26924 dl));
26925 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
26926 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
26927 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
26928
26929 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
26930 DAG.getRegister(StoreAddrReg, PtrVT));
26931}
26932
26933SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
26934 SelectionDAG &DAG) const {
26935 SDLoc DL(Op);
26936 // If the subtarget is not 64bit, we may need the global base reg
26937 // after isel expand pseudo, i.e., after CGBR pass ran.
26938 // Therefore, ask for the GlobalBaseReg now, so that the pass
26939 // inserts the code for us in case we need it.
26940 // Otherwise, we will end up in a situation where we will
26941 // reference a virtual register that is not defined!
26942 if (!Subtarget.is64Bit()) {
26943 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26944 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
26945 }
26946 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
26947 DAG.getVTList(MVT::i32, MVT::Other),
26948 Op.getOperand(0), Op.getOperand(1));
26949}
26950
26951SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
26952 SelectionDAG &DAG) const {
26953 SDLoc DL(Op);
26954 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
26955 Op.getOperand(0), Op.getOperand(1));
26956}
26957
26958SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
26959 SelectionDAG &DAG) const {
26960 SDLoc DL(Op);
26961 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
26962 Op.getOperand(0));
26963}
26964
26965static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
26966 return Op.getOperand(0);
26967}
26968
26969SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
26970 SelectionDAG &DAG) const {
26971 SDValue Root = Op.getOperand(0);
26972 SDValue Trmp = Op.getOperand(1); // trampoline
26973 SDValue FPtr = Op.getOperand(2); // nested function
26974 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
26975 SDLoc dl (Op);
26976
26977 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26978 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26979
26980 if (Subtarget.is64Bit()) {
26981 SDValue OutChains[6];
26982
26983 // Large code-model.
26984 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
26985 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
26986
26987 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
26988 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
26989
26990 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
26991
26992 // Load the pointer to the nested function into R11.
26993 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
26994 SDValue Addr = Trmp;
26995 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26996 Addr, MachinePointerInfo(TrmpAddr));
26997
26998 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26999 DAG.getConstant(2, dl, MVT::i64));
27000 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27001 MachinePointerInfo(TrmpAddr, 2), Align(2));
27002
27003 // Load the 'nest' parameter value into R10.
27004 // R10 is specified in X86CallingConv.td
27005 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27006 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27007 DAG.getConstant(10, dl, MVT::i64));
27008 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27009 Addr, MachinePointerInfo(TrmpAddr, 10));
27010
27011 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27012 DAG.getConstant(12, dl, MVT::i64));
27013 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27014 MachinePointerInfo(TrmpAddr, 12), Align(2));
27015
27016 // Jump to the nested function.
27017 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27018 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27019 DAG.getConstant(20, dl, MVT::i64));
27020 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27021 Addr, MachinePointerInfo(TrmpAddr, 20));
27022
27023 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27024 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27025 DAG.getConstant(22, dl, MVT::i64));
27026 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27027 Addr, MachinePointerInfo(TrmpAddr, 22));
27028
27029 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27030 } else {
27031 const Function *Func =
27032 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27033 CallingConv::ID CC = Func->getCallingConv();
27034 unsigned NestReg;
27035
27036 switch (CC) {
27037 default:
27038 llvm_unreachable("Unsupported calling convention")__builtin_unreachable();
27039 case CallingConv::C:
27040 case CallingConv::X86_StdCall: {
27041 // Pass 'nest' parameter in ECX.
27042 // Must be kept in sync with X86CallingConv.td
27043 NestReg = X86::ECX;
27044
27045 // Check that ECX wasn't needed by an 'inreg' parameter.
27046 FunctionType *FTy = Func->getFunctionType();
27047 const AttributeList &Attrs = Func->getAttributes();
27048
27049 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27050 unsigned InRegCount = 0;
27051 unsigned Idx = 1;
27052
27053 for (FunctionType::param_iterator I = FTy->param_begin(),
27054 E = FTy->param_end(); I != E; ++I, ++Idx)
27055 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
27056 const DataLayout &DL = DAG.getDataLayout();
27057 // FIXME: should only count parameters that are lowered to integers.
27058 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27059 }
27060
27061 if (InRegCount > 2) {
27062 report_fatal_error("Nest register in use - reduce number of inreg"
27063 " parameters!");
27064 }
27065 }
27066 break;
27067 }
27068 case CallingConv::X86_FastCall:
27069 case CallingConv::X86_ThisCall:
27070 case CallingConv::Fast:
27071 case CallingConv::Tail:
27072 case CallingConv::SwiftTail:
27073 // Pass 'nest' parameter in EAX.
27074 // Must be kept in sync with X86CallingConv.td
27075 NestReg = X86::EAX;
27076 break;
27077 }
27078
27079 SDValue OutChains[4];
27080 SDValue Addr, Disp;
27081
27082 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27083 DAG.getConstant(10, dl, MVT::i32));
27084 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27085
27086 // This is storing the opcode for MOV32ri.
27087 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27088 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27089 OutChains[0] =
27090 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27091 Trmp, MachinePointerInfo(TrmpAddr));
27092
27093 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27094 DAG.getConstant(1, dl, MVT::i32));
27095 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27096 MachinePointerInfo(TrmpAddr, 1), Align(1));
27097
27098 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27099 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27100 DAG.getConstant(5, dl, MVT::i32));
27101 OutChains[2] =
27102 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27103 MachinePointerInfo(TrmpAddr, 5), Align(1));
27104
27105 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27106 DAG.getConstant(6, dl, MVT::i32));
27107 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27108 MachinePointerInfo(TrmpAddr, 6), Align(1));
27109
27110 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27111 }
27112}
27113
27114SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
27115 SelectionDAG &DAG) const {
27116 /*
27117 The rounding mode is in bits 11:10 of FPSR, and has the following
27118 settings:
27119 00 Round to nearest
27120 01 Round to -inf
27121 10 Round to +inf
27122 11 Round to 0
27123
27124 FLT_ROUNDS, on the other hand, expects the following:
27125 -1 Undefined
27126 0 Round to 0
27127 1 Round to nearest
27128 2 Round to +inf
27129 3 Round to -inf
27130
27131 To perform the conversion, we use a packed lookup table of the four 2-bit
27132 values that we can index by FPSP[11:10]
27133 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27134
27135 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27136 */
27137
27138 MachineFunction &MF = DAG.getMachineFunction();
27139 MVT VT = Op.getSimpleValueType();
27140 SDLoc DL(Op);
27141
27142 // Save FP Control Word to stack slot
27143 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27144 SDValue StackSlot =
27145 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27146
27147 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27148
27149 SDValue Chain = Op.getOperand(0);
27150 SDValue Ops[] = {Chain, StackSlot};
27151 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27152 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27153 Align(2), MachineMemOperand::MOStore);
27154
27155 // Load FP Control Word from stack slot
27156 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27157 Chain = CWD.getValue(1);
27158
27159 // Mask and turn the control bits into a shift for the lookup table.
27160 SDValue Shift =
27161 DAG.getNode(ISD::SRL, DL, MVT::i16,
27162 DAG.getNode(ISD::AND, DL, MVT::i16,
27163 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27164 DAG.getConstant(9, DL, MVT::i8));
27165 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27166
27167 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27168 SDValue RetVal =
27169 DAG.getNode(ISD::AND, DL, MVT::i32,
27170 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27171 DAG.getConstant(3, DL, MVT::i32));
27172
27173 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27174
27175 return DAG.getMergeValues({RetVal, Chain}, DL);
27176}
27177
27178SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27179 SelectionDAG &DAG) const {
27180 MachineFunction &MF = DAG.getMachineFunction();
27181 SDLoc DL(Op);
27182 SDValue Chain = Op.getNode()->getOperand(0);
27183
27184 // FP control word may be set only from data in memory. So we need to allocate
27185 // stack space to save/load FP control word.
27186 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27187 SDValue StackSlot =
27188 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27189 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27190 MachineMemOperand *MMO =
27191 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27192
27193 // Store FP control word into memory.
27194 SDValue Ops[] = {Chain, StackSlot};
27195 Chain = DAG.getMemIntrinsicNode(
27196 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27197
27198 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27199 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27200 Chain = CWD.getValue(1);
27201 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27202 DAG.getConstant(0xf3ff, DL, MVT::i16));
27203
27204 // Calculate new rounding mode.
27205 SDValue NewRM = Op.getNode()->getOperand(1);
27206 SDValue RMBits;
27207 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27208 uint64_t RM = CVal->getZExtValue();
27209 int FieldVal;
27210 switch (static_cast<RoundingMode>(RM)) {
27211 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27212 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27213 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27214 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27215 default:
27216 llvm_unreachable("rounding mode is not supported by X86 hardware")__builtin_unreachable();
27217 }
27218 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27219 } else {
27220 // Need to convert argument into bits of control word:
27221 // 0 Round to 0 -> 11
27222 // 1 Round to nearest -> 00
27223 // 2 Round to +inf -> 10
27224 // 3 Round to -inf -> 01
27225 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27226 // To make the conversion, put all these values into a value 0xc9 and shift
27227 // it left depending on the rounding mode:
27228 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27229 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27230 // ...
27231 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27232 SDValue ShiftValue =
27233 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27234 DAG.getNode(ISD::ADD, DL, MVT::i32,
27235 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27236 DAG.getConstant(1, DL, MVT::i8)),
27237 DAG.getConstant(4, DL, MVT::i32)));
27238 SDValue Shifted =
27239 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27240 ShiftValue);
27241 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27242 DAG.getConstant(0xc00, DL, MVT::i16));
27243 }
27244
27245 // Update rounding mode bits and store the new FP Control Word into stack.
27246 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27247 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27248
27249 // Load FP control word from the slot.
27250 SDValue OpsLD[] = {Chain, StackSlot};
27251 MachineMemOperand *MMOL =
27252 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27253 Chain = DAG.getMemIntrinsicNode(
27254 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27255
27256 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27257 // same way but in bits 14:13.
27258 if (Subtarget.hasSSE1()) {
27259 // Store MXCSR into memory.
27260 Chain = DAG.getNode(
27261 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27262 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27263 StackSlot);
27264
27265 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27266 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27267 Chain = CWD.getValue(1);
27268 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27269 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27270
27271 // Shift X87 RM bits from 11:10 to 14:13.
27272 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27273 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27274 DAG.getConstant(3, DL, MVT::i8));
27275
27276 // Update rounding mode bits and store the new FP Control Word into stack.
27277 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27278 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27279
27280 // Load MXCSR from the slot.
27281 Chain = DAG.getNode(
27282 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27283 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27284 StackSlot);
27285 }
27286
27287 return Chain;
27288}
27289
27290/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27291//
27292// i8/i16 vector implemented using dword LZCNT vector instruction
27293// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27294// split the vector, perform operation on it's Lo a Hi part and
27295// concatenate the results.
27296static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27297 const X86Subtarget &Subtarget) {
27298 assert(Op.getOpcode() == ISD::CTLZ)((void)0);
27299 SDLoc dl(Op);
27300 MVT VT = Op.getSimpleValueType();
27301 MVT EltVT = VT.getVectorElementType();
27302 unsigned NumElems = VT.getVectorNumElements();
27303
27304 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&((void)0)
27305 "Unsupported element type")((void)0);
27306
27307 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27308 if (NumElems > 16 ||
27309 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27310 return splitVectorIntUnary(Op, DAG);
27311
27312 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27313 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&((void)0)
27314 "Unsupported value type for operation")((void)0);
27315
27316 // Use native supported vector instruction vplzcntd.
27317 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27318 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27319 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27320 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27321
27322 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27323}
27324
27325// Lower CTLZ using a PSHUFB lookup table implementation.
27326static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27327 const X86Subtarget &Subtarget,
27328 SelectionDAG &DAG) {
27329 MVT VT = Op.getSimpleValueType();
27330 int NumElts = VT.getVectorNumElements();
27331 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27332 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27333
27334 // Per-nibble leading zero PSHUFB lookup table.
27335 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27336 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27337 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27338 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27339
27340 SmallVector<SDValue, 64> LUTVec;
27341 for (int i = 0; i < NumBytes; ++i)
27342 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27343 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27344
27345 // Begin by bitcasting the input to byte vector, then split those bytes
27346 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27347 // If the hi input nibble is zero then we add both results together, otherwise
27348 // we just take the hi result (by masking the lo result to zero before the
27349 // add).
27350 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27351 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27352
27353 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27354 SDValue Lo = Op0;
27355 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27356 SDValue HiZ;
27357 if (CurrVT.is512BitVector()) {
27358 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27359 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27360 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27361 } else {
27362 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27363 }
27364
27365 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27366 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27367 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27368 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27369
27370 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27371 // of the current vector width in the same way we did for the nibbles.
27372 // If the upper half of the input element is zero then add the halves'
27373 // leading zero counts together, otherwise just use the upper half's.
27374 // Double the width of the result until we are at target width.
27375 while (CurrVT != VT) {
27376 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27377 int CurrNumElts = CurrVT.getVectorNumElements();
27378 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27379 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27380 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27381
27382 // Check if the upper half of the input element is zero.
27383 if (CurrVT.is512BitVector()) {
27384 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27385 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27386 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27387 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27388 } else {
27389 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27390 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27391 }
27392 HiZ = DAG.getBitcast(NextVT, HiZ);
27393
27394 // Move the upper/lower halves to the lower bits as we'll be extending to
27395 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27396 // together.
27397 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27398 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27399 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27400 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27401 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27402 CurrVT = NextVT;
27403 }
27404
27405 return Res;
27406}
27407
27408static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27409 const X86Subtarget &Subtarget,
27410 SelectionDAG &DAG) {
27411 MVT VT = Op.getSimpleValueType();
27412
27413 if (Subtarget.hasCDI() &&
27414 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27415 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27416 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27417
27418 // Decompose 256-bit ops into smaller 128-bit ops.
27419 if (VT.is256BitVector() && !Subtarget.hasInt256())
27420 return splitVectorIntUnary(Op, DAG);
27421
27422 // Decompose 512-bit ops into smaller 256-bit ops.
27423 if (VT.is512BitVector() && !Subtarget.hasBWI())
27424 return splitVectorIntUnary(Op, DAG);
27425
27426 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((void)0);
27427 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27428}
27429
27430static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27431 SelectionDAG &DAG) {
27432 MVT VT = Op.getSimpleValueType();
27433 MVT OpVT = VT;
27434 unsigned NumBits = VT.getSizeInBits();
27435 SDLoc dl(Op);
27436 unsigned Opc = Op.getOpcode();
27437
27438 if (VT.isVector())
27439 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27440
27441 Op = Op.getOperand(0);
27442 if (VT == MVT::i8) {
27443 // Zero extend to i32 since there is not an i8 bsr.
27444 OpVT = MVT::i32;
27445 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27446 }
27447
27448 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27449 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27450 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27451
27452 if (Opc == ISD::CTLZ) {
27453 // If src is zero (i.e. bsr sets ZF), returns NumBits.
27454 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27455 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27456 Op.getValue(1)};
27457 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27458 }
27459
27460 // Finally xor with NumBits-1.
27461 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27462 DAG.getConstant(NumBits - 1, dl, OpVT));
27463
27464 if (VT == MVT::i8)
27465 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27466 return Op;
27467}
27468
27469static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27470 SelectionDAG &DAG) {
27471 MVT VT = Op.getSimpleValueType();
27472 unsigned NumBits = VT.getScalarSizeInBits();
27473 SDValue N0 = Op.getOperand(0);
27474 SDLoc dl(Op);
27475
27476 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&((void)0)
27477 "Only scalar CTTZ requires custom lowering")((void)0);
27478
27479 // Issue a bsf (scan bits forward) which also sets EFLAGS.
27480 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27481 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27482
27483 // If src is zero (i.e. bsf sets ZF), returns NumBits.
27484 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27485 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27486 Op.getValue(1)};
27487 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27488}
27489
27490static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27491 const X86Subtarget &Subtarget) {
27492 MVT VT = Op.getSimpleValueType();
27493 if (VT == MVT::i16 || VT == MVT::i32)
27494 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27495
27496 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27497 return splitVectorIntBinary(Op, DAG);
27498
27499 assert(Op.getSimpleValueType().is256BitVector() &&((void)0)
27500 Op.getSimpleValueType().isInteger() &&((void)0)
27501 "Only handle AVX 256-bit vector integer operation")((void)0);
27502 return splitVectorIntBinary(Op, DAG);
27503}
27504
27505static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27506 const X86Subtarget &Subtarget) {
27507 MVT VT = Op.getSimpleValueType();
27508 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27509 unsigned Opcode = Op.getOpcode();
27510 SDLoc DL(Op);
27511
27512 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27513 (VT.is256BitVector() && !Subtarget.hasInt256())) {
27514 assert(Op.getSimpleValueType().isInteger() &&((void)0)
27515 "Only handle AVX vector integer operation")((void)0);
27516 return splitVectorIntBinary(Op, DAG);
27517 }
27518
27519 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27520 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27521 EVT SetCCResultType =
27522 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27523
27524 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
27525 // usubsat X, Y --> (X >u Y) ? X - Y : 0
27526 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27527 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27528 // TODO: Move this to DAGCombiner?
27529 if (SetCCResultType == VT &&
27530 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27531 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27532 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27533 }
27534
27535 // Use default expansion.
27536 return SDValue();
27537}
27538
27539static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27540 SelectionDAG &DAG) {
27541 MVT VT = Op.getSimpleValueType();
27542 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27543 // Since X86 does not have CMOV for 8-bit integer, we don't convert
27544 // 8-bit integer abs to NEG and CMOV.
27545 SDLoc DL(Op);
27546 SDValue N0 = Op.getOperand(0);
27547 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27548 DAG.getConstant(0, DL, VT), N0);
27549 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
27550 SDValue(Neg.getNode(), 1)};
27551 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27552 }
27553
27554 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27555 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27556 SDLoc DL(Op);
27557 SDValue Src = Op.getOperand(0);
27558 SDValue Sub =
27559 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27560 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27561 }
27562
27563 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27564 assert(VT.isInteger() &&((void)0)
27565 "Only handle AVX 256-bit vector integer operation")((void)0);
27566 return splitVectorIntUnary(Op, DAG);
27567 }
27568
27569 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27570 return splitVectorIntUnary(Op, DAG);
27571
27572 // Default to expand.
27573 return SDValue();
27574}
27575
27576static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
27577 MVT VT = Op.getSimpleValueType();
27578
27579 // For AVX1 cases, split to use legal ops (everything but v4i64).
27580 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
27581 return splitVectorIntBinary(Op, DAG);
27582
27583 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27584 return splitVectorIntBinary(Op, DAG);
27585
27586 // Default to expand.
27587 return SDValue();
27588}
27589
27590static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
27591 SelectionDAG &DAG) {
27592 SDLoc dl(Op);
27593 MVT VT = Op.getSimpleValueType();
27594
27595 // Decompose 256-bit ops into 128-bit ops.
27596 if (VT.is256BitVector() && !Subtarget.hasInt256())
27597 return splitVectorIntBinary(Op, DAG);
27598
27599 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27600 return splitVectorIntBinary(Op, DAG);
27601
27602 SDValue A = Op.getOperand(0);
27603 SDValue B = Op.getOperand(1);
27604
27605 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
27606 // vector pairs, multiply and truncate.
27607 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
27608 unsigned NumElts = VT.getVectorNumElements();
27609
27610 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27611 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27612 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27613 return DAG.getNode(
27614 ISD::TRUNCATE, dl, VT,
27615 DAG.getNode(ISD::MUL, dl, ExVT,
27616 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
27617 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
27618 }
27619
27620 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27621
27622 // Extract the lo/hi parts to any extend to i16.
27623 // We're going to mask off the low byte of each result element of the
27624 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
27625 // element.
27626 SDValue Undef = DAG.getUNDEF(VT);
27627 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
27628 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
27629
27630 SDValue BLo, BHi;
27631 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27632 // If the RHS is a constant, manually unpackl/unpackh.
27633 SmallVector<SDValue, 16> LoOps, HiOps;
27634 for (unsigned i = 0; i != NumElts; i += 16) {
27635 for (unsigned j = 0; j != 8; ++j) {
27636 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
27637 MVT::i16));
27638 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
27639 MVT::i16));
27640 }
27641 }
27642
27643 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27644 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27645 } else {
27646 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
27647 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
27648 }
27649
27650 // Multiply, mask the lower 8bits of the lo/hi results and pack.
27651 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27652 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27653 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
27654 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
27655 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27656 }
27657
27658 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
27659 if (VT == MVT::v4i32) {
27660 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((void)0)
27661 "Should not custom lower when pmulld is available!")((void)0);
27662
27663 // Extract the odd parts.
27664 static const int UnpackMask[] = { 1, -1, 3, -1 };
27665 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
27666 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
27667
27668 // Multiply the even parts.
27669 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27670 DAG.getBitcast(MVT::v2i64, A),
27671 DAG.getBitcast(MVT::v2i64, B));
27672 // Now multiply odd parts.
27673 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27674 DAG.getBitcast(MVT::v2i64, Aodds),
27675 DAG.getBitcast(MVT::v2i64, Bodds));
27676
27677 Evens = DAG.getBitcast(VT, Evens);
27678 Odds = DAG.getBitcast(VT, Odds);
27679
27680 // Merge the two vectors back together with a shuffle. This expands into 2
27681 // shuffles.
27682 static const int ShufMask[] = { 0, 4, 2, 6 };
27683 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
27684 }
27685
27686 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&((void)0)
27687 "Only know how to lower V2I64/V4I64/V8I64 multiply")((void)0);
27688 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")((void)0);
27689
27690 // Ahi = psrlqi(a, 32);
27691 // Bhi = psrlqi(b, 32);
27692 //
27693 // AloBlo = pmuludq(a, b);
27694 // AloBhi = pmuludq(a, Bhi);
27695 // AhiBlo = pmuludq(Ahi, b);
27696 //
27697 // Hi = psllqi(AloBhi + AhiBlo, 32);
27698 // return AloBlo + Hi;
27699 KnownBits AKnown = DAG.computeKnownBits(A);
27700 KnownBits BKnown = DAG.computeKnownBits(B);
27701
27702 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
27703 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
27704 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
27705
27706 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
27707 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
27708 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
27709
27710 SDValue Zero = DAG.getConstant(0, dl, VT);
27711
27712 // Only multiply lo/hi halves that aren't known to be zero.
27713 SDValue AloBlo = Zero;
27714 if (!ALoIsZero && !BLoIsZero)
27715 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
27716
27717 SDValue AloBhi = Zero;
27718 if (!ALoIsZero && !BHiIsZero) {
27719 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
27720 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
27721 }
27722
27723 SDValue AhiBlo = Zero;
27724 if (!AHiIsZero && !BLoIsZero) {
27725 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
27726 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
27727 }
27728
27729 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
27730 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
27731
27732 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
27733}
27734
27735static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
27736 MVT VT, bool IsSigned,
27737 const X86Subtarget &Subtarget,
27738 SelectionDAG &DAG,
27739 SDValue *Low = nullptr) {
27740 unsigned NumElts = VT.getVectorNumElements();
27741
27742 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
27743 // to a vXi16 type. Do the multiplies, shift the results and pack the half
27744 // lane results back together.
27745
27746 // We'll take different approaches for signed and unsigned.
27747 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
27748 // and use pmullw to calculate the full 16-bit product.
27749 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
27750 // shift them left into the upper byte of each word. This allows us to use
27751 // pmulhw to calculate the full 16-bit product. This trick means we don't
27752 // need to sign extend the bytes to use pmullw.
27753
27754 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27755 SDValue Zero = DAG.getConstant(0, dl, VT);
27756
27757 SDValue ALo, AHi;
27758 if (IsSigned) {
27759 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
27760 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
27761 } else {
27762 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
27763 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
27764 }
27765
27766 SDValue BLo, BHi;
27767 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27768 // If the RHS is a constant, manually unpackl/unpackh and extend.
27769 SmallVector<SDValue, 16> LoOps, HiOps;
27770 for (unsigned i = 0; i != NumElts; i += 16) {
27771 for (unsigned j = 0; j != 8; ++j) {
27772 SDValue LoOp = B.getOperand(i + j);
27773 SDValue HiOp = B.getOperand(i + j + 8);
27774
27775 if (IsSigned) {
27776 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
27777 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
27778 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
27779 DAG.getConstant(8, dl, MVT::i16));
27780 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
27781 DAG.getConstant(8, dl, MVT::i16));
27782 } else {
27783 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
27784 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
27785 }
27786
27787 LoOps.push_back(LoOp);
27788 HiOps.push_back(HiOp);
27789 }
27790 }
27791
27792 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27793 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27794 } else if (IsSigned) {
27795 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
27796 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
27797 } else {
27798 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
27799 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
27800 }
27801
27802 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
27803 // pack back to vXi8.
27804 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
27805 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
27806 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
27807
27808 if (Low) {
27809 // Mask the lower bits and pack the results to rejoin the halves.
27810 SDValue Mask = DAG.getConstant(255, dl, ExVT);
27811 SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
27812 SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
27813 *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
27814 }
27815
27816 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
27817 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
27818
27819 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
27820 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27821}
27822
27823static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
27824 SelectionDAG &DAG) {
27825 SDLoc dl(Op);
27826 MVT VT = Op.getSimpleValueType();
27827 bool IsSigned = Op->getOpcode() == ISD::MULHS;
27828 unsigned NumElts = VT.getVectorNumElements();
27829 SDValue A = Op.getOperand(0);
27830 SDValue B = Op.getOperand(1);
27831
27832 // Decompose 256-bit ops into 128-bit ops.
27833 if (VT.is256BitVector() && !Subtarget.hasInt256())
27834 return splitVectorIntBinary(Op, DAG);
27835
27836 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27837 return splitVectorIntBinary(Op, DAG);
27838
27839 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
27840 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||((void)0)
27841 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||((void)0)
27842 (VT == MVT::v16i32 && Subtarget.hasAVX512()))((void)0);
27843
27844 // PMULxD operations multiply each even value (starting at 0) of LHS with
27845 // the related value of RHS and produce a widen result.
27846 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27847 // => <2 x i64> <ae|cg>
27848 //
27849 // In other word, to have all the results, we need to perform two PMULxD:
27850 // 1. one with the even values.
27851 // 2. one with the odd values.
27852 // To achieve #2, with need to place the odd values at an even position.
27853 //
27854 // Place the odd value at an even position (basically, shift all values 1
27855 // step to the left):
27856 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
27857 9, -1, 11, -1, 13, -1, 15, -1};
27858 // <a|b|c|d> => <b|undef|d|undef>
27859 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
27860 makeArrayRef(&Mask[0], NumElts));
27861 // <e|f|g|h> => <f|undef|h|undef>
27862 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
27863 makeArrayRef(&Mask[0], NumElts));
27864
27865 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
27866 // ints.
27867 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
27868 unsigned Opcode =
27869 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
27870 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27871 // => <2 x i64> <ae|cg>
27872 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27873 DAG.getBitcast(MulVT, A),
27874 DAG.getBitcast(MulVT, B)));
27875 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
27876 // => <2 x i64> <bf|dh>
27877 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27878 DAG.getBitcast(MulVT, Odd0),
27879 DAG.getBitcast(MulVT, Odd1)));
27880
27881 // Shuffle it back into the right order.
27882 SmallVector<int, 16> ShufMask(NumElts);
27883 for (int i = 0; i != (int)NumElts; ++i)
27884 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
27885
27886 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
27887
27888 // If we have a signed multiply but no PMULDQ fix up the result of an
27889 // unsigned multiply.
27890 if (IsSigned && !Subtarget.hasSSE41()) {
27891 SDValue Zero = DAG.getConstant(0, dl, VT);
27892 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
27893 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
27894 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
27895 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
27896
27897 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
27898 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
27899 }
27900
27901 return Res;
27902 }
27903
27904 // Only i8 vectors should need custom lowering after this.
27905 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||((void)0)
27906 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&((void)0)
27907 "Unsupported vector type")((void)0);
27908
27909 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
27910 // logical shift down the upper half and pack back to i8.
27911
27912 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
27913 // and then ashr/lshr the upper bits down to the lower bits before multiply.
27914
27915 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27916 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27917 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27918 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27919 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27920 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27921 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27922 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27923 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27924 }
27925
27926 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
27927}
27928
27929// Custom lowering for SMULO/UMULO.
27930static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
27931 SelectionDAG &DAG) {
27932 MVT VT = Op.getSimpleValueType();
27933
27934 // Scalars defer to LowerXALUO.
27935 if (!VT.isVector())
27936 return LowerXALUO(Op, DAG);
27937
27938 SDLoc dl(Op);
27939 bool IsSigned = Op->getOpcode() == ISD::SMULO;
27940 SDValue A = Op.getOperand(0);
27941 SDValue B = Op.getOperand(1);
27942 EVT OvfVT = Op->getValueType(1);
27943
27944 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
27945 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
27946 // Extract the LHS Lo/Hi vectors
27947 SDValue LHSLo, LHSHi;
27948 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
27949
27950 // Extract the RHS Lo/Hi vectors
27951 SDValue RHSLo, RHSHi;
27952 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
27953
27954 EVT LoOvfVT, HiOvfVT;
27955 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
27956 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
27957 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
27958
27959 // Issue the split operations.
27960 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
27961 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
27962
27963 // Join the separate data results and the overflow results.
27964 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
27965 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
27966 Hi.getValue(1));
27967
27968 return DAG.getMergeValues({Res, Ovf}, dl);
27969 }
27970
27971 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27972 EVT SetccVT =
27973 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27974
27975 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27976 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27977 unsigned NumElts = VT.getVectorNumElements();
27978 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27979 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27980 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27981 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27982 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27983
27984 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27985
27986 SDValue Ovf;
27987 if (IsSigned) {
27988 SDValue High, LowSign;
27989 if (OvfVT.getVectorElementType() == MVT::i1 &&
27990 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27991 // Rather the truncating try to do the compare on vXi16 or vXi32.
27992 // Shift the high down filling with sign bits.
27993 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
27994 // Fill all 16 bits with the sign bit from the low.
27995 LowSign =
27996 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
27997 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
27998 15, DAG);
27999 SetccVT = OvfVT;
28000 if (!Subtarget.hasBWI()) {
28001 // We can't do a vXi16 compare so sign extend to v16i32.
28002 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28003 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28004 }
28005 } else {
28006 // Otherwise do the compare at vXi8.
28007 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28008 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28009 LowSign =
28010 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28011 }
28012
28013 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28014 } else {
28015 SDValue High =
28016 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28017 if (OvfVT.getVectorElementType() == MVT::i1 &&
28018 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28019 // Rather the truncating try to do the compare on vXi16 or vXi32.
28020 SetccVT = OvfVT;
28021 if (!Subtarget.hasBWI()) {
28022 // We can't do a vXi16 compare so sign extend to v16i32.
28023 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28024 }
28025 } else {
28026 // Otherwise do the compare at vXi8.
28027 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28028 }
28029
28030 Ovf =
28031 DAG.getSetCC(dl, SetccVT, High,
28032 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28033 }
28034
28035 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28036
28037 return DAG.getMergeValues({Low, Ovf}, dl);
28038 }
28039
28040 SDValue Low;
28041 SDValue High =
28042 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28043
28044 SDValue Ovf;
28045 if (IsSigned) {
28046 // SMULO overflows if the high bits don't match the sign of the low.
28047 SDValue LowSign =
28048 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28049 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28050 } else {
28051 // UMULO overflows if the high bits are non-zero.
28052 Ovf =
28053 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28054 }
28055
28056 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28057
28058 return DAG.getMergeValues({Low, Ovf}, dl);
28059}
28060
28061SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28062 assert(Subtarget.isTargetWin64() && "Unexpected target")((void)0);
28063 EVT VT = Op.getValueType();
28064 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((void)0)
28065 "Unexpected return type for lowering")((void)0);
28066
28067 RTLIB::Libcall LC;
28068 bool isSigned;
28069 switch (Op->getOpcode()) {
28070 default: llvm_unreachable("Unexpected request for libcall!")__builtin_unreachable();
28071 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28072 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28073 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28074 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28075 }
28076
28077 SDLoc dl(Op);
28078 SDValue InChain = DAG.getEntryNode();
28079
28080 TargetLowering::ArgListTy Args;
28081 TargetLowering::ArgListEntry Entry;
28082 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28083 EVT ArgVT = Op->getOperand(i).getValueType();
28084 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((void)0)
28085 "Unexpected argument type for lowering")((void)0);
28086 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28087 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28088 MachinePointerInfo MPI =
28089 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28090 Entry.Node = StackPtr;
28091 InChain =
28092 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28093 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28094 Entry.Ty = PointerType::get(ArgTy,0);
28095 Entry.IsSExt = false;
28096 Entry.IsZExt = false;
28097 Args.push_back(Entry);
28098 }
28099
28100 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28101 getPointerTy(DAG.getDataLayout()));
28102
28103 TargetLowering::CallLoweringInfo CLI(DAG);
28104 CLI.setDebugLoc(dl)
28105 .setChain(InChain)
28106 .setLibCallee(
28107 getLibcallCallingConv(LC),
28108 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28109 std::move(Args))
28110 .setInRegister()
28111 .setSExtResult(isSigned)
28112 .setZExtResult(!isSigned);
28113
28114 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28115 return DAG.getBitcast(VT, CallInfo.first);
28116}
28117
28118// Return true if the required (according to Opcode) shift-imm form is natively
28119// supported by the Subtarget
28120static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
28121 unsigned Opcode) {
28122 if (VT.getScalarSizeInBits() < 16)
28123 return false;
28124
28125 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
28126 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28127 return true;
28128
28129 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28130 (VT.is256BitVector() && Subtarget.hasInt256());
28131
28132 bool AShift = LShift && (Subtarget.hasAVX512() ||
28133 (VT != MVT::v2i64 && VT != MVT::v4i64));
28134 return (Opcode == ISD::SRA) ? AShift : LShift;
28135}
28136
28137// The shift amount is a variable, but it is the same for all vector lanes.
28138// These instructions are defined together with shift-immediate.
28139static
28140bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
28141 unsigned Opcode) {
28142 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
28143}
28144
28145// Return true if the required (according to Opcode) variable-shift form is
28146// natively supported by the Subtarget
28147static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
28148 unsigned Opcode) {
28149
28150 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28151 return false;
28152
28153 // vXi16 supported only on AVX-512, BWI
28154 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28155 return false;
28156
28157 if (Subtarget.hasAVX512())
28158 return true;
28159
28160 bool LShift = VT.is128BitVector() || VT.is256BitVector();
28161 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
28162 return (Opcode == ISD::SRA) ? AShift : LShift;
28163}
28164
28165static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
28166 const X86Subtarget &Subtarget) {
28167 MVT VT = Op.getSimpleValueType();
28168 SDLoc dl(Op);
28169 SDValue R = Op.getOperand(0);
28170 SDValue Amt = Op.getOperand(1);
28171 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28172
28173 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28174 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")((void)0);
28175 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28176 SDValue Ex = DAG.getBitcast(ExVT, R);
28177
28178 // ashr(R, 63) === cmp_slt(R, 0)
28179 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28180 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&((void)0)
28181 "Unsupported PCMPGT op")((void)0);
28182 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28183 }
28184
28185 if (ShiftAmt >= 32) {
28186 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28187 SDValue Upper =
28188 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28189 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28190 ShiftAmt - 32, DAG);
28191 if (VT == MVT::v2i64)
28192 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28193 if (VT == MVT::v4i64)
28194 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28195 {9, 1, 11, 3, 13, 5, 15, 7});
28196 } else {
28197 // SRA upper i32, SRL whole i64 and select lower i32.
28198 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28199 ShiftAmt, DAG);
28200 SDValue Lower =
28201 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28202 Lower = DAG.getBitcast(ExVT, Lower);
28203 if (VT == MVT::v2i64)
28204 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28205 if (VT == MVT::v4i64)
28206 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28207 {8, 1, 10, 3, 12, 5, 14, 7});
28208 }
28209 return DAG.getBitcast(VT, Ex);
28210 };
28211
28212 // Optimize shl/srl/sra with constant shift amount.
28213 APInt APIntShiftAmt;
28214 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28215 return SDValue();
28216
28217 // If the shift amount is out of range, return undef.
28218 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28219 return DAG.getUNDEF(VT);
28220
28221 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28222
28223 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28224 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28225
28226 // i64 SRA needs to be performed as partial shifts.
28227 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28228 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28229 Op.getOpcode() == ISD::SRA)
28230 return ArithmeticShiftRight64(ShiftAmt);
28231
28232 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28233 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28234 unsigned NumElts = VT.getVectorNumElements();
28235 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28236
28237 // Simple i8 add case
28238 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
28239 return DAG.getNode(ISD::ADD, dl, VT, R, R);
28240
28241 // ashr(R, 7) === cmp_slt(R, 0)
28242 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28243 SDValue Zeros = DAG.getConstant(0, dl, VT);
28244 if (VT.is512BitVector()) {
28245 assert(VT == MVT::v64i8 && "Unexpected element type!")((void)0);
28246 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28247 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28248 }
28249 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28250 }
28251
28252 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28253 if (VT == MVT::v16i8 && Subtarget.hasXOP())
28254 return SDValue();
28255
28256 if (Op.getOpcode() == ISD::SHL) {
28257 // Make a large shift.
28258 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28259 ShiftAmt, DAG);
28260 SHL = DAG.getBitcast(VT, SHL);
28261 // Zero out the rightmost bits.
28262 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28263 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28264 }
28265 if (Op.getOpcode() == ISD::SRL) {
28266 // Make a large shift.
28267 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28268 ShiftAmt, DAG);
28269 SRL = DAG.getBitcast(VT, SRL);
28270 // Zero out the leftmost bits.
28271 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28272 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28273 }
28274 if (Op.getOpcode() == ISD::SRA) {
28275 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28276 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28277
28278 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28279 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28280 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28281 return Res;
28282 }
28283 llvm_unreachable("Unknown shift opcode.")__builtin_unreachable();
28284 }
28285
28286 return SDValue();
28287}
28288
28289static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28290 const X86Subtarget &Subtarget) {
28291 MVT VT = Op.getSimpleValueType();
28292 SDLoc dl(Op);
28293 SDValue R = Op.getOperand(0);
28294 SDValue Amt = Op.getOperand(1);
28295 unsigned Opcode = Op.getOpcode();
28296 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28297 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28298
28299 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28300 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28301 MVT EltVT = VT.getVectorElementType();
28302 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((void)0);
28303 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28304 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28305 else if (EltVT.bitsLT(MVT::i32))
28306 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28307
28308 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28309 }
28310
28311 // vXi8 shifts - shift as v8i16 + mask result.
28312 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28313 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28314 VT == MVT::v64i8) &&
28315 !Subtarget.hasXOP()) {
28316 unsigned NumElts = VT.getVectorNumElements();
28317 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28318 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28319 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28320 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28321 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28322
28323 // Create the mask using vXi16 shifts. For shift-rights we need to move
28324 // the upper byte down before splatting the vXi8 mask.
28325 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28326 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28327 BaseShAmt, Subtarget, DAG);
28328 if (Opcode != ISD::SHL)
28329 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28330 8, DAG);
28331 BitMask = DAG.getBitcast(VT, BitMask);
28332 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28333 SmallVector<int, 64>(NumElts, 0));
28334
28335 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28336 DAG.getBitcast(ExtVT, R), BaseShAmt,
28337 Subtarget, DAG);
28338 Res = DAG.getBitcast(VT, Res);
28339 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28340
28341 if (Opcode == ISD::SRA) {
28342 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28343 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28344 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28345 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28346 BaseShAmt, Subtarget, DAG);
28347 SignMask = DAG.getBitcast(VT, SignMask);
28348 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28349 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28350 }
28351 return Res;
28352 }
28353 }
28354 }
28355
28356 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28357 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28358 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28359 Amt = Amt.getOperand(0);
28360 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28361 std::vector<SDValue> Vals(Ratio);
28362 for (unsigned i = 0; i != Ratio; ++i)
28363 Vals[i] = Amt.getOperand(i);
28364 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28365 for (unsigned j = 0; j != Ratio; ++j)
28366 if (Vals[j] != Amt.getOperand(i + j))
28367 return SDValue();
28368 }
28369
28370 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28371 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28372 }
28373 return SDValue();
28374}
28375
28376// Convert a shift/rotate left amount to a multiplication scale factor.
28377static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28378 const X86Subtarget &Subtarget,
28379 SelectionDAG &DAG) {
28380 MVT VT = Amt.getSimpleValueType();
28381 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28382 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28383 (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28384 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28385 return SDValue();
28386
28387 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28388 SmallVector<SDValue, 8> Elts;
28389 MVT SVT = VT.getVectorElementType();
28390 unsigned SVTBits = SVT.getSizeInBits();
28391 APInt One(SVTBits, 1);
28392 unsigned NumElems = VT.getVectorNumElements();
28393
28394 for (unsigned i = 0; i != NumElems; ++i) {
28395 SDValue Op = Amt->getOperand(i);
28396 if (Op->isUndef()) {
28397 Elts.push_back(Op);
28398 continue;
28399 }
28400
28401 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28402 APInt C(SVTBits, ND->getZExtValue());
28403 uint64_t ShAmt = C.getZExtValue();
28404 if (ShAmt >= SVTBits) {
28405 Elts.push_back(DAG.getUNDEF(SVT));
28406 continue;
28407 }
28408 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28409 }
28410 return DAG.getBuildVector(VT, dl, Elts);
28411 }
28412
28413 // If the target doesn't support variable shifts, use either FP conversion
28414 // or integer multiplication to avoid shifting each element individually.
28415 if (VT == MVT::v4i32) {
28416 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28417 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28418 DAG.getConstant(0x3f800000U, dl, VT));
28419 Amt = DAG.getBitcast(MVT::v4f32, Amt);
28420 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28421 }
28422
28423 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28424 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28425 SDValue Z = DAG.getConstant(0, dl, VT);
28426 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28427 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28428 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28429 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28430 if (Subtarget.hasSSE41())
28431 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28432
28433 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28434 DAG.getBitcast(VT, Hi),
28435 {0, 2, 4, 6, 8, 10, 12, 14});
28436 }
28437
28438 return SDValue();
28439}
28440
28441static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28442 SelectionDAG &DAG) {
28443 MVT VT = Op.getSimpleValueType();
28444 SDLoc dl(Op);
28445 SDValue R = Op.getOperand(0);
28446 SDValue Amt = Op.getOperand(1);
28447 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28448 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28449
28450 unsigned Opc = Op.getOpcode();
28451 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28452 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28453
28454 assert(VT.isVector() && "Custom lowering only for vector shifts!")((void)0);
28455 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((void)0);
28456
28457 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28458 return V;
28459
28460 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28461 return V;
28462
28463 if (SupportedVectorVarShift(VT, Subtarget, Opc))
28464 return Op;
28465
28466 // XOP has 128-bit variable logical/arithmetic shifts.
28467 // +ve/-ve Amt = shift left/right.
28468 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
28469 VT == MVT::v8i16 || VT == MVT::v16i8)) {
28470 if (Opc == ISD::SRL || Opc == ISD::SRA) {
28471 SDValue Zero = DAG.getConstant(0, dl, VT);
28472 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
28473 }
28474 if (Opc == ISD::SHL || Opc == ISD::SRL)
28475 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
28476 if (Opc == ISD::SRA)
28477 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
28478 }
28479
28480 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
28481 // shifts per-lane and then shuffle the partial results back together.
28482 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
28483 // Splat the shift amounts so the scalar shifts above will catch it.
28484 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
28485 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
28486 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
28487 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
28488 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
28489 }
28490
28491 // i64 vector arithmetic shift can be emulated with the transform:
28492 // M = lshr(SIGN_MASK, Amt)
28493 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
28494 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
28495 Opc == ISD::SRA) {
28496 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
28497 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
28498 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28499 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
28500 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
28501 return R;
28502 }
28503
28504 // If possible, lower this shift as a sequence of two shifts by
28505 // constant plus a BLENDing shuffle instead of scalarizing it.
28506 // Example:
28507 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
28508 //
28509 // Could be rewritten as:
28510 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
28511 //
28512 // The advantage is that the two shifts from the example would be
28513 // lowered as X86ISD::VSRLI nodes in parallel before blending.
28514 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
28515 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28516 SDValue Amt1, Amt2;
28517 unsigned NumElts = VT.getVectorNumElements();
28518 SmallVector<int, 8> ShuffleMask;
28519 for (unsigned i = 0; i != NumElts; ++i) {
28520 SDValue A = Amt->getOperand(i);
28521 if (A.isUndef()) {
28522 ShuffleMask.push_back(SM_SentinelUndef);
28523 continue;
28524 }
28525 if (!Amt1 || Amt1 == A) {
28526 ShuffleMask.push_back(i);
28527 Amt1 = A;
28528 continue;
28529 }
28530 if (!Amt2 || Amt2 == A) {
28531 ShuffleMask.push_back(i + NumElts);
28532 Amt2 = A;
28533 continue;
28534 }
28535 break;
28536 }
28537
28538 // Only perform this blend if we can perform it without loading a mask.
28539 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
28540 (VT != MVT::v16i16 ||
28541 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
28542 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
28543 canWidenShuffleElements(ShuffleMask))) {
28544 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
28545 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
28546 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
28547 Cst2->getAPIntValue().ult(EltSizeInBits)) {
28548 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28549 Cst1->getZExtValue(), DAG);
28550 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28551 Cst2->getZExtValue(), DAG);
28552 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
28553 }
28554 }
28555 }
28556
28557 // If possible, lower this packed shift into a vector multiply instead of
28558 // expanding it into a sequence of scalar shifts.
28559 if (Opc == ISD::SHL)
28560 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
28561 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
28562
28563 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
28564 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
28565 if (Opc == ISD::SRL && ConstantAmt &&
28566 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28567 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28568 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28569 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28570 SDValue Zero = DAG.getConstant(0, dl, VT);
28571 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
28572 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
28573 return DAG.getSelect(dl, VT, ZAmt, R, Res);
28574 }
28575 }
28576
28577 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
28578 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
28579 // TODO: Special case handling for shift by 0/1, really we can afford either
28580 // of these cases in pre-SSE41/XOP/AVX512 but not both.
28581 if (Opc == ISD::SRA && ConstantAmt &&
28582 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
28583 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
28584 !Subtarget.hasAVX512()) ||
28585 DAG.isKnownNeverZero(Amt))) {
28586 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28587 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28588 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28589 SDValue Amt0 =
28590 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
28591 SDValue Amt1 =
28592 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
28593 SDValue Sra1 =
28594 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
28595 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
28596 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
28597 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
28598 }
28599 }
28600
28601 // v4i32 Non Uniform Shifts.
28602 // If the shift amount is constant we can shift each lane using the SSE2
28603 // immediate shifts, else we need to zero-extend each lane to the lower i64
28604 // and shift using the SSE2 variable shifts.
28605 // The separate results can then be blended together.
28606 if (VT == MVT::v4i32) {
28607 SDValue Amt0, Amt1, Amt2, Amt3;
28608 if (ConstantAmt) {
28609 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
28610 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
28611 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
28612 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
28613 } else {
28614 // The SSE2 shifts use the lower i64 as the same shift amount for
28615 // all lanes and the upper i64 is ignored. On AVX we're better off
28616 // just zero-extending, but for SSE just duplicating the top 16-bits is
28617 // cheaper and has the same effect for out of range values.
28618 if (Subtarget.hasAVX()) {
28619 SDValue Z = DAG.getConstant(0, dl, VT);
28620 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
28621 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
28622 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
28623 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
28624 } else {
28625 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
28626 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28627 {4, 5, 6, 7, -1, -1, -1, -1});
28628 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28629 {0, 1, 1, 1, -1, -1, -1, -1});
28630 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28631 {2, 3, 3, 3, -1, -1, -1, -1});
28632 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28633 {0, 1, 1, 1, -1, -1, -1, -1});
28634 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28635 {2, 3, 3, 3, -1, -1, -1, -1});
28636 }
28637 }
28638
28639 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
28640 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
28641 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
28642 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
28643 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
28644
28645 // Merge the shifted lane results optimally with/without PBLENDW.
28646 // TODO - ideally shuffle combining would handle this.
28647 if (Subtarget.hasSSE41()) {
28648 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
28649 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
28650 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
28651 }
28652 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
28653 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
28654 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
28655 }
28656
28657 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
28658 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
28659 // make the existing SSE solution better.
28660 // NOTE: We honor prefered vector width before promoting to 512-bits.
28661 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
28662 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
28663 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
28664 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
28665 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
28666 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&((void)0)
28667 "Unexpected vector type")((void)0);
28668 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
28669 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
28670 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28671 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
28672 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
28673 return DAG.getNode(ISD::TRUNCATE, dl, VT,
28674 DAG.getNode(Opc, dl, ExtVT, R, Amt));
28675 }
28676
28677 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
28678 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
28679 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
28680 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28681 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28682 !Subtarget.hasXOP()) {
28683 int NumElts = VT.getVectorNumElements();
28684 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
28685
28686 // Extend constant shift amount to vXi16 (it doesn't matter if the type
28687 // isn't legal).
28688 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28689 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
28690 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
28691 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
28692 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&((void)0)
28693 "Constant build vector expected")((void)0);
28694
28695 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
28696 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
28697 : DAG.getZExtOrTrunc(R, dl, ExVT);
28698 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
28699 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
28700 return DAG.getZExtOrTrunc(R, dl, VT);
28701 }
28702
28703 SmallVector<SDValue, 16> LoAmt, HiAmt;
28704 for (int i = 0; i != NumElts; i += 16) {
28705 for (int j = 0; j != 8; ++j) {
28706 LoAmt.push_back(Amt.getOperand(i + j));
28707 HiAmt.push_back(Amt.getOperand(i + j + 8));
28708 }
28709 }
28710
28711 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
28712 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
28713 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
28714
28715 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
28716 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
28717 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
28718 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
28719 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
28720 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
28721 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
28722 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
28723 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
28724 }
28725
28726 if (VT == MVT::v16i8 ||
28727 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
28728 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
28729 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
28730
28731 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28732 if (VT.is512BitVector()) {
28733 // On AVX512BW targets we make use of the fact that VSELECT lowers
28734 // to a masked blend which selects bytes based just on the sign bit
28735 // extracted to a mask.
28736 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
28737 V0 = DAG.getBitcast(VT, V0);
28738 V1 = DAG.getBitcast(VT, V1);
28739 Sel = DAG.getBitcast(VT, Sel);
28740 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
28741 ISD::SETGT);
28742 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
28743 } else if (Subtarget.hasSSE41()) {
28744 // On SSE41 targets we can use PBLENDVB which selects bytes based just
28745 // on the sign bit.
28746 V0 = DAG.getBitcast(VT, V0);
28747 V1 = DAG.getBitcast(VT, V1);
28748 Sel = DAG.getBitcast(VT, Sel);
28749 return DAG.getBitcast(SelVT,
28750 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
28751 }
28752 // On pre-SSE41 targets we test for the sign bit by comparing to
28753 // zero - a negative value will set all bits of the lanes to true
28754 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28755 SDValue Z = DAG.getConstant(0, dl, SelVT);
28756 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
28757 return DAG.getSelect(dl, SelVT, C, V0, V1);
28758 };
28759
28760 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28761 // We can safely do this using i16 shifts as we're only interested in
28762 // the 3 lower bits of each byte.
28763 Amt = DAG.getBitcast(ExtVT, Amt);
28764 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
28765 Amt = DAG.getBitcast(VT, Amt);
28766
28767 if (Opc == ISD::SHL || Opc == ISD::SRL) {
28768 // r = VSELECT(r, shift(r, 4), a);
28769 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
28770 R = SignBitSelect(VT, Amt, M, R);
28771
28772 // a += a
28773 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28774
28775 // r = VSELECT(r, shift(r, 2), a);
28776 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
28777 R = SignBitSelect(VT, Amt, M, R);
28778
28779 // a += a
28780 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28781
28782 // return VSELECT(r, shift(r, 1), a);
28783 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
28784 R = SignBitSelect(VT, Amt, M, R);
28785 return R;
28786 }
28787
28788 if (Opc == ISD::SRA) {
28789 // For SRA we need to unpack each byte to the higher byte of a i16 vector
28790 // so we can correctly sign extend. We don't care what happens to the
28791 // lower byte.
28792 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28793 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28794 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
28795 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
28796 ALo = DAG.getBitcast(ExtVT, ALo);
28797 AHi = DAG.getBitcast(ExtVT, AHi);
28798 RLo = DAG.getBitcast(ExtVT, RLo);
28799 RHi = DAG.getBitcast(ExtVT, RHi);
28800
28801 // r = VSELECT(r, shift(r, 4), a);
28802 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
28803 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
28804 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28805 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28806
28807 // a += a
28808 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28809 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28810
28811 // r = VSELECT(r, shift(r, 2), a);
28812 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
28813 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
28814 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28815 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28816
28817 // a += a
28818 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28819 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28820
28821 // r = VSELECT(r, shift(r, 1), a);
28822 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
28823 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
28824 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28825 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28826
28827 // Logical shift the result back to the lower byte, leaving a zero upper
28828 // byte meaning that we can safely pack with PACKUSWB.
28829 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
28830 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
28831 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28832 }
28833 }
28834
28835 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
28836 MVT ExtVT = MVT::v8i32;
28837 SDValue Z = DAG.getConstant(0, dl, VT);
28838 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
28839 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
28840 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
28841 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
28842 ALo = DAG.getBitcast(ExtVT, ALo);
28843 AHi = DAG.getBitcast(ExtVT, AHi);
28844 RLo = DAG.getBitcast(ExtVT, RLo);
28845 RHi = DAG.getBitcast(ExtVT, RHi);
28846 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
28847 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
28848 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
28849 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
28850 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28851 }
28852
28853 if (VT == MVT::v8i16) {
28854 // If we have a constant shift amount, the non-SSE41 path is best as
28855 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
28856 bool UseSSE41 = Subtarget.hasSSE41() &&
28857 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28858
28859 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
28860 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
28861 // the sign bit.
28862 if (UseSSE41) {
28863 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
28864 V0 = DAG.getBitcast(ExtVT, V0);
28865 V1 = DAG.getBitcast(ExtVT, V1);
28866 Sel = DAG.getBitcast(ExtVT, Sel);
28867 return DAG.getBitcast(
28868 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
28869 }
28870 // On pre-SSE41 targets we splat the sign bit - a negative value will
28871 // set all bits of the lanes to true and VSELECT uses that in
28872 // its OR(AND(V0,C),AND(V1,~C)) lowering.
28873 SDValue C =
28874 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
28875 return DAG.getSelect(dl, VT, C, V0, V1);
28876 };
28877
28878 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
28879 if (UseSSE41) {
28880 // On SSE41 targets we need to replicate the shift mask in both
28881 // bytes for PBLENDVB.
28882 Amt = DAG.getNode(
28883 ISD::OR, dl, VT,
28884 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
28885 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
28886 } else {
28887 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
28888 }
28889
28890 // r = VSELECT(r, shift(r, 8), a);
28891 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
28892 R = SignBitSelect(Amt, M, R);
28893
28894 // a += a
28895 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28896
28897 // r = VSELECT(r, shift(r, 4), a);
28898 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
28899 R = SignBitSelect(Amt, M, R);
28900
28901 // a += a
28902 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28903
28904 // r = VSELECT(r, shift(r, 2), a);
28905 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
28906 R = SignBitSelect(Amt, M, R);
28907
28908 // a += a
28909 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28910
28911 // return VSELECT(r, shift(r, 1), a);
28912 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
28913 R = SignBitSelect(Amt, M, R);
28914 return R;
28915 }
28916
28917 // Decompose 256-bit shifts into 128-bit shifts.
28918 if (VT.is256BitVector())
28919 return splitVectorIntBinary(Op, DAG);
28920
28921 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28922 return splitVectorIntBinary(Op, DAG);
28923
28924 return SDValue();
28925}
28926
28927static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
28928 SelectionDAG &DAG) {
28929 MVT VT = Op.getSimpleValueType();
28930 assert(VT.isVector() && "Custom lowering only for vector rotates!")((void)0);
28931
28932 SDLoc DL(Op);
28933 SDValue R = Op.getOperand(0);
28934 SDValue Amt = Op.getOperand(1);
28935 unsigned Opcode = Op.getOpcode();
28936 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28937 int NumElts = VT.getVectorNumElements();
28938
28939 // Check for constant splat rotation amount.
28940 APInt CstSplatValue;
28941 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
28942
28943 // Check for splat rotate by zero.
28944 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
28945 return R;
28946
28947 // AVX512 implicitly uses modulo rotation amounts.
28948 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
28949 // Attempt to rotate by immediate.
28950 if (IsCstSplat) {
28951 unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
28952 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28953 return DAG.getNode(RotOpc, DL, VT, R,
28954 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28955 }
28956
28957 // Else, fall-back on VPROLV/VPRORV.
28958 return Op;
28959 }
28960
28961 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
28962 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
28963 unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
28964 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
28965 }
28966
28967 assert((Opcode == ISD::ROTL) && "Only ROTL supported")((void)0);
28968
28969 // XOP has 128-bit vector variable + immediate rotates.
28970 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
28971 // XOP implicitly uses modulo rotation amounts.
28972 if (Subtarget.hasXOP()) {
28973 if (VT.is256BitVector())
28974 return splitVectorIntBinary(Op, DAG);
28975 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((void)0);
28976
28977 // Attempt to rotate by immediate.
28978 if (IsCstSplat) {
28979 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28980 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
28981 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28982 }
28983
28984 // Use general rotate by variable (per-element).
28985 return Op;
28986 }
28987
28988 // Split 256-bit integers on pre-AVX2 targets.
28989 if (VT.is256BitVector() && !Subtarget.hasAVX2())
28990 return splitVectorIntBinary(Op, DAG);
28991
28992 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||((void)0)
28993 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||((void)0)
28994 VT == MVT::v32i16) &&((void)0)
28995 Subtarget.hasAVX2())) &&((void)0)
28996 "Only vXi32/vXi16/vXi8 vector rotates supported")((void)0);
28997
28998 // Rotate by an uniform constant - expand back to shifts.
28999 if (IsCstSplat)
29000 return SDValue();
29001
29002 bool IsSplatAmt = DAG.isSplatValue(Amt);
29003
29004 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
29005 // the amount bit.
29006 if (EltSizeInBits == 8 && !IsSplatAmt) {
29007 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
29008 return SDValue();
29009
29010 // We don't need ModuloAmt here as we just peek at individual bits.
29011 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29012
29013 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29014 if (Subtarget.hasSSE41()) {
29015 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29016 // on the sign bit.
29017 V0 = DAG.getBitcast(VT, V0);
29018 V1 = DAG.getBitcast(VT, V1);
29019 Sel = DAG.getBitcast(VT, Sel);
29020 return DAG.getBitcast(SelVT,
29021 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29022 }
29023 // On pre-SSE41 targets we test for the sign bit by comparing to
29024 // zero - a negative value will set all bits of the lanes to true
29025 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29026 SDValue Z = DAG.getConstant(0, DL, SelVT);
29027 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29028 return DAG.getSelect(DL, SelVT, C, V0, V1);
29029 };
29030
29031 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29032 // We can safely do this using i16 shifts as we're only interested in
29033 // the 3 lower bits of each byte.
29034 Amt = DAG.getBitcast(ExtVT, Amt);
29035 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29036 Amt = DAG.getBitcast(VT, Amt);
29037
29038 // r = VSELECT(r, rot(r, 4), a);
29039 SDValue M;
29040 M = DAG.getNode(
29041 ISD::OR, DL, VT,
29042 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
29043 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
29044 R = SignBitSelect(VT, Amt, M, R);
29045
29046 // a += a
29047 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29048
29049 // r = VSELECT(r, rot(r, 2), a);
29050 M = DAG.getNode(
29051 ISD::OR, DL, VT,
29052 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
29053 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
29054 R = SignBitSelect(VT, Amt, M, R);
29055
29056 // a += a
29057 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29058
29059 // return VSELECT(r, rot(r, 1), a);
29060 M = DAG.getNode(
29061 ISD::OR, DL, VT,
29062 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
29063 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
29064 return SignBitSelect(VT, Amt, M, R);
29065 }
29066
29067 // ISD::ROT* uses modulo rotate amounts.
29068 if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
29069 // If the amount is a splat, perform the modulo BEFORE the splat,
29070 // this helps LowerScalarVariableShift to remove the splat later.
29071 Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
29072 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29073 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29074 Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
29075 SmallVector<int>(NumElts, 0));
29076 } else {
29077 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29078 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29079 }
29080
29081 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29082 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29083 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
29084
29085 // Fallback for splats + all supported variable shifts.
29086 // Fallback for non-constants AVX2 vXi16 as well.
29087 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29088 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29089 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29090 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
29091 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
29092 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29093 }
29094
29095 // As with shifts, convert the rotation amount to a multiplication factor.
29096 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29097 assert(Scale && "Failed to convert ROTL amount to scale")((void)0);
29098
29099 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29100 if (EltSizeInBits == 16) {
29101 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29102 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29103 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29104 }
29105
29106 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29107 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29108 // that can then be OR'd with the lower 32-bits.
29109 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")((void)0);
29110 static const int OddMask[] = {1, -1, 3, -1};
29111 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29112 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29113
29114 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29115 DAG.getBitcast(MVT::v2i64, R),
29116 DAG.getBitcast(MVT::v2i64, Scale));
29117 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29118 DAG.getBitcast(MVT::v2i64, R13),
29119 DAG.getBitcast(MVT::v2i64, Scale13));
29120 Res02 = DAG.getBitcast(VT, Res02);
29121 Res13 = DAG.getBitcast(VT, Res13);
29122
29123 return DAG.getNode(ISD::OR, DL, VT,
29124 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29125 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29126}
29127
29128/// Returns true if the operand type is exactly twice the native width, and
29129/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29130/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29131/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
29132bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
29133 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
29134
29135 if (OpWidth == 64)
29136 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
29137 if (OpWidth == 128)
29138 return Subtarget.hasCmpxchg16b();
29139
29140 return false;
29141}
29142
29143bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
29144 Type *MemType = SI->getValueOperand()->getType();
29145
29146 bool NoImplicitFloatOps =
29147 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29148 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29149 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29150 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29151 return false;
29152
29153 return needsCmpXchgNb(MemType);
29154}
29155
29156// Note: this turns large loads into lock cmpxchg8b/16b.
29157// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
29158TargetLowering::AtomicExpansionKind
29159X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
29160 Type *MemType = LI->getType();
29161
29162 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
29163 // can use movq to do the load. If we have X87 we can load into an 80-bit
29164 // X87 register and store it to a stack temporary.
29165 bool NoImplicitFloatOps =
29166 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29167 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29168 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29169 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29170 return AtomicExpansionKind::None;
29171
29172 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29173 : AtomicExpansionKind::None;
29174}
29175
29176TargetLowering::AtomicExpansionKind
29177X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29178 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29179 Type *MemType = AI->getType();
29180
29181 // If the operand is too big, we must see if cmpxchg8/16b is available
29182 // and default to library calls otherwise.
29183 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29184 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29185 : AtomicExpansionKind::None;
29186 }
29187
29188 AtomicRMWInst::BinOp Op = AI->getOperation();
29189 switch (Op) {
29190 default:
29191 llvm_unreachable("Unknown atomic operation")__builtin_unreachable();
29192 case AtomicRMWInst::Xchg:
29193 case AtomicRMWInst::Add:
29194 case AtomicRMWInst::Sub:
29195 // It's better to use xadd, xsub or xchg for these in all cases.
29196 return AtomicExpansionKind::None;
29197 case AtomicRMWInst::Or:
29198 case AtomicRMWInst::And:
29199 case AtomicRMWInst::Xor:
29200 // If the atomicrmw's result isn't actually used, we can just add a "lock"
29201 // prefix to a normal instruction for these operations.
29202 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29203 : AtomicExpansionKind::None;
29204 case AtomicRMWInst::Nand:
29205 case AtomicRMWInst::Max:
29206 case AtomicRMWInst::Min:
29207 case AtomicRMWInst::UMax:
29208 case AtomicRMWInst::UMin:
29209 case AtomicRMWInst::FAdd:
29210 case AtomicRMWInst::FSub:
29211 // These always require a non-trivial set of data operations on x86. We must
29212 // use a cmpxchg loop.
29213 return AtomicExpansionKind::CmpXChg;
29214 }
29215}
29216
29217LoadInst *
29218X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29219 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29220 Type *MemType = AI->getType();
29221 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29222 // there is no benefit in turning such RMWs into loads, and it is actually
29223 // harmful as it introduces a mfence.
29224 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29225 return nullptr;
29226
29227 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29228 // lowering available in lowerAtomicArith.
29229 // TODO: push more cases through this path.
29230 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29231 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29232 AI->use_empty())
29233 return nullptr;
29234
29235 IRBuilder<> Builder(AI);
29236 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29237 auto SSID = AI->getSyncScopeID();
29238 // We must restrict the ordering to avoid generating loads with Release or
29239 // ReleaseAcquire orderings.
29240 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29241
29242 // Before the load we need a fence. Here is an example lifted from
29243 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29244 // is required:
29245 // Thread 0:
29246 // x.store(1, relaxed);
29247 // r1 = y.fetch_add(0, release);
29248 // Thread 1:
29249 // y.fetch_add(42, acquire);
29250 // r2 = x.load(relaxed);
29251 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29252 // lowered to just a load without a fence. A mfence flushes the store buffer,
29253 // making the optimization clearly correct.
29254 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29255 // otherwise, we might be able to be more aggressive on relaxed idempotent
29256 // rmw. In practice, they do not look useful, so we don't try to be
29257 // especially clever.
29258 if (SSID == SyncScope::SingleThread)
29259 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29260 // the IR level, so we must wrap it in an intrinsic.
29261 return nullptr;
29262
29263 if (!Subtarget.hasMFence())
29264 // FIXME: it might make sense to use a locked operation here but on a
29265 // different cache-line to prevent cache-line bouncing. In practice it
29266 // is probably a small win, and x86 processors without mfence are rare
29267 // enough that we do not bother.
29268 return nullptr;
29269
29270 Function *MFence =
29271 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29272 Builder.CreateCall(MFence, {});
29273
29274 // Finally we can emit the atomic load.
29275 LoadInst *Loaded = Builder.CreateAlignedLoad(
29276 AI->getType(), AI->getPointerOperand(), AI->getAlign());
29277 Loaded->setAtomic(Order, SSID);
29278 AI->replaceAllUsesWith(Loaded);
29279 AI->eraseFromParent();
29280 return Loaded;
29281}
29282
29283bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29284 if (!SI.isUnordered())
29285 return false;
29286 return ExperimentalUnorderedISEL;
29287}
29288bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29289 if (!LI.isUnordered())
29290 return false;
29291 return ExperimentalUnorderedISEL;
29292}
29293
29294
29295/// Emit a locked operation on a stack location which does not change any
29296/// memory location, but does involve a lock prefix. Location is chosen to be
29297/// a) very likely accessed only by a single thread to minimize cache traffic,
29298/// and b) definitely dereferenceable. Returns the new Chain result.
29299static SDValue emitLockedStackOp(SelectionDAG &DAG,
29300 const X86Subtarget &Subtarget, SDValue Chain,
29301 const SDLoc &DL) {
29302 // Implementation notes:
29303 // 1) LOCK prefix creates a full read/write reordering barrier for memory
29304 // operations issued by the current processor. As such, the location
29305 // referenced is not relevant for the ordering properties of the instruction.
29306 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
29307 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
29308 // 2) Using an immediate operand appears to be the best encoding choice
29309 // here since it doesn't require an extra register.
29310 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29311 // is small enough it might just be measurement noise.)
29312 // 4) When choosing offsets, there are several contributing factors:
29313 // a) If there's no redzone, we default to TOS. (We could allocate a cache
29314 // line aligned stack object to improve this case.)
29315 // b) To minimize our chances of introducing a false dependence, we prefer
29316 // to offset the stack usage from TOS slightly.
29317 // c) To minimize concerns about cross thread stack usage - in particular,
29318 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29319 // captures state in the TOS frame and accesses it from many threads -
29320 // we want to use an offset such that the offset is in a distinct cache
29321 // line from the TOS frame.
29322 //
29323 // For a general discussion of the tradeoffs and benchmark results, see:
29324 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29325
29326 auto &MF = DAG.getMachineFunction();
29327 auto &TFL = *Subtarget.getFrameLowering();
29328 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29329
29330 if (Subtarget.is64Bit()) {
29331 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29332 SDValue Ops[] = {
29333 DAG.getRegister(X86::RSP, MVT::i64), // Base
29334 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29335 DAG.getRegister(0, MVT::i64), // Index
29336 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29337 DAG.getRegister(0, MVT::i16), // Segment.
29338 Zero,
29339 Chain};
29340 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29341 MVT::Other, Ops);
29342 return SDValue(Res, 1);
29343 }
29344
29345 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29346 SDValue Ops[] = {
29347 DAG.getRegister(X86::ESP, MVT::i32), // Base
29348 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29349 DAG.getRegister(0, MVT::i32), // Index
29350 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29351 DAG.getRegister(0, MVT::i16), // Segment.
29352 Zero,
29353 Chain
29354 };
29355 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29356 MVT::Other, Ops);
29357 return SDValue(Res, 1);
29358}
29359
29360static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29361 SelectionDAG &DAG) {
29362 SDLoc dl(Op);
29363 AtomicOrdering FenceOrdering =
29364 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29365 SyncScope::ID FenceSSID =
29366 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29367
29368 // The only fence that needs an instruction is a sequentially-consistent
29369 // cross-thread fence.
29370 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29371 FenceSSID == SyncScope::System) {
29372 if (Subtarget.hasMFence())
29373 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29374
29375 SDValue Chain = Op.getOperand(0);
29376 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29377 }
29378
29379 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29380 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29381}
29382
29383static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29384 SelectionDAG &DAG) {
29385 MVT T = Op.getSimpleValueType();
29386 SDLoc DL(Op);
29387 unsigned Reg = 0;
29388 unsigned size = 0;
29389 switch(T.SimpleTy) {
29390 default: llvm_unreachable("Invalid value type!")__builtin_unreachable();
29391 case MVT::i8: Reg = X86::AL; size = 1; break;
29392 case MVT::i16: Reg = X86::AX; size = 2; break;
29393 case MVT::i32: Reg = X86::EAX; size = 4; break;
29394 case MVT::i64:
29395 assert(Subtarget.is64Bit() && "Node not type legal!")((void)0);
29396 Reg = X86::RAX; size = 8;
29397 break;
29398 }
29399 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29400 Op.getOperand(2), SDValue());
29401 SDValue Ops[] = { cpIn.getValue(0),
29402 Op.getOperand(1),
29403 Op.getOperand(3),
29404 DAG.getTargetConstant(size, DL, MVT::i8),
29405 cpIn.getValue(1) };
29406 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29407 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29408 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29409 Ops, T, MMO);
29410
29411 SDValue cpOut =
29412 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29413 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29414 MVT::i32, cpOut.getValue(2));
29415 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29416
29417 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29418 cpOut, Success, EFLAGS.getValue(1));
29419}
29420
29421// Create MOVMSKB, taking into account whether we need to split for AVX1.
29422static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29423 const X86Subtarget &Subtarget) {
29424 MVT InVT = V.getSimpleValueType();
29425
29426 if (InVT == MVT::v64i8) {
29427 SDValue Lo, Hi;
29428 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29429 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29430 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29431 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29432 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29433 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29434 DAG.getConstant(32, DL, MVT::i8));
29435 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29436 }
29437 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29438 SDValue Lo, Hi;
29439 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29440 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29441 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29442 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29443 DAG.getConstant(16, DL, MVT::i8));
29444 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29445 }
29446
29447 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29448}
29449
29450static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29451 SelectionDAG &DAG) {
29452 SDValue Src = Op.getOperand(0);
29453 MVT SrcVT = Src.getSimpleValueType();
29454 MVT DstVT = Op.getSimpleValueType();
29455
29456 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29457 // half to v32i1 and concatenating the result.
29458 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29459 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((void)0);
29460 assert(Subtarget.hasBWI() && "Expected BWI target")((void)0);
29461 SDLoc dl(Op);
29462 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29463 DAG.getIntPtrConstant(0, dl));
29464 Lo = DAG.getBitcast(MVT::v32i1, Lo);
29465 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29466 DAG.getIntPtrConstant(1, dl));
29467 Hi = DAG.getBitcast(MVT::v32i1, Hi);
29468 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
29469 }
29470
29471 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
29472 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
29473 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")((void)0);
29474 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
29475 SDLoc DL(Op);
29476 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
29477 V = getPMOVMSKB(DL, V, DAG, Subtarget);
29478 return DAG.getZExtOrTrunc(V, DL, DstVT);
29479 }
29480
29481 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||((void)0)
29482 SrcVT == MVT::i64) && "Unexpected VT!")((void)0);
29483
29484 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
29485 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
29486 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
29487 // This conversion needs to be expanded.
29488 return SDValue();
29489
29490 SDLoc dl(Op);
29491 if (SrcVT.isVector()) {
29492 // Widen the vector in input in the case of MVT::v2i32.
29493 // Example: from MVT::v2i32 to MVT::v4i32.
29494 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
29495 SrcVT.getVectorNumElements() * 2);
29496 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
29497 DAG.getUNDEF(SrcVT));
29498 } else {
29499 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((void)0)
29500 "Unexpected source type in LowerBITCAST")((void)0);
29501 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
29502 }
29503
29504 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
29505 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
29506
29507 if (DstVT == MVT::x86mmx)
29508 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
29509
29510 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
29511 DAG.getIntPtrConstant(0, dl));
29512}
29513
29514/// Compute the horizontal sum of bytes in V for the elements of VT.
29515///
29516/// Requires V to be a byte vector and VT to be an integer vector type with
29517/// wider elements than V's type. The width of the elements of VT determines
29518/// how many bytes of V are summed horizontally to produce each element of the
29519/// result.
29520static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
29521 const X86Subtarget &Subtarget,
29522 SelectionDAG &DAG) {
29523 SDLoc DL(V);
29524 MVT ByteVecVT = V.getSimpleValueType();
29525 MVT EltVT = VT.getVectorElementType();
29526 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((void)0)
29527 "Expected value to have byte element type.")((void)0);
29528 assert(EltVT != MVT::i8 &&((void)0)
29529 "Horizontal byte sum only makes sense for wider elements!")((void)0);
29530 unsigned VecSize = VT.getSizeInBits();
29531 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((void)0);
29532
29533 // PSADBW instruction horizontally add all bytes and leave the result in i64
29534 // chunks, thus directly computes the pop count for v2i64 and v4i64.
29535 if (EltVT == MVT::i64) {
29536 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
29537 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29538 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
29539 return DAG.getBitcast(VT, V);
29540 }
29541
29542 if (EltVT == MVT::i32) {
29543 // We unpack the low half and high half into i32s interleaved with zeros so
29544 // that we can use PSADBW to horizontally sum them. The most useful part of
29545 // this is that it lines up the results of two PSADBW instructions to be
29546 // two v2i64 vectors which concatenated are the 4 population counts. We can
29547 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
29548 SDValue Zeros = DAG.getConstant(0, DL, VT);
29549 SDValue V32 = DAG.getBitcast(VT, V);
29550 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
29551 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
29552
29553 // Do the horizontal sums into two v2i64s.
29554 Zeros = DAG.getConstant(0, DL, ByteVecVT);
29555 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29556 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29557 DAG.getBitcast(ByteVecVT, Low), Zeros);
29558 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29559 DAG.getBitcast(ByteVecVT, High), Zeros);
29560
29561 // Merge them together.
29562 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
29563 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
29564 DAG.getBitcast(ShortVecVT, Low),
29565 DAG.getBitcast(ShortVecVT, High));
29566
29567 return DAG.getBitcast(VT, V);
29568 }
29569
29570 // The only element type left is i16.
29571 assert(EltVT == MVT::i16 && "Unknown how to handle type")((void)0);
29572
29573 // To obtain pop count for each i16 element starting from the pop count for
29574 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
29575 // right by 8. It is important to shift as i16s as i8 vector shift isn't
29576 // directly supported.
29577 SDValue ShifterV = DAG.getConstant(8, DL, VT);
29578 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29579 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
29580 DAG.getBitcast(ByteVecVT, V));
29581 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29582}
29583
29584static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
29585 const X86Subtarget &Subtarget,
29586 SelectionDAG &DAG) {
29587 MVT VT = Op.getSimpleValueType();
29588 MVT EltVT = VT.getVectorElementType();
29589 int NumElts = VT.getVectorNumElements();
29590 (void)EltVT;
29591 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")((void)0);
29592
29593 // Implement a lookup table in register by using an algorithm based on:
29594 // http://wm.ite.pl/articles/sse-popcount.html
29595 //
29596 // The general idea is that every lower byte nibble in the input vector is an
29597 // index into a in-register pre-computed pop count table. We then split up the
29598 // input vector in two new ones: (1) a vector with only the shifted-right
29599 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
29600 // masked out higher ones) for each byte. PSHUFB is used separately with both
29601 // to index the in-register table. Next, both are added and the result is a
29602 // i8 vector where each element contains the pop count for input byte.
29603 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
29604 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
29605 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
29606 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
29607
29608 SmallVector<SDValue, 64> LUTVec;
29609 for (int i = 0; i < NumElts; ++i)
29610 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29611 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
29612 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
29613
29614 // High nibbles
29615 SDValue FourV = DAG.getConstant(4, DL, VT);
29616 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
29617
29618 // Low nibbles
29619 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
29620
29621 // The input vector is used as the shuffle mask that index elements into the
29622 // LUT. After counting low and high nibbles, add the vector to obtain the
29623 // final pop count per i8 element.
29624 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
29625 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
29626 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
29627}
29628
29629// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
29630// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
29631static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29632 SelectionDAG &DAG) {
29633 MVT VT = Op.getSimpleValueType();
29634 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&((void)0)
29635 "Unknown CTPOP type to handle")((void)0);
29636 SDLoc DL(Op.getNode());
29637 SDValue Op0 = Op.getOperand(0);
29638
29639 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
29640 if (Subtarget.hasVPOPCNTDQ()) {
29641 unsigned NumElems = VT.getVectorNumElements();
29642 assert((VT.getVectorElementType() == MVT::i8 ||((void)0)
29643 VT.getVectorElementType() == MVT::i16) && "Unexpected type")((void)0);
29644 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
29645 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29646 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
29647 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
29648 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
29649 }
29650 }
29651
29652 // Decompose 256-bit ops into smaller 128-bit ops.
29653 if (VT.is256BitVector() && !Subtarget.hasInt256())
29654 return splitVectorIntUnary(Op, DAG);
29655
29656 // Decompose 512-bit ops into smaller 256-bit ops.
29657 if (VT.is512BitVector() && !Subtarget.hasBWI())
29658 return splitVectorIntUnary(Op, DAG);
29659
29660 // For element types greater than i8, do vXi8 pop counts and a bytesum.
29661 if (VT.getScalarType() != MVT::i8) {
29662 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
29663 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
29664 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
29665 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
29666 }
29667
29668 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
29669 if (!Subtarget.hasSSSE3())
29670 return SDValue();
29671
29672 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
29673}
29674
29675static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29676 SelectionDAG &DAG) {
29677 assert(Op.getSimpleValueType().isVector() &&((void)0)
29678 "We only do custom lowering for vector population count.")((void)0);
29679 return LowerVectorCTPOP(Op, Subtarget, DAG);
29680}
29681
29682static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
29683 MVT VT = Op.getSimpleValueType();
29684 SDValue In = Op.getOperand(0);
29685 SDLoc DL(Op);
29686
29687 // For scalars, its still beneficial to transfer to/from the SIMD unit to
29688 // perform the BITREVERSE.
29689 if (!VT.isVector()) {
29690 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
29691 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
29692 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
29693 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
29694 DAG.getIntPtrConstant(0, DL));
29695 }
29696
29697 int NumElts = VT.getVectorNumElements();
29698 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
29699
29700 // Decompose 256-bit ops into smaller 128-bit ops.
29701 if (VT.is256BitVector())
29702 return splitVectorIntUnary(Op, DAG);
29703
29704 assert(VT.is128BitVector() &&((void)0)
29705 "Only 128-bit vector bitreverse lowering supported.")((void)0);
29706
29707 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
29708 // perform the BSWAP in the shuffle.
29709 // Its best to shuffle using the second operand as this will implicitly allow
29710 // memory folding for multiple vectors.
29711 SmallVector<SDValue, 16> MaskElts;
29712 for (int i = 0; i != NumElts; ++i) {
29713 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
29714 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
29715 int PermuteByte = SourceByte | (2 << 5);
29716 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
29717 }
29718 }
29719
29720 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
29721 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
29722 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
29723 Res, Mask);
29724 return DAG.getBitcast(VT, Res);
29725}
29726
29727static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
29728 SelectionDAG &DAG) {
29729 MVT VT = Op.getSimpleValueType();
29730
29731 if (Subtarget.hasXOP() && !VT.is512BitVector())
29732 return LowerBITREVERSE_XOP(Op, DAG);
29733
29734 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((void)0);
29735
29736 SDValue In = Op.getOperand(0);
29737 SDLoc DL(Op);
29738
29739 assert(VT.getScalarType() == MVT::i8 &&((void)0)
29740 "Only byte vector BITREVERSE supported")((void)0);
29741
29742 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
29743 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
29744 return splitVectorIntUnary(Op, DAG);
29745
29746 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
29747 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
29748 return splitVectorIntUnary(Op, DAG);
29749
29750 unsigned NumElts = VT.getVectorNumElements();
29751
29752 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
29753 if (Subtarget.hasGFNI()) {
29754 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29755 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
29756 Matrix = DAG.getBitcast(VT, Matrix);
29757 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
29758 DAG.getTargetConstant(0, DL, MVT::i8));
29759 }
29760
29761 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
29762 // two nibbles and a PSHUFB lookup to find the bitreverse of each
29763 // 0-15 value (moved to the other nibble).
29764 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
29765 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
29766 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
29767
29768 const int LoLUT[16] = {
29769 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
29770 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
29771 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
29772 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
29773 const int HiLUT[16] = {
29774 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
29775 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
29776 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
29777 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
29778
29779 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
29780 for (unsigned i = 0; i < NumElts; ++i) {
29781 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
29782 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
29783 }
29784
29785 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
29786 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
29787 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
29788 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
29789 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29790}
29791
29792static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
29793 SelectionDAG &DAG) {
29794 SDLoc DL(Op);
29795 SDValue X = Op.getOperand(0);
29796 MVT VT = Op.getSimpleValueType();
29797
29798 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
29799 if (VT == MVT::i8 ||
29800 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
29801 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29802 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
29803 DAG.getConstant(0, DL, MVT::i8));
29804 // Copy the inverse of the parity flag into a register with setcc.
29805 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29806 // Extend to the original type.
29807 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29808 }
29809
29810 if (VT == MVT::i64) {
29811 // Xor the high and low 16-bits together using a 32-bit operation.
29812 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
29813 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
29814 DAG.getConstant(32, DL, MVT::i8)));
29815 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
29816 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
29817 }
29818
29819 if (VT != MVT::i16) {
29820 // Xor the high and low 16-bits together using a 32-bit operation.
29821 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
29822 DAG.getConstant(16, DL, MVT::i8));
29823 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
29824 } else {
29825 // If the input is 16-bits, we need to extend to use an i32 shift below.
29826 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
29827 }
29828
29829 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
29830 // This should allow an h-reg to be used to save a shift.
29831 SDValue Hi = DAG.getNode(
29832 ISD::TRUNCATE, DL, MVT::i8,
29833 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
29834 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29835 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
29836 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
29837
29838 // Copy the inverse of the parity flag into a register with setcc.
29839 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29840 // Extend to the original type.
29841 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29842}
29843
29844static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
29845 const X86Subtarget &Subtarget) {
29846 unsigned NewOpc = 0;
29847 switch (N->getOpcode()) {
29848 case ISD::ATOMIC_LOAD_ADD:
29849 NewOpc = X86ISD::LADD;
29850 break;
29851 case ISD::ATOMIC_LOAD_SUB:
29852 NewOpc = X86ISD::LSUB;
29853 break;
29854 case ISD::ATOMIC_LOAD_OR:
29855 NewOpc = X86ISD::LOR;
29856 break;
29857 case ISD::ATOMIC_LOAD_XOR:
29858 NewOpc = X86ISD::LXOR;
29859 break;
29860 case ISD::ATOMIC_LOAD_AND:
29861 NewOpc = X86ISD::LAND;
29862 break;
29863 default:
29864 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")__builtin_unreachable();
29865 }
29866
29867 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
29868
29869 return DAG.getMemIntrinsicNode(
29870 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
29871 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
29872 /*MemVT=*/N->getSimpleValueType(0), MMO);
29873}
29874
29875/// Lower atomic_load_ops into LOCK-prefixed operations.
29876static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
29877 const X86Subtarget &Subtarget) {
29878 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
29879 SDValue Chain = N->getOperand(0);
29880 SDValue LHS = N->getOperand(1);
29881 SDValue RHS = N->getOperand(2);
29882 unsigned Opc = N->getOpcode();
29883 MVT VT = N->getSimpleValueType(0);
29884 SDLoc DL(N);
29885
29886 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
29887 // can only be lowered when the result is unused. They should have already
29888 // been transformed into a cmpxchg loop in AtomicExpand.
29889 if (N->hasAnyUseOfValue(0)) {
29890 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
29891 // select LXADD if LOCK_SUB can't be selected.
29892 if (Opc == ISD::ATOMIC_LOAD_SUB) {
29893 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
29894 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
29895 RHS, AN->getMemOperand());
29896 }
29897 assert(Opc == ISD::ATOMIC_LOAD_ADD &&((void)0)
29898 "Used AtomicRMW ops other than Add should have been expanded!")((void)0);
29899 return N;
29900 }
29901
29902 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
29903 // The core idea here is that since the memory location isn't actually
29904 // changing, all we need is a lowering for the *ordering* impacts of the
29905 // atomicrmw. As such, we can chose a different operation and memory
29906 // location to minimize impact on other code.
29907 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
29908 // On X86, the only ordering which actually requires an instruction is
29909 // seq_cst which isn't SingleThread, everything just needs to be preserved
29910 // during codegen and then dropped. Note that we expect (but don't assume),
29911 // that orderings other than seq_cst and acq_rel have been canonicalized to
29912 // a store or load.
29913 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
29914 AN->getSyncScopeID() == SyncScope::System) {
29915 // Prefer a locked operation against a stack location to minimize cache
29916 // traffic. This assumes that stack locations are very likely to be
29917 // accessed only by the owning thread.
29918 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
29919 assert(!N->hasAnyUseOfValue(0))((void)0);
29920 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29921 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29922 DAG.getUNDEF(VT), NewChain);
29923 }
29924 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29925 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
29926 assert(!N->hasAnyUseOfValue(0))((void)0);
29927 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29928 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29929 DAG.getUNDEF(VT), NewChain);
29930 }
29931
29932 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
29933 // RAUW the chain, but don't worry about the result, as it's unused.
29934 assert(!N->hasAnyUseOfValue(0))((void)0);
29935 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29936 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29937 DAG.getUNDEF(VT), LockOp.getValue(1));
29938}
29939
29940static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
29941 const X86Subtarget &Subtarget) {
29942 auto *Node = cast<AtomicSDNode>(Op.getNode());
29943 SDLoc dl(Node);
29944 EVT VT = Node->getMemoryVT();
29945
29946 bool IsSeqCst =
29947 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
29948 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
29949
29950 // If this store is not sequentially consistent and the type is legal
29951 // we can just keep it.
29952 if (!IsSeqCst && IsTypeLegal)
29953 return Op;
29954
29955 if (VT == MVT::i64 && !IsTypeLegal) {
29956 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
29957 // is enabled.
29958 bool NoImplicitFloatOps =
29959 DAG.getMachineFunction().getFunction().hasFnAttribute(
29960 Attribute::NoImplicitFloat);
29961 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29962 SDValue Chain;
29963 if (Subtarget.hasSSE1()) {
29964 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
29965 Node->getOperand(2));
29966 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29967 SclToVec = DAG.getBitcast(StVT, SclToVec);
29968 SDVTList Tys = DAG.getVTList(MVT::Other);
29969 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
29970 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
29971 MVT::i64, Node->getMemOperand());
29972 } else if (Subtarget.hasX87()) {
29973 // First load this into an 80-bit X87 register using a stack temporary.
29974 // This will put the whole integer into the significand.
29975 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29976 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29977 MachinePointerInfo MPI =
29978 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29979 Chain =
29980 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
29981 MPI, MaybeAlign(), MachineMemOperand::MOStore);
29982 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29983 SDValue LdOps[] = {Chain, StackPtr};
29984 SDValue Value =
29985 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
29986 /*Align*/ None, MachineMemOperand::MOLoad);
29987 Chain = Value.getValue(1);
29988
29989 // Now use an FIST to do the atomic store.
29990 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
29991 Chain =
29992 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
29993 StoreOps, MVT::i64, Node->getMemOperand());
29994 }
29995
29996 if (Chain) {
29997 // If this is a sequentially consistent store, also emit an appropriate
29998 // barrier.
29999 if (IsSeqCst)
30000 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
30001
30002 return Chain;
30003 }
30004 }
30005 }
30006
30007 // Convert seq_cst store -> xchg
30008 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
30009 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
30010 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
30011 Node->getMemoryVT(),
30012 Node->getOperand(0),
30013 Node->getOperand(1), Node->getOperand(2),
30014 Node->getMemOperand());
30015 return Swap.getValue(1);
30016}
30017
30018static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
30019 SDNode *N = Op.getNode();
30020 MVT VT = N->getSimpleValueType(0);
30021 unsigned Opc = Op.getOpcode();
30022
30023 // Let legalize expand this if it isn't a legal type yet.
30024 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30025 return SDValue();
30026
30027 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30028 SDLoc DL(N);
30029
30030 // Set the carry flag.
30031 SDValue Carry = Op.getOperand(2);
30032 EVT CarryVT = Carry.getValueType();
30033 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
30034 Carry, DAG.getAllOnesConstant(DL, CarryVT));
30035
30036 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
30037 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
30038 Op.getOperand(0), Op.getOperand(1),
30039 Carry.getValue(1));
30040
30041 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
30042 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
30043 Sum.getValue(1), DL, DAG);
30044 if (N->getValueType(1) == MVT::i1)
30045 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
30046
30047 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
30048}
30049
30050static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
30051 SelectionDAG &DAG) {
30052 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((void)0);
30053
30054 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
30055 // which returns the values as { float, float } (in XMM0) or
30056 // { double, double } (which is returned in XMM0, XMM1).
30057 SDLoc dl(Op);
30058 SDValue Arg = Op.getOperand(0);
30059 EVT ArgVT = Arg.getValueType();
30060 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30061
30062 TargetLowering::ArgListTy Args;
30063 TargetLowering::ArgListEntry Entry;
30064
30065 Entry.Node = Arg;
30066 Entry.Ty = ArgTy;
30067 Entry.IsSExt = false;
30068 Entry.IsZExt = false;
30069 Args.push_back(Entry);
30070
30071 bool isF64 = ArgVT == MVT::f64;
30072 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
30073 // the small struct {f32, f32} is returned in (eax, edx). For f64,
30074 // the results are returned via SRet in memory.
30075 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30076 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
30077 const char *LibcallName = TLI.getLibcallName(LC);
30078 SDValue Callee =
30079 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
30080
30081 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
30082 : (Type *)FixedVectorType::get(ArgTy, 4);
30083
30084 TargetLowering::CallLoweringInfo CLI(DAG);
30085 CLI.setDebugLoc(dl)
30086 .setChain(DAG.getEntryNode())
30087 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
30088
30089 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
30090
30091 if (isF64)
30092 // Returned in xmm0 and xmm1.
30093 return CallResult.first;
30094
30095 // Returned in bits 0:31 and 32:64 xmm0.
30096 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30097 CallResult.first, DAG.getIntPtrConstant(0, dl));
30098 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30099 CallResult.first, DAG.getIntPtrConstant(1, dl));
30100 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
30101 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
30102}
30103
30104/// Widen a vector input to a vector of NVT. The
30105/// input vector must have the same element type as NVT.
30106static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
30107 bool FillWithZeroes = false) {
30108 // Check if InOp already has the right width.
30109 MVT InVT = InOp.getSimpleValueType();
30110 if (InVT == NVT)
30111 return InOp;
30112
30113 if (InOp.isUndef())
30114 return DAG.getUNDEF(NVT);
30115
30116 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((void)0)
30117 "input and widen element type must match")((void)0);
30118
30119 unsigned InNumElts = InVT.getVectorNumElements();
30120 unsigned WidenNumElts = NVT.getVectorNumElements();
30121 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((void)0)
30122 "Unexpected request for vector widening")((void)0);
30123
30124 SDLoc dl(InOp);
30125 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
30126 InOp.getNumOperands() == 2) {
30127 SDValue N1 = InOp.getOperand(1);
30128 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
30129 N1.isUndef()) {
30130 InOp = InOp.getOperand(0);
30131 InVT = InOp.getSimpleValueType();
30132 InNumElts = InVT.getVectorNumElements();
30133 }
30134 }
30135 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
30136 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
30137 SmallVector<SDValue, 16> Ops;
30138 for (unsigned i = 0; i < InNumElts; ++i)
30139 Ops.push_back(InOp.getOperand(i));
30140
30141 EVT EltVT = InOp.getOperand(0).getValueType();
30142
30143 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
30144 DAG.getUNDEF(EltVT);
30145 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
30146 Ops.push_back(FillVal);
30147 return DAG.getBuildVector(NVT, dl, Ops);
30148 }
30149 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
30150 DAG.getUNDEF(NVT);
30151 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
30152 InOp, DAG.getIntPtrConstant(0, dl));
30153}
30154
30155static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
30156 SelectionDAG &DAG) {
30157 assert(Subtarget.hasAVX512() &&((void)0)
30158 "MGATHER/MSCATTER are supported on AVX-512 arch only")((void)0);
30159
30160 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
30161 SDValue Src = N->getValue();
30162 MVT VT = Src.getSimpleValueType();
30163 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((void)0);
30164 SDLoc dl(Op);
30165
30166 SDValue Scale = N->getScale();
30167 SDValue Index = N->getIndex();
30168 SDValue Mask = N->getMask();
30169 SDValue Chain = N->getChain();
30170 SDValue BasePtr = N->getBasePtr();
30171
30172 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
30173 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((void)0);
30174 // If the index is v2i64 and we have VLX we can use xmm for data and index.
30175 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
30176 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30177 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
30178 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30179 SDVTList VTs = DAG.getVTList(MVT::Other);
30180 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30181 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30182 N->getMemoryVT(), N->getMemOperand());
30183 }
30184 return SDValue();
30185 }
30186
30187 MVT IndexVT = Index.getSimpleValueType();
30188
30189 // If the index is v2i32, we're being called by type legalization and we
30190 // should just let the default handling take care of it.
30191 if (IndexVT == MVT::v2i32)
30192 return SDValue();
30193
30194 // If we don't have VLX and neither the passthru or index is 512-bits, we
30195 // need to widen until one is.
30196 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30197 !Index.getSimpleValueType().is512BitVector()) {
30198 // Determine how much we need to widen by to get a 512-bit type.
30199 unsigned Factor = std::min(512/VT.getSizeInBits(),
30200 512/IndexVT.getSizeInBits());
30201 unsigned NumElts = VT.getVectorNumElements() * Factor;
30202
30203 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30204 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30205 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30206
30207 Src = ExtendToType(Src, VT, DAG);
30208 Index = ExtendToType(Index, IndexVT, DAG);
30209 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30210 }
30211
30212 SDVTList VTs = DAG.getVTList(MVT::Other);
30213 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30214 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30215 N->getMemoryVT(), N->getMemOperand());
30216}
30217
30218static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30219 SelectionDAG &DAG) {
30220
30221 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30222 MVT VT = Op.getSimpleValueType();
30223 MVT ScalarVT = VT.getScalarType();
30224 SDValue Mask = N->getMask();
30225 MVT MaskVT = Mask.getSimpleValueType();
30226 SDValue PassThru = N->getPassThru();
30227 SDLoc dl(Op);
30228
30229 // Handle AVX masked loads which don't support passthru other than 0.
30230 if (MaskVT.getVectorElementType() != MVT::i1) {
30231 // We also allow undef in the isel pattern.
30232 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30233 return Op;
30234
30235 SDValue NewLoad = DAG.getMaskedLoad(
30236 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30237 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30238 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30239 N->isExpandingLoad());
30240 // Emit a blend.
30241 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30242 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30243 }
30244
30245 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&((void)0)
30246 "Expanding masked load is supported on AVX-512 target only!")((void)0);
30247
30248 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&((void)0)
30249 "Expanding masked load is supported for 32 and 64-bit types only!")((void)0);
30250
30251 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((void)0)
30252 "Cannot lower masked load op.")((void)0);
30253
30254 assert((ScalarVT.getSizeInBits() >= 32 ||((void)0)
30255 (Subtarget.hasBWI() &&((void)0)
30256 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&((void)0)
30257 "Unsupported masked load op.")((void)0);
30258
30259 // This operation is legal for targets with VLX, but without
30260 // VLX the vector should be widened to 512 bit
30261 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30262 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30263 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30264
30265 // Mask element has to be i1.
30266 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((void)0)
30267 "Unexpected mask type")((void)0);
30268
30269 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30270
30271 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30272 SDValue NewLoad = DAG.getMaskedLoad(
30273 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30274 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30275 N->getExtensionType(), N->isExpandingLoad());
30276
30277 SDValue Extract =
30278 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30279 DAG.getIntPtrConstant(0, dl));
30280 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30281 return DAG.getMergeValues(RetOps, dl);
30282}
30283
30284static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30285 SelectionDAG &DAG) {
30286 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30287 SDValue DataToStore = N->getValue();
30288 MVT VT = DataToStore.getSimpleValueType();
30289 MVT ScalarVT = VT.getScalarType();
30290 SDValue Mask = N->getMask();
30291 SDLoc dl(Op);
30292
30293 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&((void)0)
30294 "Expanding masked load is supported on AVX-512 target only!")((void)0);
30295