Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 16362, column 31
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -D PIC -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -D_RET_PROTECTOR -ret-protector -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86/X86ISelLowering.cpp
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/Instructions.h"
52#include "llvm/IR/Intrinsics.h"
53#include "llvm/IR/IRBuilder.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
58#include "llvm/Support/CommandLine.h"
59#include "llvm/Support/Debug.h"
60#include "llvm/Support/ErrorHandling.h"
61#include "llvm/Support/KnownBits.h"
62#include "llvm/Support/MathExtras.h"
63#include "llvm/Target/TargetOptions.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE"x86-isel" "x86-isel"
71
72STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
73
74static cl::opt<int> ExperimentalPrefLoopAlignment(
75 "x86-experimental-pref-loop-alignment", cl::init(4),
76 cl::desc(
77 "Sets the preferable loop alignment for experiments (as log2 bytes)"
78 "(the last x86-experimental-pref-loop-alignment bits"
79 " of the loop header PC will be 0)."),
80 cl::Hidden);
81
82static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
83 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
84 cl::desc(
85 "Sets the preferable loop alignment for experiments (as log2 bytes) "
86 "for innermost loops only. If specified, this option overrides "
87 "alignment set by x86-experimental-pref-loop-alignment."),
88 cl::Hidden);
89
90static cl::opt<bool> MulConstantOptimization(
91 "mul-constant-optimization", cl::init(true),
92 cl::desc("Replace 'mul x, Const' with more effective instructions like "
93 "SHIFT, LEA, etc."),
94 cl::Hidden);
95
96static cl::opt<bool> ExperimentalUnorderedISEL(
97 "x86-experimental-unordered-atomic-isel", cl::init(false),
98 cl::desc("Use LoadSDNode and StoreSDNode instead of "
99 "AtomicSDNode for unordered atomic loads and "
100 "stores respectively."),
101 cl::Hidden);
102
103/// Call this when the user attempts to do something unsupported, like
104/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
105/// report_fatal_error, so calling code should attempt to recover without
106/// crashing.
107static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
108 const char *Msg) {
109 MachineFunction &MF = DAG.getMachineFunction();
110 DAG.getContext()->diagnose(
111 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
112}
113
114X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
115 const X86Subtarget &STI)
116 : TargetLowering(TM), Subtarget(STI) {
117 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
118 X86ScalarSSEf64 = Subtarget.hasSSE2();
119 X86ScalarSSEf32 = Subtarget.hasSSE1();
120 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
121
122 // Set up the TargetLowering object.
123
124 // X86 is weird. It always uses i8 for shift amounts and setcc results.
125 setBooleanContents(ZeroOrOneBooleanContent);
126 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
127 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
128
129 // For 64-bit, since we have so many registers, use the ILP scheduler.
130 // For 32-bit, use the register pressure specific scheduling.
131 // For Atom, always use ILP scheduling.
132 if (Subtarget.isAtom())
133 setSchedulingPreference(Sched::ILP);
134 else if (Subtarget.is64Bit())
135 setSchedulingPreference(Sched::ILP);
136 else
137 setSchedulingPreference(Sched::RegPressure);
138 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
139 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
140
141 // Bypass expensive divides and use cheaper ones.
142 if (TM.getOptLevel() >= CodeGenOpt::Default) {
143 if (Subtarget.hasSlowDivide32())
144 addBypassSlowDiv(32, 8);
145 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
146 addBypassSlowDiv(64, 32);
147 }
148
149 // Setup Windows compiler runtime calls.
150 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
151 static const struct {
152 const RTLIB::Libcall Op;
153 const char * const Name;
154 const CallingConv::ID CC;
155 } LibraryCalls[] = {
156 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
157 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
158 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
159 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
160 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
161 };
162
163 for (const auto &LC : LibraryCalls) {
164 setLibcallName(LC.Op, LC.Name);
165 setLibcallCallingConv(LC.Op, LC.CC);
166 }
167 }
168
169 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
170 // MSVCRT doesn't have powi; fall back to pow
171 setLibcallName(RTLIB::POWI_F32, nullptr);
172 setLibcallName(RTLIB::POWI_F64, nullptr);
173 }
174
175 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
176 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
177 // FIXME: Should we be limiting the atomic size on other configs? Default is
178 // 1024.
179 if (!Subtarget.hasCmpxchg8b())
180 setMaxAtomicSizeInBitsSupported(32);
181
182 // Set up the register classes.
183 addRegisterClass(MVT::i8, &X86::GR8RegClass);
184 addRegisterClass(MVT::i16, &X86::GR16RegClass);
185 addRegisterClass(MVT::i32, &X86::GR32RegClass);
186 if (Subtarget.is64Bit())
187 addRegisterClass(MVT::i64, &X86::GR64RegClass);
188
189 for (MVT VT : MVT::integer_valuetypes())
190 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
191
192 // We don't accept any truncstore of integer registers.
193 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
194 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
195 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
196 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
197 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
198 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
199
200 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
201
202 // SETOEQ and SETUNE require checking two conditions.
203 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204 setCondCodeAction(ISD::SETOEQ, VT, Expand);
205 setCondCodeAction(ISD::SETUNE, VT, Expand);
206 }
207
208 // Integer absolute.
209 if (Subtarget.hasCMov()) {
210 setOperationAction(ISD::ABS , MVT::i16 , Custom);
211 setOperationAction(ISD::ABS , MVT::i32 , Custom);
212 if (Subtarget.is64Bit())
213 setOperationAction(ISD::ABS , MVT::i64 , Custom);
214 }
215
216 // Funnel shifts.
217 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
218 // For slow shld targets we only lower for code size.
219 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
220
221 setOperationAction(ShiftOp , MVT::i8 , Custom);
222 setOperationAction(ShiftOp , MVT::i16 , Custom);
223 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
224 if (Subtarget.is64Bit())
225 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
226 }
227
228 if (!Subtarget.useSoftFloat()) {
229 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
230 // operation.
231 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
232 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
233 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
234 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
235 // We have an algorithm for SSE2, and we turn this into a 64-bit
236 // FILD or VCVTUSI2SS/SD for other targets.
237 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
238 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
239 // We have an algorithm for SSE2->double, and we turn this into a
240 // 64-bit FILD followed by conditional FADD for other targets.
241 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
242 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
243
244 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
245 // this operation.
246 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
247 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
248 // SSE has no i16 to fp conversion, only i32. We promote in the handler
249 // to allow f80 to use i16 and f64 to use i16 with sse1 only
250 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
251 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
252 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
253 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
254 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
255 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
256 // are Legal, f80 is custom lowered.
257 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
258 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
259
260 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
261 // this operation.
262 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
263 // FIXME: This doesn't generate invalid exception when it should. PR44019.
264 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
265 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
266 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
267 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
268 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
269 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
270 // are Legal, f80 is custom lowered.
271 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
272 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
273
274 // Handle FP_TO_UINT by promoting the destination to a larger signed
275 // conversion.
276 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
277 // FIXME: This doesn't generate invalid exception when it should. PR44019.
278 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
279 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
280 // FIXME: This doesn't generate invalid exception when it should. PR44019.
281 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
282 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
283 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
284 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
285 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
286
287 setOperationAction(ISD::LRINT, MVT::f32, Custom);
288 setOperationAction(ISD::LRINT, MVT::f64, Custom);
289 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
290 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
291
292 if (!Subtarget.is64Bit()) {
293 setOperationAction(ISD::LRINT, MVT::i64, Custom);
294 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
295 }
296 }
297
298 if (Subtarget.hasSSE2()) {
299 // Custom lowering for saturating float to int conversions.
300 // We handle promotion to larger result types manually.
301 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
302 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
303 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
304 }
305 if (Subtarget.is64Bit()) {
306 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
307 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
308 }
309 }
310
311 // Handle address space casts between mixed sized pointers.
312 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
313 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
314
315 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
316 if (!X86ScalarSSEf64) {
317 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
318 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
319 if (Subtarget.is64Bit()) {
320 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
321 // Without SSE, i64->f64 goes through memory.
322 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
323 }
324 } else if (!Subtarget.is64Bit())
325 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
326
327 // Scalar integer divide and remainder are lowered to use operations that
328 // produce two results, to match the available instructions. This exposes
329 // the two-result form to trivial CSE, which is able to combine x/y and x%y
330 // into a single instruction.
331 //
332 // Scalar integer multiply-high is also lowered to use two-result
333 // operations, to match the available instructions. However, plain multiply
334 // (low) operations are left as Legal, as there are single-result
335 // instructions for this in x86. Using the two-result multiply instructions
336 // when both high and low results are needed must be arranged by dagcombine.
337 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
338 setOperationAction(ISD::MULHS, VT, Expand);
339 setOperationAction(ISD::MULHU, VT, Expand);
340 setOperationAction(ISD::SDIV, VT, Expand);
341 setOperationAction(ISD::UDIV, VT, Expand);
342 setOperationAction(ISD::SREM, VT, Expand);
343 setOperationAction(ISD::UREM, VT, Expand);
344 }
345
346 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
347 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
348 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
349 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
350 setOperationAction(ISD::BR_CC, VT, Expand);
351 setOperationAction(ISD::SELECT_CC, VT, Expand);
352 }
353 if (Subtarget.is64Bit())
354 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
355 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
356 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
357 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
358
359 setOperationAction(ISD::FREM , MVT::f32 , Expand);
360 setOperationAction(ISD::FREM , MVT::f64 , Expand);
361 setOperationAction(ISD::FREM , MVT::f80 , Expand);
362 setOperationAction(ISD::FREM , MVT::f128 , Expand);
363
364 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
365 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
366 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
367 }
368
369 // Promote the i8 variants and force them on up to i32 which has a shorter
370 // encoding.
371 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
372 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
373
374 if (Subtarget.hasBMI()) {
375 // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
376 // is enabled.
377 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
378 } else {
379 setOperationAction(ISD::CTTZ, MVT::i16, Custom);
380 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
381 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
382 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
383 if (Subtarget.is64Bit()) {
384 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
385 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
386 }
387 }
388
389 if (Subtarget.hasLZCNT()) {
390 // When promoting the i8 variants, force them to i32 for a shorter
391 // encoding.
392 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
393 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
394 } else {
395 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
396 if (VT == MVT::i64 && !Subtarget.is64Bit())
397 continue;
398 setOperationAction(ISD::CTLZ , VT, Custom);
399 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
400 }
401 }
402
403 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
404 ISD::STRICT_FP_TO_FP16}) {
405 // Special handling for half-precision floating point conversions.
406 // If we don't have F16C support, then lower half float conversions
407 // into library calls.
408 setOperationAction(
409 Op, MVT::f32,
410 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
411 // There's never any support for operations beyond MVT::f32.
412 setOperationAction(Op, MVT::f64, Expand);
413 setOperationAction(Op, MVT::f80, Expand);
414 setOperationAction(Op, MVT::f128, Expand);
415 }
416
417 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
418 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
419 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
420 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
421 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
422 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
423 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
424 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
425
426 setOperationAction(ISD::PARITY, MVT::i8, Custom);
427 if (Subtarget.hasPOPCNT()) {
428 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
429 } else {
430 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
431 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
432 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
433 if (Subtarget.is64Bit())
434 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
435 else
436 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
437
438 setOperationAction(ISD::PARITY, MVT::i16, Custom);
439 setOperationAction(ISD::PARITY, MVT::i32, Custom);
440 if (Subtarget.is64Bit())
441 setOperationAction(ISD::PARITY, MVT::i64, Custom);
442 }
443
444 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
445
446 if (!Subtarget.hasMOVBE())
447 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
448
449 // X86 wants to expand cmov itself.
450 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
451 setOperationAction(ISD::SELECT, VT, Custom);
452 setOperationAction(ISD::SETCC, VT, Custom);
453 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
454 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
455 }
456 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::SELECT, VT, Custom);
460 setOperationAction(ISD::SETCC, VT, Custom);
461 }
462
463 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
464 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
465 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
466
467 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
468 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
469 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
470 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
471 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
472 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
473 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
474 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
475
476 // Darwin ABI issue.
477 for (auto VT : { MVT::i32, MVT::i64 }) {
478 if (VT == MVT::i64 && !Subtarget.is64Bit())
479 continue;
480 setOperationAction(ISD::ConstantPool , VT, Custom);
481 setOperationAction(ISD::JumpTable , VT, Custom);
482 setOperationAction(ISD::GlobalAddress , VT, Custom);
483 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
484 setOperationAction(ISD::ExternalSymbol , VT, Custom);
485 setOperationAction(ISD::BlockAddress , VT, Custom);
486 }
487
488 // 64-bit shl, sra, srl (iff 32-bit x86)
489 for (auto VT : { MVT::i32, MVT::i64 }) {
490 if (VT == MVT::i64 && !Subtarget.is64Bit())
491 continue;
492 setOperationAction(ISD::SHL_PARTS, VT, Custom);
493 setOperationAction(ISD::SRA_PARTS, VT, Custom);
494 setOperationAction(ISD::SRL_PARTS, VT, Custom);
495 }
496
497 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
498 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
499
500 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
501
502 // Expand certain atomics
503 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
504 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
505 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
506 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
507 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
508 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
509 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
510 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
511 }
512
513 if (!Subtarget.is64Bit())
514 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
515
516 if (Subtarget.hasCmpxchg16b()) {
517 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
518 }
519
520 // FIXME - use subtarget debug flags
521 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
522 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
523 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
524 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
525 }
526
527 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
528 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
529
530 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
531 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
532
533 setOperationAction(ISD::TRAP, MVT::Other, Legal);
534 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
535 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
536
537 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
538 setOperationAction(ISD::VASTART , MVT::Other, Custom);
539 setOperationAction(ISD::VAEND , MVT::Other, Expand);
540 bool Is64Bit = Subtarget.is64Bit();
541 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
542 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
543
544 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
545 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
546
547 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
548
549 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
550 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
551 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
552
553 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
554 // f32 and f64 use SSE.
555 // Set up the FP register classes.
556 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
557 : &X86::FR32RegClass);
558 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
559 : &X86::FR64RegClass);
560
561 // Disable f32->f64 extload as we can only generate this in one instruction
562 // under optsize. So its easier to pattern match (fpext (load)) for that
563 // case instead of needing to emit 2 instructions for extload in the
564 // non-optsize case.
565 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
566
567 for (auto VT : { MVT::f32, MVT::f64 }) {
568 // Use ANDPD to simulate FABS.
569 setOperationAction(ISD::FABS, VT, Custom);
570
571 // Use XORP to simulate FNEG.
572 setOperationAction(ISD::FNEG, VT, Custom);
573
574 // Use ANDPD and ORPD to simulate FCOPYSIGN.
575 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
576
577 // These might be better off as horizontal vector ops.
578 setOperationAction(ISD::FADD, VT, Custom);
579 setOperationAction(ISD::FSUB, VT, Custom);
580
581 // We don't support sin/cos/fmod
582 setOperationAction(ISD::FSIN , VT, Expand);
583 setOperationAction(ISD::FCOS , VT, Expand);
584 setOperationAction(ISD::FSINCOS, VT, Expand);
585 }
586
587 // Lower this to MOVMSK plus an AND.
588 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
589 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
590
591 } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
592 (UseX87 || Is64Bit)) {
593 // Use SSE for f32, x87 for f64.
594 // Set up the FP register classes.
595 addRegisterClass(MVT::f32, &X86::FR32RegClass);
596 if (UseX87)
597 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
598
599 // Use ANDPS to simulate FABS.
600 setOperationAction(ISD::FABS , MVT::f32, Custom);
601
602 // Use XORP to simulate FNEG.
603 setOperationAction(ISD::FNEG , MVT::f32, Custom);
604
605 if (UseX87)
606 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
607
608 // Use ANDPS and ORPS to simulate FCOPYSIGN.
609 if (UseX87)
610 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
611 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
612
613 // We don't support sin/cos/fmod
614 setOperationAction(ISD::FSIN , MVT::f32, Expand);
615 setOperationAction(ISD::FCOS , MVT::f32, Expand);
616 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
617
618 if (UseX87) {
619 // Always expand sin/cos functions even though x87 has an instruction.
620 setOperationAction(ISD::FSIN, MVT::f64, Expand);
621 setOperationAction(ISD::FCOS, MVT::f64, Expand);
622 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
623 }
624 } else if (UseX87) {
625 // f32 and f64 in x87.
626 // Set up the FP register classes.
627 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
628 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
629
630 for (auto VT : { MVT::f32, MVT::f64 }) {
631 setOperationAction(ISD::UNDEF, VT, Expand);
632 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
633
634 // Always expand sin/cos functions even though x87 has an instruction.
635 setOperationAction(ISD::FSIN , VT, Expand);
636 setOperationAction(ISD::FCOS , VT, Expand);
637 setOperationAction(ISD::FSINCOS, VT, Expand);
638 }
639 }
640
641 // Expand FP32 immediates into loads from the stack, save special cases.
642 if (isTypeLegal(MVT::f32)) {
643 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
644 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
645 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
646 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
647 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
648 } else // SSE immediates.
649 addLegalFPImmediate(APFloat(+0.0f)); // xorps
650 }
651 // Expand FP64 immediates into loads from the stack, save special cases.
652 if (isTypeLegal(MVT::f64)) {
653 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
654 addLegalFPImmediate(APFloat(+0.0)); // FLD0
655 addLegalFPImmediate(APFloat(+1.0)); // FLD1
656 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
657 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
658 } else // SSE immediates.
659 addLegalFPImmediate(APFloat(+0.0)); // xorpd
660 }
661 // Handle constrained floating-point operations of scalar.
662 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
663 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
664 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
665 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
666 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
667 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
668 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
669 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
670 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
671 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
672 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
673 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
674 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
675
676 // We don't support FMA.
677 setOperationAction(ISD::FMA, MVT::f64, Expand);
678 setOperationAction(ISD::FMA, MVT::f32, Expand);
679
680 // f80 always uses X87.
681 if (UseX87) {
682 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
683 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
684 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
685 {
686 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
687 addLegalFPImmediate(TmpFlt); // FLD0
688 TmpFlt.changeSign();
689 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
690
691 bool ignored;
692 APFloat TmpFlt2(+1.0);
693 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
694 &ignored);
695 addLegalFPImmediate(TmpFlt2); // FLD1
696 TmpFlt2.changeSign();
697 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
698 }
699
700 // Always expand sin/cos functions even though x87 has an instruction.
701 setOperationAction(ISD::FSIN , MVT::f80, Expand);
702 setOperationAction(ISD::FCOS , MVT::f80, Expand);
703 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
704
705 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
706 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
707 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
708 setOperationAction(ISD::FRINT, MVT::f80, Expand);
709 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
710 setOperationAction(ISD::FMA, MVT::f80, Expand);
711 setOperationAction(ISD::LROUND, MVT::f80, Expand);
712 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
713 setOperationAction(ISD::LRINT, MVT::f80, Custom);
714 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
715
716 // Handle constrained floating-point operations of scalar.
717 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
718 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
719 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
720 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
721 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
722 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
723 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
724 // as Custom.
725 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
726 }
727
728 // f128 uses xmm registers, but most operations require libcalls.
729 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
730 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
731 : &X86::VR128RegClass);
732
733 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
734
735 setOperationAction(ISD::FADD, MVT::f128, LibCall);
736 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
737 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
738 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
739 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
740 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
741 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
742 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
743 setOperationAction(ISD::FMA, MVT::f128, LibCall);
744 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
745
746 setOperationAction(ISD::FABS, MVT::f128, Custom);
747 setOperationAction(ISD::FNEG, MVT::f128, Custom);
748 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
749
750 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
751 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
752 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
753 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
754 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
755 // No STRICT_FSINCOS
756 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
757 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
758
759 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
760 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
761 // We need to custom handle any FP_ROUND with an f128 input, but
762 // LegalizeDAG uses the result type to know when to run a custom handler.
763 // So we have to list all legal floating point result types here.
764 if (isTypeLegal(MVT::f32)) {
765 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
766 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
767 }
768 if (isTypeLegal(MVT::f64)) {
769 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
770 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
771 }
772 if (isTypeLegal(MVT::f80)) {
773 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
774 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
775 }
776
777 setOperationAction(ISD::SETCC, MVT::f128, Custom);
778
779 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
780 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
781 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
782 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
783 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
784 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
785 }
786
787 // Always use a library call for pow.
788 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
789 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
790 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
791 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
792
793 setOperationAction(ISD::FLOG, MVT::f80, Expand);
794 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
795 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
796 setOperationAction(ISD::FEXP, MVT::f80, Expand);
797 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
798 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
799 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
800
801 // Some FP actions are always expanded for vector types.
802 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
803 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
804 setOperationAction(ISD::FSIN, VT, Expand);
805 setOperationAction(ISD::FSINCOS, VT, Expand);
806 setOperationAction(ISD::FCOS, VT, Expand);
807 setOperationAction(ISD::FREM, VT, Expand);
808 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
809 setOperationAction(ISD::FPOW, VT, Expand);
810 setOperationAction(ISD::FLOG, VT, Expand);
811 setOperationAction(ISD::FLOG2, VT, Expand);
812 setOperationAction(ISD::FLOG10, VT, Expand);
813 setOperationAction(ISD::FEXP, VT, Expand);
814 setOperationAction(ISD::FEXP2, VT, Expand);
815 }
816
817 // First set operation action for all vector types to either promote
818 // (for widening) or expand (for scalarization). Then we will selectively
819 // turn on ones that can be effectively codegen'd.
820 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
821 setOperationAction(ISD::SDIV, VT, Expand);
822 setOperationAction(ISD::UDIV, VT, Expand);
823 setOperationAction(ISD::SREM, VT, Expand);
824 setOperationAction(ISD::UREM, VT, Expand);
825 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
826 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
827 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
828 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
829 setOperationAction(ISD::FMA, VT, Expand);
830 setOperationAction(ISD::FFLOOR, VT, Expand);
831 setOperationAction(ISD::FCEIL, VT, Expand);
832 setOperationAction(ISD::FTRUNC, VT, Expand);
833 setOperationAction(ISD::FRINT, VT, Expand);
834 setOperationAction(ISD::FNEARBYINT, VT, Expand);
835 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
836 setOperationAction(ISD::MULHS, VT, Expand);
837 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
838 setOperationAction(ISD::MULHU, VT, Expand);
839 setOperationAction(ISD::SDIVREM, VT, Expand);
840 setOperationAction(ISD::UDIVREM, VT, Expand);
841 setOperationAction(ISD::CTPOP, VT, Expand);
842 setOperationAction(ISD::CTTZ, VT, Expand);
843 setOperationAction(ISD::CTLZ, VT, Expand);
844 setOperationAction(ISD::ROTL, VT, Expand);
845 setOperationAction(ISD::ROTR, VT, Expand);
846 setOperationAction(ISD::BSWAP, VT, Expand);
847 setOperationAction(ISD::SETCC, VT, Expand);
848 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
849 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
850 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
851 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
852 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
853 setOperationAction(ISD::TRUNCATE, VT, Expand);
854 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
855 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
856 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
857 setOperationAction(ISD::SELECT_CC, VT, Expand);
858 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
859 setTruncStoreAction(InnerVT, VT, Expand);
860
861 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
862 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
863
864 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
865 // types, we have to deal with them whether we ask for Expansion or not.
866 // Setting Expand causes its own optimisation problems though, so leave
867 // them legal.
868 if (VT.getVectorElementType() == MVT::i1)
869 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
870
871 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
872 // split/scalarized right now.
873 if (VT.getVectorElementType() == MVT::f16)
874 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
875 }
876 }
877
878 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
879 // with -msoft-float, disable use of MMX as well.
880 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
881 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
882 // No operations on x86mmx supported, everything uses intrinsics.
883 }
884
885 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
886 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
887 : &X86::VR128RegClass);
888
889 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
890 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
891 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
892 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
893 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
894 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
895 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
896 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
897
898 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
899 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
900
901 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
902 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
903 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
904 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
905 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
906 }
907
908 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
909 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
910 : &X86::VR128RegClass);
911
912 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
913 // registers cannot be used even for integer operations.
914 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
915 : &X86::VR128RegClass);
916 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
917 : &X86::VR128RegClass);
918 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
919 : &X86::VR128RegClass);
920 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
921 : &X86::VR128RegClass);
922
923 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
924 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
925 setOperationAction(ISD::SDIV, VT, Custom);
926 setOperationAction(ISD::SREM, VT, Custom);
927 setOperationAction(ISD::UDIV, VT, Custom);
928 setOperationAction(ISD::UREM, VT, Custom);
929 }
930
931 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
932 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
933 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
934
935 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
936 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
937 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
938 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
939 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
940 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
941 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
942 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
943 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
944 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
945
946 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
947 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
948
949 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
950 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
951 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
952
953 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
954 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
955 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
956 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
957 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
958 }
959
960 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
961 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
962 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
963 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
964 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
965 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
966 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
967 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
968 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
969 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
970
971 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
972 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
973 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
974
975 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
976 setOperationAction(ISD::SETCC, VT, Custom);
977 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
978 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
979 setOperationAction(ISD::CTPOP, VT, Custom);
980 setOperationAction(ISD::ABS, VT, Custom);
981
982 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
983 // setcc all the way to isel and prefer SETGT in some isel patterns.
984 setCondCodeAction(ISD::SETLT, VT, Custom);
985 setCondCodeAction(ISD::SETLE, VT, Custom);
986 }
987
988 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
989 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
990 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
991 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
992 setOperationAction(ISD::VSELECT, VT, Custom);
993 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
994 }
995
996 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
997 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
998 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
999 setOperationAction(ISD::VSELECT, VT, Custom);
1000
1001 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1002 continue;
1003
1004 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1005 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1006 }
1007
1008 // Custom lower v2i64 and v2f64 selects.
1009 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1010 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1011 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1012 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1013 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1014
1015 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1016 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1017 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1018 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1019 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
1020 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1021
1022 // Custom legalize these to avoid over promotion or custom promotion.
1023 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1024 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1025 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1026 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1027 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1028 }
1029
1030 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1031 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1032 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1033 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1034
1035 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1036 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1037
1038 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1039 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1040
1041 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1042 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1043 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1044 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1045 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1046
1047 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1048 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1049 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1050 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1051
1052 // We want to legalize this to an f64 load rather than an i64 load on
1053 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1054 // store.
1055 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1056 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1057 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1058 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1059 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1060 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1061
1062 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1063 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1064 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1065 if (!Subtarget.hasAVX512())
1066 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1067
1068 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1069 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1070 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1071
1072 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1073
1074 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1075 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1076 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1077 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1078 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1079 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1080
1081 // In the customized shift lowering, the legal v4i32/v2i64 cases
1082 // in AVX2 will be recognized.
1083 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1084 setOperationAction(ISD::SRL, VT, Custom);
1085 setOperationAction(ISD::SHL, VT, Custom);
1086 setOperationAction(ISD::SRA, VT, Custom);
1087 }
1088
1089 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1090 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1091
1092 // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1093 // shifts) is better.
1094 if (!Subtarget.useAVX512Regs() &&
1095 !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1096 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1097
1098 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1099 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1100 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1101 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1102 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1103 }
1104
1105 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1106 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1107 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1108 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1109 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1110 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1111 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1112 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1113 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1114
1115 // These might be better off as horizontal vector ops.
1116 setOperationAction(ISD::ADD, MVT::i16, Custom);
1117 setOperationAction(ISD::ADD, MVT::i32, Custom);
1118 setOperationAction(ISD::SUB, MVT::i16, Custom);
1119 setOperationAction(ISD::SUB, MVT::i32, Custom);
1120 }
1121
1122 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1123 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1124 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1125 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1126 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1127 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1128 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1129 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1130 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1131 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1132 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1133 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1134 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1135 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1136
1137 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1138 }
1139
1140 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1141 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1142 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1143 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1144 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1145 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1146 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1147 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1148
1149 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1150
1151 // FIXME: Do we need to handle scalar-to-vector here?
1152 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1153
1154 // We directly match byte blends in the backend as they match the VSELECT
1155 // condition form.
1156 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1157
1158 // SSE41 brings specific instructions for doing vector sign extend even in
1159 // cases where we don't have SRA.
1160 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1161 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1162 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1163 }
1164
1165 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1166 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1167 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1168 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1169 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1170 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1171 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1172 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1173 }
1174
1175 // i8 vectors are custom because the source register and source
1176 // source memory operand types are not the same width.
1177 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1178
1179 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1180 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1181 // do the pre and post work in the vector domain.
1182 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1183 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1184 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1185 // so that DAG combine doesn't try to turn it into uint_to_fp.
1186 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1187 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1188 }
1189 }
1190
1191 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1192 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1193 }
1194
1195 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1196 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1197 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1198 setOperationAction(ISD::ROTL, VT, Custom);
1199
1200 // XOP can efficiently perform BITREVERSE with VPPERM.
1201 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1202 setOperationAction(ISD::BITREVERSE, VT, Custom);
1203
1204 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1205 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1206 setOperationAction(ISD::BITREVERSE, VT, Custom);
1207 }
1208
1209 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1210 bool HasInt256 = Subtarget.hasInt256();
1211
1212 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1213 : &X86::VR256RegClass);
1214 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1215 : &X86::VR256RegClass);
1216 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1217 : &X86::VR256RegClass);
1218 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1219 : &X86::VR256RegClass);
1220 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1221 : &X86::VR256RegClass);
1222 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1223 : &X86::VR256RegClass);
1224
1225 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1226 setOperationAction(ISD::FFLOOR, VT, Legal);
1227 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1228 setOperationAction(ISD::FCEIL, VT, Legal);
1229 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1230 setOperationAction(ISD::FTRUNC, VT, Legal);
1231 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1232 setOperationAction(ISD::FRINT, VT, Legal);
1233 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1234 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1235 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1236 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1237 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1238
1239 setOperationAction(ISD::FROUND, VT, Custom);
1240
1241 setOperationAction(ISD::FNEG, VT, Custom);
1242 setOperationAction(ISD::FABS, VT, Custom);
1243 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1244 }
1245
1246 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1247 // even though v8i16 is a legal type.
1248 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1249 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1250 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1251 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1252 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1253 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1254 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1255
1256 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1257 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1258
1259 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1260 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1261 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1262 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1263 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1264 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1265 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1266 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1267 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1268 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1269 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1270 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1271
1272 if (!Subtarget.hasAVX512())
1273 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1274
1275 // In the customized shift lowering, the legal v8i32/v4i64 cases
1276 // in AVX2 will be recognized.
1277 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1278 setOperationAction(ISD::SRL, VT, Custom);
1279 setOperationAction(ISD::SHL, VT, Custom);
1280 setOperationAction(ISD::SRA, VT, Custom);
1281 }
1282
1283 // These types need custom splitting if their input is a 128-bit vector.
1284 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1285 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1286 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1287 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1288
1289 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1290 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1291
1292 // With BWI, expanding (and promoting the shifts) is the better.
1293 if (!Subtarget.useBWIRegs())
1294 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1295
1296 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1297 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1298 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1299 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1300 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1301 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1302
1303 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1304 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1305 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1307 }
1308
1309 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1310 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1311 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1312 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1313
1314 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1315 setOperationAction(ISD::SETCC, VT, Custom);
1316 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1317 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1318 setOperationAction(ISD::CTPOP, VT, Custom);
1319 setOperationAction(ISD::CTLZ, VT, Custom);
1320
1321 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1322 // setcc all the way to isel and prefer SETGT in some isel patterns.
1323 setCondCodeAction(ISD::SETLT, VT, Custom);
1324 setCondCodeAction(ISD::SETLE, VT, Custom);
1325 }
1326
1327 if (Subtarget.hasAnyFMA()) {
1328 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1329 MVT::v2f64, MVT::v4f64 }) {
1330 setOperationAction(ISD::FMA, VT, Legal);
1331 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1332 }
1333 }
1334
1335 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1336 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1337 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1338 }
1339
1340 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1341 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1342 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1343 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1344
1345 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1346 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1347 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1348 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1349 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1350 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1351
1352 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1353 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1354
1355 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1356 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1357 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1358 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1359 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1360
1361 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1362 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1363 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1364 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1365 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1366 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1367 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1368 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1369 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1370 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1371 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1372 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1373
1374 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1375 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1376 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1377 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1378 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1379 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1380 }
1381
1382 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1383 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1384 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1385 }
1386
1387 if (HasInt256) {
1388 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1389 // when we have a 256bit-wide blend with immediate.
1390 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1391 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1392
1393 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1394 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1395 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1396 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1397 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1398 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1399 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1401 }
1402 }
1403
1404 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1405 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1406 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1407 setOperationAction(ISD::MSTORE, VT, Legal);
1408 }
1409
1410 // Extract subvector is special because the value type
1411 // (result) is 128-bit but the source is 256-bit wide.
1412 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1413 MVT::v4f32, MVT::v2f64 }) {
1414 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1415 }
1416
1417 // Custom lower several nodes for 256-bit types.
1418 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1419 MVT::v8f32, MVT::v4f64 }) {
1420 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1421 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1422 setOperationAction(ISD::VSELECT, VT, Custom);
1423 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1424 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1425 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1426 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1427 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1428 setOperationAction(ISD::STORE, VT, Custom);
1429 }
1430
1431 if (HasInt256) {
1432 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1433
1434 // Custom legalize 2x32 to get a little better code.
1435 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1436 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1437
1438 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1439 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1440 setOperationAction(ISD::MGATHER, VT, Custom);
1441 }
1442 }
1443
1444 // This block controls legalization of the mask vector sizes that are
1445 // available with AVX512. 512-bit vectors are in a separate block controlled
1446 // by useAVX512Regs.
1447 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1448 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1449 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1450 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1451 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1452 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1453
1454 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1455 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1456 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1457
1458 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1459 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1460 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1461 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1462 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1463 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1464 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1465 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1466 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1467 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1468 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1469 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1470
1471 // There is no byte sized k-register load or store without AVX512DQ.
1472 if (!Subtarget.hasDQI()) {
1473 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1474 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1475 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1476 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1477
1478 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1479 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1480 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1481 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1482 }
1483
1484 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1485 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1486 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1487 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1488 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1489 }
1490
1491 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1492 setOperationAction(ISD::VSELECT, VT, Expand);
1493
1494 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1495 setOperationAction(ISD::SETCC, VT, Custom);
1496 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1497 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1498 setOperationAction(ISD::SELECT, VT, Custom);
1499 setOperationAction(ISD::TRUNCATE, VT, Custom);
1500
1501 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1502 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1503 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1504 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1505 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1506 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1507 }
1508
1509 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1510 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1511 }
1512
1513 // This block controls legalization for 512-bit operations with 32/64 bit
1514 // elements. 512-bits can be disabled based on prefer-vector-width and
1515 // required-vector-width function attributes.
1516 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1517 bool HasBWI = Subtarget.hasBWI();
1518
1519 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1520 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1521 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1522 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1523 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1524 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1525
1526 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1527 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1528 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1529 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1530 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1531 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1532 if (HasBWI)
1533 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1534 }
1535
1536 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1537 setOperationAction(ISD::FNEG, VT, Custom);
1538 setOperationAction(ISD::FABS, VT, Custom);
1539 setOperationAction(ISD::FMA, VT, Legal);
1540 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1541 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1542 }
1543
1544 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1545 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1546 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1547 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1548 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1549 }
1550 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1551 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1552 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1553 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1554 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1555 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1556 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1557 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1558
1559 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1560 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1561 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1562 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1563 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1564 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1565 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1566 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1567 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1568 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1569 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1570 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1571
1572 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1573 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1574 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1575 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1576 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1577 if (HasBWI)
1578 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1579
1580 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1581 // to 512-bit rather than use the AVX2 instructions so that we can use
1582 // k-masks.
1583 if (!Subtarget.hasVLX()) {
1584 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1585 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1586 setOperationAction(ISD::MLOAD, VT, Custom);
1587 setOperationAction(ISD::MSTORE, VT, Custom);
1588 }
1589 }
1590
1591 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1592 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1593 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1594 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1595 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1596 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1597 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1598 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1599 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1600 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1601 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1602 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1603 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1604
1605 if (HasBWI) {
1606 // Extends from v64i1 masks to 512-bit vectors.
1607 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1608 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1609 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1610 }
1611
1612 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1613 setOperationAction(ISD::FFLOOR, VT, Legal);
1614 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1615 setOperationAction(ISD::FCEIL, VT, Legal);
1616 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1617 setOperationAction(ISD::FTRUNC, VT, Legal);
1618 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1619 setOperationAction(ISD::FRINT, VT, Legal);
1620 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1621 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1622 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1623 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1624 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1625
1626 setOperationAction(ISD::FROUND, VT, Custom);
1627 }
1628
1629 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1630 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1631 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1632 }
1633
1634 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1635 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1636 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1637 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1638
1639 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1640 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1641 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1642 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1643
1644 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1645 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1646 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1647 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1648 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1649 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1650
1651 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1652 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1653
1654 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1655
1656 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1657 setOperationAction(ISD::SRL, VT, Custom);
1658 setOperationAction(ISD::SHL, VT, Custom);
1659 setOperationAction(ISD::SRA, VT, Custom);
1660 setOperationAction(ISD::SETCC, VT, Custom);
1661
1662 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1663 // setcc all the way to isel and prefer SETGT in some isel patterns.
1664 setCondCodeAction(ISD::SETLT, VT, Custom);
1665 setCondCodeAction(ISD::SETLE, VT, Custom);
1666 }
1667 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1668 setOperationAction(ISD::SMAX, VT, Legal);
1669 setOperationAction(ISD::UMAX, VT, Legal);
1670 setOperationAction(ISD::SMIN, VT, Legal);
1671 setOperationAction(ISD::UMIN, VT, Legal);
1672 setOperationAction(ISD::ABS, VT, Legal);
1673 setOperationAction(ISD::CTPOP, VT, Custom);
1674 setOperationAction(ISD::ROTL, VT, Custom);
1675 setOperationAction(ISD::ROTR, VT, Custom);
1676 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1677 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1678 }
1679
1680 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1681 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1682 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1683 setOperationAction(ISD::CTLZ, VT, Custom);
1684 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1685 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1686 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1687 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1688 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1689 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1690 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1691 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1692 }
1693
1694 if (Subtarget.hasDQI()) {
1695 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1696 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1697 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1698 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1699 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1700 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1701 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1702 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1703
1704 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1705 }
1706
1707 if (Subtarget.hasCDI()) {
1708 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1709 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1710 setOperationAction(ISD::CTLZ, VT, Legal);
1711 }
1712 } // Subtarget.hasCDI()
1713
1714 if (Subtarget.hasVPOPCNTDQ()) {
1715 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1716 setOperationAction(ISD::CTPOP, VT, Legal);
1717 }
1718
1719 // Extract subvector is special because the value type
1720 // (result) is 256-bit but the source is 512-bit wide.
1721 // 128-bit was made Legal under AVX1.
1722 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1723 MVT::v8f32, MVT::v4f64 })
1724 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1725
1726 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1727 MVT::v16f32, MVT::v8f64 }) {
1728 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1729 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1730 setOperationAction(ISD::SELECT, VT, Custom);
1731 setOperationAction(ISD::VSELECT, VT, Custom);
1732 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1733 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1734 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1735 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1736 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1737 }
1738
1739 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1740 setOperationAction(ISD::MLOAD, VT, Legal);
1741 setOperationAction(ISD::MSTORE, VT, Legal);
1742 setOperationAction(ISD::MGATHER, VT, Custom);
1743 setOperationAction(ISD::MSCATTER, VT, Custom);
1744 }
1745 if (HasBWI) {
1746 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1747 setOperationAction(ISD::MLOAD, VT, Legal);
1748 setOperationAction(ISD::MSTORE, VT, Legal);
1749 }
1750 } else {
1751 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1752 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1753 }
1754
1755 if (Subtarget.hasVBMI2()) {
1756 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1757 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1758 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1759 setOperationAction(ISD::FSHL, VT, Custom);
1760 setOperationAction(ISD::FSHR, VT, Custom);
1761 }
1762
1763 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1764 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1765 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1766 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1767 }
1768 }// useAVX512Regs
1769
1770 // This block controls legalization for operations that don't have
1771 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1772 // narrower widths.
1773 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1774 // These operations are handled on non-VLX by artificially widening in
1775 // isel patterns.
1776
1777 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1778 Subtarget.hasVLX() ? Legal : Custom);
1779 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1780 Subtarget.hasVLX() ? Legal : Custom);
1781 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1782 Subtarget.hasVLX() ? Legal : Custom);
1783 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1784 Subtarget.hasVLX() ? Legal : Custom);
1785 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1786 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1787 Subtarget.hasVLX() ? Legal : Custom);
1788 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1789 Subtarget.hasVLX() ? Legal : Custom);
1790 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1791 Subtarget.hasVLX() ? Legal : Custom);
1792 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1793 Subtarget.hasVLX() ? Legal : Custom);
1794
1795 if (Subtarget.hasDQI()) {
1796 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1797 // v2f32 UINT_TO_FP is already custom under SSE2.
1798 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&((void)0)
1799 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&((void)0)
1800 "Unexpected operation action!")((void)0);
1801 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1802 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1803 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1804 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1805 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1806 }
1807
1808 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1809 setOperationAction(ISD::SMAX, VT, Legal);
1810 setOperationAction(ISD::UMAX, VT, Legal);
1811 setOperationAction(ISD::SMIN, VT, Legal);
1812 setOperationAction(ISD::UMIN, VT, Legal);
1813 setOperationAction(ISD::ABS, VT, Legal);
1814 }
1815
1816 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1817 setOperationAction(ISD::ROTL, VT, Custom);
1818 setOperationAction(ISD::ROTR, VT, Custom);
1819 }
1820
1821 // Custom legalize 2x32 to get a little better code.
1822 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1823 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1824
1825 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1826 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1827 setOperationAction(ISD::MSCATTER, VT, Custom);
1828
1829 if (Subtarget.hasDQI()) {
1830 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1831 setOperationAction(ISD::SINT_TO_FP, VT,
1832 Subtarget.hasVLX() ? Legal : Custom);
1833 setOperationAction(ISD::UINT_TO_FP, VT,
1834 Subtarget.hasVLX() ? Legal : Custom);
1835 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1836 Subtarget.hasVLX() ? Legal : Custom);
1837 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1838 Subtarget.hasVLX() ? Legal : Custom);
1839 setOperationAction(ISD::FP_TO_SINT, VT,
1840 Subtarget.hasVLX() ? Legal : Custom);
1841 setOperationAction(ISD::FP_TO_UINT, VT,
1842 Subtarget.hasVLX() ? Legal : Custom);
1843 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1844 Subtarget.hasVLX() ? Legal : Custom);
1845 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1846 Subtarget.hasVLX() ? Legal : Custom);
1847 setOperationAction(ISD::MUL, VT, Legal);
1848 }
1849 }
1850
1851 if (Subtarget.hasCDI()) {
1852 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1853 setOperationAction(ISD::CTLZ, VT, Legal);
1854 }
1855 } // Subtarget.hasCDI()
1856
1857 if (Subtarget.hasVPOPCNTDQ()) {
1858 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1859 setOperationAction(ISD::CTPOP, VT, Legal);
1860 }
1861 }
1862
1863 // This block control legalization of v32i1/v64i1 which are available with
1864 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1865 // useBWIRegs.
1866 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1867 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1868 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1869
1870 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1871 setOperationAction(ISD::VSELECT, VT, Expand);
1872 setOperationAction(ISD::TRUNCATE, VT, Custom);
1873 setOperationAction(ISD::SETCC, VT, Custom);
1874 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1875 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1876 setOperationAction(ISD::SELECT, VT, Custom);
1877 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1878 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1879 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1880 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1881 }
1882
1883 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1884 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1885
1886 // Extends from v32i1 masks to 256-bit vectors.
1887 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1888 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1889 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1890
1891 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1892 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1893 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1894 }
1895
1896 // These operations are handled on non-VLX by artificially widening in
1897 // isel patterns.
1898 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1899
1900 if (Subtarget.hasBITALG()) {
1901 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1902 setOperationAction(ISD::CTPOP, VT, Legal);
1903 }
1904 }
1905
1906 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1907 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1908 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1909 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1910 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1911 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1912
1913 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1914 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1915 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1916 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1917 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1918
1919 if (Subtarget.hasBWI()) {
1920 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1921 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1922 }
1923
1924 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1925 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1926 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1927 }
1928
1929 if (Subtarget.hasAMXTILE()) {
1930 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
1931 }
1932
1933 // We want to custom lower some of our intrinsics.
1934 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1935 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1936 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1937 if (!Subtarget.is64Bit()) {
1938 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1939 }
1940
1941 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1942 // handle type legalization for these operations here.
1943 //
1944 // FIXME: We really should do custom legalization for addition and
1945 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1946 // than generic legalization for 64-bit multiplication-with-overflow, though.
1947 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1948 if (VT == MVT::i64 && !Subtarget.is64Bit())
1949 continue;
1950 // Add/Sub/Mul with overflow operations are custom lowered.
1951 setOperationAction(ISD::SADDO, VT, Custom);
1952 setOperationAction(ISD::UADDO, VT, Custom);
1953 setOperationAction(ISD::SSUBO, VT, Custom);
1954 setOperationAction(ISD::USUBO, VT, Custom);
1955 setOperationAction(ISD::SMULO, VT, Custom);
1956 setOperationAction(ISD::UMULO, VT, Custom);
1957
1958 // Support carry in as value rather than glue.
1959 setOperationAction(ISD::ADDCARRY, VT, Custom);
1960 setOperationAction(ISD::SUBCARRY, VT, Custom);
1961 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1962 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
1963 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
1964 }
1965
1966 if (!Subtarget.is64Bit()) {
1967 // These libcalls are not available in 32-bit.
1968 setLibcallName(RTLIB::SHL_I128, nullptr);
1969 setLibcallName(RTLIB::SRL_I128, nullptr);
1970 setLibcallName(RTLIB::SRA_I128, nullptr);
1971 setLibcallName(RTLIB::MUL_I128, nullptr);
1972 }
1973
1974 // Combine sin / cos into _sincos_stret if it is available.
1975 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1976 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1977 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1978 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1979 }
1980
1981 if (Subtarget.isTargetWin64()) {
1982 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1983 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1984 setOperationAction(ISD::SREM, MVT::i128, Custom);
1985 setOperationAction(ISD::UREM, MVT::i128, Custom);
1986 }
1987
1988 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1989 // is. We should promote the value to 64-bits to solve this.
1990 // This is what the CRT headers do - `fmodf` is an inline header
1991 // function casting to f64 and calling `fmod`.
1992 if (Subtarget.is32Bit() &&
1993 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1994 for (ISD::NodeType Op :
1995 {ISD::FCEIL, ISD::STRICT_FCEIL,
1996 ISD::FCOS, ISD::STRICT_FCOS,
1997 ISD::FEXP, ISD::STRICT_FEXP,
1998 ISD::FFLOOR, ISD::STRICT_FFLOOR,
1999 ISD::FREM, ISD::STRICT_FREM,
2000 ISD::FLOG, ISD::STRICT_FLOG,
2001 ISD::FLOG10, ISD::STRICT_FLOG10,
2002 ISD::FPOW, ISD::STRICT_FPOW,
2003 ISD::FSIN, ISD::STRICT_FSIN})
2004 if (isOperationExpand(Op, MVT::f32))
2005 setOperationAction(Op, MVT::f32, Promote);
2006
2007 // We have target-specific dag combine patterns for the following nodes:
2008 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2009 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2010 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2012 setTargetDAGCombine(ISD::CONCAT_VECTORS);
2013 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2014 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2015 setTargetDAGCombine(ISD::BITCAST);
2016 setTargetDAGCombine(ISD::VSELECT);
2017 setTargetDAGCombine(ISD::SELECT);
2018 setTargetDAGCombine(ISD::SHL);
2019 setTargetDAGCombine(ISD::SRA);
2020 setTargetDAGCombine(ISD::SRL);
2021 setTargetDAGCombine(ISD::OR);
2022 setTargetDAGCombine(ISD::AND);
2023 setTargetDAGCombine(ISD::ADD);
2024 setTargetDAGCombine(ISD::FADD);
2025 setTargetDAGCombine(ISD::FSUB);
2026 setTargetDAGCombine(ISD::FNEG);
2027 setTargetDAGCombine(ISD::FMA);
2028 setTargetDAGCombine(ISD::STRICT_FMA);
2029 setTargetDAGCombine(ISD::FMINNUM);
2030 setTargetDAGCombine(ISD::FMAXNUM);
2031 setTargetDAGCombine(ISD::SUB);
2032 setTargetDAGCombine(ISD::LOAD);
2033 setTargetDAGCombine(ISD::MLOAD);
2034 setTargetDAGCombine(ISD::STORE);
2035 setTargetDAGCombine(ISD::MSTORE);
2036 setTargetDAGCombine(ISD::TRUNCATE);
2037 setTargetDAGCombine(ISD::ZERO_EXTEND);
2038 setTargetDAGCombine(ISD::ANY_EXTEND);
2039 setTargetDAGCombine(ISD::SIGN_EXTEND);
2040 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2041 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2042 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2043 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2044 setTargetDAGCombine(ISD::SINT_TO_FP);
2045 setTargetDAGCombine(ISD::UINT_TO_FP);
2046 setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2047 setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2048 setTargetDAGCombine(ISD::SETCC);
2049 setTargetDAGCombine(ISD::MUL);
2050 setTargetDAGCombine(ISD::XOR);
2051 setTargetDAGCombine(ISD::MSCATTER);
2052 setTargetDAGCombine(ISD::MGATHER);
2053 setTargetDAGCombine(ISD::FP16_TO_FP);
2054 setTargetDAGCombine(ISD::FP_EXTEND);
2055 setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2056 setTargetDAGCombine(ISD::FP_ROUND);
2057
2058 computeRegisterProperties(Subtarget.getRegisterInfo());
2059
2060 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2061 MaxStoresPerMemsetOptSize = 8;
2062 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2063 MaxStoresPerMemcpyOptSize = 4;
2064 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2065 MaxStoresPerMemmoveOptSize = 4;
2066
2067 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2068 // that needs to benchmarked and balanced with the potential use of vector
2069 // load/store types (PR33329, PR33914).
2070 MaxLoadsPerMemcmp = 2;
2071 MaxLoadsPerMemcmpOptSize = 2;
2072
2073 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2074 setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2075
2076 // An out-of-order CPU can speculatively execute past a predictable branch,
2077 // but a conditional move could be stalled by an expensive earlier operation.
2078 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2079 EnableExtLdPromotion = true;
2080 setPrefFunctionAlignment(Align(16));
2081
2082 verifyIntrinsicTables();
2083
2084 // Default to having -disable-strictnode-mutation on
2085 IsStrictFPEnabled = true;
2086}
2087
2088// This has so far only been implemented for 64-bit MachO.
2089bool X86TargetLowering::useLoadStackGuardNode() const {
2090 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2091}
2092
2093bool X86TargetLowering::useStackGuardXorFP() const {
2094 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2095 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2096}
2097
2098SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2099 const SDLoc &DL) const {
2100 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2101 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2102 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2103 return SDValue(Node, 0);
2104}
2105
2106TargetLoweringBase::LegalizeTypeAction
2107X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2108 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2109 !Subtarget.hasBWI())
2110 return TypeSplitVector;
2111
2112 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2113 VT.getVectorElementType() != MVT::i1)
2114 return TypeWidenVector;
2115
2116 return TargetLoweringBase::getPreferredVectorAction(VT);
2117}
2118
2119static std::pair<MVT, unsigned>
2120handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2121 const X86Subtarget &Subtarget) {
2122 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2123 // convention is one that uses k registers.
2124 if (NumElts == 2)
2125 return {MVT::v2i64, 1};
2126 if (NumElts == 4)
2127 return {MVT::v4i32, 1};
2128 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2129 CC != CallingConv::Intel_OCL_BI)
2130 return {MVT::v8i16, 1};
2131 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2132 CC != CallingConv::Intel_OCL_BI)
2133 return {MVT::v16i8, 1};
2134 // v32i1 passes in ymm unless we have BWI and the calling convention is
2135 // regcall.
2136 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2137 return {MVT::v32i8, 1};
2138 // Split v64i1 vectors if we don't have v64i8 available.
2139 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2140 if (Subtarget.useAVX512Regs())
2141 return {MVT::v64i8, 1};
2142 return {MVT::v32i8, 2};
2143 }
2144
2145 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2146 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2147 NumElts > 64)
2148 return {MVT::i8, NumElts};
2149
2150 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2151}
2152
2153MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2154 CallingConv::ID CC,
2155 EVT VT) const {
2156 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2157 Subtarget.hasAVX512()) {
2158 unsigned NumElts = VT.getVectorNumElements();
2159
2160 MVT RegisterVT;
2161 unsigned NumRegisters;
2162 std::tie(RegisterVT, NumRegisters) =
2163 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2164 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2165 return RegisterVT;
2166 }
2167
2168 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2169}
2170
2171unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2172 CallingConv::ID CC,
2173 EVT VT) const {
2174 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2175 Subtarget.hasAVX512()) {
2176 unsigned NumElts = VT.getVectorNumElements();
2177
2178 MVT RegisterVT;
2179 unsigned NumRegisters;
2180 std::tie(RegisterVT, NumRegisters) =
2181 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2182 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2183 return NumRegisters;
2184 }
2185
2186 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2187}
2188
2189unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2190 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2191 unsigned &NumIntermediates, MVT &RegisterVT) const {
2192 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2193 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2194 Subtarget.hasAVX512() &&
2195 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2196 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2197 VT.getVectorNumElements() > 64)) {
2198 RegisterVT = MVT::i8;
2199 IntermediateVT = MVT::i1;
2200 NumIntermediates = VT.getVectorNumElements();
2201 return NumIntermediates;
2202 }
2203
2204 // Split v64i1 vectors if we don't have v64i8 available.
2205 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2206 CC != CallingConv::X86_RegCall) {
2207 RegisterVT = MVT::v32i8;
2208 IntermediateVT = MVT::v32i1;
2209 NumIntermediates = 2;
2210 return 2;
2211 }
2212
2213 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2214 NumIntermediates, RegisterVT);
2215}
2216
2217EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2218 LLVMContext& Context,
2219 EVT VT) const {
2220 if (!VT.isVector())
2221 return MVT::i8;
2222
2223 if (Subtarget.hasAVX512()) {
2224 // Figure out what this type will be legalized to.
2225 EVT LegalVT = VT;
2226 while (getTypeAction(Context, LegalVT) != TypeLegal)
2227 LegalVT = getTypeToTransformTo(Context, LegalVT);
2228
2229 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2230 if (LegalVT.getSimpleVT().is512BitVector())
2231 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2232
2233 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2234 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2235 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2236 // vXi16/vXi8.
2237 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2238 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2239 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2240 }
2241 }
2242
2243 return VT.changeVectorElementTypeToInteger();
2244}
2245
2246/// Helper for getByValTypeAlignment to determine
2247/// the desired ByVal argument alignment.
2248static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2249 if (MaxAlign == 16)
2250 return;
2251 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2252 if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2253 MaxAlign = Align(16);
2254 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2255 Align EltAlign;
2256 getMaxByValAlign(ATy->getElementType(), EltAlign);
2257 if (EltAlign > MaxAlign)
2258 MaxAlign = EltAlign;
2259 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2260 for (auto *EltTy : STy->elements()) {
2261 Align EltAlign;
2262 getMaxByValAlign(EltTy, EltAlign);
2263 if (EltAlign > MaxAlign)
2264 MaxAlign = EltAlign;
2265 if (MaxAlign == 16)
2266 break;
2267 }
2268 }
2269}
2270
2271/// Return the desired alignment for ByVal aggregate
2272/// function arguments in the caller parameter area. For X86, aggregates
2273/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2274/// are at 4-byte boundaries.
2275unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2276 const DataLayout &DL) const {
2277 if (Subtarget.is64Bit()) {
2278 // Max of 8 and alignment of type.
2279 Align TyAlign = DL.getABITypeAlign(Ty);
2280 if (TyAlign > 8)
2281 return TyAlign.value();
2282 return 8;
2283 }
2284
2285 Align Alignment(4);
2286 if (Subtarget.hasSSE1())
2287 getMaxByValAlign(Ty, Alignment);
2288 return Alignment.value();
2289}
2290
2291/// It returns EVT::Other if the type should be determined using generic
2292/// target-independent logic.
2293/// For vector ops we check that the overall size isn't larger than our
2294/// preferred vector width.
2295EVT X86TargetLowering::getOptimalMemOpType(
2296 const MemOp &Op, const AttributeList &FuncAttributes) const {
2297 if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2298 if (Op.size() >= 16 &&
2299 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2300 // FIXME: Check if unaligned 64-byte accesses are slow.
2301 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2302 (Subtarget.getPreferVectorWidth() >= 512)) {
2303 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2304 }
2305 // FIXME: Check if unaligned 32-byte accesses are slow.
2306 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2307 (Subtarget.getPreferVectorWidth() >= 256)) {
2308 // Although this isn't a well-supported type for AVX1, we'll let
2309 // legalization and shuffle lowering produce the optimal codegen. If we
2310 // choose an optimal type with a vector element larger than a byte,
2311 // getMemsetStores() may create an intermediate splat (using an integer
2312 // multiply) before we splat as a vector.
2313 return MVT::v32i8;
2314 }
2315 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2316 return MVT::v16i8;
2317 // TODO: Can SSE1 handle a byte vector?
2318 // If we have SSE1 registers we should be able to use them.
2319 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2320 (Subtarget.getPreferVectorWidth() >= 128))
2321 return MVT::v4f32;
2322 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2323 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2324 // Do not use f64 to lower memcpy if source is string constant. It's
2325 // better to use i32 to avoid the loads.
2326 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2327 // The gymnastics of splatting a byte value into an XMM register and then
2328 // only using 8-byte stores (because this is a CPU with slow unaligned
2329 // 16-byte accesses) makes that a loser.
2330 return MVT::f64;
2331 }
2332 }
2333 // This is a compromise. If we reach here, unaligned accesses may be slow on
2334 // this target. However, creating smaller, aligned accesses could be even
2335 // slower and would certainly be a lot more code.
2336 if (Subtarget.is64Bit() && Op.size() >= 8)
2337 return MVT::i64;
2338 return MVT::i32;
2339}
2340
2341bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2342 if (VT == MVT::f32)
2343 return X86ScalarSSEf32;
2344 if (VT == MVT::f64)
2345 return X86ScalarSSEf64;
2346 return true;
2347}
2348
2349bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2350 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2351 bool *Fast) const {
2352 if (Fast) {
2353 switch (VT.getSizeInBits()) {
2354 default:
2355 // 8-byte and under are always assumed to be fast.
2356 *Fast = true;
2357 break;
2358 case 128:
2359 *Fast = !Subtarget.isUnalignedMem16Slow();
2360 break;
2361 case 256:
2362 *Fast = !Subtarget.isUnalignedMem32Slow();
2363 break;
2364 // TODO: What about AVX-512 (512-bit) accesses?
2365 }
2366 }
2367 // NonTemporal vector memory ops must be aligned.
2368 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2369 // NT loads can only be vector aligned, so if its less aligned than the
2370 // minimum vector size (which we can split the vector down to), we might as
2371 // well use a regular unaligned vector load.
2372 // We don't have any NT loads pre-SSE41.
2373 if (!!(Flags & MachineMemOperand::MOLoad))
2374 return (Alignment < 16 || !Subtarget.hasSSE41());
2375 return false;
2376 }
2377 // Misaligned accesses of any size are always allowed.
2378 return true;
2379}
2380
2381/// Return the entry encoding for a jump table in the
2382/// current function. The returned value is a member of the
2383/// MachineJumpTableInfo::JTEntryKind enum.
2384unsigned X86TargetLowering::getJumpTableEncoding() const {
2385 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2386 // symbol.
2387 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2388 return MachineJumpTableInfo::EK_Custom32;
2389
2390 // Otherwise, use the normal jump table encoding heuristics.
2391 return TargetLowering::getJumpTableEncoding();
2392}
2393
2394bool X86TargetLowering::useSoftFloat() const {
2395 return Subtarget.useSoftFloat();
2396}
2397
2398void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2399 ArgListTy &Args) const {
2400
2401 // Only relabel X86-32 for C / Stdcall CCs.
2402 if (Subtarget.is64Bit())
2403 return;
2404 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2405 return;
2406 unsigned ParamRegs = 0;
2407 if (auto *M = MF->getFunction().getParent())
2408 ParamRegs = M->getNumberRegisterParameters();
2409
2410 // Mark the first N int arguments as having reg
2411 for (auto &Arg : Args) {
2412 Type *T = Arg.Ty;
2413 if (T->isIntOrPtrTy())
2414 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2415 unsigned numRegs = 1;
2416 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2417 numRegs = 2;
2418 if (ParamRegs < numRegs)
2419 return;
2420 ParamRegs -= numRegs;
2421 Arg.IsInReg = true;
2422 }
2423 }
2424}
2425
2426const MCExpr *
2427X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2428 const MachineBasicBlock *MBB,
2429 unsigned uid,MCContext &Ctx) const{
2430 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((void)0);
2431 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2432 // entries.
2433 return MCSymbolRefExpr::create(MBB->getSymbol(),
2434 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2435}
2436
2437/// Returns relocation base for the given PIC jumptable.
2438SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2439 SelectionDAG &DAG) const {
2440 if (!Subtarget.is64Bit())
2441 // This doesn't have SDLoc associated with it, but is not really the
2442 // same as a Register.
2443 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2444 getPointerTy(DAG.getDataLayout()));
2445 return Table;
2446}
2447
2448/// This returns the relocation base for the given PIC jumptable,
2449/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2450const MCExpr *X86TargetLowering::
2451getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2452 MCContext &Ctx) const {
2453 // X86-64 uses RIP relative addressing based on the jump table label.
2454 if (Subtarget.isPICStyleRIPRel())
2455 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2456
2457 // Otherwise, the reference is relative to the PIC base.
2458 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2459}
2460
2461std::pair<const TargetRegisterClass *, uint8_t>
2462X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2463 MVT VT) const {
2464 const TargetRegisterClass *RRC = nullptr;
2465 uint8_t Cost = 1;
2466 switch (VT.SimpleTy) {
2467 default:
2468 return TargetLowering::findRepresentativeClass(TRI, VT);
2469 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2470 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2471 break;
2472 case MVT::x86mmx:
2473 RRC = &X86::VR64RegClass;
2474 break;
2475 case MVT::f32: case MVT::f64:
2476 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2477 case MVT::v4f32: case MVT::v2f64:
2478 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2479 case MVT::v8f32: case MVT::v4f64:
2480 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2481 case MVT::v16f32: case MVT::v8f64:
2482 RRC = &X86::VR128XRegClass;
2483 break;
2484 }
2485 return std::make_pair(RRC, Cost);
2486}
2487
2488unsigned X86TargetLowering::getAddressSpace() const {
2489 if (Subtarget.is64Bit())
2490 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2491 return 256;
2492}
2493
2494static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2495 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2496 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2497}
2498
2499static Constant* SegmentOffset(IRBuilderBase &IRB,
2500 int Offset, unsigned AddressSpace) {
2501 return ConstantExpr::getIntToPtr(
2502 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2503 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2504}
2505
2506Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2507 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2508 // tcbhead_t; use it instead of the usual global variable (see
2509 // sysdeps/{i386,x86_64}/nptl/tls.h)
2510 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2511 if (Subtarget.isTargetFuchsia()) {
2512 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2513 return SegmentOffset(IRB, 0x10, getAddressSpace());
2514 } else {
2515 unsigned AddressSpace = getAddressSpace();
2516 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2517 // Specially, some users may customize the base reg and offset.
2518 int Offset = M->getStackProtectorGuardOffset();
2519 // If we don't set -stack-protector-guard-offset value:
2520 // %fs:0x28, unless we're using a Kernel code model, in which case
2521 // it's %gs:0x28. gs:0x14 on i386.
2522 if (Offset == INT_MAX2147483647)
2523 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2524
2525 StringRef GuardReg = M->getStackProtectorGuardReg();
2526 if (GuardReg == "fs")
2527 AddressSpace = X86AS::FS;
2528 else if (GuardReg == "gs")
2529 AddressSpace = X86AS::GS;
2530 return SegmentOffset(IRB, Offset, AddressSpace);
2531 }
2532 }
2533 return TargetLowering::getIRStackGuard(IRB);
2534}
2535
2536void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2537 // MSVC CRT provides functionalities for stack protection.
2538 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2539 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2540 // MSVC CRT has a global variable holding security cookie.
2541 M.getOrInsertGlobal("__security_cookie",
2542 Type::getInt8PtrTy(M.getContext()));
2543
2544 // MSVC CRT has a function to validate security cookie.
2545 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2546 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2547 Type::getInt8PtrTy(M.getContext()));
2548 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2549 F->setCallingConv(CallingConv::X86_FastCall);
2550 F->addAttribute(1, Attribute::AttrKind::InReg);
2551 }
2552 return;
2553 }
2554
2555 StringRef GuardMode = M.getStackProtectorGuard();
2556
2557 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2558 if ((GuardMode == "tls" || GuardMode.empty()) &&
2559 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2560 return;
2561 TargetLowering::insertSSPDeclarations(M);
2562}
2563
2564Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2565 // MSVC CRT has a global variable holding security cookie.
2566 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2567 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2568 return M.getGlobalVariable("__security_cookie");
2569 }
2570 return TargetLowering::getSDagStackGuard(M);
2571}
2572
2573Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2574 // MSVC CRT has a function to validate security cookie.
2575 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2576 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2577 return M.getFunction("__security_check_cookie");
2578 }
2579 return TargetLowering::getSSPStackGuardCheck(M);
2580}
2581
2582Value *
2583X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2584 if (Subtarget.getTargetTriple().isOSContiki())
2585 return getDefaultSafeStackPointerLocation(IRB, false);
2586
2587 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2588 // definition of TLS_SLOT_SAFESTACK in
2589 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2590 if (Subtarget.isTargetAndroid()) {
2591 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2592 // %gs:0x24 on i386
2593 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2594 return SegmentOffset(IRB, Offset, getAddressSpace());
2595 }
2596
2597 // Fuchsia is similar.
2598 if (Subtarget.isTargetFuchsia()) {
2599 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2600 return SegmentOffset(IRB, 0x18, getAddressSpace());
2601 }
2602
2603 return TargetLowering::getSafeStackPointerLocation(IRB);
2604}
2605
2606//===----------------------------------------------------------------------===//
2607// Return Value Calling Convention Implementation
2608//===----------------------------------------------------------------------===//
2609
2610bool X86TargetLowering::CanLowerReturn(
2611 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2612 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2613 SmallVector<CCValAssign, 16> RVLocs;
2614 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2615 return CCInfo.CheckReturn(Outs, RetCC_X86);
2616}
2617
2618const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2619 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2620 return ScratchRegs;
2621}
2622
2623/// Lowers masks values (v*i1) to the local register values
2624/// \returns DAG node after lowering to register type
2625static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2626 const SDLoc &Dl, SelectionDAG &DAG) {
2627 EVT ValVT = ValArg.getValueType();
2628
2629 if (ValVT == MVT::v1i1)
2630 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2631 DAG.getIntPtrConstant(0, Dl));
2632
2633 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2634 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2635 // Two stage lowering might be required
2636 // bitcast: v8i1 -> i8 / v16i1 -> i16
2637 // anyextend: i8 -> i32 / i16 -> i32
2638 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2639 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2640 if (ValLoc == MVT::i32)
2641 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2642 return ValToCopy;
2643 }
2644
2645 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2646 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2647 // One stage lowering is required
2648 // bitcast: v32i1 -> i32 / v64i1 -> i64
2649 return DAG.getBitcast(ValLoc, ValArg);
2650 }
2651
2652 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2653}
2654
2655/// Breaks v64i1 value into two registers and adds the new node to the DAG
2656static void Passv64i1ArgInRegs(
2657 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2658 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2659 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2660 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((void)0);
2661 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((void)0);
2662 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((void)0);
2663 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((void)0)
2664 "The value should reside in two registers")((void)0);
2665
2666 // Before splitting the value we cast it to i64
2667 Arg = DAG.getBitcast(MVT::i64, Arg);
2668
2669 // Splitting the value into two i32 types
2670 SDValue Lo, Hi;
2671 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2672 DAG.getConstant(0, Dl, MVT::i32));
2673 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2674 DAG.getConstant(1, Dl, MVT::i32));
2675
2676 // Attach the two i32 types into corresponding registers
2677 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2678 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2679}
2680
2681SDValue
2682X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2683 bool isVarArg,
2684 const SmallVectorImpl<ISD::OutputArg> &Outs,
2685 const SmallVectorImpl<SDValue> &OutVals,
2686 const SDLoc &dl, SelectionDAG &DAG) const {
2687 MachineFunction &MF = DAG.getMachineFunction();
2688 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2689
2690 // In some cases we need to disable registers from the default CSR list.
2691 // For example, when they are used for argument passing.
2692 bool ShouldDisableCalleeSavedRegister =
2693 CallConv == CallingConv::X86_RegCall ||
2694 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2695
2696 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2697 report_fatal_error("X86 interrupts may not return any value");
2698
2699 SmallVector<CCValAssign, 16> RVLocs;
2700 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2701 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2702
2703 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2704 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2705 ++I, ++OutsIndex) {
2706 CCValAssign &VA = RVLocs[I];
2707 assert(VA.isRegLoc() && "Can only return in registers!")((void)0);
2708
2709 // Add the register to the CalleeSaveDisableRegs list.
2710 if (ShouldDisableCalleeSavedRegister)
2711 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2712
2713 SDValue ValToCopy = OutVals[OutsIndex];
2714 EVT ValVT = ValToCopy.getValueType();
2715
2716 // Promote values to the appropriate types.
2717 if (VA.getLocInfo() == CCValAssign::SExt)
2718 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2719 else if (VA.getLocInfo() == CCValAssign::ZExt)
2720 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2721 else if (VA.getLocInfo() == CCValAssign::AExt) {
2722 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2723 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2724 else
2725 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2726 }
2727 else if (VA.getLocInfo() == CCValAssign::BCvt)
2728 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2729
2730 assert(VA.getLocInfo() != CCValAssign::FPExt &&((void)0)
2731 "Unexpected FP-extend for return value.")((void)0);
2732
2733 // Report an error if we have attempted to return a value via an XMM
2734 // register and SSE was disabled.
2735 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2736 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2737 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2738 } else if (!Subtarget.hasSSE2() &&
2739 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2740 ValVT == MVT::f64) {
2741 // When returning a double via an XMM register, report an error if SSE2 is
2742 // not enabled.
2743 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2744 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2745 }
2746
2747 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2748 // the RET instruction and handled by the FP Stackifier.
2749 if (VA.getLocReg() == X86::FP0 ||
2750 VA.getLocReg() == X86::FP1) {
2751 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2752 // change the value to the FP stack register class.
2753 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2754 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2755 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2756 // Don't emit a copytoreg.
2757 continue;
2758 }
2759
2760 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2761 // which is returned in RAX / RDX.
2762 if (Subtarget.is64Bit()) {
2763 if (ValVT == MVT::x86mmx) {
2764 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2765 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2766 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2767 ValToCopy);
2768 // If we don't have SSE2 available, convert to v4f32 so the generated
2769 // register is legal.
2770 if (!Subtarget.hasSSE2())
2771 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2772 }
2773 }
2774 }
2775
2776 if (VA.needsCustom()) {
2777 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
2778 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
2779
2780 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2781 Subtarget);
2782
2783 // Add the second register to the CalleeSaveDisableRegs list.
2784 if (ShouldDisableCalleeSavedRegister)
2785 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2786 } else {
2787 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2788 }
2789 }
2790
2791 SDValue Flag;
2792 SmallVector<SDValue, 6> RetOps;
2793 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2794 // Operand #1 = Bytes To Pop
2795 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2796 MVT::i32));
2797
2798 // Copy the result values into the output registers.
2799 for (auto &RetVal : RetVals) {
2800 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2801 RetOps.push_back(RetVal.second);
2802 continue; // Don't emit a copytoreg.
2803 }
2804
2805 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2806 Flag = Chain.getValue(1);
2807 RetOps.push_back(
2808 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2809 }
2810
2811 // Swift calling convention does not require we copy the sret argument
2812 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2813
2814 // All x86 ABIs require that for returning structs by value we copy
2815 // the sret argument into %rax/%eax (depending on ABI) for the return.
2816 // We saved the argument into a virtual register in the entry block,
2817 // so now we copy the value out and into %rax/%eax.
2818 //
2819 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2820 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2821 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2822 // either case FuncInfo->setSRetReturnReg() will have been called.
2823 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2824 // When we have both sret and another return value, we should use the
2825 // original Chain stored in RetOps[0], instead of the current Chain updated
2826 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2827
2828 // For the case of sret and another return value, we have
2829 // Chain_0 at the function entry
2830 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2831 // If we use Chain_1 in getCopyFromReg, we will have
2832 // Val = getCopyFromReg(Chain_1)
2833 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2834
2835 // getCopyToReg(Chain_0) will be glued together with
2836 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2837 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2838 // Data dependency from Unit B to Unit A due to usage of Val in
2839 // getCopyToReg(Chain_1, Val)
2840 // Chain dependency from Unit A to Unit B
2841
2842 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2843 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2844 getPointerTy(MF.getDataLayout()));
2845
2846 Register RetValReg
2847 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2848 X86::RAX : X86::EAX;
2849 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2850 Flag = Chain.getValue(1);
2851
2852 // RAX/EAX now acts like a return value.
2853 RetOps.push_back(
2854 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2855
2856 // Add the returned register to the CalleeSaveDisableRegs list.
2857 if (ShouldDisableCalleeSavedRegister)
2858 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2859 }
2860
2861 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2862 const MCPhysReg *I =
2863 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2864 if (I) {
2865 for (; *I; ++I) {
2866 if (X86::GR64RegClass.contains(*I))
2867 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2868 else
2869 llvm_unreachable("Unexpected register class in CSRsViaCopy!")__builtin_unreachable();
2870 }
2871 }
2872
2873 RetOps[0] = Chain; // Update chain.
2874
2875 // Add the flag if we have it.
2876 if (Flag.getNode())
2877 RetOps.push_back(Flag);
2878
2879 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2880 if (CallConv == CallingConv::X86_INTR)
2881 opcode = X86ISD::IRET;
2882 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2883}
2884
2885bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2886 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2887 return false;
2888
2889 SDValue TCChain = Chain;
2890 SDNode *Copy = *N->use_begin();
2891 if (Copy->getOpcode() == ISD::CopyToReg) {
2892 // If the copy has a glue operand, we conservatively assume it isn't safe to
2893 // perform a tail call.
2894 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2895 return false;
2896 TCChain = Copy->getOperand(0);
2897 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2898 return false;
2899
2900 bool HasRet = false;
2901 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2902 UI != UE; ++UI) {
2903 if (UI->getOpcode() != X86ISD::RET_FLAG)
2904 return false;
2905 // If we are returning more than one value, we can definitely
2906 // not make a tail call see PR19530
2907 if (UI->getNumOperands() > 4)
2908 return false;
2909 if (UI->getNumOperands() == 4 &&
2910 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2911 return false;
2912 HasRet = true;
2913 }
2914
2915 if (!HasRet)
2916 return false;
2917
2918 Chain = TCChain;
2919 return true;
2920}
2921
2922EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2923 ISD::NodeType ExtendKind) const {
2924 MVT ReturnMVT = MVT::i32;
2925
2926 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2927 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2928 // The ABI does not require i1, i8 or i16 to be extended.
2929 //
2930 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2931 // always extending i8/i16 return values, so keep doing that for now.
2932 // (PR26665).
2933 ReturnMVT = MVT::i8;
2934 }
2935
2936 EVT MinVT = getRegisterType(Context, ReturnMVT);
2937 return VT.bitsLT(MinVT) ? MinVT : VT;
2938}
2939
2940/// Reads two 32 bit registers and creates a 64 bit mask value.
2941/// \param VA The current 32 bit value that need to be assigned.
2942/// \param NextVA The next 32 bit value that need to be assigned.
2943/// \param Root The parent DAG node.
2944/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2945/// glue purposes. In the case the DAG is already using
2946/// physical register instead of virtual, we should glue
2947/// our new SDValue to InFlag SDvalue.
2948/// \return a new SDvalue of size 64bit.
2949static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2950 SDValue &Root, SelectionDAG &DAG,
2951 const SDLoc &Dl, const X86Subtarget &Subtarget,
2952 SDValue *InFlag = nullptr) {
2953 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")((void)0);
2954 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((void)0);
2955 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
2956 "Expecting first location of 64 bit width type")((void)0);
2957 assert(NextVA.getValVT() == VA.getValVT() &&((void)0)
2958 "The locations should have the same type")((void)0);
2959 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((void)0)
2960 "The values should reside in two registers")((void)0);
2961
2962 SDValue Lo, Hi;
2963 SDValue ArgValueLo, ArgValueHi;
2964
2965 MachineFunction &MF = DAG.getMachineFunction();
2966 const TargetRegisterClass *RC = &X86::GR32RegClass;
2967
2968 // Read a 32 bit value from the registers.
2969 if (nullptr == InFlag) {
2970 // When no physical register is present,
2971 // create an intermediate virtual register.
2972 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2973 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2974 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2975 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2976 } else {
2977 // When a physical register is available read the value from it and glue
2978 // the reads together.
2979 ArgValueLo =
2980 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2981 *InFlag = ArgValueLo.getValue(2);
2982 ArgValueHi =
2983 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2984 *InFlag = ArgValueHi.getValue(2);
2985 }
2986
2987 // Convert the i32 type into v32i1 type.
2988 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2989
2990 // Convert the i32 type into v32i1 type.
2991 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2992
2993 // Concatenate the two values together.
2994 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2995}
2996
2997/// The function will lower a register of various sizes (8/16/32/64)
2998/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2999/// \returns a DAG node contains the operand after lowering to mask type.
3000static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3001 const EVT &ValLoc, const SDLoc &Dl,
3002 SelectionDAG &DAG) {
3003 SDValue ValReturned = ValArg;
3004
3005 if (ValVT == MVT::v1i1)
3006 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3007
3008 if (ValVT == MVT::v64i1) {
3009 // In 32 bit machine, this case is handled by getv64i1Argument
3010 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((void)0);
3011 // In 64 bit machine, There is no need to truncate the value only bitcast
3012 } else {
3013 MVT maskLen;
3014 switch (ValVT.getSimpleVT().SimpleTy) {
3015 case MVT::v8i1:
3016 maskLen = MVT::i8;
3017 break;
3018 case MVT::v16i1:
3019 maskLen = MVT::i16;
3020 break;
3021 case MVT::v32i1:
3022 maskLen = MVT::i32;
3023 break;
3024 default:
3025 llvm_unreachable("Expecting a vector of i1 types")__builtin_unreachable();
3026 }
3027
3028 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3029 }
3030 return DAG.getBitcast(ValVT, ValReturned);
3031}
3032
3033/// Lower the result values of a call into the
3034/// appropriate copies out of appropriate physical registers.
3035///
3036SDValue X86TargetLowering::LowerCallResult(
3037 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3038 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3039 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3040 uint32_t *RegMask) const {
3041
3042 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3043 // Assign locations to each value returned by this call.
3044 SmallVector<CCValAssign, 16> RVLocs;
3045 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3046 *DAG.getContext());
3047 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3048
3049 // Copy all of the result registers out of their specified physreg.
3050 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3051 ++I, ++InsIndex) {
3052 CCValAssign &VA = RVLocs[I];
3053 EVT CopyVT = VA.getLocVT();
3054
3055 // In some calling conventions we need to remove the used registers
3056 // from the register mask.
3057 if (RegMask) {
3058 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3059 SubRegs.isValid(); ++SubRegs)
3060 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3061 }
3062
3063 // Report an error if there was an attempt to return FP values via XMM
3064 // registers.
3065 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3066 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3067 if (VA.getLocReg() == X86::XMM1)
3068 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3069 else
3070 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3071 } else if (!Subtarget.hasSSE2() &&
3072 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3073 CopyVT == MVT::f64) {
3074 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3075 if (VA.getLocReg() == X86::XMM1)
3076 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3077 else
3078 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3079 }
3080
3081 // If we prefer to use the value in xmm registers, copy it out as f80 and
3082 // use a truncate to move it from fp stack reg to xmm reg.
3083 bool RoundAfterCopy = false;
3084 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3085 isScalarFPTypeInSSEReg(VA.getValVT())) {
3086 if (!Subtarget.hasX87())
3087 report_fatal_error("X87 register return with X87 disabled");
3088 CopyVT = MVT::f80;
3089 RoundAfterCopy = (CopyVT != VA.getLocVT());
3090 }
3091
3092 SDValue Val;
3093 if (VA.needsCustom()) {
3094 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
3095 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
3096 Val =
3097 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3098 } else {
3099 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3100 .getValue(1);
3101 Val = Chain.getValue(0);
3102 InFlag = Chain.getValue(2);
3103 }
3104
3105 if (RoundAfterCopy)
3106 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3107 // This truncation won't change the value.
3108 DAG.getIntPtrConstant(1, dl));
3109
3110 if (VA.isExtInLoc()) {
3111 if (VA.getValVT().isVector() &&
3112 VA.getValVT().getScalarType() == MVT::i1 &&
3113 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3114 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3115 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3116 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3117 } else
3118 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3119 }
3120
3121 if (VA.getLocInfo() == CCValAssign::BCvt)
3122 Val = DAG.getBitcast(VA.getValVT(), Val);
3123
3124 InVals.push_back(Val);
3125 }
3126
3127 return Chain;
3128}
3129
3130//===----------------------------------------------------------------------===//
3131// C & StdCall & Fast Calling Convention implementation
3132//===----------------------------------------------------------------------===//
3133// StdCall calling convention seems to be standard for many Windows' API
3134// routines and around. It differs from C calling convention just a little:
3135// callee should clean up the stack, not caller. Symbols should be also
3136// decorated in some fancy way :) It doesn't support any vector arguments.
3137// For info on fast calling convention see Fast Calling Convention (tail call)
3138// implementation LowerX86_32FastCCCallTo.
3139
3140/// CallIsStructReturn - Determines whether a call uses struct return
3141/// semantics.
3142enum StructReturnType {
3143 NotStructReturn,
3144 RegStructReturn,
3145 StackStructReturn
3146};
3147static StructReturnType
3148callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3149 if (Outs.empty())
3150 return NotStructReturn;
3151
3152 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3153 if (!Flags.isSRet())
3154 return NotStructReturn;
3155 if (Flags.isInReg() || IsMCU)
3156 return RegStructReturn;
3157 return StackStructReturn;
3158}
3159
3160/// Determines whether a function uses struct return semantics.
3161static StructReturnType
3162argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3163 if (Ins.empty())
3164 return NotStructReturn;
3165
3166 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3167 if (!Flags.isSRet())
3168 return NotStructReturn;
3169 if (Flags.isInReg() || IsMCU)
3170 return RegStructReturn;
3171 return StackStructReturn;
3172}
3173
3174/// Make a copy of an aggregate at address specified by "Src" to address
3175/// "Dst" with size and alignment information specified by the specific
3176/// parameter attribute. The copy will be passed as a byval function parameter.
3177static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3178 SDValue Chain, ISD::ArgFlagsTy Flags,
3179 SelectionDAG &DAG, const SDLoc &dl) {
3180 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3181
3182 return DAG.getMemcpy(
3183 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3184 /*isVolatile*/ false, /*AlwaysInline=*/true,
3185 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3186}
3187
3188/// Return true if the calling convention is one that we can guarantee TCO for.
3189static bool canGuaranteeTCO(CallingConv::ID CC) {
3190 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3191 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3192 CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3193 CC == CallingConv::SwiftTail);
3194}
3195
3196/// Return true if we might ever do TCO for calls with this calling convention.
3197static bool mayTailCallThisCC(CallingConv::ID CC) {
3198 switch (CC) {
3199 // C calling conventions:
3200 case CallingConv::C:
3201 case CallingConv::Win64:
3202 case CallingConv::X86_64_SysV:
3203 // Callee pop conventions:
3204 case CallingConv::X86_ThisCall:
3205 case CallingConv::X86_StdCall:
3206 case CallingConv::X86_VectorCall:
3207 case CallingConv::X86_FastCall:
3208 // Swift:
3209 case CallingConv::Swift:
3210 return true;
3211 default:
3212 return canGuaranteeTCO(CC);
3213 }
3214}
3215
3216/// Return true if the function is being made into a tailcall target by
3217/// changing its ABI.
3218static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3219 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3220 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3221}
3222
3223bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3224 if (!CI->isTailCall())
3225 return false;
3226
3227 CallingConv::ID CalleeCC = CI->getCallingConv();
3228 if (!mayTailCallThisCC(CalleeCC))
3229 return false;
3230
3231 return true;
3232}
3233
3234SDValue
3235X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3236 const SmallVectorImpl<ISD::InputArg> &Ins,
3237 const SDLoc &dl, SelectionDAG &DAG,
3238 const CCValAssign &VA,
3239 MachineFrameInfo &MFI, unsigned i) const {
3240 // Create the nodes corresponding to a load from this parameter slot.
3241 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3242 bool AlwaysUseMutable = shouldGuaranteeTCO(
3243 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3244 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3245 EVT ValVT;
3246 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3247
3248 // If value is passed by pointer we have address passed instead of the value
3249 // itself. No need to extend if the mask value and location share the same
3250 // absolute size.
3251 bool ExtendedInMem =
3252 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3253 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3254
3255 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3256 ValVT = VA.getLocVT();
3257 else
3258 ValVT = VA.getValVT();
3259
3260 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3261 // changed with more analysis.
3262 // In case of tail call optimization mark all arguments mutable. Since they
3263 // could be overwritten by lowering of arguments in case of a tail call.
3264 if (Flags.isByVal()) {
3265 unsigned Bytes = Flags.getByValSize();
3266 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3267
3268 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3269 // can be improved with deeper analysis.
3270 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3271 /*isAliased=*/true);
3272 return DAG.getFrameIndex(FI, PtrVT);
3273 }
3274
3275 EVT ArgVT = Ins[i].ArgVT;
3276
3277 // If this is a vector that has been split into multiple parts, and the
3278 // scalar size of the parts don't match the vector element size, then we can't
3279 // elide the copy. The parts will have padding between them instead of being
3280 // packed like a vector.
3281 bool ScalarizedAndExtendedVector =
3282 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3283 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3284
3285 // This is an argument in memory. We might be able to perform copy elision.
3286 // If the argument is passed directly in memory without any extension, then we
3287 // can perform copy elision. Large vector types, for example, may be passed
3288 // indirectly by pointer.
3289 if (Flags.isCopyElisionCandidate() &&
3290 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3291 !ScalarizedAndExtendedVector) {
3292 SDValue PartAddr;
3293 if (Ins[i].PartOffset == 0) {
3294 // If this is a one-part value or the first part of a multi-part value,
3295 // create a stack object for the entire argument value type and return a
3296 // load from our portion of it. This assumes that if the first part of an
3297 // argument is in memory, the rest will also be in memory.
3298 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3299 /*IsImmutable=*/false);
3300 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3301 return DAG.getLoad(
3302 ValVT, dl, Chain, PartAddr,
3303 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3304 } else {
3305 // This is not the first piece of an argument in memory. See if there is
3306 // already a fixed stack object including this offset. If so, assume it
3307 // was created by the PartOffset == 0 branch above and create a load from
3308 // the appropriate offset into it.
3309 int64_t PartBegin = VA.getLocMemOffset();
3310 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3311 int FI = MFI.getObjectIndexBegin();
3312 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3313 int64_t ObjBegin = MFI.getObjectOffset(FI);
3314 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3315 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3316 break;
3317 }
3318 if (MFI.isFixedObjectIndex(FI)) {
3319 SDValue Addr =
3320 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3321 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3322 return DAG.getLoad(
3323 ValVT, dl, Chain, Addr,
3324 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3325 Ins[i].PartOffset));
3326 }
3327 }
3328 }
3329
3330 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3331 VA.getLocMemOffset(), isImmutable);
3332
3333 // Set SExt or ZExt flag.
3334 if (VA.getLocInfo() == CCValAssign::ZExt) {
3335 MFI.setObjectZExt(FI, true);
3336 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3337 MFI.setObjectSExt(FI, true);
3338 }
3339
3340 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3341 SDValue Val = DAG.getLoad(
3342 ValVT, dl, Chain, FIN,
3343 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3344 return ExtendedInMem
3345 ? (VA.getValVT().isVector()
3346 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3347 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3348 : Val;
3349}
3350
3351// FIXME: Get this from tablegen.
3352static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3353 const X86Subtarget &Subtarget) {
3354 assert(Subtarget.is64Bit())((void)0);
3355
3356 if (Subtarget.isCallingConvWin64(CallConv)) {
3357 static const MCPhysReg GPR64ArgRegsWin64[] = {
3358 X86::RCX, X86::RDX, X86::R8, X86::R9
3359 };
3360 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3361 }
3362
3363 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3364 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3365 };
3366 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3367}
3368
3369// FIXME: Get this from tablegen.
3370static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3371 CallingConv::ID CallConv,
3372 const X86Subtarget &Subtarget) {
3373 assert(Subtarget.is64Bit())((void)0);
3374 if (Subtarget.isCallingConvWin64(CallConv)) {
3375 // The XMM registers which might contain var arg parameters are shadowed
3376 // in their paired GPR. So we only need to save the GPR to their home
3377 // slots.
3378 // TODO: __vectorcall will change this.
3379 return None;
3380 }
3381
3382 bool isSoftFloat = Subtarget.useSoftFloat();
3383 if (isSoftFloat || !Subtarget.hasSSE1())
3384 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3385 // registers.
3386 return None;
3387
3388 static const MCPhysReg XMMArgRegs64Bit[] = {
3389 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3390 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3391 };
3392 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3393}
3394
3395#ifndef NDEBUG1
3396static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3397 return llvm::is_sorted(
3398 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3399 return A.getValNo() < B.getValNo();
3400 });
3401}
3402#endif
3403
3404namespace {
3405/// This is a helper class for lowering variable arguments parameters.
3406class VarArgsLoweringHelper {
3407public:
3408 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3409 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3410 CallingConv::ID CallConv, CCState &CCInfo)
3411 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3412 TheMachineFunction(DAG.getMachineFunction()),
3413 TheFunction(TheMachineFunction.getFunction()),
3414 FrameInfo(TheMachineFunction.getFrameInfo()),
3415 FrameLowering(*Subtarget.getFrameLowering()),
3416 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3417 CCInfo(CCInfo) {}
3418
3419 // Lower variable arguments parameters.
3420 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3421
3422private:
3423 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3424
3425 void forwardMustTailParameters(SDValue &Chain);
3426
3427 bool is64Bit() const { return Subtarget.is64Bit(); }
3428 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3429
3430 X86MachineFunctionInfo *FuncInfo;
3431 const SDLoc &DL;
3432 SelectionDAG &DAG;
3433 const X86Subtarget &Subtarget;
3434 MachineFunction &TheMachineFunction;
3435 const Function &TheFunction;
3436 MachineFrameInfo &FrameInfo;
3437 const TargetFrameLowering &FrameLowering;
3438 const TargetLowering &TargLowering;
3439 CallingConv::ID CallConv;
3440 CCState &CCInfo;
3441};
3442} // namespace
3443
3444void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3445 SDValue &Chain, unsigned StackSize) {
3446 // If the function takes variable number of arguments, make a frame index for
3447 // the start of the first vararg value... for expansion of llvm.va_start. We
3448 // can skip this if there are no va_start calls.
3449 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3450 CallConv != CallingConv::X86_ThisCall)) {
3451 FuncInfo->setVarArgsFrameIndex(
3452 FrameInfo.CreateFixedObject(1, StackSize, true));
3453 }
3454
3455 // 64-bit calling conventions support varargs and register parameters, so we
3456 // have to do extra work to spill them in the prologue.
3457 if (is64Bit()) {
3458 // Find the first unallocated argument registers.
3459 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3460 ArrayRef<MCPhysReg> ArgXMMs =
3461 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3462 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3463 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3464
3465 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((void)0)
3466 "SSE register cannot be used when SSE is disabled!")((void)0);
3467
3468 if (isWin64()) {
3469 // Get to the caller-allocated home save location. Add 8 to account
3470 // for the return address.
3471 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3472 FuncInfo->setRegSaveFrameIndex(
3473 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3474 // Fixup to set vararg frame on shadow area (4 x i64).
3475 if (NumIntRegs < 4)
3476 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3477 } else {
3478 // For X86-64, if there are vararg parameters that are passed via
3479 // registers, then we must store them to their spots on the stack so
3480 // they may be loaded by dereferencing the result of va_next.
3481 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3482 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3483 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3484 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3485 }
3486
3487 SmallVector<SDValue, 6>
3488 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3489 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3490 // keeping live input value
3491 SDValue ALVal; // if applicable keeps SDValue for %al register
3492
3493 // Gather all the live in physical registers.
3494 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3495 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3496 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3497 }
3498 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3499 if (!AvailableXmms.empty()) {
3500 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3501 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3502 for (MCPhysReg Reg : AvailableXmms) {
3503 // FastRegisterAllocator spills virtual registers at basic
3504 // block boundary. That leads to usages of xmm registers
3505 // outside of check for %al. Pass physical registers to
3506 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3507 TheMachineFunction.getRegInfo().addLiveIn(Reg);
3508 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3509 }
3510 }
3511
3512 // Store the integer parameter registers.
3513 SmallVector<SDValue, 8> MemOps;
3514 SDValue RSFIN =
3515 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3516 TargLowering.getPointerTy(DAG.getDataLayout()));
3517 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3518 for (SDValue Val : LiveGPRs) {
3519 SDValue FIN = DAG.getNode(ISD::ADD, DL,
3520 TargLowering.getPointerTy(DAG.getDataLayout()),
3521 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3522 SDValue Store =
3523 DAG.getStore(Val.getValue(1), DL, Val, FIN,
3524 MachinePointerInfo::getFixedStack(
3525 DAG.getMachineFunction(),
3526 FuncInfo->getRegSaveFrameIndex(), Offset));
3527 MemOps.push_back(Store);
3528 Offset += 8;
3529 }
3530
3531 // Now store the XMM (fp + vector) parameter registers.
3532 if (!LiveXMMRegs.empty()) {
3533 SmallVector<SDValue, 12> SaveXMMOps;
3534 SaveXMMOps.push_back(Chain);
3535 SaveXMMOps.push_back(ALVal);
3536 SaveXMMOps.push_back(
3537 DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3538 SaveXMMOps.push_back(
3539 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3540 llvm::append_range(SaveXMMOps, LiveXMMRegs);
3541 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3542 MVT::Other, SaveXMMOps));
3543 }
3544
3545 if (!MemOps.empty())
3546 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3547 }
3548}
3549
3550void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3551 // Find the largest legal vector type.
3552 MVT VecVT = MVT::Other;
3553 // FIXME: Only some x86_32 calling conventions support AVX512.
3554 if (Subtarget.useAVX512Regs() &&
3555 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3556 CallConv == CallingConv::Intel_OCL_BI)))
3557 VecVT = MVT::v16f32;
3558 else if (Subtarget.hasAVX())
3559 VecVT = MVT::v8f32;
3560 else if (Subtarget.hasSSE2())
3561 VecVT = MVT::v4f32;
3562
3563 // We forward some GPRs and some vector types.
3564 SmallVector<MVT, 2> RegParmTypes;
3565 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3566 RegParmTypes.push_back(IntVT);
3567 if (VecVT != MVT::Other)
3568 RegParmTypes.push_back(VecVT);
3569
3570 // Compute the set of forwarded registers. The rest are scratch.
3571 SmallVectorImpl<ForwardedRegister> &Forwards =
3572 FuncInfo->getForwardedMustTailRegParms();
3573 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3574
3575 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3576 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3577 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3578 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3579 }
3580
3581 // Copy all forwards from physical to virtual registers.
3582 for (ForwardedRegister &FR : Forwards) {
3583 // FIXME: Can we use a less constrained schedule?
3584 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3585 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3586 TargLowering.getRegClassFor(FR.VT));
3587 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3588 }
3589}
3590
3591void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3592 unsigned StackSize) {
3593 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3594 // If necessary, it would be set into the correct value later.
3595 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3596 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3597
3598 if (FrameInfo.hasVAStart())
3599 createVarArgAreaAndStoreRegisters(Chain, StackSize);
3600
3601 if (FrameInfo.hasMustTailInVarArgFunc())
3602 forwardMustTailParameters(Chain);
3603}
3604
3605SDValue X86TargetLowering::LowerFormalArguments(
3606 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3607 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3608 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3609 MachineFunction &MF = DAG.getMachineFunction();
3610 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3611
3612 const Function &F = MF.getFunction();
3613 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3614 F.getName() == "main")
3615 FuncInfo->setForceFramePointer(true);
3616
3617 MachineFrameInfo &MFI = MF.getFrameInfo();
3618 bool Is64Bit = Subtarget.is64Bit();
3619 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3620
3621 assert(((void)0)
3622 !(IsVarArg && canGuaranteeTCO(CallConv)) &&((void)0)
3623 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((void)0);
3624
3625 // Assign locations to all of the incoming arguments.
3626 SmallVector<CCValAssign, 16> ArgLocs;
3627 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3628
3629 // Allocate shadow area for Win64.
3630 if (IsWin64)
3631 CCInfo.AllocateStack(32, Align(8));
3632
3633 CCInfo.AnalyzeArguments(Ins, CC_X86);
3634
3635 // In vectorcall calling convention a second pass is required for the HVA
3636 // types.
3637 if (CallingConv::X86_VectorCall == CallConv) {
3638 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3639 }
3640
3641 // The next loop assumes that the locations are in the same order of the
3642 // input arguments.
3643 assert(isSortedByValueNo(ArgLocs) &&((void)0)
3644 "Argument Location list must be sorted before lowering")((void)0);
3645
3646 SDValue ArgValue;
3647 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3648 ++I, ++InsIndex) {
3649 assert(InsIndex < Ins.size() && "Invalid Ins index")((void)0);
3650 CCValAssign &VA = ArgLocs[I];
3651
3652 if (VA.isRegLoc()) {
3653 EVT RegVT = VA.getLocVT();
3654 if (VA.needsCustom()) {
3655 assert(((void)0)
3656 VA.getValVT() == MVT::v64i1 &&((void)0)
3657 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
3658
3659 // v64i1 values, in regcall calling convention, that are
3660 // compiled to 32 bit arch, are split up into two registers.
3661 ArgValue =
3662 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3663 } else {
3664 const TargetRegisterClass *RC;
3665 if (RegVT == MVT::i8)
3666 RC = &X86::GR8RegClass;
3667 else if (RegVT == MVT::i16)
3668 RC = &X86::GR16RegClass;
3669 else if (RegVT == MVT::i32)
3670 RC = &X86::GR32RegClass;
3671 else if (Is64Bit && RegVT == MVT::i64)
3672 RC = &X86::GR64RegClass;
3673 else if (RegVT == MVT::f32)
3674 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3675 else if (RegVT == MVT::f64)
3676 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3677 else if (RegVT == MVT::f80)
3678 RC = &X86::RFP80RegClass;
3679 else if (RegVT == MVT::f128)
3680 RC = &X86::VR128RegClass;
3681 else if (RegVT.is512BitVector())
3682 RC = &X86::VR512RegClass;
3683 else if (RegVT.is256BitVector())
3684 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3685 else if (RegVT.is128BitVector())
3686 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3687 else if (RegVT == MVT::x86mmx)
3688 RC = &X86::VR64RegClass;
3689 else if (RegVT == MVT::v1i1)
3690 RC = &X86::VK1RegClass;
3691 else if (RegVT == MVT::v8i1)
3692 RC = &X86::VK8RegClass;
3693 else if (RegVT == MVT::v16i1)
3694 RC = &X86::VK16RegClass;
3695 else if (RegVT == MVT::v32i1)
3696 RC = &X86::VK32RegClass;
3697 else if (RegVT == MVT::v64i1)
3698 RC = &X86::VK64RegClass;
3699 else
3700 llvm_unreachable("Unknown argument type!")__builtin_unreachable();
3701
3702 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3703 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3704 }
3705
3706 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3707 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3708 // right size.
3709 if (VA.getLocInfo() == CCValAssign::SExt)
3710 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3711 DAG.getValueType(VA.getValVT()));
3712 else if (VA.getLocInfo() == CCValAssign::ZExt)
3713 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3714 DAG.getValueType(VA.getValVT()));
3715 else if (VA.getLocInfo() == CCValAssign::BCvt)
3716 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3717
3718 if (VA.isExtInLoc()) {
3719 // Handle MMX values passed in XMM regs.
3720 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3721 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3722 else if (VA.getValVT().isVector() &&
3723 VA.getValVT().getScalarType() == MVT::i1 &&
3724 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3725 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3726 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3727 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3728 } else
3729 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3730 }
3731 } else {
3732 assert(VA.isMemLoc())((void)0);
3733 ArgValue =
3734 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3735 }
3736
3737 // If value is passed via pointer - do a load.
3738 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3739 ArgValue =
3740 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3741
3742 InVals.push_back(ArgValue);
3743 }
3744
3745 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3746 if (Ins[I].Flags.isSwiftAsync()) {
3747 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3748 if (Subtarget.is64Bit())
3749 X86FI->setHasSwiftAsyncContext(true);
3750 else {
3751 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3752 X86FI->setSwiftAsyncContextFrameIdx(FI);
3753 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3754 DAG.getFrameIndex(FI, MVT::i32),
3755 MachinePointerInfo::getFixedStack(MF, FI));
3756 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3757 }
3758 }
3759
3760 // Swift calling convention does not require we copy the sret argument
3761 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3762 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3763 continue;
3764
3765 // All x86 ABIs require that for returning structs by value we copy the
3766 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3767 // the argument into a virtual register so that we can access it from the
3768 // return points.
3769 if (Ins[I].Flags.isSRet()) {
3770 Register Reg = FuncInfo->getSRetReturnReg();
3771 if (!Reg) {
3772 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3773 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3774 FuncInfo->setSRetReturnReg(Reg);
3775 }
3776 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3777 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3778 break;
3779 }
3780 }
3781
3782 unsigned StackSize = CCInfo.getNextStackOffset();
3783 // Align stack specially for tail calls.
3784 if (shouldGuaranteeTCO(CallConv,
3785 MF.getTarget().Options.GuaranteedTailCallOpt))
3786 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3787
3788 if (IsVarArg)
3789 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3790 .lowerVarArgsParameters(Chain, StackSize);
3791
3792 // Some CCs need callee pop.
3793 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3794 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3795 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3796 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3797 // X86 interrupts must pop the error code (and the alignment padding) if
3798 // present.
3799 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3800 } else {
3801 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3802 // If this is an sret function, the return should pop the hidden pointer.
3803 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3804 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3805 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3806 FuncInfo->setBytesToPopOnReturn(4);
3807 }
3808
3809 if (!Is64Bit) {
3810 // RegSaveFrameIndex is X86-64 only.
3811 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3812 }
3813
3814 FuncInfo->setArgumentStackSize(StackSize);
3815
3816 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3817 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3818 if (Personality == EHPersonality::CoreCLR) {
3819 assert(Is64Bit)((void)0);
3820 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3821 // that we'd prefer this slot be allocated towards the bottom of the frame
3822 // (i.e. near the stack pointer after allocating the frame). Every
3823 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3824 // offset from the bottom of this and each funclet's frame must be the
3825 // same, so the size of funclets' (mostly empty) frames is dictated by
3826 // how far this slot is from the bottom (since they allocate just enough
3827 // space to accommodate holding this slot at the correct offset).
3828 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
3829 EHInfo->PSPSymFrameIdx = PSPSymFI;
3830 }
3831 }
3832
3833 if (CallConv == CallingConv::X86_RegCall ||
3834 F.hasFnAttribute("no_caller_saved_registers")) {
3835 MachineRegisterInfo &MRI = MF.getRegInfo();
3836 for (std::pair<Register, Register> Pair : MRI.liveins())
3837 MRI.disableCalleeSavedRegister(Pair.first);
3838 }
3839
3840 return Chain;
3841}
3842
3843SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3844 SDValue Arg, const SDLoc &dl,
3845 SelectionDAG &DAG,
3846 const CCValAssign &VA,
3847 ISD::ArgFlagsTy Flags,
3848 bool isByVal) const {
3849 unsigned LocMemOffset = VA.getLocMemOffset();
3850 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3851 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3852 StackPtr, PtrOff);
3853 if (isByVal)
3854 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3855
3856 return DAG.getStore(
3857 Chain, dl, Arg, PtrOff,
3858 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3859}
3860
3861/// Emit a load of return address if tail call
3862/// optimization is performed and it is required.
3863SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3864 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3865 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3866 // Adjust the Return address stack slot.
3867 EVT VT = getPointerTy(DAG.getDataLayout());
3868 OutRetAddr = getReturnAddressFrameIndex(DAG);
3869
3870 // Load the "old" Return address.
3871 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3872 return SDValue(OutRetAddr.getNode(), 1);
3873}
3874
3875/// Emit a store of the return address if tail call
3876/// optimization is performed and it is required (FPDiff!=0).
3877static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3878 SDValue Chain, SDValue RetAddrFrIdx,
3879 EVT PtrVT, unsigned SlotSize,
3880 int FPDiff, const SDLoc &dl) {
3881 // Store the return address to the appropriate stack slot.
3882 if (!FPDiff) return Chain;
3883 // Calculate the new stack slot for the return address.
3884 int NewReturnAddrFI =
3885 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3886 false);
3887 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3888 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3889 MachinePointerInfo::getFixedStack(
3890 DAG.getMachineFunction(), NewReturnAddrFI));
3891 return Chain;
3892}
3893
3894/// Returns a vector_shuffle mask for an movs{s|d}, movd
3895/// operation of specified width.
3896static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3897 SDValue V2) {
3898 unsigned NumElems = VT.getVectorNumElements();
3899 SmallVector<int, 8> Mask;
3900 Mask.push_back(NumElems);
3901 for (unsigned i = 1; i != NumElems; ++i)
3902 Mask.push_back(i);
3903 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3904}
3905
3906SDValue
3907X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3908 SmallVectorImpl<SDValue> &InVals) const {
3909 SelectionDAG &DAG = CLI.DAG;
3910 SDLoc &dl = CLI.DL;
3911 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3912 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3913 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3914 SDValue Chain = CLI.Chain;
3915 SDValue Callee = CLI.Callee;
3916 CallingConv::ID CallConv = CLI.CallConv;
3917 bool &isTailCall = CLI.IsTailCall;
3918 bool isVarArg = CLI.IsVarArg;
3919 const auto *CB = CLI.CB;
3920
3921 MachineFunction &MF = DAG.getMachineFunction();
3922 bool Is64Bit = Subtarget.is64Bit();
3923 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3924 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3925 bool IsSibcall = false;
3926 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3927 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
3928 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3929 bool HasNCSR = (CB && isa<CallInst>(CB) &&
3930 CB->hasFnAttr("no_caller_saved_registers"));
3931 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
3932 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
3933 const Module *M = MF.getMMI().getModule();
3934 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3935
3936 MachineFunction::CallSiteInfo CSInfo;
3937 if (CallConv == CallingConv::X86_INTR)
3938 report_fatal_error("X86 interrupts may not be called directly");
3939
3940 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
3941 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
3942 // If we are using a GOT, disable tail calls to external symbols with
3943 // default visibility. Tail calling such a symbol requires using a GOT
3944 // relocation, which forces early binding of the symbol. This breaks code
3945 // that require lazy function symbol resolution. Using musttail or
3946 // GuaranteedTailCallOpt will override this.
3947 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3948 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3949 G->getGlobal()->hasDefaultVisibility()))
3950 isTailCall = false;
3951 }
3952
3953
3954 if (isTailCall && !IsMustTail) {
3955 // Check if it's really possible to do a tail call.
3956 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3957 isVarArg, SR != NotStructReturn,
3958 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3959 Outs, OutVals, Ins, DAG);
3960
3961 // Sibcalls are automatically detected tailcalls which do not require
3962 // ABI changes.
3963 if (!IsGuaranteeTCO && isTailCall)
3964 IsSibcall = true;
3965
3966 if (isTailCall)
3967 ++NumTailCalls;
3968 }
3969
3970 if (IsMustTail && !isTailCall)
3971 report_fatal_error("failed to perform tail call elimination on a call "
3972 "site marked musttail");
3973
3974 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((void)0)
3975 "Var args not supported with calling convention fastcc, ghc or hipe")((void)0);
3976
3977 // Analyze operands of the call, assigning locations to each operand.
3978 SmallVector<CCValAssign, 16> ArgLocs;
3979 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3980
3981 // Allocate shadow area for Win64.
3982 if (IsWin64)
3983 CCInfo.AllocateStack(32, Align(8));
3984
3985 CCInfo.AnalyzeArguments(Outs, CC_X86);
3986
3987 // In vectorcall calling convention a second pass is required for the HVA
3988 // types.
3989 if (CallingConv::X86_VectorCall == CallConv) {
3990 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3991 }
3992
3993 // Get a count of how many bytes are to be pushed on the stack.
3994 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3995 if (IsSibcall)
3996 // This is a sibcall. The memory operands are available in caller's
3997 // own caller's stack.
3998 NumBytes = 0;
3999 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4000 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4001
4002 int FPDiff = 0;
4003 if (isTailCall &&
4004 shouldGuaranteeTCO(CallConv,
4005 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4006 // Lower arguments at fp - stackoffset + fpdiff.
4007 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4008
4009 FPDiff = NumBytesCallerPushed - NumBytes;
4010
4011 // Set the delta of movement of the returnaddr stackslot.
4012 // But only set if delta is greater than previous delta.
4013 if (FPDiff < X86Info->getTCReturnAddrDelta())
4014 X86Info->setTCReturnAddrDelta(FPDiff);
4015 }
4016
4017 unsigned NumBytesToPush = NumBytes;
4018 unsigned NumBytesToPop = NumBytes;
4019
4020 // If we have an inalloca argument, all stack space has already been allocated
4021 // for us and be right at the top of the stack. We don't support multiple
4022 // arguments passed in memory when using inalloca.
4023 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4024 NumBytesToPush = 0;
4025 if (!ArgLocs.back().isMemLoc())
4026 report_fatal_error("cannot use inalloca attribute on a register "
4027 "parameter");
4028 if (ArgLocs.back().getLocMemOffset() != 0)
4029 report_fatal_error("any parameter with the inalloca attribute must be "
4030 "the only memory argument");
4031 } else if (CLI.IsPreallocated) {
4032 assert(ArgLocs.back().isMemLoc() &&((void)0)
4033 "cannot use preallocated attribute on a register "((void)0)
4034 "parameter")((void)0);
4035 SmallVector<size_t, 4> PreallocatedOffsets;
4036 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4037 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4038 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4039 }
4040 }
4041 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4042 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4043 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4044 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4045 NumBytesToPush = 0;
4046 }
4047
4048 if (!IsSibcall && !IsMustTail)
4049 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4050 NumBytes - NumBytesToPush, dl);
4051
4052 SDValue RetAddrFrIdx;
4053 // Load return address for tail calls.
4054 if (isTailCall && FPDiff)
4055 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4056 Is64Bit, FPDiff, dl);
4057
4058 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4059 SmallVector<SDValue, 8> MemOpChains;
4060 SDValue StackPtr;
4061
4062 // The next loop assumes that the locations are in the same order of the
4063 // input arguments.
4064 assert(isSortedByValueNo(ArgLocs) &&((void)0)
4065 "Argument Location list must be sorted before lowering")((void)0);
4066
4067 // Walk the register/memloc assignments, inserting copies/loads. In the case
4068 // of tail call optimization arguments are handle later.
4069 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4070 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4071 ++I, ++OutIndex) {
4072 assert(OutIndex < Outs.size() && "Invalid Out index")((void)0);
4073 // Skip inalloca/preallocated arguments, they have already been written.
4074 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4075 if (Flags.isInAlloca() || Flags.isPreallocated())
4076 continue;
4077
4078 CCValAssign &VA = ArgLocs[I];
4079 EVT RegVT = VA.getLocVT();
4080 SDValue Arg = OutVals[OutIndex];
4081 bool isByVal = Flags.isByVal();
4082
4083 // Promote the value if needed.
4084 switch (VA.getLocInfo()) {
4085 default: llvm_unreachable("Unknown loc info!")__builtin_unreachable();
4086 case CCValAssign::Full: break;
4087 case CCValAssign::SExt:
4088 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4089 break;
4090 case CCValAssign::ZExt:
4091 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4092 break;
4093 case CCValAssign::AExt:
4094 if (Arg.getValueType().isVector() &&
4095 Arg.getValueType().getVectorElementType() == MVT::i1)
4096 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4097 else if (RegVT.is128BitVector()) {
4098 // Special case: passing MMX values in XMM registers.
4099 Arg = DAG.getBitcast(MVT::i64, Arg);
4100 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4101 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4102 } else
4103 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4104 break;
4105 case CCValAssign::BCvt:
4106 Arg = DAG.getBitcast(RegVT, Arg);
4107 break;
4108 case CCValAssign::Indirect: {
4109 if (isByVal) {
4110 // Memcpy the argument to a temporary stack slot to prevent
4111 // the caller from seeing any modifications the callee may make
4112 // as guaranteed by the `byval` attribute.
4113 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4114 Flags.getByValSize(),
4115 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4116 SDValue StackSlot =
4117 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4118 Chain =
4119 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4120 // From now on treat this as a regular pointer
4121 Arg = StackSlot;
4122 isByVal = false;
4123 } else {
4124 // Store the argument.
4125 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4126 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4127 Chain = DAG.getStore(
4128 Chain, dl, Arg, SpillSlot,
4129 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4130 Arg = SpillSlot;
4131 }
4132 break;
4133 }
4134 }
4135
4136 if (VA.needsCustom()) {
4137 assert(VA.getValVT() == MVT::v64i1 &&((void)0)
4138 "Currently the only custom case is when we split v64i1 to 2 regs")((void)0);
4139 // Split v64i1 value into two registers
4140 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4141 } else if (VA.isRegLoc()) {
4142 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4143 const TargetOptions &Options = DAG.getTarget().Options;
4144 if (Options.EmitCallSiteInfo)
4145 CSInfo.emplace_back(VA.getLocReg(), I);
4146 if (isVarArg && IsWin64) {
4147 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4148 // shadow reg if callee is a varargs function.
4149 Register ShadowReg;
4150 switch (VA.getLocReg()) {
4151 case X86::XMM0: ShadowReg = X86::RCX; break;
4152 case X86::XMM1: ShadowReg = X86::RDX; break;
4153 case X86::XMM2: ShadowReg = X86::R8; break;
4154 case X86::XMM3: ShadowReg = X86::R9; break;
4155 }
4156 if (ShadowReg)
4157 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4158 }
4159 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4160 assert(VA.isMemLoc())((void)0);
4161 if (!StackPtr.getNode())
4162 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4163 getPointerTy(DAG.getDataLayout()));
4164 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4165 dl, DAG, VA, Flags, isByVal));
4166 }
4167 }
4168
4169 if (!MemOpChains.empty())
4170 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4171
4172 if (Subtarget.isPICStyleGOT()) {
4173 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4174 // GOT pointer (except regcall).
4175 if (!isTailCall) {
4176 // Indirect call with RegCall calling convertion may use up all the
4177 // general registers, so it is not suitable to bind EBX reister for
4178 // GOT address, just let register allocator handle it.
4179 if (CallConv != CallingConv::X86_RegCall)
4180 RegsToPass.push_back(std::make_pair(
4181 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4182 getPointerTy(DAG.getDataLayout()))));
4183 } else {
4184 // If we are tail calling and generating PIC/GOT style code load the
4185 // address of the callee into ECX. The value in ecx is used as target of
4186 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4187 // for tail calls on PIC/GOT architectures. Normally we would just put the
4188 // address of GOT into ebx and then call target@PLT. But for tail calls
4189 // ebx would be restored (since ebx is callee saved) before jumping to the
4190 // target@PLT.
4191
4192 // Note: The actual moving to ECX is done further down.
4193 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4194 if (G && !G->getGlobal()->hasLocalLinkage() &&
4195 G->getGlobal()->hasDefaultVisibility())
4196 Callee = LowerGlobalAddress(Callee, DAG);
4197 else if (isa<ExternalSymbolSDNode>(Callee))
4198 Callee = LowerExternalSymbol(Callee, DAG);
4199 }
4200 }
4201
4202 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4203 // From AMD64 ABI document:
4204 // For calls that may call functions that use varargs or stdargs
4205 // (prototype-less calls or calls to functions containing ellipsis (...) in
4206 // the declaration) %al is used as hidden argument to specify the number
4207 // of SSE registers used. The contents of %al do not need to match exactly
4208 // the number of registers, but must be an ubound on the number of SSE
4209 // registers used and is in the range 0 - 8 inclusive.
4210
4211 // Count the number of XMM registers allocated.
4212 static const MCPhysReg XMMArgRegs[] = {
4213 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4214 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4215 };
4216 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4217 assert((Subtarget.hasSSE1() || !NumXMMRegs)((void)0)
4218 && "SSE registers cannot be used when SSE is disabled")((void)0);
4219 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4220 DAG.getConstant(NumXMMRegs, dl,
4221 MVT::i8)));
4222 }
4223
4224 if (isVarArg && IsMustTail) {
4225 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4226 for (const auto &F : Forwards) {
4227 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4228 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4229 }
4230 }
4231
4232 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4233 // don't need this because the eligibility check rejects calls that require
4234 // shuffling arguments passed in memory.
4235 if (!IsSibcall && isTailCall) {
4236 // Force all the incoming stack arguments to be loaded from the stack
4237 // before any new outgoing arguments are stored to the stack, because the
4238 // outgoing stack slots may alias the incoming argument stack slots, and
4239 // the alias isn't otherwise explicit. This is slightly more conservative
4240 // than necessary, because it means that each store effectively depends
4241 // on every argument instead of just those arguments it would clobber.
4242 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4243
4244 SmallVector<SDValue, 8> MemOpChains2;
4245 SDValue FIN;
4246 int FI = 0;
4247 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4248 ++I, ++OutsIndex) {
4249 CCValAssign &VA = ArgLocs[I];
4250
4251 if (VA.isRegLoc()) {
4252 if (VA.needsCustom()) {
4253 assert((CallConv == CallingConv::X86_RegCall) &&((void)0)
4254 "Expecting custom case only in regcall calling convention")((void)0);
4255 // This means that we are in special case where one argument was
4256 // passed through two register locations - Skip the next location
4257 ++I;
4258 }
4259
4260 continue;
4261 }
4262
4263 assert(VA.isMemLoc())((void)0);
4264 SDValue Arg = OutVals[OutsIndex];
4265 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4266 // Skip inalloca/preallocated arguments. They don't require any work.
4267 if (Flags.isInAlloca() || Flags.isPreallocated())
4268 continue;
4269 // Create frame index.
4270 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4271 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4272 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4273 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4274
4275 if (Flags.isByVal()) {
4276 // Copy relative to framepointer.
4277 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4278 if (!StackPtr.getNode())
4279 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4280 getPointerTy(DAG.getDataLayout()));
4281 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4282 StackPtr, Source);
4283
4284 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4285 ArgChain,
4286 Flags, DAG, dl));
4287 } else {
4288 // Store relative to framepointer.
4289 MemOpChains2.push_back(DAG.getStore(
4290 ArgChain, dl, Arg, FIN,
4291 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4292 }
4293 }
4294
4295 if (!MemOpChains2.empty())
4296 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4297
4298 // Store the return address to the appropriate stack slot.
4299 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4300 getPointerTy(DAG.getDataLayout()),
4301 RegInfo->getSlotSize(), FPDiff, dl);
4302 }
4303
4304 // Build a sequence of copy-to-reg nodes chained together with token chain
4305 // and flag operands which copy the outgoing args into registers.
4306 SDValue InFlag;
4307 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4308 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4309 RegsToPass[i].second, InFlag);
4310 InFlag = Chain.getValue(1);
4311 }
4312
4313 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4314 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((void)0);
4315 // In the 64-bit large code model, we have to make all calls
4316 // through a register, since the call instruction's 32-bit
4317 // pc-relative offset may not be large enough to hold the whole
4318 // address.
4319 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4320 Callee->getOpcode() == ISD::ExternalSymbol) {
4321 // Lower direct calls to global addresses and external symbols. Setting
4322 // ForCall to true here has the effect of removing WrapperRIP when possible
4323 // to allow direct calls to be selected without first materializing the
4324 // address into a register.
4325 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4326 } else if (Subtarget.isTarget64BitILP32() &&
4327 Callee->getValueType(0) == MVT::i32) {
4328 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4329 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4330 }
4331
4332 // Returns a chain & a flag for retval copy to use.
4333 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4334 SmallVector<SDValue, 8> Ops;
4335
4336 if (!IsSibcall && isTailCall && !IsMustTail) {
4337 Chain = DAG.getCALLSEQ_END(Chain,
4338 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4339 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4340 InFlag = Chain.getValue(1);
4341 }
4342
4343 Ops.push_back(Chain);
4344 Ops.push_back(Callee);
4345
4346 if (isTailCall)
4347 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4348
4349 // Add argument registers to the end of the list so that they are known live
4350 // into the call.
4351 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4352 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4353 RegsToPass[i].second.getValueType()));
4354
4355 // Add a register mask operand representing the call-preserved registers.
4356 const uint32_t *Mask = [&]() {
4357 auto AdaptedCC = CallConv;
4358 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4359 // use X86_INTR calling convention because it has the same CSR mask
4360 // (same preserved registers).
4361 if (HasNCSR)
4362 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4363 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4364 // to use the CSR_NoRegs_RegMask.
4365 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4366 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4367 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4368 }();
4369 assert(Mask && "Missing call preserved mask for calling convention")((void)0);
4370
4371 // If this is an invoke in a 32-bit function using a funclet-based
4372 // personality, assume the function clobbers all registers. If an exception
4373 // is thrown, the runtime will not restore CSRs.
4374 // FIXME: Model this more precisely so that we can register allocate across
4375 // the normal edge and spill and fill across the exceptional edge.
4376 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4377 const Function &CallerFn = MF.getFunction();
4378 EHPersonality Pers =
4379 CallerFn.hasPersonalityFn()
4380 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4381 : EHPersonality::Unknown;
4382 if (isFuncletEHPersonality(Pers))
4383 Mask = RegInfo->getNoPreservedMask();
4384 }
4385
4386 // Define a new register mask from the existing mask.
4387 uint32_t *RegMask = nullptr;
4388
4389 // In some calling conventions we need to remove the used physical registers
4390 // from the reg mask.
4391 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4392 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4393
4394 // Allocate a new Reg Mask and copy Mask.
4395 RegMask = MF.allocateRegMask();
4396 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4397 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4398
4399 // Make sure all sub registers of the argument registers are reset
4400 // in the RegMask.
4401 for (auto const &RegPair : RegsToPass)
4402 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4403 SubRegs.isValid(); ++SubRegs)
4404 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4405
4406 // Create the RegMask Operand according to our updated mask.
4407 Ops.push_back(DAG.getRegisterMask(RegMask));
4408 } else {
4409 // Create the RegMask Operand according to the static mask.
4410 Ops.push_back(DAG.getRegisterMask(Mask));
4411 }
4412
4413 if (InFlag.getNode())
4414 Ops.push_back(InFlag);
4415
4416 if (isTailCall) {
4417 // We used to do:
4418 //// If this is the first return lowered for this function, add the regs
4419 //// to the liveout set for the function.
4420 // This isn't right, although it's probably harmless on x86; liveouts
4421 // should be computed from returns not tail calls. Consider a void
4422 // function making a tail call to a function returning int.
4423 MF.getFrameInfo().setHasTailCall();
4424 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4425 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4426 return Ret;
4427 }
4428
4429 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4430 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4431 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4432 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4433 // expanded to the call, directly followed by a special marker sequence and
4434 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4435 assert(!isTailCall &&((void)0)
4436 "tail calls cannot be marked with clang.arc.attachedcall")((void)0);
4437 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")((void)0);
4438
4439 // Add target constant to select ObjC runtime call just before the call
4440 // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4441 // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4442 // epxanding the pseudo.
4443 unsigned RuntimeCallType =
4444 objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4445 Ops.insert(Ops.begin() + 1,
4446 DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4447 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4448 } else {
4449 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4450 }
4451
4452 InFlag = Chain.getValue(1);
4453 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4454 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4455
4456 // Save heapallocsite metadata.
4457 if (CLI.CB)
4458 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4459 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4460
4461 // Create the CALLSEQ_END node.
4462 unsigned NumBytesForCalleeToPop;
4463 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4464 DAG.getTarget().Options.GuaranteedTailCallOpt))
4465 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4466 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4467 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4468 SR == StackStructReturn)
4469 // If this is a call to a struct-return function, the callee
4470 // pops the hidden struct pointer, so we have to push it back.
4471 // This is common for Darwin/X86, Linux & Mingw32 targets.
4472 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4473 NumBytesForCalleeToPop = 4;
4474 else
4475 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4476
4477 // Returns a flag for retval copy to use.
4478 if (!IsSibcall) {
4479 Chain = DAG.getCALLSEQ_END(Chain,
4480 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4481 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4482 true),
4483 InFlag, dl);
4484 InFlag = Chain.getValue(1);
4485 }
4486
4487 // Handle result values, copying them out of physregs into vregs that we
4488 // return.
4489 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4490 InVals, RegMask);
4491}
4492
4493//===----------------------------------------------------------------------===//
4494// Fast Calling Convention (tail call) implementation
4495//===----------------------------------------------------------------------===//
4496
4497// Like std call, callee cleans arguments, convention except that ECX is
4498// reserved for storing the tail called function address. Only 2 registers are
4499// free for argument passing (inreg). Tail call optimization is performed
4500// provided:
4501// * tailcallopt is enabled
4502// * caller/callee are fastcc
4503// On X86_64 architecture with GOT-style position independent code only local
4504// (within module) calls are supported at the moment.
4505// To keep the stack aligned according to platform abi the function
4506// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4507// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4508// If a tail called function callee has more arguments than the caller the
4509// caller needs to make sure that there is room to move the RETADDR to. This is
4510// achieved by reserving an area the size of the argument delta right after the
4511// original RETADDR, but before the saved framepointer or the spilled registers
4512// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4513// stack layout:
4514// arg1
4515// arg2
4516// RETADDR
4517// [ new RETADDR
4518// move area ]
4519// (possible EBP)
4520// ESI
4521// EDI
4522// local1 ..
4523
4524/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4525/// requirement.
4526unsigned
4527X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4528 SelectionDAG &DAG) const {
4529 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4530 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4531 assert(StackSize % SlotSize == 0 &&((void)0)
4532 "StackSize must be a multiple of SlotSize")((void)0);
4533 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4534}
4535
4536/// Return true if the given stack call argument is already available in the
4537/// same position (relatively) of the caller's incoming argument stack.
4538static
4539bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4540 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4541 const X86InstrInfo *TII, const CCValAssign &VA) {
4542 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4543
4544 for (;;) {
4545 // Look through nodes that don't alter the bits of the incoming value.
4546 unsigned Op = Arg.getOpcode();
4547 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4548 Arg = Arg.getOperand(0);
4549 continue;
4550 }
4551 if (Op == ISD::TRUNCATE) {
4552 const SDValue &TruncInput = Arg.getOperand(0);
4553 if (TruncInput.getOpcode() == ISD::AssertZext &&
4554 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4555 Arg.getValueType()) {
4556 Arg = TruncInput.getOperand(0);
4557 continue;
4558 }
4559 }
4560 break;
4561 }
4562
4563 int FI = INT_MAX2147483647;
4564 if (Arg.getOpcode() == ISD::CopyFromReg) {
4565 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4566 if (!VR.isVirtual())
4567 return false;
4568 MachineInstr *Def = MRI->getVRegDef(VR);
4569 if (!Def)
4570 return false;
4571 if (!Flags.isByVal()) {
4572 if (!TII->isLoadFromStackSlot(*Def, FI))
4573 return false;
4574 } else {
4575 unsigned Opcode = Def->getOpcode();
4576 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4577 Opcode == X86::LEA64_32r) &&
4578 Def->getOperand(1).isFI()) {
4579 FI = Def->getOperand(1).getIndex();
4580 Bytes = Flags.getByValSize();
4581 } else
4582 return false;
4583 }
4584 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4585 if (Flags.isByVal())
4586 // ByVal argument is passed in as a pointer but it's now being
4587 // dereferenced. e.g.
4588 // define @foo(%struct.X* %A) {
4589 // tail call @bar(%struct.X* byval %A)
4590 // }
4591 return false;
4592 SDValue Ptr = Ld->getBasePtr();
4593 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4594 if (!FINode)
4595 return false;
4596 FI = FINode->getIndex();
4597 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4598 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4599 FI = FINode->getIndex();
4600 Bytes = Flags.getByValSize();
4601 } else
4602 return false;
4603
4604 assert(FI != INT_MAX)((void)0);
4605 if (!MFI.isFixedObjectIndex(FI))
4606 return false;
4607
4608 if (Offset != MFI.getObjectOffset(FI))
4609 return false;
4610
4611 // If this is not byval, check that the argument stack object is immutable.
4612 // inalloca and argument copy elision can create mutable argument stack
4613 // objects. Byval objects can be mutated, but a byval call intends to pass the
4614 // mutated memory.
4615 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4616 return false;
4617
4618 if (VA.getLocVT().getFixedSizeInBits() >
4619 Arg.getValueSizeInBits().getFixedSize()) {
4620 // If the argument location is wider than the argument type, check that any
4621 // extension flags match.
4622 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4623 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4624 return false;
4625 }
4626 }
4627
4628 return Bytes == MFI.getObjectSize(FI);
4629}
4630
4631/// Check whether the call is eligible for tail call optimization. Targets
4632/// that want to do tail call optimization should implement this function.
4633bool X86TargetLowering::IsEligibleForTailCallOptimization(
4634 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4635 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4636 const SmallVectorImpl<ISD::OutputArg> &Outs,
4637 const SmallVectorImpl<SDValue> &OutVals,
4638 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4639 if (!mayTailCallThisCC(CalleeCC))
4640 return false;
4641
4642 // If -tailcallopt is specified, make fastcc functions tail-callable.
4643 MachineFunction &MF = DAG.getMachineFunction();
4644 const Function &CallerF = MF.getFunction();
4645
4646 // If the function return type is x86_fp80 and the callee return type is not,
4647 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4648 // perform a tailcall optimization here.
4649 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4650 return false;
4651
4652 CallingConv::ID CallerCC = CallerF.getCallingConv();
4653 bool CCMatch = CallerCC == CalleeCC;
4654 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4655 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4656 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4657 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4658
4659 // Win64 functions have extra shadow space for argument homing. Don't do the
4660 // sibcall if the caller and callee have mismatched expectations for this
4661 // space.
4662 if (IsCalleeWin64 != IsCallerWin64)
4663 return false;
4664
4665 if (IsGuaranteeTCO) {
4666 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4667 return true;
4668 return false;
4669 }
4670
4671 // Look for obvious safe cases to perform tail call optimization that do not
4672 // require ABI changes. This is what gcc calls sibcall.
4673
4674 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4675 // emit a special epilogue.
4676 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4677 if (RegInfo->hasStackRealignment(MF))
4678 return false;
4679
4680 // Also avoid sibcall optimization if either caller or callee uses struct
4681 // return semantics.
4682 if (isCalleeStructRet || isCallerStructRet)
4683 return false;
4684
4685 // Do not sibcall optimize vararg calls unless all arguments are passed via
4686 // registers.
4687 LLVMContext &C = *DAG.getContext();
4688 if (isVarArg && !Outs.empty()) {
4689 // Optimizing for varargs on Win64 is unlikely to be safe without
4690 // additional testing.
4691 if (IsCalleeWin64 || IsCallerWin64)
4692 return false;
4693
4694 SmallVector<CCValAssign, 16> ArgLocs;
4695 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4696
4697 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4698 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4699 if (!ArgLocs[i].isRegLoc())
4700 return false;
4701 }
4702
4703 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4704 // stack. Therefore, if it's not used by the call it is not safe to optimize
4705 // this into a sibcall.
4706 bool Unused = false;
4707 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4708 if (!Ins[i].Used) {
4709 Unused = true;
4710 break;
4711 }
4712 }
4713 if (Unused) {
4714 SmallVector<CCValAssign, 16> RVLocs;
4715 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4716 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4717 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4718 CCValAssign &VA = RVLocs[i];
4719 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4720 return false;
4721 }
4722 }
4723
4724 // Check that the call results are passed in the same way.
4725 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4726 RetCC_X86, RetCC_X86))
4727 return false;
4728 // The callee has to preserve all registers the caller needs to preserve.
4729 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4730 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4731 if (!CCMatch) {
4732 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4733 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4734 return false;
4735 }
4736
4737 unsigned StackArgsSize = 0;
4738
4739 // If the callee takes no arguments then go on to check the results of the
4740 // call.
4741 if (!Outs.empty()) {
4742 // Check if stack adjustment is needed. For now, do not do this if any
4743 // argument is passed on the stack.
4744 SmallVector<CCValAssign, 16> ArgLocs;
4745 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4746
4747 // Allocate shadow area for Win64
4748 if (IsCalleeWin64)
4749 CCInfo.AllocateStack(32, Align(8));
4750
4751 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4752 StackArgsSize = CCInfo.getNextStackOffset();
4753
4754 if (CCInfo.getNextStackOffset()) {
4755 // Check if the arguments are already laid out in the right way as
4756 // the caller's fixed stack objects.
4757 MachineFrameInfo &MFI = MF.getFrameInfo();
4758 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4759 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4760 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4761 CCValAssign &VA = ArgLocs[i];
4762 SDValue Arg = OutVals[i];
4763 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4764 if (VA.getLocInfo() == CCValAssign::Indirect)
4765 return false;
4766 if (!VA.isRegLoc()) {
4767 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4768 MFI, MRI, TII, VA))
4769 return false;
4770 }
4771 }
4772 }
4773
4774 bool PositionIndependent = isPositionIndependent();
4775 // If the tailcall address may be in a register, then make sure it's
4776 // possible to register allocate for it. In 32-bit, the call address can
4777 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4778 // callee-saved registers are restored. These happen to be the same
4779 // registers used to pass 'inreg' arguments so watch out for those.
4780 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4781 !isa<ExternalSymbolSDNode>(Callee)) ||
4782 PositionIndependent)) {
4783 unsigned NumInRegs = 0;
4784 // In PIC we need an extra register to formulate the address computation
4785 // for the callee.
4786 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4787
4788 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4789 CCValAssign &VA = ArgLocs[i];
4790 if (!VA.isRegLoc())
4791 continue;
4792 Register Reg = VA.getLocReg();
4793 switch (Reg) {
4794 default: break;
4795 case X86::EAX: case X86::EDX: case X86::ECX:
4796 if (++NumInRegs == MaxInRegs)
4797 return false;
4798 break;
4799 }
4800 }
4801 }
4802
4803 const MachineRegisterInfo &MRI = MF.getRegInfo();
4804 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4805 return false;
4806 }
4807
4808 bool CalleeWillPop =
4809 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4810 MF.getTarget().Options.GuaranteedTailCallOpt);
4811
4812 if (unsigned BytesToPop =
4813 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4814 // If we have bytes to pop, the callee must pop them.
4815 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4816 if (!CalleePopMatches)
4817 return false;
4818 } else if (CalleeWillPop && StackArgsSize > 0) {
4819 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4820 return false;
4821 }
4822
4823 return true;
4824}
4825
4826FastISel *
4827X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4828 const TargetLibraryInfo *libInfo) const {
4829 return X86::createFastISel(funcInfo, libInfo);
4830}
4831
4832//===----------------------------------------------------------------------===//
4833// Other Lowering Hooks
4834//===----------------------------------------------------------------------===//
4835
4836static bool MayFoldLoad(SDValue Op) {
4837 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4838}
4839
4840static bool MayFoldIntoStore(SDValue Op) {
4841 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4842}
4843
4844static bool MayFoldIntoZeroExtend(SDValue Op) {
4845 if (Op.hasOneUse()) {
4846 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4847 return (ISD::ZERO_EXTEND == Opcode);
4848 }
4849 return false;
4850}
4851
4852static bool isTargetShuffle(unsigned Opcode) {
4853 switch(Opcode) {
4854 default: return false;
4855 case X86ISD::BLENDI:
4856 case X86ISD::PSHUFB:
4857 case X86ISD::PSHUFD:
4858 case X86ISD::PSHUFHW:
4859 case X86ISD::PSHUFLW:
4860 case X86ISD::SHUFP:
4861 case X86ISD::INSERTPS:
4862 case X86ISD::EXTRQI:
4863 case X86ISD::INSERTQI:
4864 case X86ISD::VALIGN:
4865 case X86ISD::PALIGNR:
4866 case X86ISD::VSHLDQ:
4867 case X86ISD::VSRLDQ:
4868 case X86ISD::MOVLHPS:
4869 case X86ISD::MOVHLPS:
4870 case X86ISD::MOVSHDUP:
4871 case X86ISD::MOVSLDUP:
4872 case X86ISD::MOVDDUP:
4873 case X86ISD::MOVSS:
4874 case X86ISD::MOVSD:
4875 case X86ISD::UNPCKL:
4876 case X86ISD::UNPCKH:
4877 case X86ISD::VBROADCAST:
4878 case X86ISD::VPERMILPI:
4879 case X86ISD::VPERMILPV:
4880 case X86ISD::VPERM2X128:
4881 case X86ISD::SHUF128:
4882 case X86ISD::VPERMIL2:
4883 case X86ISD::VPERMI:
4884 case X86ISD::VPPERM:
4885 case X86ISD::VPERMV:
4886 case X86ISD::VPERMV3:
4887 case X86ISD::VZEXT_MOVL:
4888 return true;
4889 }
4890}
4891
4892static bool isTargetShuffleVariableMask(unsigned Opcode) {
4893 switch (Opcode) {
4894 default: return false;
4895 // Target Shuffles.
4896 case X86ISD::PSHUFB:
4897 case X86ISD::VPERMILPV:
4898 case X86ISD::VPERMIL2:
4899 case X86ISD::VPPERM:
4900 case X86ISD::VPERMV:
4901 case X86ISD::VPERMV3:
4902 return true;
4903 // 'Faux' Target Shuffles.
4904 case ISD::OR:
4905 case ISD::AND:
4906 case X86ISD::ANDNP:
4907 return true;
4908 }
4909}
4910
4911static bool isTargetShuffleSplat(SDValue Op) {
4912 unsigned Opcode = Op.getOpcode();
4913 if (Opcode == ISD::EXTRACT_SUBVECTOR)
4914 return isTargetShuffleSplat(Op.getOperand(0));
4915 return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
4916}
4917
4918SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4919 MachineFunction &MF = DAG.getMachineFunction();
4920 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4921 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4922 int ReturnAddrIndex = FuncInfo->getRAIndex();
4923
4924 if (ReturnAddrIndex == 0) {
4925 // Set up a frame object for the return address.
4926 unsigned SlotSize = RegInfo->getSlotSize();
4927 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4928 -(int64_t)SlotSize,
4929 false);
4930 FuncInfo->setRAIndex(ReturnAddrIndex);
4931 }
4932
4933 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4934}
4935
4936bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4937 bool hasSymbolicDisplacement) {
4938 // Offset should fit into 32 bit immediate field.
4939 if (!isInt<32>(Offset))
4940 return false;
4941
4942 // If we don't have a symbolic displacement - we don't have any extra
4943 // restrictions.
4944 if (!hasSymbolicDisplacement)
4945 return true;
4946
4947 // FIXME: Some tweaks might be needed for medium code model.
4948 if (M != CodeModel::Small && M != CodeModel::Kernel)
4949 return false;
4950
4951 // For small code model we assume that latest object is 16MB before end of 31
4952 // bits boundary. We may also accept pretty large negative constants knowing
4953 // that all objects are in the positive half of address space.
4954 if (M == CodeModel::Small && Offset < 16*1024*1024)
4955 return true;
4956
4957 // For kernel code model we know that all object resist in the negative half
4958 // of 32bits address space. We may not accept negative offsets, since they may
4959 // be just off and we may accept pretty large positive ones.
4960 if (M == CodeModel::Kernel && Offset >= 0)
4961 return true;
4962
4963 return false;
4964}
4965
4966/// Determines whether the callee is required to pop its own arguments.
4967/// Callee pop is necessary to support tail calls.
4968bool X86::isCalleePop(CallingConv::ID CallingConv,
4969 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4970 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4971 // can guarantee TCO.
4972 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4973 return true;
4974
4975 switch (CallingConv) {
4976 default:
4977 return false;
4978 case CallingConv::X86_StdCall:
4979 case CallingConv::X86_FastCall:
4980 case CallingConv::X86_ThisCall:
4981 case CallingConv::X86_VectorCall:
4982 return !is64Bit;
4983 }
4984}
4985
4986/// Return true if the condition is an signed comparison operation.
4987static bool isX86CCSigned(unsigned X86CC) {
4988 switch (X86CC) {
4989 default:
4990 llvm_unreachable("Invalid integer condition!")__builtin_unreachable();
4991 case X86::COND_E:
4992 case X86::COND_NE:
4993 case X86::COND_B:
4994 case X86::COND_A:
4995 case X86::COND_BE:
4996 case X86::COND_AE:
4997 return false;
4998 case X86::COND_G:
4999 case X86::COND_GE:
5000 case X86::COND_L:
5001 case X86::COND_LE:
5002 return true;
5003 }
5004}
5005
5006static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5007 switch (SetCCOpcode) {
5008 default: llvm_unreachable("Invalid integer condition!")__builtin_unreachable();
5009 case ISD::SETEQ: return X86::COND_E;
5010 case ISD::SETGT: return X86::COND_G;
5011 case ISD::SETGE: return X86::COND_GE;
5012 case ISD::SETLT: return X86::COND_L;
5013 case ISD::SETLE: return X86::COND_LE;
5014 case ISD::SETNE: return X86::COND_NE;
5015 case ISD::SETULT: return X86::COND_B;
5016 case ISD::SETUGT: return X86::COND_A;
5017 case ISD::SETULE: return X86::COND_BE;
5018 case ISD::SETUGE: return X86::COND_AE;
5019 }
5020}
5021
5022/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5023/// condition code, returning the condition code and the LHS/RHS of the
5024/// comparison to make.
5025static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5026 bool isFP, SDValue &LHS, SDValue &RHS,
5027 SelectionDAG &DAG) {
5028 if (!isFP) {
5029 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5030 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5031 // X > -1 -> X == 0, jump !sign.
5032 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5033 return X86::COND_NS;
5034 }
5035 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5036 // X < 0 -> X == 0, jump on sign.
5037 return X86::COND_S;
5038 }
5039 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5040 // X >= 0 -> X == 0, jump on !sign.
5041 return X86::COND_NS;
5042 }
5043 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5044 // X < 1 -> X <= 0
5045 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5046 return X86::COND_LE;
5047 }
5048 }
5049
5050 return TranslateIntegerX86CC(SetCCOpcode);
5051 }
5052
5053 // First determine if it is required or is profitable to flip the operands.
5054
5055 // If LHS is a foldable load, but RHS is not, flip the condition.
5056 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5057 !ISD::isNON_EXTLoad(RHS.getNode())) {
5058 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5059 std::swap(LHS, RHS);
5060 }
5061
5062 switch (SetCCOpcode) {
5063 default: break;
5064 case ISD::SETOLT:
5065 case ISD::SETOLE:
5066 case ISD::SETUGT:
5067 case ISD::SETUGE:
5068 std::swap(LHS, RHS);
5069 break;
5070 }
5071
5072 // On a floating point condition, the flags are set as follows:
5073 // ZF PF CF op
5074 // 0 | 0 | 0 | X > Y
5075 // 0 | 0 | 1 | X < Y
5076 // 1 | 0 | 0 | X == Y
5077 // 1 | 1 | 1 | unordered
5078 switch (SetCCOpcode) {
5079 default: llvm_unreachable("Condcode should be pre-legalized away")__builtin_unreachable();
5080 case ISD::SETUEQ:
5081 case ISD::SETEQ: return X86::COND_E;
5082 case ISD::SETOLT: // flipped
5083 case ISD::SETOGT:
5084 case ISD::SETGT: return X86::COND_A;
5085 case ISD::SETOLE: // flipped
5086 case ISD::SETOGE:
5087 case ISD::SETGE: return X86::COND_AE;
5088 case ISD::SETUGT: // flipped
5089 case ISD::SETULT:
5090 case ISD::SETLT: return X86::COND_B;
5091 case ISD::SETUGE: // flipped
5092 case ISD::SETULE:
5093 case ISD::SETLE: return X86::COND_BE;
5094 case ISD::SETONE:
5095 case ISD::SETNE: return X86::COND_NE;
5096 case ISD::SETUO: return X86::COND_P;
5097 case ISD::SETO: return X86::COND_NP;
5098 case ISD::SETOEQ:
5099 case ISD::SETUNE: return X86::COND_INVALID;
5100 }
5101}
5102
5103/// Is there a floating point cmov for the specific X86 condition code?
5104/// Current x86 isa includes the following FP cmov instructions:
5105/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5106static bool hasFPCMov(unsigned X86CC) {
5107 switch (X86CC) {
5108 default:
5109 return false;
5110 case X86::COND_B:
5111 case X86::COND_BE:
5112 case X86::COND_E:
5113 case X86::COND_P:
5114 case X86::COND_A:
5115 case X86::COND_AE:
5116 case X86::COND_NE:
5117 case X86::COND_NP:
5118 return true;
5119 }
5120}
5121
5122
5123bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5124 const CallInst &I,
5125 MachineFunction &MF,
5126 unsigned Intrinsic) const {
5127 Info.flags = MachineMemOperand::MONone;
5128 Info.offset = 0;
5129
5130 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5131 if (!IntrData) {
5132 switch (Intrinsic) {
5133 case Intrinsic::x86_aesenc128kl:
5134 case Intrinsic::x86_aesdec128kl:
5135 Info.opc = ISD::INTRINSIC_W_CHAIN;
5136 Info.ptrVal = I.getArgOperand(1);
5137 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5138 Info.align = Align(1);
5139 Info.flags |= MachineMemOperand::MOLoad;
5140 return true;
5141 case Intrinsic::x86_aesenc256kl:
5142 case Intrinsic::x86_aesdec256kl:
5143 Info.opc = ISD::INTRINSIC_W_CHAIN;
5144 Info.ptrVal = I.getArgOperand(1);
5145 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5146 Info.align = Align(1);
5147 Info.flags |= MachineMemOperand::MOLoad;
5148 return true;
5149 case Intrinsic::x86_aesencwide128kl:
5150 case Intrinsic::x86_aesdecwide128kl:
5151 Info.opc = ISD::INTRINSIC_W_CHAIN;
5152 Info.ptrVal = I.getArgOperand(0);
5153 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5154 Info.align = Align(1);
5155 Info.flags |= MachineMemOperand::MOLoad;
5156 return true;
5157 case Intrinsic::x86_aesencwide256kl:
5158 case Intrinsic::x86_aesdecwide256kl:
5159 Info.opc = ISD::INTRINSIC_W_CHAIN;
5160 Info.ptrVal = I.getArgOperand(0);
5161 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5162 Info.align = Align(1);
5163 Info.flags |= MachineMemOperand::MOLoad;
5164 return true;
5165 }
5166 return false;
5167 }
5168
5169 switch (IntrData->Type) {
5170 case TRUNCATE_TO_MEM_VI8:
5171 case TRUNCATE_TO_MEM_VI16:
5172 case TRUNCATE_TO_MEM_VI32: {
5173 Info.opc = ISD::INTRINSIC_VOID;
5174 Info.ptrVal = I.getArgOperand(0);
5175 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5176 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5177 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5178 ScalarVT = MVT::i8;
5179 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5180 ScalarVT = MVT::i16;
5181 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5182 ScalarVT = MVT::i32;
5183
5184 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5185 Info.align = Align(1);
5186 Info.flags |= MachineMemOperand::MOStore;
5187 break;
5188 }
5189 case GATHER:
5190 case GATHER_AVX2: {
5191 Info.opc = ISD::INTRINSIC_W_CHAIN;
5192 Info.ptrVal = nullptr;
5193 MVT DataVT = MVT::getVT(I.getType());
5194 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5195 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5196 IndexVT.getVectorNumElements());
5197 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5198 Info.align = Align(1);
5199 Info.flags |= MachineMemOperand::MOLoad;
5200 break;
5201 }
5202 case SCATTER: {
5203 Info.opc = ISD::INTRINSIC_VOID;
5204 Info.ptrVal = nullptr;
5205 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5206 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5207 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5208 IndexVT.getVectorNumElements());
5209 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5210 Info.align = Align(1);
5211 Info.flags |= MachineMemOperand::MOStore;
5212 break;
5213 }
5214 default:
5215 return false;
5216 }
5217
5218 return true;
5219}
5220
5221/// Returns true if the target can instruction select the
5222/// specified FP immediate natively. If false, the legalizer will
5223/// materialize the FP immediate as a load from a constant pool.
5224bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5225 bool ForCodeSize) const {
5226 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5227 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5228 return true;
5229 }
5230 return false;
5231}
5232
5233bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5234 ISD::LoadExtType ExtTy,
5235 EVT NewVT) const {
5236 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")((void)0);
5237
5238 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5239 // relocation target a movq or addq instruction: don't let the load shrink.
5240 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5241 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5242 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5243 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5244
5245 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5246 // those uses are extracted directly into a store, then the extract + store
5247 // can be store-folded. Therefore, it's probably not worth splitting the load.
5248 EVT VT = Load->getValueType(0);
5249 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5250 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5251 // Skip uses of the chain value. Result 0 of the node is the load value.
5252 if (UI.getUse().getResNo() != 0)
5253 continue;
5254
5255 // If this use is not an extract + store, it's probably worth splitting.
5256 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5257 UI->use_begin()->getOpcode() != ISD::STORE)
5258 return true;
5259 }
5260 // All non-chain uses are extract + store.
5261 return false;
5262 }
5263
5264 return true;
5265}
5266
5267/// Returns true if it is beneficial to convert a load of a constant
5268/// to just the constant itself.
5269bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5270 Type *Ty) const {
5271 assert(Ty->isIntegerTy())((void)0);
5272
5273 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5274 if (BitSize == 0 || BitSize > 64)
5275 return false;
5276 return true;
5277}
5278
5279bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5280 // If we are using XMM registers in the ABI and the condition of the select is
5281 // a floating-point compare and we have blendv or conditional move, then it is
5282 // cheaper to select instead of doing a cross-register move and creating a
5283 // load that depends on the compare result.
5284 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5285 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5286}
5287
5288bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5289 // TODO: It might be a win to ease or lift this restriction, but the generic
5290 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5291 if (VT.isVector() && Subtarget.hasAVX512())
5292 return false;
5293
5294 return true;
5295}
5296
5297bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5298 SDValue C) const {
5299 // TODO: We handle scalars using custom code, but generic combining could make
5300 // that unnecessary.
5301 APInt MulC;
5302 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5303 return false;
5304
5305 // Find the type this will be legalized too. Otherwise we might prematurely
5306 // convert this to shl+add/sub and then still have to type legalize those ops.
5307 // Another choice would be to defer the decision for illegal types until
5308 // after type legalization. But constant splat vectors of i64 can't make it
5309 // through type legalization on 32-bit targets so we would need to special
5310 // case vXi64.
5311 while (getTypeAction(Context, VT) != TypeLegal)
5312 VT = getTypeToTransformTo(Context, VT);
5313
5314 // If vector multiply is legal, assume that's faster than shl + add/sub.
5315 // TODO: Multiply is a complex op with higher latency and lower throughput in
5316 // most implementations, so this check could be loosened based on type
5317 // and/or a CPU attribute.
5318 if (isOperationLegal(ISD::MUL, VT))
5319 return false;
5320
5321 // shl+add, shl+sub, shl+add+neg
5322 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5323 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5324}
5325
5326bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5327 unsigned Index) const {
5328 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5329 return false;
5330
5331 // Mask vectors support all subregister combinations and operations that
5332 // extract half of vector.
5333 if (ResVT.getVectorElementType() == MVT::i1)
5334 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5335 (Index == ResVT.getVectorNumElements()));
5336
5337 return (Index % ResVT.getVectorNumElements()) == 0;
5338}
5339
5340bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5341 unsigned Opc = VecOp.getOpcode();
5342
5343 // Assume target opcodes can't be scalarized.
5344 // TODO - do we have any exceptions?
5345 if (Opc >= ISD::BUILTIN_OP_END)
5346 return false;
5347
5348 // If the vector op is not supported, try to convert to scalar.
5349 EVT VecVT = VecOp.getValueType();
5350 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5351 return true;
5352
5353 // If the vector op is supported, but the scalar op is not, the transform may
5354 // not be worthwhile.
5355 EVT ScalarVT = VecVT.getScalarType();
5356 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5357}
5358
5359bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5360 bool) const {
5361 // TODO: Allow vectors?
5362 if (VT.isVector())
5363 return false;
5364 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5365}
5366
5367bool X86TargetLowering::isCheapToSpeculateCttz() const {
5368 // Speculate cttz only if we can directly use TZCNT.
5369 return Subtarget.hasBMI();
5370}
5371
5372bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5373 // Speculate ctlz only if we can directly use LZCNT.
5374 return Subtarget.hasLZCNT();
5375}
5376
5377bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5378 const SelectionDAG &DAG,
5379 const MachineMemOperand &MMO) const {
5380 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5381 BitcastVT.getVectorElementType() == MVT::i1)
5382 return false;
5383
5384 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5385 return false;
5386
5387 // If both types are legal vectors, it's always ok to convert them.
5388 if (LoadVT.isVector() && BitcastVT.isVector() &&
5389 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5390 return true;
5391
5392 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5393}
5394
5395bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5396 const SelectionDAG &DAG) const {
5397 // Do not merge to float value size (128 bytes) if no implicit
5398 // float attribute is set.
5399 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5400 Attribute::NoImplicitFloat);
5401
5402 if (NoFloat) {
5403 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5404 return (MemVT.getSizeInBits() <= MaxIntSize);
5405 }
5406 // Make sure we don't merge greater than our preferred vector
5407 // width.
5408 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5409 return false;
5410
5411 return true;
5412}
5413
5414bool X86TargetLowering::isCtlzFast() const {
5415 return Subtarget.hasFastLZCNT();
5416}
5417
5418bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5419 const Instruction &AndI) const {
5420 return true;
5421}
5422
5423bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5424 EVT VT = Y.getValueType();
5425
5426 if (VT.isVector())
5427 return false;
5428
5429 if (!Subtarget.hasBMI())
5430 return false;
5431
5432 // There are only 32-bit and 64-bit forms for 'andn'.
5433 if (VT != MVT::i32 && VT != MVT::i64)
5434 return false;
5435
5436 return !isa<ConstantSDNode>(Y);
5437}
5438
5439bool X86TargetLowering::hasAndNot(SDValue Y) const {
5440 EVT VT = Y.getValueType();
5441
5442 if (!VT.isVector())
5443 return hasAndNotCompare(Y);
5444
5445 // Vector.
5446
5447 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5448 return false;
5449
5450 if (VT == MVT::v4i32)
5451 return true;
5452
5453 return Subtarget.hasSSE2();
5454}
5455
5456bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5457 return X.getValueType().isScalarInteger(); // 'bt'
5458}
5459
5460bool X86TargetLowering::
5461 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5462 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5463 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5464 SelectionDAG &DAG) const {
5465 // Does baseline recommend not to perform the fold by default?
5466 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5467 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5468 return false;
5469 // For scalars this transform is always beneficial.
5470 if (X.getValueType().isScalarInteger())
5471 return true;
5472 // If all the shift amounts are identical, then transform is beneficial even
5473 // with rudimentary SSE2 shifts.
5474 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5475 return true;
5476 // If we have AVX2 with it's powerful shift operations, then it's also good.
5477 if (Subtarget.hasAVX2())
5478 return true;
5479 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5480 return NewShiftOpcode == ISD::SHL;
5481}
5482
5483bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5484 const SDNode *N, CombineLevel Level) const {
5485 assert(((N->getOpcode() == ISD::SHL &&((void)0)
5486 N->getOperand(0).getOpcode() == ISD::SRL) ||((void)0)
5487 (N->getOpcode() == ISD::SRL &&((void)0)
5488 N->getOperand(0).getOpcode() == ISD::SHL)) &&((void)0)
5489 "Expected shift-shift mask")((void)0);
5490 EVT VT = N->getValueType(0);
5491 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5492 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5493 // Only fold if the shift values are equal - so it folds to AND.
5494 // TODO - we should fold if either is a non-uniform vector but we don't do
5495 // the fold for non-splats yet.
5496 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5497 }
5498 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5499}
5500
5501bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5502 EVT VT = Y.getValueType();
5503
5504 // For vectors, we don't have a preference, but we probably want a mask.
5505 if (VT.isVector())
5506 return false;
5507
5508 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5509 if (VT == MVT::i64 && !Subtarget.is64Bit())
5510 return false;
5511
5512 return true;
5513}
5514
5515bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5516 SDNode *N) const {
5517 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5518 !Subtarget.isOSWindows())
5519 return false;
5520 return true;
5521}
5522
5523bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5524 // Any legal vector type can be splatted more efficiently than
5525 // loading/spilling from memory.
5526 return isTypeLegal(VT);
5527}
5528
5529MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5530 MVT VT = MVT::getIntegerVT(NumBits);
5531 if (isTypeLegal(VT))
5532 return VT;
5533
5534 // PMOVMSKB can handle this.
5535 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5536 return MVT::v16i8;
5537
5538 // VPMOVMSKB can handle this.
5539 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5540 return MVT::v32i8;
5541
5542 // TODO: Allow 64-bit type for 32-bit target.
5543 // TODO: 512-bit types should be allowed, but make sure that those
5544 // cases are handled in combineVectorSizedSetCCEquality().
5545
5546 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5547}
5548
5549/// Val is the undef sentinel value or equal to the specified value.
5550static bool isUndefOrEqual(int Val, int CmpVal) {
5551 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5552}
5553
5554/// Return true if every element in Mask is the undef sentinel value or equal to
5555/// the specified value..
5556static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5557 return llvm::all_of(Mask, [CmpVal](int M) {
5558 return (M == SM_SentinelUndef) || (M == CmpVal);
5559 });
5560}
5561
5562/// Val is either the undef or zero sentinel value.
5563static bool isUndefOrZero(int Val) {
5564 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5565}
5566
5567/// Return true if every element in Mask, beginning from position Pos and ending
5568/// in Pos+Size is the undef sentinel value.
5569static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5570 return llvm::all_of(Mask.slice(Pos, Size),
5571 [](int M) { return M == SM_SentinelUndef; });
5572}
5573
5574/// Return true if the mask creates a vector whose lower half is undefined.
5575static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5576 unsigned NumElts = Mask.size();
5577 return isUndefInRange(Mask, 0, NumElts / 2);
5578}
5579
5580/// Return true if the mask creates a vector whose upper half is undefined.
5581static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5582 unsigned NumElts = Mask.size();
5583 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5584}
5585
5586/// Return true if Val falls within the specified range (L, H].
5587static bool isInRange(int Val, int Low, int Hi) {
5588 return (Val >= Low && Val < Hi);
5589}
5590
5591/// Return true if the value of any element in Mask falls within the specified
5592/// range (L, H].
5593static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5594 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5595}
5596
5597/// Return true if the value of any element in Mask is the zero sentinel value.
5598static bool isAnyZero(ArrayRef<int> Mask) {
5599 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5600}
5601
5602/// Return true if the value of any element in Mask is the zero or undef
5603/// sentinel values.
5604static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5605 return llvm::any_of(Mask, [](int M) {
5606 return M == SM_SentinelZero || M == SM_SentinelUndef;
5607 });
5608}
5609
5610/// Return true if Val is undef or if its value falls within the
5611/// specified range (L, H].
5612static bool isUndefOrInRange(int Val, int Low, int Hi) {
5613 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5614}
5615
5616/// Return true if every element in Mask is undef or if its value
5617/// falls within the specified range (L, H].
5618static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5619 return llvm::all_of(
5620 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5621}
5622
5623/// Return true if Val is undef, zero or if its value falls within the
5624/// specified range (L, H].
5625static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5626 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5627}
5628
5629/// Return true if every element in Mask is undef, zero or if its value
5630/// falls within the specified range (L, H].
5631static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5632 return llvm::all_of(
5633 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5634}
5635
5636/// Return true if every element in Mask, beginning
5637/// from position Pos and ending in Pos + Size, falls within the specified
5638/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5639static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5640 unsigned Size, int Low, int Step = 1) {
5641 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5642 if (!isUndefOrEqual(Mask[i], Low))
5643 return false;
5644 return true;
5645}
5646
5647/// Return true if every element in Mask, beginning
5648/// from position Pos and ending in Pos+Size, falls within the specified
5649/// sequential range (Low, Low+Size], or is undef or is zero.
5650static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5651 unsigned Size, int Low,
5652 int Step = 1) {
5653 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5654 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5655 return false;
5656 return true;
5657}
5658
5659/// Return true if every element in Mask, beginning
5660/// from position Pos and ending in Pos+Size is undef or is zero.
5661static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5662 unsigned Size) {
5663 return llvm::all_of(Mask.slice(Pos, Size),
5664 [](int M) { return isUndefOrZero(M); });
5665}
5666
5667/// Helper function to test whether a shuffle mask could be
5668/// simplified by widening the elements being shuffled.
5669///
5670/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5671/// leaves it in an unspecified state.
5672///
5673/// NOTE: This must handle normal vector shuffle masks and *target* vector
5674/// shuffle masks. The latter have the special property of a '-2' representing
5675/// a zero-ed lane of a vector.
5676static bool canWidenShuffleElements(ArrayRef<int> Mask,
5677 SmallVectorImpl<int> &WidenedMask) {
5678 WidenedMask.assign(Mask.size() / 2, 0);
5679 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5680 int M0 = Mask[i];
5681 int M1 = Mask[i + 1];
5682
5683 // If both elements are undef, its trivial.
5684 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5685 WidenedMask[i / 2] = SM_SentinelUndef;
5686 continue;
5687 }
5688
5689 // Check for an undef mask and a mask value properly aligned to fit with
5690 // a pair of values. If we find such a case, use the non-undef mask's value.
5691 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5692 WidenedMask[i / 2] = M1 / 2;
5693 continue;
5694 }
5695 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5696 WidenedMask[i / 2] = M0 / 2;
5697 continue;
5698 }
5699
5700 // When zeroing, we need to spread the zeroing across both lanes to widen.
5701 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5702 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5703 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5704 WidenedMask[i / 2] = SM_SentinelZero;
5705 continue;
5706 }
5707 return false;
5708 }
5709
5710 // Finally check if the two mask values are adjacent and aligned with
5711 // a pair.
5712 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5713 WidenedMask[i / 2] = M0 / 2;
5714 continue;
5715 }
5716
5717 // Otherwise we can't safely widen the elements used in this shuffle.
5718 return false;
5719 }
5720 assert(WidenedMask.size() == Mask.size() / 2 &&((void)0)
5721 "Incorrect size of mask after widening the elements!")((void)0);
5722
5723 return true;
5724}
5725
5726static bool canWidenShuffleElements(ArrayRef<int> Mask,
5727 const APInt &Zeroable,
5728 bool V2IsZero,
5729 SmallVectorImpl<int> &WidenedMask) {
5730 // Create an alternative mask with info about zeroable elements.
5731 // Here we do not set undef elements as zeroable.
5732 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5733 if (V2IsZero) {
5734 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")((void)0);
5735 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5736 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5737 ZeroableMask[i] = SM_SentinelZero;
5738 }
5739 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5740}
5741
5742static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5743 SmallVector<int, 32> WidenedMask;
5744 return canWidenShuffleElements(Mask, WidenedMask);
5745}
5746
5747// Attempt to narrow/widen shuffle mask until it matches the target number of
5748// elements.
5749static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5750 SmallVectorImpl<int> &ScaledMask) {
5751 unsigned NumSrcElts = Mask.size();
5752 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&((void)0)
5753 "Illegal shuffle scale factor")((void)0);
5754
5755 // Narrowing is guaranteed to work.
5756 if (NumDstElts >= NumSrcElts) {
5757 int Scale = NumDstElts / NumSrcElts;
5758 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5759 return true;
5760 }
5761
5762 // We have to repeat the widening until we reach the target size, but we can
5763 // split out the first widening as it sets up ScaledMask for us.
5764 if (canWidenShuffleElements(Mask, ScaledMask)) {
5765 while (ScaledMask.size() > NumDstElts) {
5766 SmallVector<int, 16> WidenedMask;
5767 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5768 return false;
5769 ScaledMask = std::move(WidenedMask);
5770 }
5771 return true;
5772 }
5773
5774 return false;
5775}
5776
5777/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5778bool X86::isZeroNode(SDValue Elt) {
5779 return isNullConstant(Elt) || isNullFPConstant(Elt);
5780}
5781
5782// Build a vector of constants.
5783// Use an UNDEF node if MaskElt == -1.
5784// Split 64-bit constants in the 32-bit mode.
5785static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5786 const SDLoc &dl, bool IsMask = false) {
5787
5788 SmallVector<SDValue, 32> Ops;
5789 bool Split = false;
5790
5791 MVT ConstVecVT = VT;
5792 unsigned NumElts = VT.getVectorNumElements();
5793 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5794 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5795 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5796 Split = true;
5797 }
5798
5799 MVT EltVT = ConstVecVT.getVectorElementType();
5800 for (unsigned i = 0; i < NumElts; ++i) {
5801 bool IsUndef = Values[i] < 0 && IsMask;
5802 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5803 DAG.getConstant(Values[i], dl, EltVT);
5804 Ops.push_back(OpNode);
5805 if (Split)
5806 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5807 DAG.getConstant(0, dl, EltVT));
5808 }
5809 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5810 if (Split)
5811 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5812 return ConstsNode;
5813}
5814
5815static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5816 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5817 assert(Bits.size() == Undefs.getBitWidth() &&((void)0)
5818 "Unequal constant and undef arrays")((void)0);
5819 SmallVector<SDValue, 32> Ops;
5820 bool Split = false;
5821
5822 MVT ConstVecVT = VT;
5823 unsigned NumElts = VT.getVectorNumElements();
5824 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5825 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5826 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5827 Split = true;
5828 }
5829
5830 MVT EltVT = ConstVecVT.getVectorElementType();
5831 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5832 if (Undefs[i]) {
5833 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5834 continue;
5835 }
5836 const APInt &V = Bits[i];
5837 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((void)0);
5838 if (Split) {
5839 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5840 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5841 } else if (EltVT == MVT::f32) {
5842 APFloat FV(APFloat::IEEEsingle(), V);
5843 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5844 } else if (EltVT == MVT::f64) {
5845 APFloat FV(APFloat::IEEEdouble(), V);
5846 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5847 } else {
5848 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5849 }
5850 }
5851
5852 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5853 return DAG.getBitcast(VT, ConstsNode);
5854}
5855
5856/// Returns a vector of specified type with all zero elements.
5857static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5858 SelectionDAG &DAG, const SDLoc &dl) {
5859 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||((void)0)
5860 VT.getVectorElementType() == MVT::i1) &&((void)0)
5861 "Unexpected vector type")((void)0);
5862
5863 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5864 // type. This ensures they get CSE'd. But if the integer type is not
5865 // available, use a floating-point +0.0 instead.
5866 SDValue Vec;
5867 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5868 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5869 } else if (VT.isFloatingPoint()) {
5870 Vec = DAG.getConstantFP(+0.0, dl, VT);
5871 } else if (VT.getVectorElementType() == MVT::i1) {
5872 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&((void)0)
5873 "Unexpected vector type")((void)0);
5874 Vec = DAG.getConstant(0, dl, VT);
5875 } else {
5876 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5877 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5878 }
5879 return DAG.getBitcast(VT, Vec);
5880}
5881
5882static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5883 const SDLoc &dl, unsigned vectorWidth) {
5884 EVT VT = Vec.getValueType();
5885 EVT ElVT = VT.getVectorElementType();
5886 unsigned Factor = VT.getSizeInBits() / vectorWidth;
5887 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5888 VT.getVectorNumElements() / Factor);
5889
5890 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5891 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5892 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
5893
5894 // This is the index of the first element of the vectorWidth-bit chunk
5895 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5896 IdxVal &= ~(ElemsPerChunk - 1);
5897
5898 // If the input is a buildvector just emit a smaller one.
5899 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5900 return DAG.getBuildVector(ResultVT, dl,
5901 Vec->ops().slice(IdxVal, ElemsPerChunk));
5902
5903 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5904 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5905}
5906
5907/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5908/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5909/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5910/// instructions or a simple subregister reference. Idx is an index in the
5911/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5912/// lowering EXTRACT_VECTOR_ELT operations easier.
5913static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5914 SelectionDAG &DAG, const SDLoc &dl) {
5915 assert((Vec.getValueType().is256BitVector() ||((void)0)
5916 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")((void)0);
5917 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5918}
5919
5920/// Generate a DAG to grab 256-bits from a 512-bit vector.
5921static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5922 SelectionDAG &DAG, const SDLoc &dl) {
5923 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((void)0);
5924 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5925}
5926
5927static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5928 SelectionDAG &DAG, const SDLoc &dl,
5929 unsigned vectorWidth) {
5930 assert((vectorWidth == 128 || vectorWidth == 256) &&((void)0)
5931 "Unsupported vector width")((void)0);
5932 // Inserting UNDEF is Result
5933 if (Vec.isUndef())
5934 return Result;
5935 EVT VT = Vec.getValueType();
5936 EVT ElVT = VT.getVectorElementType();
5937 EVT ResultVT = Result.getValueType();
5938
5939 // Insert the relevant vectorWidth bits.
5940 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5941 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
5942
5943 // This is the index of the first element of the vectorWidth-bit chunk
5944 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5945 IdxVal &= ~(ElemsPerChunk - 1);
5946
5947 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5948 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5949}
5950
5951/// Generate a DAG to put 128-bits into a vector > 128 bits. This
5952/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5953/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5954/// simple superregister reference. Idx is an index in the 128 bits
5955/// we want. It need not be aligned to a 128-bit boundary. That makes
5956/// lowering INSERT_VECTOR_ELT operations easier.
5957static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5958 SelectionDAG &DAG, const SDLoc &dl) {
5959 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((void)0);
5960 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5961}
5962
5963/// Widen a vector to a larger size with the same scalar type, with the new
5964/// elements either zero or undef.
5965static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5966 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5967 const SDLoc &dl) {
5968 assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&((void)0)
5969 Vec.getValueType().getScalarType() == VT.getScalarType() &&((void)0)
5970 "Unsupported vector widening type")((void)0);
5971 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5972 : DAG.getUNDEF(VT);
5973 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5974 DAG.getIntPtrConstant(0, dl));
5975}
5976
5977/// Widen a vector to a larger size with the same scalar type, with the new
5978/// elements either zero or undef.
5979static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5980 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5981 const SDLoc &dl, unsigned WideSizeInBits) {
5982 assert(Vec.getValueSizeInBits() < WideSizeInBits &&((void)0)
5983 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&((void)0)
5984 "Unsupported vector widening type")((void)0);
5985 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5986 MVT SVT = Vec.getSimpleValueType().getScalarType();
5987 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5988 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5989}
5990
5991// Helper function to collect subvector ops that are concatenated together,
5992// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5993// The subvectors in Ops are guaranteed to be the same type.
5994static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5995 assert(Ops.empty() && "Expected an empty ops vector")((void)0);
5996
5997 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
5998 Ops.append(N->op_begin(), N->op_end());
5999 return true;
6000 }
6001
6002 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6003 SDValue Src = N->getOperand(0);
6004 SDValue Sub = N->getOperand(1);
6005 const APInt &Idx = N->getConstantOperandAPInt(2);
6006 EVT VT = Src.getValueType();
6007 EVT SubVT = Sub.getValueType();
6008
6009 // TODO - Handle more general insert_subvector chains.
6010 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6011 Idx == (VT.getVectorNumElements() / 2)) {
6012 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6013 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6014 Src.getOperand(1).getValueType() == SubVT &&
6015 isNullConstant(Src.getOperand(2))) {
6016 Ops.push_back(Src.getOperand(1));
6017 Ops.push_back(Sub);
6018 return true;
6019 }
6020 // insert_subvector(x, extract_subvector(x, lo), hi)
6021 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6022 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6023 Ops.append(2, Sub);
6024 return true;
6025 }
6026 }
6027 }
6028
6029 return false;
6030}
6031
6032static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6033 const SDLoc &dl) {
6034 EVT VT = Op.getValueType();
6035 unsigned NumElems = VT.getVectorNumElements();
6036 unsigned SizeInBits = VT.getSizeInBits();
6037 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&((void)0)
6038 "Can't split odd sized vector")((void)0);
6039
6040 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6041 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6042 return std::make_pair(Lo, Hi);
6043}
6044
6045// Split an unary integer op into 2 half sized ops.
6046static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6047 EVT VT = Op.getValueType();
6048
6049 // Make sure we only try to split 256/512-bit types to avoid creating
6050 // narrow vectors.
6051 assert((Op.getOperand(0).getValueType().is256BitVector() ||((void)0)
6052 Op.getOperand(0).getValueType().is512BitVector()) &&((void)0)
6053 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")((void)0);
6054 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==((void)0)
6055 VT.getVectorNumElements() &&((void)0)
6056 "Unexpected VTs!")((void)0);
6057
6058 SDLoc dl(Op);
6059
6060 // Extract the Lo/Hi vectors
6061 SDValue Lo, Hi;
6062 std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6063
6064 EVT LoVT, HiVT;
6065 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6066 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6067 DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6068 DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6069}
6070
6071/// Break a binary integer operation into 2 half sized ops and then
6072/// concatenate the result back.
6073static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6074 EVT VT = Op.getValueType();
6075
6076 // Sanity check that all the types match.
6077 assert(Op.getOperand(0).getValueType() == VT &&((void)0)
6078 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")((void)0);
6079 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")((void)0);
6080
6081 SDLoc dl(Op);
6082
6083 // Extract the LHS Lo/Hi vectors
6084 SDValue LHS1, LHS2;
6085 std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6086
6087 // Extract the RHS Lo/Hi vectors
6088 SDValue RHS1, RHS2;
6089 std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6090
6091 EVT LoVT, HiVT;
6092 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6093 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6094 DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6095 DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6096}
6097
6098// Helper for splitting operands of an operation to legal target size and
6099// apply a function on each part.
6100// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6101// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6102// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6103// The argument Builder is a function that will be applied on each split part:
6104// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6105template <typename F>
6106SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6107 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6108 F Builder, bool CheckBWI = true) {
6109 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")((void)0);
6110 unsigned NumSubs = 1;
6111 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6112 (!CheckBWI && Subtarget.useAVX512Regs())) {
6113 if (VT.getSizeInBits() > 512) {
6114 NumSubs = VT.getSizeInBits() / 512;
6115 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")((void)0);
6116 }
6117 } else if (Subtarget.hasAVX2()) {
6118 if (VT.getSizeInBits() > 256) {
6119 NumSubs = VT.getSizeInBits() / 256;
6120 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")((void)0);
6121 }
6122 } else {
6123 if (VT.getSizeInBits() > 128) {
6124 NumSubs = VT.getSizeInBits() / 128;
6125 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")((void)0);
6126 }
6127 }
6128
6129 if (NumSubs == 1)
6130 return Builder(DAG, DL, Ops);
6131
6132 SmallVector<SDValue, 4> Subs;
6133 for (unsigned i = 0; i != NumSubs; ++i) {
6134 SmallVector<SDValue, 2> SubOps;
6135 for (SDValue Op : Ops) {
6136 EVT OpVT = Op.getValueType();
6137 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6138 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6139 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6140 }
6141 Subs.push_back(Builder(DAG, DL, SubOps));
6142 }
6143 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6144}
6145
6146/// Insert i1-subvector to i1-vector.
6147static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6148 const X86Subtarget &Subtarget) {
6149
6150 SDLoc dl(Op);
6151 SDValue Vec = Op.getOperand(0);
6152 SDValue SubVec = Op.getOperand(1);
6153 SDValue Idx = Op.getOperand(2);
6154 unsigned IdxVal = Op.getConstantOperandVal(2);
6155
6156 // Inserting undef is a nop. We can just return the original vector.
6157 if (SubVec.isUndef())
6158 return Vec;
6159
6160 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6161 return Op;
6162
6163 MVT OpVT = Op.getSimpleValueType();
6164 unsigned NumElems = OpVT.getVectorNumElements();
6165 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6166
6167 // Extend to natively supported kshift.
6168 MVT WideOpVT = OpVT;
6169 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6170 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6171
6172 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6173 // if necessary.
6174 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6175 // May need to promote to a legal type.
6176 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6177 DAG.getConstant(0, dl, WideOpVT),
6178 SubVec, Idx);
6179 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6180 }
6181
6182 MVT SubVecVT = SubVec.getSimpleValueType();
6183 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6184 assert(IdxVal + SubVecNumElems <= NumElems &&((void)0)
6185 IdxVal % SubVecVT.getSizeInBits() == 0 &&((void)0)
6186 "Unexpected index value in INSERT_SUBVECTOR")((void)0);
6187
6188 SDValue Undef = DAG.getUNDEF(WideOpVT);
6189
6190 if (IdxVal == 0) {
6191 // Zero lower bits of the Vec
6192 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6193 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6194 ZeroIdx);
6195 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6196 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6197 // Merge them together, SubVec should be zero extended.
6198 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6199 DAG.getConstant(0, dl, WideOpVT),
6200 SubVec, ZeroIdx);
6201 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6202 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6203 }
6204
6205 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6206 Undef, SubVec, ZeroIdx);
6207
6208 if (Vec.isUndef()) {
6209 assert(IdxVal != 0 && "Unexpected index")((void)0);
6210 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6211 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6212 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6213 }
6214
6215 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6216 assert(IdxVal != 0 && "Unexpected index")((void)0);
6217 NumElems = WideOpVT.getVectorNumElements();
6218 unsigned ShiftLeft = NumElems - SubVecNumElems;
6219 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6220 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6221 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6222 if (ShiftRight != 0)
6223 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6224 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6225 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6226 }
6227
6228 // Simple case when we put subvector in the upper part
6229 if (IdxVal + SubVecNumElems == NumElems) {
6230 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6231 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6232 if (SubVecNumElems * 2 == NumElems) {
6233 // Special case, use legal zero extending insert_subvector. This allows
6234 // isel to optimize when bits are known zero.
6235 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6236 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6237 DAG.getConstant(0, dl, WideOpVT),
6238 Vec, ZeroIdx);
6239 } else {
6240 // Otherwise use explicit shifts to zero the bits.
6241 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6242 Undef, Vec, ZeroIdx);
6243 NumElems = WideOpVT.getVectorNumElements();
6244 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6245 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6246 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6247 }
6248 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6249 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6250 }
6251
6252 // Inserting into the middle is more complicated.
6253
6254 NumElems = WideOpVT.getVectorNumElements();
6255
6256 // Widen the vector if needed.
6257 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6258
6259 unsigned ShiftLeft = NumElems - SubVecNumElems;
6260 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6261
6262 // Do an optimization for the the most frequently used types.
6263 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6264 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6265 Mask0.flipAllBits();
6266 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6267 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6268 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6269 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6270 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6271 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6272 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6273 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6274
6275 // Reduce to original width if needed.
6276 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6277 }
6278
6279 // Clear the upper bits of the subvector and move it to its insert position.
6280 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6281 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6282 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6283 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6284
6285 // Isolate the bits below the insertion point.
6286 unsigned LowShift = NumElems - IdxVal;
6287 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6288 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6289 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6290 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6291
6292 // Isolate the bits after the last inserted bit.
6293 unsigned HighShift = IdxVal + SubVecNumElems;
6294 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6295 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6296 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6297 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6298
6299 // Now OR all 3 pieces together.
6300 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6301 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6302
6303 // Reduce to original width if needed.
6304 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6305}
6306
6307static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6308 const SDLoc &dl) {
6309 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")((void)0);
6310 EVT SubVT = V1.getValueType();
6311 EVT SubSVT = SubVT.getScalarType();
6312 unsigned SubNumElts = SubVT.getVectorNumElements();
6313 unsigned SubVectorWidth = SubVT.getSizeInBits();
6314 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6315 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6316 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6317}
6318
6319/// Returns a vector of specified type with all bits set.
6320/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6321/// Then bitcast to their original type, ensuring they get CSE'd.
6322static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6323 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
6324 "Expected a 128/256/512-bit vector type")((void)0);
6325
6326 APInt Ones = APInt::getAllOnesValue(32);
6327 unsigned NumElts = VT.getSizeInBits() / 32;
6328 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6329 return DAG.getBitcast(VT, Vec);
6330}
6331
6332// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6333static unsigned getOpcode_EXTEND(unsigned Opcode) {
6334 switch (Opcode) {
6335 case ISD::ANY_EXTEND:
6336 case ISD::ANY_EXTEND_VECTOR_INREG:
6337 return ISD::ANY_EXTEND;
6338 case ISD::ZERO_EXTEND:
6339 case ISD::ZERO_EXTEND_VECTOR_INREG:
6340 return ISD::ZERO_EXTEND;
6341 case ISD::SIGN_EXTEND:
6342 case ISD::SIGN_EXTEND_VECTOR_INREG:
6343 return ISD::SIGN_EXTEND;
6344 }
6345 llvm_unreachable("Unknown opcode")__builtin_unreachable();
6346}
6347
6348// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6349static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6350 switch (Opcode) {
6351 case ISD::ANY_EXTEND:
6352 case ISD::ANY_EXTEND_VECTOR_INREG:
6353 return ISD::ANY_EXTEND_VECTOR_INREG;
6354 case ISD::ZERO_EXTEND:
6355 case ISD::ZERO_EXTEND_VECTOR_INREG:
6356 return ISD::ZERO_EXTEND_VECTOR_INREG;
6357 case ISD::SIGN_EXTEND:
6358 case ISD::SIGN_EXTEND_VECTOR_INREG:
6359 return ISD::SIGN_EXTEND_VECTOR_INREG;
6360 }
6361 llvm_unreachable("Unknown opcode")__builtin_unreachable();
6362}
6363
6364static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6365 SDValue In, SelectionDAG &DAG) {
6366 EVT InVT = In.getValueType();
6367 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")((void)0);
6368 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||((void)0)
6369 ISD::ZERO_EXTEND == Opcode) &&((void)0)
6370 "Unknown extension opcode")((void)0);
6371
6372 // For 256-bit vectors, we only need the lower (128-bit) input half.
6373 // For 512-bit vectors, we only need the lower input half or quarter.
6374 if (InVT.getSizeInBits() > 128) {
6375 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&((void)0)
6376 "Expected VTs to be the same size!")((void)0);
6377 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6378 In = extractSubVector(In, 0, DAG, DL,
6379 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6380 InVT = In.getValueType();
6381 }
6382
6383 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6384 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6385
6386 return DAG.getNode(Opcode, DL, VT, In);
6387}
6388
6389// Match (xor X, -1) -> X.
6390// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6391// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6392static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6393 V = peekThroughBitcasts(V);
6394 if (V.getOpcode() == ISD::XOR &&
6395 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6396 return V.getOperand(0);
6397 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6398 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6399 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6400 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6401 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6402 Not, V.getOperand(1));
6403 }
6404 }
6405 SmallVector<SDValue, 2> CatOps;
6406 if (collectConcatOps(V.getNode(), CatOps)) {
6407 for (SDValue &CatOp : CatOps) {
6408 SDValue NotCat = IsNOT(CatOp, DAG);
6409 if (!NotCat) return SDValue();
6410 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6411 }
6412 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6413 }
6414 return SDValue();
6415}
6416
6417void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6418 bool Lo, bool Unary) {
6419 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&((void)0)
6420 "Illegal vector type to unpack")((void)0);
6421 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6422 int NumElts = VT.getVectorNumElements();
6423 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6424 for (int i = 0; i < NumElts; ++i) {
6425 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6426 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6427 Pos += (Unary ? 0 : NumElts * (i % 2));
6428 Pos += (Lo ? 0 : NumEltsInLane / 2);
6429 Mask.push_back(Pos);
6430 }
6431}
6432
6433/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6434/// imposed by AVX and specific to the unary pattern. Example:
6435/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6436/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6437void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6438 bool Lo) {
6439 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6440 int NumElts = VT.getVectorNumElements();
6441 for (int i = 0; i < NumElts; ++i) {
6442 int Pos = i / 2;
6443 Pos += (Lo ? 0 : NumElts / 2);
6444 Mask.push_back(Pos);
6445 }
6446}
6447
6448/// Returns a vector_shuffle node for an unpackl operation.
6449static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6450 SDValue V1, SDValue V2) {
6451 SmallVector<int, 8> Mask;
6452 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6453 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6454}
6455
6456/// Returns a vector_shuffle node for an unpackh operation.
6457static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6458 SDValue V1, SDValue V2) {
6459 SmallVector<int, 8> Mask;
6460 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6461 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6462}
6463
6464/// Return a vector_shuffle of the specified vector of zero or undef vector.
6465/// This produces a shuffle where the low element of V2 is swizzled into the
6466/// zero/undef vector, landing at element Idx.
6467/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6468static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6469 bool IsZero,
6470 const X86Subtarget &Subtarget,
6471 SelectionDAG &DAG) {
6472 MVT VT = V2.getSimpleValueType();
6473 SDValue V1 = IsZero
6474 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6475 int NumElems = VT.getVectorNumElements();
6476 SmallVector<int, 16> MaskVec(NumElems);
6477 for (int i = 0; i != NumElems; ++i)
6478 // If this is the insertion idx, put the low elt of V2 here.
6479 MaskVec[i] = (i == Idx) ? NumElems : i;
6480 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6481}
6482
6483static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6484 if (Ptr.getOpcode() == X86ISD::Wrapper ||
6485 Ptr.getOpcode() == X86ISD::WrapperRIP)
6486 Ptr = Ptr.getOperand(0);
6487
6488 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6489 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6490 return nullptr;
6491
6492 return CNode->getConstVal();
6493}
6494
6495static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6496 if (!Load || !ISD::isNormalLoad(Load))
6497 return nullptr;
6498 return getTargetConstantFromBasePtr(Load->getBasePtr());
6499}
6500
6501static const Constant *getTargetConstantFromNode(SDValue Op) {
6502 Op = peekThroughBitcasts(Op);
6503 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6504}
6505
6506const Constant *
6507X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6508 assert(LD && "Unexpected null LoadSDNode")((void)0);
6509 return getTargetConstantFromNode(LD);
6510}
6511
6512// Extract raw constant bits from constant pools.
6513static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6514 APInt &UndefElts,
6515 SmallVectorImpl<APInt> &EltBits,
6516 bool AllowWholeUndefs = true,
6517 bool AllowPartialUndefs = true) {
6518 assert(EltBits.empty() && "Expected an empty EltBits vector")((void)0);
6519
6520 Op = peekThroughBitcasts(Op);
6521
6522 EVT VT = Op.getValueType();
6523 unsigned SizeInBits = VT.getSizeInBits();
6524 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")((void)0);
6525 unsigned NumElts = SizeInBits / EltSizeInBits;
6526
6527 // Bitcast a source array of element bits to the target size.
6528 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6529 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6530 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6531 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&((void)0)
6532 "Constant bit sizes don't match")((void)0);
6533
6534 // Don't split if we don't allow undef bits.
6535 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6536 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6537 return false;
6538
6539 // If we're already the right size, don't bother bitcasting.
6540 if (NumSrcElts == NumElts) {
6541 UndefElts = UndefSrcElts;
6542 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6543 return true;
6544 }
6545
6546 // Extract all the undef/constant element data and pack into single bitsets.
6547 APInt UndefBits(SizeInBits, 0);
6548 APInt MaskBits(SizeInBits, 0);
6549
6550 for (unsigned i = 0; i != NumSrcElts; ++i) {
6551 unsigned BitOffset = i * SrcEltSizeInBits;
6552 if (UndefSrcElts[i])
6553 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6554 MaskBits.insertBits(SrcEltBits[i], BitOffset);
6555 }
6556
6557 // Split the undef/constant single bitset data into the target elements.
6558 UndefElts = APInt(NumElts, 0);
6559 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6560
6561 for (unsigned i = 0; i != NumElts; ++i) {
6562 unsigned BitOffset = i * EltSizeInBits;
6563 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6564
6565 // Only treat an element as UNDEF if all bits are UNDEF.
6566 if (UndefEltBits.isAllOnesValue()) {
6567 if (!AllowWholeUndefs)
6568 return false;
6569 UndefElts.setBit(i);
6570 continue;
6571 }
6572
6573 // If only some bits are UNDEF then treat them as zero (or bail if not
6574 // supported).
6575 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6576 return false;
6577
6578 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6579 }
6580 return true;
6581 };
6582
6583 // Collect constant bits and insert into mask/undef bit masks.
6584 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6585 unsigned UndefBitIndex) {
6586 if (!Cst)
6587 return false;
6588 if (isa<UndefValue>(Cst)) {
6589 Undefs.setBit(UndefBitIndex);
6590 return true;
6591 }
6592 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6593 Mask = CInt->getValue();
6594 return true;
6595 }
6596 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6597 Mask = CFP->getValueAPF().bitcastToAPInt();
6598 return true;
6599 }
6600 return false;
6601 };
6602
6603 // Handle UNDEFs.
6604 if (Op.isUndef()) {
6605 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6606 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6607 return CastBitData(UndefSrcElts, SrcEltBits);
6608 }
6609
6610 // Extract scalar constant bits.
6611 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6612 APInt UndefSrcElts = APInt::getNullValue(1);
6613 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6614 return CastBitData(UndefSrcElts, SrcEltBits);
6615 }
6616 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6617 APInt UndefSrcElts = APInt::getNullValue(1);
6618 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6619 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6620 return CastBitData(UndefSrcElts, SrcEltBits);
6621 }
6622
6623 // Extract constant bits from build vector.
6624 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6625 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6626 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6627
6628 APInt UndefSrcElts(NumSrcElts, 0);
6629 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6630 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6631 const SDValue &Src = Op.getOperand(i);
6632 if (Src.isUndef()) {
6633 UndefSrcElts.setBit(i);
6634 continue;
6635 }
6636 auto *Cst = cast<ConstantSDNode>(Src);
6637 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6638 }
6639 return CastBitData(UndefSrcElts, SrcEltBits);
6640 }
6641 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6642 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6643 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6644
6645 APInt UndefSrcElts(NumSrcElts, 0);
6646 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6647 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6648 const SDValue &Src = Op.getOperand(i);
6649 if (Src.isUndef()) {
6650 UndefSrcElts.setBit(i);
6651 continue;
6652 }
6653 auto *Cst = cast<ConstantFPSDNode>(Src);
6654 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6655 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6656 }
6657 return CastBitData(UndefSrcElts, SrcEltBits);
6658 }
6659
6660 // Extract constant bits from constant pool vector.
6661 if (auto *Cst = getTargetConstantFromNode(Op)) {
6662 Type *CstTy = Cst->getType();
6663 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6664 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6665 return false;
6666
6667 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6668 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6669
6670 APInt UndefSrcElts(NumSrcElts, 0);
6671 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6672 for (unsigned i = 0; i != NumSrcElts; ++i)
6673 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6674 UndefSrcElts, i))
6675 return false;
6676
6677 return CastBitData(UndefSrcElts, SrcEltBits);
6678 }
6679
6680 // Extract constant bits from a broadcasted constant pool scalar.
6681 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6682 EltSizeInBits <= VT.getScalarSizeInBits()) {
6683 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6684 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6685 return false;
6686
6687 SDValue Ptr = MemIntr->getBasePtr();
6688 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6689 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6690 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6691
6692 APInt UndefSrcElts(NumSrcElts, 0);
6693 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6694 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6695 if (UndefSrcElts[0])
6696 UndefSrcElts.setBits(0, NumSrcElts);
6697 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6698 return CastBitData(UndefSrcElts, SrcEltBits);
6699 }
6700 }
6701 }
6702
6703 // Extract constant bits from a subvector broadcast.
6704 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6705 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6706 SDValue Ptr = MemIntr->getBasePtr();
6707 // The source constant may be larger than the subvector broadcast,
6708 // ensure we extract the correct subvector constants.
6709 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6710 Type *CstTy = Cst->getType();
6711 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6712 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
6713 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
6714 (SizeInBits % SubVecSizeInBits) != 0)
6715 return false;
6716 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
6717 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
6718 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
6719 APInt UndefSubElts(NumSubElts, 0);
6720 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6721 APInt(CstEltSizeInBits, 0));
6722 for (unsigned i = 0; i != NumSubElts; ++i) {
6723 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6724 UndefSubElts, i))
6725 return false;
6726 for (unsigned j = 1; j != NumSubVecs; ++j)
6727 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6728 }
6729 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6730 UndefSubElts);
6731 return CastBitData(UndefSubElts, SubEltBits);
6732 }
6733 }
6734
6735 // Extract a rematerialized scalar constant insertion.
6736 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6737 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6738 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6739 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6740 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6741
6742 APInt UndefSrcElts(NumSrcElts, 0);
6743 SmallVector<APInt, 64> SrcEltBits;
6744 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6745 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6746 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6747 return CastBitData(UndefSrcElts, SrcEltBits);
6748 }
6749
6750 // Insert constant bits from a base and sub vector sources.
6751 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6752 // If bitcasts to larger elements we might lose track of undefs - don't
6753 // allow any to be safe.
6754 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6755 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6756
6757 APInt UndefSrcElts, UndefSubElts;
6758 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6759 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6760 UndefSubElts, EltSubBits,
6761 AllowWholeUndefs && AllowUndefs,
6762 AllowPartialUndefs && AllowUndefs) &&
6763 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6764 UndefSrcElts, EltSrcBits,
6765 AllowWholeUndefs && AllowUndefs,
6766 AllowPartialUndefs && AllowUndefs)) {
6767 unsigned BaseIdx = Op.getConstantOperandVal(2);
6768 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6769 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6770 EltSrcBits[BaseIdx + i] = EltSubBits[i];
6771 return CastBitData(UndefSrcElts, EltSrcBits);
6772 }
6773 }
6774
6775 // Extract constant bits from a subvector's source.
6776 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6777 // TODO - support extract_subvector through bitcasts.
6778 if (EltSizeInBits != VT.getScalarSizeInBits())
6779 return false;
6780
6781 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6782 UndefElts, EltBits, AllowWholeUndefs,
6783 AllowPartialUndefs)) {
6784 EVT SrcVT = Op.getOperand(0).getValueType();
6785 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6786 unsigned NumSubElts = VT.getVectorNumElements();
6787 unsigned BaseIdx = Op.getConstantOperandVal(1);
6788 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6789 if ((BaseIdx + NumSubElts) != NumSrcElts)
6790 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6791 if (BaseIdx != 0)
6792 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6793 return true;
6794 }
6795 }
6796
6797 // Extract constant bits from shuffle node sources.
6798 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6799 // TODO - support shuffle through bitcasts.
6800 if (EltSizeInBits != VT.getScalarSizeInBits())
6801 return false;
6802
6803 ArrayRef<int> Mask = SVN->getMask();
6804 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6805 llvm::any_of(Mask, [](int M) { return M < 0; }))
6806 return false;
6807
6808 APInt UndefElts0, UndefElts1;
6809 SmallVector<APInt, 32> EltBits0, EltBits1;
6810 if (isAnyInRange(Mask, 0, NumElts) &&
6811 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6812 UndefElts0, EltBits0, AllowWholeUndefs,
6813 AllowPartialUndefs))
6814 return false;
6815 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6816 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6817 UndefElts1, EltBits1, AllowWholeUndefs,
6818 AllowPartialUndefs))
6819 return false;
6820
6821 UndefElts = APInt::getNullValue(NumElts);
6822 for (int i = 0; i != (int)NumElts; ++i) {
6823 int M = Mask[i];
6824 if (M < 0) {
6825 UndefElts.setBit(i);
6826 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6827 } else if (M < (int)NumElts) {
6828 if (UndefElts0[M])
6829 UndefElts.setBit(i);
6830 EltBits.push_back(EltBits0[M]);
6831 } else {
6832 if (UndefElts1[M - NumElts])
6833 UndefElts.setBit(i);
6834 EltBits.push_back(EltBits1[M - NumElts]);
6835 }
6836 }
6837 return true;
6838 }
6839
6840 return false;
6841}
6842
6843namespace llvm {
6844namespace X86 {
6845bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
6846 APInt UndefElts;
6847 SmallVector<APInt, 16> EltBits;
6848 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6849 UndefElts, EltBits, true,
6850 AllowPartialUndefs)) {
6851 int SplatIndex = -1;
6852 for (int i = 0, e = EltBits.size(); i != e; ++i) {
6853 if (UndefElts[i])
6854 continue;
6855 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6856 SplatIndex = -1;
6857 break;
6858 }
6859 SplatIndex = i;
6860 }
6861 if (0 <= SplatIndex) {
6862 SplatVal = EltBits[SplatIndex];
6863 return true;
6864 }
6865 }
6866
6867 return false;
6868}
6869} // namespace X86
6870} // namespace llvm
6871
6872static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6873 unsigned MaskEltSizeInBits,
6874 SmallVectorImpl<uint64_t> &RawMask,
6875 APInt &UndefElts) {
6876 // Extract the raw target constant bits.
6877 SmallVector<APInt, 64> EltBits;
6878 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6879 EltBits, /* AllowWholeUndefs */ true,
6880 /* AllowPartialUndefs */ false))
6881 return false;
6882
6883 // Insert the extracted elements into the mask.
6884 for (const APInt &Elt : EltBits)
6885 RawMask.push_back(Elt.getZExtValue());
6886
6887 return true;
6888}
6889
6890/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6891/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
6892/// Note: This ignores saturation, so inputs must be checked first.
6893static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6894 bool Unary, unsigned NumStages = 1) {
6895 assert(Mask.empty() && "Expected an empty shuffle mask vector")((void)0);
6896 unsigned NumElts = VT.getVectorNumElements();
6897 unsigned NumLanes = VT.getSizeInBits() / 128;
6898 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6899 unsigned Offset = Unary ? 0 : NumElts;
6900 unsigned Repetitions = 1u << (NumStages - 1);
6901 unsigned Increment = 1u << NumStages;
6902 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")((void)0);
6903
6904 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6905 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
6906 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6907 Mask.push_back(Elt + (Lane * NumEltsPerLane));
6908 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
6909 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6910 }
6911 }
6912}
6913
6914// Split the demanded elts of a PACKSS/PACKUS node between its operands.
6915static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6916 APInt &DemandedLHS, APInt &DemandedRHS) {
6917 int NumLanes = VT.getSizeInBits() / 128;
6918 int NumElts = DemandedElts.getBitWidth();
6919 int NumInnerElts = NumElts / 2;
6920 int NumEltsPerLane = NumElts / NumLanes;
6921 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6922
6923 DemandedLHS = APInt::getNullValue(NumInnerElts);
6924 DemandedRHS = APInt::getNullValue(NumInnerElts);
6925
6926 // Map DemandedElts to the packed operands.
6927 for (int Lane = 0; Lane != NumLanes; ++Lane) {
6928 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6929 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6930 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6931 if (DemandedElts[OuterIdx])
6932 DemandedLHS.setBit(InnerIdx);
6933 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6934 DemandedRHS.setBit(InnerIdx);
6935 }
6936 }
6937}
6938
6939// Split the demanded elts of a HADD/HSUB node between its operands.
6940static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6941 APInt &DemandedLHS, APInt &DemandedRHS) {
6942 int NumLanes = VT.getSizeInBits() / 128;
6943 int NumElts = DemandedElts.getBitWidth();
6944 int NumEltsPerLane = NumElts / NumLanes;
6945 int HalfEltsPerLane = NumEltsPerLane / 2;
6946
6947 DemandedLHS = APInt::getNullValue(NumElts);
6948 DemandedRHS = APInt::getNullValue(NumElts);
6949
6950 // Map DemandedElts to the horizontal operands.
6951 for (int Idx = 0; Idx != NumElts; ++Idx) {
6952 if (!DemandedElts[Idx])
6953 continue;
6954 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6955 int LocalIdx = Idx % NumEltsPerLane;
6956 if (LocalIdx < HalfEltsPerLane) {
6957 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6958 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6959 } else {
6960 LocalIdx -= HalfEltsPerLane;
6961 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6962 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6963 }
6964 }
6965}
6966
6967/// Calculates the shuffle mask corresponding to the target-specific opcode.
6968/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6969/// operands in \p Ops, and returns true.
6970/// Sets \p IsUnary to true if only one source is used. Note that this will set
6971/// IsUnary for shuffles which use a single input multiple times, and in those
6972/// cases it will adjust the mask to only have indices within that single input.
6973/// It is an error to call this with non-empty Mask/Ops vectors.
6974static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6975 SmallVectorImpl<SDValue> &Ops,
6976 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6977 unsigned NumElems = VT.getVectorNumElements();
6978 unsigned MaskEltSize = VT.getScalarSizeInBits();
6979 SmallVector<uint64_t, 32> RawMask;
6980 APInt RawUndefs;
6981 uint64_t ImmN;
6982
6983 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((void)0);
6984 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((void)0);
6985
6986 IsUnary = false;
6987 bool IsFakeUnary = false;
6988 switch (N->getOpcode()) {
6989 case X86ISD::BLENDI:
6990 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
6991 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
6992 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6993 DecodeBLENDMask(NumElems, ImmN, Mask);
6994 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6995 break;
6996 case X86ISD::SHUFP:
6997 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
6998 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
6999 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7000 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7001 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7002 break;
7003 case X86ISD::INSERTPS:
7004 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7005 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7006 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7007 DecodeINSERTPSMask(ImmN, Mask);
7008 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7009 break;
7010 case X86ISD::EXTRQI:
7011 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7012 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7013 isa<ConstantSDNode>(N->getOperand(2))) {
7014 int BitLen = N->getConstantOperandVal(1);
7015 int BitIdx = N->getConstantOperandVal(2);
7016 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7017 IsUnary = true;
7018 }
7019 break;
7020 case X86ISD::INSERTQI:
7021 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7022 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7023 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7024 isa<ConstantSDNode>(N->getOperand(3))) {
7025 int BitLen = N->getConstantOperandVal(2);
7026 int BitIdx = N->getConstantOperandVal(3);
7027 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7028 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7029 }
7030 break;
7031 case X86ISD::UNPCKH:
7032 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7033 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7034 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7035 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7036 break;
7037 case X86ISD::UNPCKL:
7038 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7039 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7040 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7041 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7042 break;
7043 case X86ISD::MOVHLPS:
7044 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7045 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7046 DecodeMOVHLPSMask(NumElems, Mask);
7047 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7048 break;
7049 case X86ISD::MOVLHPS:
7050 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7051 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7052 DecodeMOVLHPSMask(NumElems, Mask);
7053 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7054 break;
7055 case X86ISD::VALIGN:
7056 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&((void)0)
7057 "Only 32-bit and 64-bit elements are supported!")((void)0);
7058 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7059 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7060 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7061 DecodeVALIGNMask(NumElems, ImmN, Mask);
7062 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7063 Ops.push_back(N->getOperand(1));
7064 Ops.push_back(N->getOperand(0));
7065 break;
7066 case X86ISD::PALIGNR:
7067 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7068 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7069 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7070 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7071 DecodePALIGNRMask(NumElems, ImmN, Mask);
7072 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7073 Ops.push_back(N->getOperand(1));
7074 Ops.push_back(N->getOperand(0));
7075 break;
7076 case X86ISD::VSHLDQ:
7077 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7078 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7079 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7080 DecodePSLLDQMask(NumElems, ImmN, Mask);
7081 IsUnary = true;
7082 break;
7083 case X86ISD::VSRLDQ:
7084 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7085 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7086 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7087 DecodePSRLDQMask(NumElems, ImmN, Mask);
7088 IsUnary = true;
7089 break;
7090 case X86ISD::PSHUFD:
7091 case X86ISD::VPERMILPI:
7092 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7093 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7094 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7095 IsUnary = true;
7096 break;
7097 case X86ISD::PSHUFHW:
7098 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7099 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7100 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7101 IsUnary = true;
7102 break;
7103 case X86ISD::PSHUFLW:
7104 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7105 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7106 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7107 IsUnary = true;
7108 break;
7109 case X86ISD::VZEXT_MOVL:
7110 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7111 DecodeZeroMoveLowMask(NumElems, Mask);
7112 IsUnary = true;
7113 break;
7114 case X86ISD::VBROADCAST:
7115 // We only decode broadcasts of same-sized vectors, peeking through to
7116 // extracted subvectors is likely to cause hasOneUse issues with
7117 // SimplifyDemandedBits etc.
7118 if (N->getOperand(0).getValueType() == VT) {
7119 DecodeVectorBroadcast(NumElems, Mask);
7120 IsUnary = true;
7121 break;
7122 }
7123 return false;
7124 case X86ISD::VPERMILPV: {
7125 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7126 IsUnary = true;
7127 SDValue MaskNode = N->getOperand(1);
7128 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7129 RawUndefs)) {
7130 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7131 break;
7132 }
7133 return false;
7134 }
7135 case X86ISD::PSHUFB: {
7136 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((void)0);
7137 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7138 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7139 IsUnary = true;
7140 SDValue MaskNode = N->getOperand(1);
7141 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7142 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7143 break;
7144 }
7145 return false;
7146 }
7147 case X86ISD::VPERMI:
7148 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7149 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7150 DecodeVPERMMask(NumElems, ImmN, Mask);
7151 IsUnary = true;
7152 break;
7153 case X86ISD::MOVSS:
7154 case X86ISD::MOVSD:
7155 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7156 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7157 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7158 break;
7159 case X86ISD::VPERM2X128:
7160 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7161 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7162 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7163 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7164 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7165 break;
7166 case X86ISD::SHUF128:
7167 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7168 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7169 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7170 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7171 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7172 break;
7173 case X86ISD::MOVSLDUP:
7174 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7175 DecodeMOVSLDUPMask(NumElems, Mask);
7176 IsUnary = true;
7177 break;
7178 case X86ISD::MOVSHDUP:
7179 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7180 DecodeMOVSHDUPMask(NumElems, Mask);
7181 IsUnary = true;
7182 break;
7183 case X86ISD::MOVDDUP:
7184 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7185 DecodeMOVDDUPMask(NumElems, Mask);
7186 IsUnary = true;
7187 break;
7188 case X86ISD::VPERMIL2: {
7189 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7190 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7191 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7192 SDValue MaskNode = N->getOperand(2);
7193 SDValue CtrlNode = N->getOperand(3);
7194 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7195 unsigned CtrlImm = CtrlOp->getZExtValue();
7196 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7197 RawUndefs)) {
7198 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7199 Mask);
7200 break;
7201 }
7202 }
7203 return false;
7204 }
7205 case X86ISD::VPPERM: {
7206 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7207 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7208 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7209 SDValue MaskNode = N->getOperand(2);
7210 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7211 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7212 break;
7213 }
7214 return false;
7215 }
7216 case X86ISD::VPERMV: {
7217 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((void)0);
7218 IsUnary = true;
7219 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7220 Ops.push_back(N->getOperand(1));
7221 SDValue MaskNode = N->getOperand(0);
7222 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7223 RawUndefs)) {
7224 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7225 break;
7226 }
7227 return false;
7228 }
7229 case X86ISD::VPERMV3: {
7230 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((void)0);
7231 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")((void)0);
7232 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7233 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7234 Ops.push_back(N->getOperand(0));
7235 Ops.push_back(N->getOperand(2));
7236 SDValue MaskNode = N->getOperand(1);
7237 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7238 RawUndefs)) {
7239 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7240 break;
7241 }
7242 return false;
7243 }
7244 default: llvm_unreachable("unknown target shuffle node")__builtin_unreachable();
7245 }
7246
7247 // Empty mask indicates the decode failed.
7248 if (Mask.empty())
7249 return false;
7250
7251 // Check if we're getting a shuffle mask with zero'd elements.
7252 if (!AllowSentinelZero && isAnyZero(Mask))
7253 return false;
7254
7255 // If we have a fake unary shuffle, the shuffle mask is spread across two
7256 // inputs that are actually the same node. Re-map the mask to always point
7257 // into the first input.
7258 if (IsFakeUnary)
7259 for (int &M : Mask)
7260 if (M >= (int)Mask.size())
7261 M -= Mask.size();
7262
7263 // If we didn't already add operands in the opcode-specific code, default to
7264 // adding 1 or 2 operands starting at 0.
7265 if (Ops.empty()) {
7266 Ops.push_back(N->getOperand(0));
7267 if (!IsUnary || IsFakeUnary)
7268 Ops.push_back(N->getOperand(1));
7269 }
7270
7271 return true;
7272}
7273
7274// Wrapper for getTargetShuffleMask with InUnary;
7275static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7276 SmallVectorImpl<SDValue> &Ops,
7277 SmallVectorImpl<int> &Mask) {
7278 bool IsUnary;
7279 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7280}
7281
7282/// Compute whether each element of a shuffle is zeroable.
7283///
7284/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7285/// Either it is an undef element in the shuffle mask, the element of the input
7286/// referenced is undef, or the element of the input referenced is known to be
7287/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7288/// as many lanes with this technique as possible to simplify the remaining
7289/// shuffle.
7290static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7291 SDValue V1, SDValue V2,
7292 APInt &KnownUndef, APInt &KnownZero) {
7293 int Size = Mask.size();
7294 KnownUndef = KnownZero = APInt::getNullValue(Size);
7295
7296 V1 = peekThroughBitcasts(V1);
7297 V2 = peekThroughBitcasts(V2);
7298
7299 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7300 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7301
7302 int VectorSizeInBits = V1.getValueSizeInBits();
7303 int ScalarSizeInBits = VectorSizeInBits / Size;
7304 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((void)0);
7305
7306 for (int i = 0; i < Size; ++i) {
7307 int M = Mask[i];
7308 // Handle the easy cases.
7309 if (M < 0) {
7310 KnownUndef.setBit(i);
7311 continue;
7312 }
7313 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7314 KnownZero.setBit(i);
7315 continue;
7316 }
7317
7318 // Determine shuffle input and normalize the mask.
7319 SDValue V = M < Size ? V1 : V2;
7320 M %= Size;
7321
7322 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7323 if (V.getOpcode() != ISD::BUILD_VECTOR)
7324 continue;
7325
7326 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7327 // the (larger) source element must be UNDEF/ZERO.
7328 if ((Size % V.getNumOperands()) == 0) {
7329 int Scale = Size / V->getNumOperands();
7330 SDValue Op = V.getOperand(M / Scale);
7331 if (Op.isUndef())
7332 KnownUndef.setBit(i);
7333 if (X86::isZeroNode(Op))
7334 KnownZero.setBit(i);
7335 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7336 APInt Val = Cst->getAPIntValue();
7337 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7338 if (Val == 0)
7339 KnownZero.setBit(i);
7340 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7341 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7342 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7343 if (Val == 0)
7344 KnownZero.setBit(i);
7345 }
7346 continue;
7347 }
7348
7349 // If the BUILD_VECTOR has more elements then all the (smaller) source
7350 // elements must be UNDEF or ZERO.
7351 if ((V.getNumOperands() % Size) == 0) {
7352 int Scale = V->getNumOperands() / Size;
7353 bool AllUndef = true;
7354 bool AllZero = true;
7355 for (int j = 0; j < Scale; ++j) {
7356 SDValue Op = V.getOperand((M * Scale) + j);
7357 AllUndef &= Op.isUndef();
7358 AllZero &= X86::isZeroNode(Op);
7359 }
7360 if (AllUndef)
7361 KnownUndef.setBit(i);
7362 if (AllZero)
7363 KnownZero.setBit(i);
7364 continue;
7365 }
7366 }
7367}
7368
7369/// Decode a target shuffle mask and inputs and see if any values are
7370/// known to be undef or zero from their inputs.
7371/// Returns true if the target shuffle mask was decoded.
7372/// FIXME: Merge this with computeZeroableShuffleElements?
7373static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7374 SmallVectorImpl<SDValue> &Ops,
7375 APInt &KnownUndef, APInt &KnownZero) {
7376 bool IsUnary;
7377 if (!isTargetShuffle(N.getOpcode()))
7378 return false;
7379
7380 MVT VT = N.getSimpleValueType();
7381 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7382 return false;
7383
7384 int Size = Mask.size();
7385 SDValue V1 = Ops[0];
7386 SDValue V2 = IsUnary ? V1 : Ops[1];
7387 KnownUndef = KnownZero = APInt::getNullValue(Size);
7388
7389 V1 = peekThroughBitcasts(V1);
7390 V2 = peekThroughBitcasts(V2);
7391
7392 assert((VT.getSizeInBits() % Size) == 0 &&((void)0)
7393 "Illegal split of shuffle value type")((void)0);
7394 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7395
7396 // Extract known constant input data.
7397 APInt UndefSrcElts[2];
7398 SmallVector<APInt, 32> SrcEltBits[2];
7399 bool IsSrcConstant[2] = {
7400 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7401 SrcEltBits[0], true, false),
7402 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7403 SrcEltBits[1], true, false)};
7404
7405 for (int i = 0; i < Size; ++i) {
7406 int M = Mask[i];
7407
7408 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7409 if (M < 0) {
7410 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")((void)0);
7411 if (SM_SentinelUndef == M)
7412 KnownUndef.setBit(i);
7413 if (SM_SentinelZero == M)
7414 KnownZero.setBit(i);
7415 continue;
7416 }
7417
7418 // Determine shuffle input and normalize the mask.
7419 unsigned SrcIdx = M / Size;
7420 SDValue V = M < Size ? V1 : V2;
7421 M %= Size;
7422
7423 // We are referencing an UNDEF input.
7424 if (V.isUndef()) {
7425 KnownUndef.setBit(i);
7426 continue;
7427 }
7428
7429 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7430 // TODO: We currently only set UNDEF for integer types - floats use the same
7431 // registers as vectors and many of the scalar folded loads rely on the
7432 // SCALAR_TO_VECTOR pattern.
7433 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7434 (Size % V.getValueType().getVectorNumElements()) == 0) {
7435 int Scale = Size / V.getValueType().getVectorNumElements();
7436 int Idx = M / Scale;
7437 if (Idx != 0 && !VT.isFloatingPoint())
7438 KnownUndef.setBit(i);
7439 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7440 KnownZero.setBit(i);
7441 continue;
7442 }
7443
7444 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7445 // base vectors.
7446 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7447 SDValue Vec = V.getOperand(0);
7448 int NumVecElts = Vec.getValueType().getVectorNumElements();
7449 if (Vec.isUndef() && Size == NumVecElts) {
7450 int Idx = V.getConstantOperandVal(2);
7451 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7452 if (M < Idx || (Idx + NumSubElts) <= M)
7453 KnownUndef.setBit(i);
7454 }
7455 continue;
7456 }
7457
7458 // Attempt to extract from the source's constant bits.
7459 if (IsSrcConstant[SrcIdx]) {
7460 if (UndefSrcElts[SrcIdx][M])
7461 KnownUndef.setBit(i);
7462 else if (SrcEltBits[SrcIdx][M] == 0)
7463 KnownZero.setBit(i);
7464 }
7465 }
7466
7467 assert(VT.getVectorNumElements() == (unsigned)Size &&((void)0)
7468 "Different mask size from vector size!")((void)0);
7469 return true;
7470}
7471
7472// Replace target shuffle mask elements with known undef/zero sentinels.
7473static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7474 const APInt &KnownUndef,
7475 const APInt &KnownZero,
7476 bool ResolveKnownZeros= true) {
7477 unsigned NumElts = Mask.size();
7478 assert(KnownUndef.getBitWidth() == NumElts &&((void)0)
7479 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")((void)0);
7480
7481 for (unsigned i = 0; i != NumElts; ++i) {
7482 if (KnownUndef[i])
7483 Mask[i] = SM_SentinelUndef;
7484 else if (ResolveKnownZeros && KnownZero[i])
7485 Mask[i] = SM_SentinelZero;
7486 }
7487}
7488
7489// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7490static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7491 APInt &KnownUndef,
7492 APInt &KnownZero) {
7493 unsigned NumElts = Mask.size();
7494 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7495
7496 for (unsigned i = 0; i != NumElts; ++i) {
7497 int M = Mask[i];
7498 if (SM_SentinelUndef == M)
7499 KnownUndef.setBit(i);
7500 if (SM_SentinelZero == M)
7501 KnownZero.setBit(i);
7502 }
7503}
7504
7505// Forward declaration (for getFauxShuffleMask recursive check).
7506// TODO: Use DemandedElts variant.
7507static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7508 SmallVectorImpl<int> &Mask,
7509 const SelectionDAG &DAG, unsigned Depth,
7510 bool ResolveKnownElts);
7511
7512// Attempt to decode ops that could be represented as a shuffle mask.
7513// The decoded shuffle mask may contain a different number of elements to the
7514// destination value type.
7515static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7516 SmallVectorImpl<int> &Mask,
7517 SmallVectorImpl<SDValue> &Ops,
7518 const SelectionDAG &DAG, unsigned Depth,
7519 bool ResolveKnownElts) {
7520 Mask.clear();
7521 Ops.clear();
7522
7523 MVT VT = N.getSimpleValueType();
7524 unsigned NumElts = VT.getVectorNumElements();
7525 unsigned NumSizeInBits = VT.getSizeInBits();
7526 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7527 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7528 return false;
7529 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")((void)0);
7530 unsigned NumSizeInBytes = NumSizeInBits / 8;
7531 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7532
7533 unsigned Opcode = N.getOpcode();
7534 switch (Opcode) {
7535 case ISD::VECTOR_SHUFFLE: {
7536 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7537 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7538 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7539 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7540 Ops.push_back(N.getOperand(0));
7541 Ops.push_back(N.getOperand(1));
7542 return true;
7543 }
7544 return false;
7545 }
7546 case ISD::AND:
7547 case X86ISD::ANDNP: {
7548 // Attempt to decode as a per-byte mask.
7549 APInt UndefElts;
7550 SmallVector<APInt, 32> EltBits;
7551 SDValue N0 = N.getOperand(0);
7552 SDValue N1 = N.getOperand(1);
7553 bool IsAndN = (X86ISD::ANDNP == Opcode);
7554 uint64_t ZeroMask = IsAndN ? 255 : 0;
7555 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7556 return false;
7557 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7558 if (UndefElts[i]) {
7559 Mask.push_back(SM_SentinelUndef);
7560 continue;
7561 }
7562 const APInt &ByteBits = EltBits[i];
7563 if (ByteBits != 0 && ByteBits != 255)
7564 return false;
7565 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7566 }
7567 Ops.push_back(IsAndN ? N1 : N0);
7568 return true;
7569 }
7570 case ISD::OR: {
7571 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7572 // is a valid shuffle index.
7573 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7574 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7575 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7576 return false;
7577 SmallVector<int, 64> SrcMask0, SrcMask1;
7578 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7579 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7580 true) ||
7581 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7582 true))
7583 return false;
7584
7585 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7586 SmallVector<int, 64> Mask0, Mask1;
7587 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7588 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7589 for (int i = 0; i != (int)MaskSize; ++i) {
7590 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7591 // loops converting between OR and BLEND shuffles due to
7592 // canWidenShuffleElements merging away undef elements, meaning we
7593 // fail to recognise the OR as the undef element isn't known zero.
7594 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7595 Mask.push_back(SM_SentinelZero);
7596 else if (Mask1[i] == SM_SentinelZero)
7597 Mask.push_back(i);
7598 else if (Mask0[i] == SM_SentinelZero)
7599 Mask.push_back(i + MaskSize);
7600 else
7601 return false;
7602 }
7603 Ops.push_back(N0);
7604 Ops.push_back(N1);
7605 return true;
7606 }
7607 case ISD::INSERT_SUBVECTOR: {
7608 SDValue Src = N.getOperand(0);
7609 SDValue Sub = N.getOperand(1);
7610 EVT SubVT = Sub.getValueType();
7611 unsigned NumSubElts = SubVT.getVectorNumElements();
7612 if (!N->isOnlyUserOf(Sub.getNode()))
7613 return false;
7614 uint64_t InsertIdx = N.getConstantOperandVal(2);
7615 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7616 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7617 Sub.getOperand(0).getValueType() == VT) {
7618 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7619 for (int i = 0; i != (int)NumElts; ++i)
7620 Mask.push_back(i);
7621 for (int i = 0; i != (int)NumSubElts; ++i)
7622 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7623 Ops.push_back(Src);
7624 Ops.push_back(Sub.getOperand(0));
7625 return true;
7626 }
7627 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7628 SmallVector<int, 64> SubMask;
7629 SmallVector<SDValue, 2> SubInputs;
7630 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7631 SubMask, DAG, Depth + 1, ResolveKnownElts))
7632 return false;
7633
7634 // Subvector shuffle inputs must not be larger than the subvector.
7635 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7636 return SubVT.getFixedSizeInBits() <
7637 SubInput.getValueSizeInBits().getFixedSize();
7638 }))
7639 return false;
7640
7641 if (SubMask.size() != NumSubElts) {
7642 assert(((SubMask.size() % NumSubElts) == 0 ||((void)0)
7643 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")((void)0);
7644 if ((NumSubElts % SubMask.size()) == 0) {
7645 int Scale = NumSubElts / SubMask.size();
7646 SmallVector<int,64> ScaledSubMask;
7647 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7648 SubMask = ScaledSubMask;
7649 } else {
7650 int Scale = SubMask.size() / NumSubElts;
7651 NumSubElts = SubMask.size();
7652 NumElts *= Scale;
7653 InsertIdx *= Scale;
7654 }
7655 }
7656 Ops.push_back(Src);
7657 Ops.append(SubInputs.begin(), SubInputs.end());
7658 if (ISD::isBuildVectorAllZeros(Src.getNode()))
7659 Mask.append(NumElts, SM_SentinelZero);
7660 else
7661 for (int i = 0; i != (int)NumElts; ++i)
7662 Mask.push_back(i);
7663 for (int i = 0; i != (int)NumSubElts; ++i) {
7664 int M = SubMask[i];
7665 if (0 <= M) {
7666 int InputIdx = M / NumSubElts;
7667 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7668 }
7669 Mask[i + InsertIdx] = M;
7670 }
7671 return true;
7672 }
7673 case X86ISD::PINSRB:
7674 case X86ISD::PINSRW:
7675 case ISD::SCALAR_TO_VECTOR:
7676 case ISD::INSERT_VECTOR_ELT: {
7677 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7678 // vector, for matching src/dst vector types.
7679 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7680
7681 unsigned DstIdx = 0;
7682 if (Opcode != ISD::SCALAR_TO_VECTOR) {
7683 // Check we have an in-range constant insertion index.
7684 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7685 N.getConstantOperandAPInt(2).uge(NumElts))
7686 return false;
7687 DstIdx = N.getConstantOperandVal(2);
7688
7689 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7690 if (X86::isZeroNode(Scl)) {
7691 Ops.push_back(N.getOperand(0));
7692 for (unsigned i = 0; i != NumElts; ++i)
7693 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7694 return true;
7695 }
7696 }
7697
7698 // Peek through trunc/aext/zext.
7699 // TODO: aext shouldn't require SM_SentinelZero padding.
7700 // TODO: handle shift of scalars.
7701 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7702 while (Scl.getOpcode() == ISD::TRUNCATE ||
7703 Scl.getOpcode() == ISD::ANY_EXTEND ||
7704 Scl.getOpcode() == ISD::ZERO_EXTEND) {
7705 Scl = Scl.getOperand(0);
7706 MinBitsPerElt =
7707 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7708 }
7709 if ((MinBitsPerElt % 8) != 0)
7710 return false;
7711
7712 // Attempt to find the source vector the scalar was extracted from.
7713 SDValue SrcExtract;
7714 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7715 Scl.getOpcode() == X86ISD::PEXTRW ||
7716 Scl.getOpcode() == X86ISD::PEXTRB) &&
7717 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7718 SrcExtract = Scl;
7719 }
7720 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7721 return false;
7722
7723 SDValue SrcVec = SrcExtract.getOperand(0);
7724 EVT SrcVT = SrcVec.getValueType();
7725 if (!SrcVT.getScalarType().isByteSized())
7726 return false;
7727 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7728 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7729 unsigned DstByte = DstIdx * NumBytesPerElt;
7730 MinBitsPerElt =
7731 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7732
7733 // Create 'identity' byte level shuffle mask and then add inserted bytes.
7734 if (Opcode == ISD::SCALAR_TO_VECTOR) {
7735 Ops.push_back(SrcVec);
7736 Mask.append(NumSizeInBytes, SM_SentinelUndef);
7737 } else {
7738 Ops.push_back(SrcVec);
7739 Ops.push_back(N.getOperand(0));
7740 for (int i = 0; i != (int)NumSizeInBytes; ++i)
7741 Mask.push_back(NumSizeInBytes + i);
7742 }
7743
7744 unsigned MinBytesPerElts = MinBitsPerElt / 8;
7745 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7746 for (unsigned i = 0; i != MinBytesPerElts; ++i)
7747 Mask[DstByte + i] = SrcByte + i;
7748 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7749 Mask[DstByte + i] = SM_SentinelZero;
7750 return true;
7751 }
7752 case X86ISD::PACKSS:
7753 case X86ISD::PACKUS: {
7754 SDValue N0 = N.getOperand(0);
7755 SDValue N1 = N.getOperand(1);
7756 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&((void)0)
7757 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&((void)0)
7758 "Unexpected input value type")((void)0);
7759
7760 APInt EltsLHS, EltsRHS;
7761 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7762
7763 // If we know input saturation won't happen (or we don't care for particular
7764 // lanes), we can treat this as a truncation shuffle.
7765 bool Offset0 = false, Offset1 = false;
7766 if (Opcode == X86ISD::PACKSS) {
7767 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7768 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7769 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7770 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7771 return false;
7772 // We can't easily fold ASHR into a shuffle, but if it was feeding a
7773 // PACKSS then it was likely being used for sign-extension for a
7774 // truncation, so just peek through and adjust the mask accordingly.
7775 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7776 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7777 Offset0 = true;
7778 N0 = N0.getOperand(0);
7779 }
7780 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
7781 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
7782 Offset1 = true;
7783 N1 = N1.getOperand(0);
7784 }
7785 } else {
7786 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7787 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7788 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7789 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7790 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7791 return false;
7792 }
7793
7794 bool IsUnary = (N0 == N1);
7795
7796 Ops.push_back(N0);
7797 if (!IsUnary)
7798 Ops.push_back(N1);
7799
7800 createPackShuffleMask(VT, Mask, IsUnary);
7801
7802 if (Offset0 || Offset1) {
7803 for (int &M : Mask)
7804 if ((Offset0 && isInRange(M, 0, NumElts)) ||
7805 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
7806 ++M;
7807 }
7808 return true;
7809 }
7810 case X86ISD::VTRUNC: {
7811 SDValue Src = N.getOperand(0);
7812 EVT SrcVT = Src.getValueType();
7813 // Truncated source must be a simple vector.
7814 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7815 (SrcVT.getScalarSizeInBits() % 8) != 0)
7816 return false;
7817 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7818 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
7819 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
7820 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")((void)0);
7821 for (unsigned i = 0; i != NumSrcElts; ++i)
7822 Mask.push_back(i * Scale);
7823 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
7824 Ops.push_back(Src);
7825 return true;
7826 }
7827 case X86ISD::VSHLI:
7828 case X86ISD::VSRLI: {
7829 uint64_t ShiftVal = N.getConstantOperandVal(1);
7830 // Out of range bit shifts are guaranteed to be zero.
7831 if (NumBitsPerElt <= ShiftVal) {
7832 Mask.append(NumElts, SM_SentinelZero);
7833 return true;
7834 }
7835
7836 // We can only decode 'whole byte' bit shifts as shuffles.
7837 if ((ShiftVal % 8) != 0)
7838 break;
7839
7840 uint64_t ByteShift = ShiftVal / 8;
7841 Ops.push_back(N.getOperand(0));
7842
7843 // Clear mask to all zeros and insert the shifted byte indices.
7844 Mask.append(NumSizeInBytes, SM_SentinelZero);
7845
7846 if (X86ISD::VSHLI == Opcode) {
7847 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7848 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7849 Mask[i + j] = i + j - ByteShift;
7850 } else {
7851 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
7852 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7853 Mask[i + j - ByteShift] = i + j;
7854 }
7855 return true;
7856 }
7857 case X86ISD::VROTLI:
7858 case X86ISD::VROTRI: {
7859 // We can only decode 'whole byte' bit rotates as shuffles.
7860 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7861 if ((RotateVal % 8) != 0)
7862 return false;
7863 Ops.push_back(N.getOperand(0));
7864 int Offset = RotateVal / 8;
7865 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7866 for (int i = 0; i != (int)NumElts; ++i) {
7867 int BaseIdx = i * NumBytesPerElt;
7868 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
7869 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7870 }
7871 }
7872 return true;
7873 }
7874 case X86ISD::VBROADCAST: {
7875 SDValue Src = N.getOperand(0);
7876 if (!Src.getSimpleValueType().isVector()) {
7877 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7878 !isNullConstant(Src.getOperand(1)) ||
7879 Src.getOperand(0).getValueType().getScalarType() !=
7880 VT.getScalarType())
7881 return false;
7882 Src = Src.getOperand(0);
7883 }
7884 Ops.push_back(Src);
7885 Mask.append(NumElts, 0);
7886 return true;
7887 }
7888 case ISD::ZERO_EXTEND:
7889 case ISD::ANY_EXTEND:
7890 case ISD::ZERO_EXTEND_VECTOR_INREG:
7891 case ISD::ANY_EXTEND_VECTOR_INREG: {
7892 SDValue Src = N.getOperand(0);
7893 EVT SrcVT = Src.getValueType();
7894
7895 // Extended source must be a simple vector.
7896 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7897 (SrcVT.getScalarSizeInBits() % 8) != 0)
7898 return false;
7899
7900 bool IsAnyExtend =
7901 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7902 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
7903 IsAnyExtend, Mask);
7904 Ops.push_back(Src);
7905 return true;
7906 }
7907 }
7908
7909 return false;
7910}
7911
7912/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7913static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7914 SmallVectorImpl<int> &Mask) {
7915 int MaskWidth = Mask.size();
7916 SmallVector<SDValue, 16> UsedInputs;
7917 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7918 int lo = UsedInputs.size() * MaskWidth;
7919 int hi = lo + MaskWidth;
7920
7921 // Strip UNDEF input usage.
7922 if (Inputs[i].isUndef())
7923 for (int &M : Mask)
7924 if ((lo <= M) && (M < hi))
7925 M = SM_SentinelUndef;
7926
7927 // Check for unused inputs.
7928 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7929 for (int &M : Mask)
7930 if (lo <= M)
7931 M -= MaskWidth;
7932 continue;
7933 }
7934
7935 // Check for repeated inputs.
7936 bool IsRepeat = false;
7937 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7938 if (UsedInputs[j] != Inputs[i])
7939 continue;
7940 for (int &M : Mask)
7941 if (lo <= M)
7942 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7943 IsRepeat = true;
7944 break;
7945 }
7946 if (IsRepeat)
7947 continue;
7948
7949 UsedInputs.push_back(Inputs[i]);
7950 }
7951 Inputs = UsedInputs;
7952}
7953
7954/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7955/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7956/// Returns true if the target shuffle mask was decoded.
7957static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7958 SmallVectorImpl<SDValue> &Inputs,
7959 SmallVectorImpl<int> &Mask,
7960 APInt &KnownUndef, APInt &KnownZero,
7961 const SelectionDAG &DAG, unsigned Depth,
7962 bool ResolveKnownElts) {
7963 EVT VT = Op.getValueType();
7964 if (!VT.isSimple() || !VT.isVector())
7965 return false;
7966
7967 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
7968 if (ResolveKnownElts)
7969 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7970 return true;
7971 }
7972 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7973 ResolveKnownElts)) {
7974 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7975 return true;
7976 }
7977 return false;
7978}
7979
7980static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7981 SmallVectorImpl<int> &Mask,
7982 const SelectionDAG &DAG, unsigned Depth = 0,
7983 bool ResolveKnownElts = true) {
7984 EVT VT = Op.getValueType();
7985 if (!VT.isSimple() || !VT.isVector())
7986 return false;
7987
7988 APInt KnownUndef, KnownZero;
7989 unsigned NumElts = Op.getValueType().getVectorNumElements();
7990 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7991 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7992 KnownZero, DAG, Depth, ResolveKnownElts);
7993}
7994
7995// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
7996static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
7997 EVT MemVT, MemSDNode *Mem, unsigned Offset,
7998 SelectionDAG &DAG) {
7999 assert((Opcode == X86ISD::VBROADCAST_LOAD ||((void)0)
8000 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&((void)0)
8001 "Unknown broadcast load type")((void)0);
8002
8003 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8004 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8005 return SDValue();
8006
8007 SDValue Ptr =
8008 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8009 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8010 SDValue Ops[] = {Mem->getChain(), Ptr};
8011 SDValue BcstLd = DAG.getMemIntrinsicNode(
8012 Opcode, DL, Tys, Ops, MemVT,
8013 DAG.getMachineFunction().getMachineMemOperand(
8014 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8015 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8016 return BcstLd;
8017}
8018
8019/// Returns the scalar element that will make up the i'th
8020/// element of the result of the vector shuffle.
8021static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8022 SelectionDAG &DAG, unsigned Depth) {
8023 if (Depth >= SelectionDAG::MaxRecursionDepth)
8024 return SDValue(); // Limit search depth.
8025
8026 EVT VT = Op.getValueType();
8027 unsigned Opcode = Op.getOpcode();
8028 unsigned NumElems = VT.getVectorNumElements();
8029
8030 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8031 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8032 int Elt = SV->getMaskElt(Index);
8033
8034 if (Elt < 0)
8035 return DAG.getUNDEF(VT.getVectorElementType());
8036
8037 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8038 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8039 }
8040
8041 // Recurse into target specific vector shuffles to find scalars.
8042 if (isTargetShuffle(Opcode)) {
8043 MVT ShufVT = VT.getSimpleVT();
8044 MVT ShufSVT = ShufVT.getVectorElementType();
8045 int NumElems = (int)ShufVT.getVectorNumElements();
8046 SmallVector<int, 16> ShuffleMask;
8047 SmallVector<SDValue, 16> ShuffleOps;
8048 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8049 ShuffleMask))
8050 return SDValue();
8051
8052 int Elt = ShuffleMask[Index];
8053 if (Elt == SM_SentinelZero)
8054 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8055 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8056 if (Elt == SM_SentinelUndef)
8057 return DAG.getUNDEF(ShufSVT);
8058
8059 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")((void)0);
8060 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8061 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8062 }
8063
8064 // Recurse into insert_subvector base/sub vector to find scalars.
8065 if (Opcode == ISD::INSERT_SUBVECTOR) {
8066 SDValue Vec = Op.getOperand(0);
8067 SDValue Sub = Op.getOperand(1);
8068 uint64_t SubIdx = Op.getConstantOperandVal(2);
8069 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8070
8071 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8072 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8073 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8074 }
8075
8076 // Recurse into concat_vectors sub vector to find scalars.
8077 if (Opcode == ISD::CONCAT_VECTORS) {
8078 EVT SubVT = Op.getOperand(0).getValueType();
8079 unsigned NumSubElts = SubVT.getVectorNumElements();
8080 uint64_t SubIdx = Index / NumSubElts;
8081 uint64_t SubElt = Index % NumSubElts;
8082 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8083 }
8084
8085 // Recurse into extract_subvector src vector to find scalars.
8086 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8087 SDValue Src = Op.getOperand(0);
8088 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8089 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8090 }
8091
8092 // We only peek through bitcasts of the same vector width.
8093 if (Opcode == ISD::BITCAST) {
8094 SDValue Src = Op.getOperand(0);
8095 EVT SrcVT = Src.getValueType();
8096 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8097 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8098 return SDValue();
8099 }
8100
8101 // Actual nodes that may contain scalar elements
8102
8103 // For insert_vector_elt - either return the index matching scalar or recurse
8104 // into the base vector.
8105 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8106 isa<ConstantSDNode>(Op.getOperand(2))) {
8107 if (Op.getConstantOperandAPInt(2) == Index)
8108 return Op.getOperand(1);
8109 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8110 }
8111
8112 if (Opcode == ISD::SCALAR_TO_VECTOR)
8113 return (Index == 0) ? Op.getOperand(0)
8114 : DAG.getUNDEF(VT.getVectorElementType());
8115
8116 if (Opcode == ISD::BUILD_VECTOR)
8117 return Op.getOperand(Index);
8118
8119 return SDValue();
8120}
8121
8122// Use PINSRB/PINSRW/PINSRD to create a build vector.
8123static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8124 unsigned NumNonZero, unsigned NumZero,
8125 SelectionDAG &DAG,
8126 const X86Subtarget &Subtarget) {
8127 MVT VT = Op.getSimpleValueType();
8128 unsigned NumElts = VT.getVectorNumElements();
8129 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||((void)0)
8130 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&((void)0)
8131 "Illegal vector insertion")((void)0);
8132
8133 SDLoc dl(Op);
8134 SDValue V;
8135 bool First = true;
8136
8137 for (unsigned i = 0; i < NumElts; ++i) {
8138 bool IsNonZero = NonZeroMask[i];
8139 if (!IsNonZero)
8140 continue;
8141
8142 // If the build vector contains zeros or our first insertion is not the
8143 // first index then insert into zero vector to break any register
8144 // dependency else use SCALAR_TO_VECTOR.
8145 if (First) {
8146 First = false;
8147 if (NumZero || 0 != i)
8148 V = getZeroVector(VT, Subtarget, DAG, dl);
8149 else {
8150 assert(0 == i && "Expected insertion into zero-index")((void)0);
8151 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8152 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8153 V = DAG.getBitcast(VT, V);
8154 continue;
8155 }
8156 }
8157 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8158 DAG.getIntPtrConstant(i, dl));
8159 }
8160
8161 return V;
8162}
8163
8164/// Custom lower build_vector of v16i8.
8165static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8166 unsigned NumNonZero, unsigned NumZero,
8167 SelectionDAG &DAG,
8168 const X86Subtarget &Subtarget) {
8169 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8170 return SDValue();
8171
8172 // SSE4.1 - use PINSRB to insert each byte directly.
8173 if (Subtarget.hasSSE41())
8174 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8175 Subtarget);
8176
8177 SDLoc dl(Op);
8178 SDValue V;
8179
8180 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8181 for (unsigned i = 0; i < 16; i += 2) {
8182 bool ThisIsNonZero = NonZeroMask[i];
8183 bool NextIsNonZero = NonZeroMask[i + 1];
8184 if (!ThisIsNonZero && !NextIsNonZero)
8185 continue;
8186
8187 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8188 SDValue Elt;
8189 if (ThisIsNonZero) {
8190 if (NumZero || NextIsNonZero)
8191 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8192 else
8193 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8194 }
8195
8196 if (NextIsNonZero) {
8197 SDValue NextElt = Op.getOperand(i + 1);
8198 if (i == 0 && NumZero)
8199 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8200 else
8201 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8202 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8203 DAG.getConstant(8, dl, MVT::i8));
8204 if (ThisIsNonZero)
8205 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8206 else
8207 Elt = NextElt;
8208 }
8209
8210 // If our first insertion is not the first index or zeros are needed, then
8211 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8212 // elements undefined).
8213 if (!V) {
8214 if (i != 0 || NumZero)
8215 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8216 else {
8217 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8218 V = DAG.getBitcast(MVT::v8i16, V);
8219 continue;
8220 }
8221 }
8222 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8223 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8224 DAG.getIntPtrConstant(i / 2, dl));
8225 }
8226
8227 return DAG.getBitcast(MVT::v16i8, V);
8228}
8229
8230/// Custom lower build_vector of v8i16.
8231static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8232 unsigned NumNonZero, unsigned NumZero,
8233 SelectionDAG &DAG,
8234 const X86Subtarget &Subtarget) {
8235 if (NumNonZero > 4 && !Subtarget.hasSSE41())
8236 return SDValue();
8237
8238 // Use PINSRW to insert each byte directly.
8239 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8240 Subtarget);
8241}
8242
8243/// Custom lower build_vector of v4i32 or v4f32.
8244static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8245 const X86Subtarget &Subtarget) {
8246 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8247 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8248 // Because we're creating a less complicated build vector here, we may enable
8249 // further folding of the MOVDDUP via shuffle transforms.
8250 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8251 Op.getOperand(0) == Op.getOperand(2) &&
8252 Op.getOperand(1) == Op.getOperand(3) &&
8253 Op.getOperand(0) != Op.getOperand(1)) {
8254 SDLoc DL(Op);
8255 MVT VT = Op.getSimpleValueType();
8256 MVT EltVT = VT.getVectorElementType();
8257 // Create a new build vector with the first 2 elements followed by undef
8258 // padding, bitcast to v2f64, duplicate, and bitcast back.
8259 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8260 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8261 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8262 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8263 return DAG.getBitcast(VT, Dup);
8264 }
8265
8266 // Find all zeroable elements.
8267 std::bitset<4> Zeroable, Undefs;
8268 for (int i = 0; i < 4; ++i) {
8269 SDValue Elt = Op.getOperand(i);
8270 Undefs[i] = Elt.isUndef();
8271 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8272 }
8273 assert(Zeroable.size() - Zeroable.count() > 1 &&((void)0)
8274 "We expect at least two non-zero elements!")((void)0);
8275
8276 // We only know how to deal with build_vector nodes where elements are either
8277 // zeroable or extract_vector_elt with constant index.
8278 SDValue FirstNonZero;
8279 unsigned FirstNonZeroIdx;
8280 for (unsigned i = 0; i < 4; ++i) {
8281 if (Zeroable[i])
8282 continue;
8283 SDValue Elt = Op.getOperand(i);
8284 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8285 !isa<ConstantSDNode>(Elt.getOperand(1)))
8286 return SDValue();
8287 // Make sure that this node is extracting from a 128-bit vector.
8288 MVT VT = Elt.getOperand(0).getSimpleValueType();
8289 if (!VT.is128BitVector())
8290 return SDValue();
8291 if (!FirstNonZero.getNode()) {
8292 FirstNonZero = Elt;
8293 FirstNonZeroIdx = i;
8294 }
8295 }
8296
8297 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((void)0);
8298 SDValue V1 = FirstNonZero.getOperand(0);
8299 MVT VT = V1.getSimpleValueType();
8300
8301 // See if this build_vector can be lowered as a blend with zero.
8302 SDValue Elt;
8303 unsigned EltMaskIdx, EltIdx;
8304 int Mask[4];
8305 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8306 if (Zeroable[EltIdx]) {
8307 // The zero vector will be on the right hand side.
8308 Mask[EltIdx] = EltIdx+4;
8309 continue;
8310 }
8311
8312 Elt = Op->getOperand(EltIdx);
8313 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8314 EltMaskIdx = Elt.getConstantOperandVal(1);
8315 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8316 break;
8317 Mask[EltIdx] = EltIdx;
8318 }
8319
8320 if (EltIdx == 4) {
8321 // Let the shuffle legalizer deal with blend operations.
8322 SDValue VZeroOrUndef = (Zeroable == Undefs)
8323 ? DAG.getUNDEF(VT)
8324 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8325 if (V1.getSimpleValueType() != VT)
8326 V1 = DAG.getBitcast(VT, V1);
8327 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8328 }
8329
8330 // See if we can lower this build_vector to a INSERTPS.
8331 if (!Subtarget.hasSSE41())
8332 return SDValue();
8333
8334 SDValue V2 = Elt.getOperand(0);
8335 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8336 V1 = SDValue();
8337
8338 bool CanFold = true;
8339 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8340 if (Zeroable[i])
8341 continue;
8342
8343 SDValue Current = Op->getOperand(i);
8344 SDValue SrcVector = Current->getOperand(0);
8345 if (!V1.getNode())
8346 V1 = SrcVector;
8347 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8348 }
8349
8350 if (!CanFold)
8351 return SDValue();
8352
8353 assert(V1.getNode() && "Expected at least two non-zero elements!")((void)0);
8354 if (V1.getSimpleValueType() != MVT::v4f32)
8355 V1 = DAG.getBitcast(MVT::v4f32, V1);
8356 if (V2.getSimpleValueType() != MVT::v4f32)
8357 V2 = DAG.getBitcast(MVT::v4f32, V2);
8358
8359 // Ok, we can emit an INSERTPS instruction.
8360 unsigned ZMask = Zeroable.to_ulong();
8361
8362 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8363 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")((void)0);
8364 SDLoc DL(Op);
8365 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8366 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8367 return DAG.getBitcast(VT, Result);
8368}
8369
8370/// Return a vector logical shift node.
8371static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8372 SelectionDAG &DAG, const TargetLowering &TLI,
8373 const SDLoc &dl) {
8374 assert(VT.is128BitVector() && "Unknown type for VShift")((void)0);
8375 MVT ShVT = MVT::v16i8;
8376 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8377 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8378 assert(NumBits % 8 == 0 && "Only support byte sized shifts")((void)0);
8379 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8380 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8381}
8382
8383static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8384 SelectionDAG &DAG) {
8385
8386 // Check if the scalar load can be widened into a vector load. And if
8387 // the address is "base + cst" see if the cst can be "absorbed" into
8388 // the shuffle mask.
8389 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8390 SDValue Ptr = LD->getBasePtr();
8391 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8392 return SDValue();
8393 EVT PVT = LD->getValueType(0);
8394 if (PVT != MVT::i32 && PVT != MVT::f32)
8395 return SDValue();
8396
8397 int FI = -1;
8398 int64_t Offset = 0;
8399 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8400 FI = FINode->getIndex();
8401 Offset = 0;
8402 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8403 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8404 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8405 Offset = Ptr.getConstantOperandVal(1);
8406 Ptr = Ptr.getOperand(0);
8407 } else {
8408 return SDValue();
8409 }
8410
8411 // FIXME: 256-bit vector instructions don't require a strict alignment,
8412 // improve this code to support it better.
8413 Align RequiredAlign(VT.getSizeInBits() / 8);
8414 SDValue Chain = LD->getChain();
8415 // Make sure the stack object alignment is at least 16 or 32.
8416 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8417 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8418 if (!InferredAlign || *InferredAlign < RequiredAlign) {
8419 if (MFI.isFixedObjectIndex(FI)) {
8420 // Can't change the alignment. FIXME: It's possible to compute
8421 // the exact stack offset and reference FI + adjust offset instead.
8422 // If someone *really* cares about this. That's the way to implement it.
8423 return SDValue();
8424 } else {
8425 MFI.setObjectAlignment(FI, RequiredAlign);
8426 }
8427 }
8428
8429 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8430 // Ptr + (Offset & ~15).
8431 if (Offset < 0)
8432 return SDValue();
8433 if ((Offset % RequiredAlign.value()) & 3)
8434 return SDValue();
8435 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8436 if (StartOffset) {
8437 SDLoc DL(Ptr);
8438 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8439 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8440 }
8441
8442 int EltNo = (Offset - StartOffset) >> 2;
8443 unsigned NumElems = VT.getVectorNumElements();
8444
8445 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8446 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8447 LD->getPointerInfo().getWithOffset(StartOffset));
8448
8449 SmallVector<int, 8> Mask(NumElems, EltNo);
8450
8451 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8452 }
8453
8454 return SDValue();
8455}
8456
8457// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8458static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8459 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8460 auto *BaseLd = cast<LoadSDNode>(Elt);
8461 if (!BaseLd->isSimple())
8462 return false;
8463 Ld = BaseLd;
8464 ByteOffset = 0;
8465 return true;
8466 }
8467
8468 switch (Elt.getOpcode()) {
8469 case ISD::BITCAST:
8470 case ISD::TRUNCATE:
8471 case ISD::SCALAR_TO_VECTOR:
8472 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8473 case ISD::SRL:
8474 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8475 uint64_t Idx = IdxC->getZExtValue();
8476 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8477 ByteOffset += Idx / 8;
8478 return true;
8479 }
8480 }
8481 break;
8482 case ISD::EXTRACT_VECTOR_ELT:
8483 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8484 SDValue Src = Elt.getOperand(0);
8485 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8486 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8487 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8488 findEltLoadSrc(Src, Ld, ByteOffset)) {
8489 uint64_t Idx = IdxC->getZExtValue();
8490 ByteOffset += Idx * (SrcSizeInBits / 8);
8491 return true;
8492 }
8493 }
8494 break;
8495 }
8496
8497 return false;
8498}
8499
8500/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8501/// elements can be replaced by a single large load which has the same value as
8502/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8503///
8504/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8505static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8506 const SDLoc &DL, SelectionDAG &DAG,
8507 const X86Subtarget &Subtarget,
8508 bool IsAfterLegalize) {
8509 if ((VT.getScalarSizeInBits() % 8) != 0)
8510 return SDValue();
8511
8512 unsigned NumElems = Elts.size();
8513
8514 int LastLoadedElt = -1;
8515 APInt LoadMask = APInt::getNullValue(NumElems);
8516 APInt ZeroMask = APInt::getNullValue(NumElems);
8517 APInt UndefMask = APInt::getNullValue(NumElems);
8518
8519 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8520 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8521
8522 // For each element in the initializer, see if we've found a load, zero or an
8523 // undef.
8524 for (unsigned i = 0; i < NumElems; ++i) {
8525 SDValue Elt = peekThroughBitcasts(Elts[i]);
8526 if (!Elt.getNode())
8527 return SDValue();
8528 if (Elt.isUndef()) {
8529 UndefMask.setBit(i);
8530 continue;
8531 }
8532 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8533 ZeroMask.setBit(i);
8534 continue;
8535 }
8536
8537 // Each loaded element must be the correct fractional portion of the
8538 // requested vector load.
8539 unsigned EltSizeInBits = Elt.getValueSizeInBits();
8540 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8541 return SDValue();
8542
8543 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8544 return SDValue();
8545 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8546 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8547 return SDValue();
8548
8549 LoadMask.setBit(i);
8550 LastLoadedElt = i;
8551 }
8552 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +((void)0)
8553 LoadMask.countPopulation()) == NumElems &&((void)0)
8554 "Incomplete element masks")((void)0);
8555
8556 // Handle Special Cases - all undef or undef/zero.
8557 if (UndefMask.countPopulation() == NumElems)
8558 return DAG.getUNDEF(VT);
8559 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8560 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8561 : DAG.getConstantFP(0.0, DL, VT);
8562
8563 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8564 int FirstLoadedElt = LoadMask.countTrailingZeros();
8565 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8566 EVT EltBaseVT = EltBase.getValueType();
8567 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&((void)0)
8568 "Register/Memory size mismatch")((void)0);
8569 LoadSDNode *LDBase = Loads[FirstLoadedElt];
8570 assert(LDBase && "Did not find base load for merging consecutive loads")((void)0);
8571 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8572 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8573 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8574 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8575 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")((void)0);
8576
8577 // TODO: Support offsetting the base load.
8578 if (ByteOffsets[FirstLoadedElt] != 0)
8579 return SDValue();
8580
8581 // Check to see if the element's load is consecutive to the base load
8582 // or offset from a previous (already checked) load.
8583 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8584 LoadSDNode *Ld = Loads[EltIdx];
8585 int64_t ByteOffset = ByteOffsets[EltIdx];
8586 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8587 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8588 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8589 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8590 }
8591 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8592 EltIdx - FirstLoadedElt);
8593 };
8594
8595 // Consecutive loads can contain UNDEFS but not ZERO elements.
8596 // Consecutive loads with UNDEFs and ZEROs elements require a
8597 // an additional shuffle stage to clear the ZERO elements.
8598 bool IsConsecutiveLoad = true;
8599 bool IsConsecutiveLoadWithZeros = true;
8600 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8601 if (LoadMask[i]) {
8602 if (!CheckConsecutiveLoad(LDBase, i)) {
8603 IsConsecutiveLoad = false;
8604 IsConsecutiveLoadWithZeros = false;
8605 break;
8606 }
8607 } else if (ZeroMask[i]) {
8608 IsConsecutiveLoad = false;
8609 }
8610 }
8611
8612 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8613 auto MMOFlags = LDBase->getMemOperand()->getFlags();
8614 assert(LDBase->isSimple() &&((void)0)
8615 "Cannot merge volatile or atomic loads.")((void)0);
8616 SDValue NewLd =
8617 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8618 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8619 MMOFlags);
8620 for (auto *LD : Loads)
8621 if (LD)
8622 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8623 return NewLd;
8624 };
8625
8626 // Check if the base load is entirely dereferenceable.
8627 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8628 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8629
8630 // LOAD - all consecutive load/undefs (must start/end with a load or be
8631 // entirely dereferenceable). If we have found an entire vector of loads and
8632 // undefs, then return a large load of the entire vector width starting at the
8633 // base pointer. If the vector contains zeros, then attempt to shuffle those
8634 // elements.
8635 if (FirstLoadedElt == 0 &&
8636 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8637 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8638 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8639 return SDValue();
8640
8641 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8642 // will lower to regular temporal loads and use the cache.
8643 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8644 VT.is256BitVector() && !Subtarget.hasInt256())
8645 return SDValue();
8646
8647 if (NumElems == 1)
8648 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8649
8650 if (!ZeroMask)
8651 return CreateLoad(VT, LDBase);
8652
8653 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8654 // vector and a zero vector to clear out the zero elements.
8655 if (!IsAfterLegalize && VT.isVector()) {
8656 unsigned NumMaskElts = VT.getVectorNumElements();
8657 if ((NumMaskElts % NumElems) == 0) {
8658 unsigned Scale = NumMaskElts / NumElems;
8659 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8660 for (unsigned i = 0; i < NumElems; ++i) {
8661 if (UndefMask[i])
8662 continue;
8663 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8664 for (unsigned j = 0; j != Scale; ++j)
8665 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8666 }
8667 SDValue V = CreateLoad(VT, LDBase);
8668 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8669 : DAG.getConstantFP(0.0, DL, VT);
8670 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8671 }
8672 }
8673 }
8674
8675 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8676 if (VT.is256BitVector() || VT.is512BitVector()) {
8677 unsigned HalfNumElems = NumElems / 2;
8678 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8679 EVT HalfVT =
8680 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8681 SDValue HalfLD =
8682 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8683 DAG, Subtarget, IsAfterLegalize);
8684 if (HalfLD)
8685 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8686 HalfLD, DAG.getIntPtrConstant(0, DL));
8687 }
8688 }
8689
8690 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8691 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8692 (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8693 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8694 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8695 : MVT::getIntegerVT(LoadSizeInBits);
8696 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8697 // Allow v4f32 on SSE1 only targets.
8698 // FIXME: Add more isel patterns so we can just use VT directly.
8699 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8700 VecVT = MVT::v4f32;
8701 if (TLI.isTypeLegal(VecVT)) {
8702 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8703 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8704 SDValue ResNode = DAG.getMemIntrinsicNode(
8705 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8706 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8707 for (auto *LD : Loads)
8708 if (LD)
8709 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8710 return DAG.getBitcast(VT, ResNode);
8711 }
8712 }
8713
8714 // BROADCAST - match the smallest possible repetition pattern, load that
8715 // scalar/subvector element and then broadcast to the entire vector.
8716 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8717 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8718 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8719 unsigned RepeatSize = SubElems * BaseSizeInBits;
8720 unsigned ScalarSize = std::min(RepeatSize, 64u);
8721 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8722 continue;
8723
8724 // Don't attempt a 1:N subvector broadcast - it should be caught by
8725 // combineConcatVectorOps, else will cause infinite loops.
8726 if (RepeatSize > ScalarSize && SubElems == 1)
8727 continue;
8728
8729 bool Match = true;
8730 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8731 for (unsigned i = 0; i != NumElems && Match; ++i) {
8732 if (!LoadMask[i])
8733 continue;
8734 SDValue Elt = peekThroughBitcasts(Elts[i]);
8735 if (RepeatedLoads[i % SubElems].isUndef())
8736 RepeatedLoads[i % SubElems] = Elt;
8737 else
8738 Match &= (RepeatedLoads[i % SubElems] == Elt);
8739 }
8740
8741 // We must have loads at both ends of the repetition.
8742 Match &= !RepeatedLoads.front().isUndef();
8743 Match &= !RepeatedLoads.back().isUndef();
8744 if (!Match)
8745 continue;
8746
8747 EVT RepeatVT =
8748 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8749 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8750 : EVT::getFloatingPointVT(ScalarSize);
8751 if (RepeatSize > ScalarSize)
8752 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8753 RepeatSize / ScalarSize);
8754 EVT BroadcastVT =
8755 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8756 VT.getSizeInBits() / ScalarSize);
8757 if (TLI.isTypeLegal(BroadcastVT)) {
8758 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8759 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
8760 SDValue Broadcast = RepeatLoad;
8761 if (RepeatSize > ScalarSize) {
8762 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8763 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8764 } else {
8765 Broadcast =
8766 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8767 }
8768 return DAG.getBitcast(VT, Broadcast);
8769 }
8770 }
8771 }
8772 }
8773
8774 return SDValue();
8775}
8776
8777// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8778// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8779// are consecutive, non-overlapping, and in the right order.
8780static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
8781 SelectionDAG &DAG,
8782 const X86Subtarget &Subtarget,
8783 bool IsAfterLegalize) {
8784 SmallVector<SDValue, 64> Elts;
8785 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8786 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
8787 Elts.push_back(Elt);
8788 continue;
8789 }
8790 return SDValue();
8791 }
8792 assert(Elts.size() == VT.getVectorNumElements())((void)0);
8793 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8794 IsAfterLegalize);
8795}
8796
8797static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8798 unsigned SplatBitSize, LLVMContext &C) {
8799 unsigned ScalarSize = VT.getScalarSizeInBits();
8800 unsigned NumElm = SplatBitSize / ScalarSize;
8801
8802 SmallVector<Constant *, 32> ConstantVec;
8803 for (unsigned i = 0; i < NumElm; i++) {
8804 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8805 Constant *Const;
8806 if (VT.isFloatingPoint()) {
8807 if (ScalarSize == 32) {
8808 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8809 } else {
8810 assert(ScalarSize == 64 && "Unsupported floating point scalar size")((void)0);
8811 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8812 }
8813 } else
8814 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8815 ConstantVec.push_back(Const);
8816 }
8817 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8818}
8819
8820static bool isFoldableUseOfShuffle(SDNode *N) {
8821 for (auto *U : N->uses()) {
8822 unsigned Opc = U->getOpcode();
8823 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8824 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8825 return false;
8826 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8827 return false;
8828 if (isTargetShuffle(Opc))
8829 return true;
8830 if (Opc == ISD::BITCAST) // Ignore bitcasts
8831 return isFoldableUseOfShuffle(U);
8832 if (N->hasOneUse())
8833 return true;
8834 }
8835 return false;
8836}
8837
8838/// Attempt to use the vbroadcast instruction to generate a splat value
8839/// from a splat BUILD_VECTOR which uses:
8840/// a. A single scalar load, or a constant.
8841/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8842///
8843/// The VBROADCAST node is returned when a pattern is found,
8844/// or SDValue() otherwise.
8845static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8846 const X86Subtarget &Subtarget,
8847 SelectionDAG &DAG) {
8848 // VBROADCAST requires AVX.
8849 // TODO: Splats could be generated for non-AVX CPUs using SSE
8850 // instructions, but there's less potential gain for only 128-bit vectors.
8851 if (!Subtarget.hasAVX())
8852 return SDValue();
8853
8854 MVT VT = BVOp->getSimpleValueType(0);
8855 unsigned NumElts = VT.getVectorNumElements();
8856 SDLoc dl(BVOp);
8857
8858 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
8859 "Unsupported vector type for broadcast.")((void)0);
8860
8861 // See if the build vector is a repeating sequence of scalars (inc. splat).
8862 SDValue Ld;
8863 BitVector UndefElements;
8864 SmallVector<SDValue, 16> Sequence;
8865 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
8866 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")((void)0);
8867 if (Sequence.size() == 1)
8868 Ld = Sequence[0];
8869 }
8870
8871 // Attempt to use VBROADCASTM
8872 // From this pattern:
8873 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8874 // b. t1 = (build_vector t0 t0)
8875 //
8876 // Create (VBROADCASTM v2i1 X)
8877 if (!Sequence.empty() && Subtarget.hasCDI()) {
8878 // If not a splat, are the upper sequence values zeroable?
8879 unsigned SeqLen = Sequence.size();
8880 bool UpperZeroOrUndef =
8881 SeqLen == 1 ||
8882 llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
8883 return !V || V.isUndef() || isNullConstant(V);
8884 });
8885 SDValue Op0 = Sequence[0];
8886 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
8887 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
8888 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
8889 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
8890 ? Op0.getOperand(0)
8891 : Op0.getOperand(0).getOperand(0);
8892 MVT MaskVT = BOperand.getSimpleValueType();
8893 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
8894 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8895 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8896 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
8897 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
8898 unsigned Scale = 512 / VT.getSizeInBits();
8899 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
8900 }
8901 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8902 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
8903 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
8904 return DAG.getBitcast(VT, Bcst);
8905 }
8906 }
8907 }
8908
8909 unsigned NumUndefElts = UndefElements.count();
8910 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8911 APInt SplatValue, Undef;
8912 unsigned SplatBitSize;
8913 bool HasUndef;
8914 // Check if this is a repeated constant pattern suitable for broadcasting.
8915 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8916 SplatBitSize > VT.getScalarSizeInBits() &&
8917 SplatBitSize < VT.getSizeInBits()) {
8918 // Avoid replacing with broadcast when it's a use of a shuffle
8919 // instruction to preserve the present custom lowering of shuffles.
8920 if (isFoldableUseOfShuffle(BVOp))
8921 return SDValue();
8922 // replace BUILD_VECTOR with broadcast of the repeated constants.
8923 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8924 LLVMContext *Ctx = DAG.getContext();
8925 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8926 if (Subtarget.hasAVX()) {
8927 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8928 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8929 // Splatted value can fit in one INTEGER constant in constant pool.
8930 // Load the constant and broadcast it.
8931 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8932 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8933 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8934 SDValue CP = DAG.getConstantPool(C, PVT);
8935 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8936
8937 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
8938 SDVTList Tys =
8939 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8940 SDValue Ops[] = {DAG.getEntryNode(), CP};
8941 MachinePointerInfo MPI =
8942 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8943 SDValue Brdcst = DAG.getMemIntrinsicNode(
8944 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8945 MachineMemOperand::MOLoad);
8946 return DAG.getBitcast(VT, Brdcst);
8947 }
8948 if (SplatBitSize > 64) {
8949 // Load the vector of constants and broadcast it.
8950 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8951 *Ctx);
8952 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8953 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8954 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
8955 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
8956 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8957 SDValue Ops[] = {DAG.getEntryNode(), VCP};
8958 MachinePointerInfo MPI =
8959 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8960 return DAG.getMemIntrinsicNode(
8961 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
8962 MachineMemOperand::MOLoad);
8963 }
8964 }
8965 }
8966
8967 // If we are moving a scalar into a vector (Ld must be set and all elements
8968 // but 1 are undef) and that operation is not obviously supported by
8969 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8970 // That's better than general shuffling and may eliminate a load to GPR and
8971 // move from scalar to vector register.
8972 if (!Ld || NumElts - NumUndefElts != 1)
8973 return SDValue();
8974 unsigned ScalarSize = Ld.getValueSizeInBits();
8975 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8976 return SDValue();
8977 }
8978
8979 bool ConstSplatVal =
8980 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8981 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8982
8983 // TODO: Handle broadcasts of non-constant sequences.
8984
8985 // Make sure that all of the users of a non-constant load are from the
8986 // BUILD_VECTOR node.
8987 // FIXME: Is the use count needed for non-constant, non-load case?
8988 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8989 return SDValue();
8990
8991 unsigned ScalarSize = Ld.getValueSizeInBits();
8992 bool IsGE256 = (VT.getSizeInBits() >= 256);
8993
8994 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8995 // instruction to save 8 or more bytes of constant pool data.
8996 // TODO: If multiple splats are generated to load the same constant,
8997 // it may be detrimental to overall size. There needs to be a way to detect
8998 // that condition to know if this is truly a size win.
8999 bool OptForSize = DAG.shouldOptForSize();
9000
9001 // Handle broadcasting a single constant scalar from the constant pool
9002 // into a vector.
9003 // On Sandybridge (no AVX2), it is still better to load a constant vector
9004 // from the constant pool and not to broadcast it from a scalar.
9005 // But override that restriction when optimizing for size.
9006 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9007 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9008 EVT CVT = Ld.getValueType();
9009 assert(!CVT.isVector() && "Must not broadcast a vector type")((void)0);
9010
9011 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9012 // For size optimization, also splat v2f64 and v2i64, and for size opt
9013 // with AVX2, also splat i8 and i16.
9014 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9015 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9016 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9017 const Constant *C = nullptr;
9018 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9019 C = CI->getConstantIntValue();
9020 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9021 C = CF->getConstantFPValue();
9022
9023 assert(C && "Invalid constant type")((void)0);
9024
9025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9026 SDValue CP =
9027 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9028 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9029
9030 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9031 SDValue Ops[] = {DAG.getEntryNode(), CP};
9032 MachinePointerInfo MPI =
9033 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9034 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9035 MPI, Alignment, MachineMemOperand::MOLoad);
9036 }
9037 }
9038
9039 // Handle AVX2 in-register broadcasts.
9040 if (!IsLoad && Subtarget.hasInt256() &&
9041 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9042 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9043
9044 // The scalar source must be a normal load.
9045 if (!IsLoad)
9046 return SDValue();
9047
9048 // Make sure the non-chain result is only used by this build vector.
9049 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9050 return SDValue();
9051
9052 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9053 (Subtarget.hasVLX() && ScalarSize == 64)) {
9054 auto *LN = cast<LoadSDNode>(Ld);
9055 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9056 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9057 SDValue BCast =
9058 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9059 LN->getMemoryVT(), LN->getMemOperand());
9060 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9061 return BCast;
9062 }
9063
9064 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9065 // double since there is no vbroadcastsd xmm
9066 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9067 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9068 auto *LN = cast<LoadSDNode>(Ld);
9069 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9070 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9071 SDValue BCast =
9072 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9073 LN->getMemoryVT(), LN->getMemOperand());
9074 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9075 return BCast;
9076 }
9077
9078 // Unsupported broadcast.
9079 return SDValue();
9080}
9081
9082/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9083/// underlying vector and index.
9084///
9085/// Modifies \p ExtractedFromVec to the real vector and returns the real
9086/// index.
9087static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9088 SDValue ExtIdx) {
9089 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9090 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9091 return Idx;
9092
9093 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9094 // lowered this:
9095 // (extract_vector_elt (v8f32 %1), Constant<6>)
9096 // to:
9097 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9098 // (extract_subvector (v8f32 %0), Constant<4>),
9099 // undef)
9100 // Constant<0>)
9101 // In this case the vector is the extract_subvector expression and the index
9102 // is 2, as specified by the shuffle.
9103 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9104 SDValue ShuffleVec = SVOp->getOperand(0);
9105 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9106 assert(ShuffleVecVT.getVectorElementType() ==((void)0)
9107 ExtractedFromVec.getSimpleValueType().getVectorElementType())((void)0);
9108
9109 int ShuffleIdx = SVOp->getMaskElt(Idx);
9110 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9111 ExtractedFromVec = ShuffleVec;
9112 return ShuffleIdx;
9113 }
9114 return Idx;
9115}
9116
9117static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9118 MVT VT = Op.getSimpleValueType();
9119
9120 // Skip if insert_vec_elt is not supported.
9121 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9122 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9123 return SDValue();
9124
9125 SDLoc DL(Op);
9126 unsigned NumElems = Op.getNumOperands();
9127
9128 SDValue VecIn1;
9129 SDValue VecIn2;
9130 SmallVector<unsigned, 4> InsertIndices;
9131 SmallVector<int, 8> Mask(NumElems, -1);
9132
9133 for (unsigned i = 0; i != NumElems; ++i) {
9134 unsigned Opc = Op.getOperand(i).getOpcode();
9135
9136 if (Opc == ISD::UNDEF)
9137 continue;
9138
9139 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9140 // Quit if more than 1 elements need inserting.
9141 if (InsertIndices.size() > 1)
9142 return SDValue();
9143
9144 InsertIndices.push_back(i);
9145 continue;
9146 }
9147
9148 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9149 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9150
9151 // Quit if non-constant index.
9152 if (!isa<ConstantSDNode>(ExtIdx))
9153 return SDValue();
9154 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9155
9156 // Quit if extracted from vector of different type.
9157 if (ExtractedFromVec.getValueType() != VT)
9158 return SDValue();
9159
9160 if (!VecIn1.getNode())
9161 VecIn1 = ExtractedFromVec;
9162 else if (VecIn1 != ExtractedFromVec) {
9163 if (!VecIn2.getNode())
9164 VecIn2 = ExtractedFromVec;
9165 else if (VecIn2 != ExtractedFromVec)
9166 // Quit if more than 2 vectors to shuffle
9167 return SDValue();
9168 }
9169
9170 if (ExtractedFromVec == VecIn1)
9171 Mask[i] = Idx;
9172 else if (ExtractedFromVec == VecIn2)
9173 Mask[i] = Idx + NumElems;
9174 }
9175
9176 if (!VecIn1.getNode())
9177 return SDValue();
9178
9179 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9180 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9181
9182 for (unsigned Idx : InsertIndices)
9183 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9184 DAG.getIntPtrConstant(Idx, DL));
9185
9186 return NV;
9187}
9188
9189// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9190static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9191 const X86Subtarget &Subtarget) {
9192
9193 MVT VT = Op.getSimpleValueType();
9194 assert((VT.getVectorElementType() == MVT::i1) &&((void)0)
9195 "Unexpected type in LowerBUILD_VECTORvXi1!")((void)0);
9196
9197 SDLoc dl(Op);
9198 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9199 ISD::isBuildVectorAllOnes(Op.getNode()))
9200 return Op;
9201
9202 uint64_t Immediate = 0;
9203 SmallVector<unsigned, 16> NonConstIdx;
9204 bool IsSplat = true;
9205 bool HasConstElts = false;
9206 int SplatIdx = -1;
9207 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9208 SDValue In = Op.getOperand(idx);
9209 if (In.isUndef())
9210 continue;
9211 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9212 Immediate |= (InC->getZExtValue() & 0x1) << idx;
9213 HasConstElts = true;
9214 } else {
9215 NonConstIdx.push_back(idx);
9216 }
9217 if (SplatIdx < 0)
9218 SplatIdx = idx;
9219 else if (In != Op.getOperand(SplatIdx))
9220 IsSplat = false;
9221 }
9222
9223 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9224 if (IsSplat) {
9225 // The build_vector allows the scalar element to be larger than the vector
9226 // element type. We need to mask it to use as a condition unless we know
9227 // the upper bits are zero.
9228 // FIXME: Use computeKnownBits instead of checking specific opcode?
9229 SDValue Cond = Op.getOperand(SplatIdx);
9230 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")((void)0);
9231 if (Cond.getOpcode() != ISD::SETCC)
9232 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9233 DAG.getConstant(1, dl, MVT::i8));
9234
9235 // Perform the select in the scalar domain so we can use cmov.
9236 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9237 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9238 DAG.getAllOnesConstant(dl, MVT::i32),
9239 DAG.getConstant(0, dl, MVT::i32));
9240 Select = DAG.getBitcast(MVT::v32i1, Select);
9241 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9242 } else {
9243 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9244 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9245 DAG.getAllOnesConstant(dl, ImmVT),
9246 DAG.getConstant(0, dl, ImmVT));
9247 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9248 Select = DAG.getBitcast(VecVT, Select);
9249 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9250 DAG.getIntPtrConstant(0, dl));
9251 }
9252 }
9253
9254 // insert elements one by one
9255 SDValue DstVec;
9256 if (HasConstElts) {
9257 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9258 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9259 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9260 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9261 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9262 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9263 } else {
9264 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9265 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9266 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9267 DstVec = DAG.getBitcast(VecVT, Imm);
9268 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9269 DAG.getIntPtrConstant(0, dl));
9270 }
9271 } else
9272 DstVec = DAG.getUNDEF(VT);
9273
9274 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9275 unsigned InsertIdx = NonConstIdx[i];
9276 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9277 Op.getOperand(InsertIdx),
9278 DAG.getIntPtrConstant(InsertIdx, dl));
9279 }
9280 return DstVec;
9281}
9282
9283LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
9284 switch (Opcode) {
9285 case X86ISD::PACKSS:
9286 case X86ISD::PACKUS:
9287 case X86ISD::FHADD:
9288 case X86ISD::FHSUB:
9289 case X86ISD::HADD:
9290 case X86ISD::HSUB:
9291 return true;
9292 }
9293 return false;
9294}
9295
9296/// This is a helper function of LowerToHorizontalOp().
9297/// This function checks that the build_vector \p N in input implements a
9298/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9299/// may not match the layout of an x86 256-bit horizontal instruction.
9300/// In other words, if this returns true, then some extraction/insertion will
9301/// be required to produce a valid horizontal instruction.
9302///
9303/// Parameter \p Opcode defines the kind of horizontal operation to match.
9304/// For example, if \p Opcode is equal to ISD::ADD, then this function
9305/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9306/// is equal to ISD::SUB, then this function checks if this is a horizontal
9307/// arithmetic sub.
9308///
9309/// This function only analyzes elements of \p N whose indices are
9310/// in range [BaseIdx, LastIdx).
9311///
9312/// TODO: This function was originally used to match both real and fake partial
9313/// horizontal operations, but the index-matching logic is incorrect for that.
9314/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9315/// code because it is only used for partial h-op matching now?
9316static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9317 SelectionDAG &DAG,
9318 unsigned BaseIdx, unsigned LastIdx,
9319 SDValue &V0, SDValue &V1) {
9320 EVT VT = N->getValueType(0);
9321 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")((void)0);
9322 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((void)0);
9323 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((void)0)
9324 "Invalid Vector in input!")((void)0);
9325
9326 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9327 bool CanFold = true;
9328 unsigned ExpectedVExtractIdx = BaseIdx;
9329 unsigned NumElts = LastIdx - BaseIdx;
9330 V0 = DAG.getUNDEF(VT);
9331 V1 = DAG.getUNDEF(VT);
9332
9333 // Check if N implements a horizontal binop.
9334 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9335 SDValue Op = N->getOperand(i + BaseIdx);
9336
9337 // Skip UNDEFs.
9338 if (Op->isUndef()) {
9339 // Update the expected vector extract index.
9340 if (i * 2 == NumElts)
9341 ExpectedVExtractIdx = BaseIdx;
9342 ExpectedVExtractIdx += 2;
9343 continue;
9344 }
9345
9346 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9347
9348 if (!CanFold)
9349 break;
9350
9351 SDValue Op0 = Op.getOperand(0);
9352 SDValue Op1 = Op.getOperand(1);
9353
9354 // Try to match the following pattern:
9355 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9356 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9357 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9358 Op0.getOperand(0) == Op1.getOperand(0) &&
9359 isa<ConstantSDNode>(Op0.getOperand(1)) &&
9360 isa<ConstantSDNode>(Op1.getOperand(1)));
9361 if (!CanFold)
9362 break;
9363
9364 unsigned I0 = Op0.getConstantOperandVal(1);
9365 unsigned I1 = Op1.getConstantOperandVal(1);
9366
9367 if (i * 2 < NumElts) {
9368 if (V0.isUndef()) {
9369 V0 = Op0.getOperand(0);
9370 if (V0.getValueType() != VT)
9371 return false;
9372 }
9373 } else {
9374 if (V1.isUndef()) {
9375 V1 = Op0.getOperand(0);
9376 if (V1.getValueType() != VT)
9377 return false;
9378 }
9379 if (i * 2 == NumElts)
9380 ExpectedVExtractIdx = BaseIdx;
9381 }
9382
9383 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9384 if (I0 == ExpectedVExtractIdx)
9385 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9386 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9387 // Try to match the following dag sequence:
9388 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9389 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9390 } else
9391 CanFold = false;
9392
9393 ExpectedVExtractIdx += 2;
9394 }
9395
9396 return CanFold;
9397}
9398
9399/// Emit a sequence of two 128-bit horizontal add/sub followed by
9400/// a concat_vector.
9401///
9402/// This is a helper function of LowerToHorizontalOp().
9403/// This function expects two 256-bit vectors called V0 and V1.
9404/// At first, each vector is split into two separate 128-bit vectors.
9405/// Then, the resulting 128-bit vectors are used to implement two
9406/// horizontal binary operations.
9407///
9408/// The kind of horizontal binary operation is defined by \p X86Opcode.
9409///
9410/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9411/// the two new horizontal binop.
9412/// When Mode is set, the first horizontal binop dag node would take as input
9413/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9414/// horizontal binop dag node would take as input the lower 128-bit of V1
9415/// and the upper 128-bit of V1.
9416/// Example:
9417/// HADD V0_LO, V0_HI
9418/// HADD V1_LO, V1_HI
9419///
9420/// Otherwise, the first horizontal binop dag node takes as input the lower
9421/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9422/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9423/// Example:
9424/// HADD V0_LO, V1_LO
9425/// HADD V0_HI, V1_HI
9426///
9427/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9428/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9429/// the upper 128-bits of the result.
9430static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9431 const SDLoc &DL, SelectionDAG &DAG,
9432 unsigned X86Opcode, bool Mode,
9433 bool isUndefLO, bool isUndefHI) {
9434 MVT VT = V0.getSimpleValueType();
9435 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((void)0)
9436 "Invalid nodes in input!")((void)0);
9437
9438 unsigned NumElts = VT.getVectorNumElements();
9439 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9440 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9441 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9442 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9443 MVT NewVT = V0_LO.getSimpleValueType();
9444
9445 SDValue LO = DAG.getUNDEF(NewVT);
9446 SDValue HI = DAG.getUNDEF(NewVT);
9447
9448 if (Mode) {
9449 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9450 if (!isUndefLO && !V0->isUndef())
9451 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9452 if (!isUndefHI && !V1->isUndef())
9453 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9454 } else {
9455 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9456 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9457 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9458
9459 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9460 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9461 }
9462
9463 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9464}
9465
9466/// Returns true iff \p BV builds a vector with the result equivalent to
9467/// the result of ADDSUB/SUBADD operation.
9468/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9469/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9470/// \p Opnd0 and \p Opnd1.
9471static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9472 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9473 SDValue &Opnd0, SDValue &Opnd1,
9474 unsigned &NumExtracts,
9475 bool &IsSubAdd) {
9476
9477 MVT VT = BV->getSimpleValueType(0);
9478 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9479 return false;
9480
9481 unsigned NumElts = VT.getVectorNumElements();
9482 SDValue InVec0 = DAG.getUNDEF(VT);
9483 SDValue InVec1 = DAG.getUNDEF(VT);
9484
9485 NumExtracts = 0;
9486
9487 // Odd-numbered elements in the input build vector are obtained from
9488 // adding/subtracting two integer/float elements.
9489 // Even-numbered elements in the input build vector are obtained from
9490 // subtracting/adding two integer/float elements.
9491 unsigned Opc[2] = {0, 0};
9492 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9493 SDValue Op = BV->getOperand(i);
9494
9495 // Skip 'undef' values.
9496 unsigned Opcode = Op.getOpcode();
9497 if (Opcode == ISD::UNDEF)
9498 continue;
9499
9500 // Early exit if we found an unexpected opcode.
9501 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9502 return false;
9503
9504 SDValue Op0 = Op.getOperand(0);
9505 SDValue Op1 = Op.getOperand(1);
9506
9507 // Try to match the following pattern:
9508 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9509 // Early exit if we cannot match that sequence.
9510 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9511 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9512 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9513 Op0.getOperand(1) != Op1.getOperand(1))
9514 return false;
9515
9516 unsigned I0 = Op0.getConstantOperandVal(1);
9517 if (I0 != i)
9518 return false;
9519
9520 // We found a valid add/sub node, make sure its the same opcode as previous
9521 // elements for this parity.
9522 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9523 return false;
9524 Opc[i % 2] = Opcode;
9525
9526 // Update InVec0 and InVec1.
9527 if (InVec0.isUndef()) {
9528 InVec0 = Op0.getOperand(0);
9529 if (InVec0.getSimpleValueType() != VT)
9530 return false;
9531 }
9532 if (InVec1.isUndef()) {
9533 InVec1 = Op1.getOperand(0);
9534 if (InVec1.getSimpleValueType() != VT)
9535 return false;
9536 }
9537
9538 // Make sure that operands in input to each add/sub node always
9539 // come from a same pair of vectors.
9540 if (InVec0 != Op0.getOperand(0)) {
9541 if (Opcode == ISD::FSUB)
9542 return false;
9543
9544 // FADD is commutable. Try to commute the operands
9545 // and then test again.
9546 std::swap(Op0, Op1);
9547 if (InVec0 != Op0.getOperand(0))
9548 return false;
9549 }
9550
9551 if (InVec1 != Op1.getOperand(0))
9552 return false;
9553
9554 // Increment the number of extractions done.
9555 ++NumExtracts;
9556 }
9557
9558 // Ensure we have found an opcode for both parities and that they are
9559 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9560 // inputs are undef.
9561 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9562 InVec0.isUndef() || InVec1.isUndef())
9563 return false;
9564
9565 IsSubAdd = Opc[0] == ISD::FADD;
9566
9567 Opnd0 = InVec0;
9568 Opnd1 = InVec1;
9569 return true;
9570}
9571
9572/// Returns true if is possible to fold MUL and an idiom that has already been
9573/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9574/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9575/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9576///
9577/// Prior to calling this function it should be known that there is some
9578/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9579/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9580/// before replacement of such SDNode with ADDSUB operation. Thus the number
9581/// of \p Opnd0 uses is expected to be equal to 2.
9582/// For example, this function may be called for the following IR:
9583/// %AB = fmul fast <2 x double> %A, %B
9584/// %Sub = fsub fast <2 x double> %AB, %C
9585/// %Add = fadd fast <2 x double> %AB, %C
9586/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9587/// <2 x i32> <i32 0, i32 3>
9588/// There is a def for %Addsub here, which potentially can be replaced by
9589/// X86ISD::ADDSUB operation:
9590/// %Addsub = X86ISD::ADDSUB %AB, %C
9591/// and such ADDSUB can further be replaced with FMADDSUB:
9592/// %Addsub = FMADDSUB %A, %B, %C.
9593///
9594/// The main reason why this method is called before the replacement of the
9595/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9596/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9597/// FMADDSUB is.
9598static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9599 SelectionDAG &DAG,
9600 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9601 unsigned ExpectedUses) {
9602 if (Opnd0.getOpcode() != ISD::FMUL ||
9603 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9604 return false;
9605
9606 // FIXME: These checks must match the similar ones in
9607 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9608 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9609 // or MUL + ADDSUB to FMADDSUB.
9610 const TargetOptions &Options = DAG.getTarget().Options;
9611 bool AllowFusion =
9612 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9613 if (!AllowFusion)
9614 return false;
9615
9616 Opnd2 = Opnd1;
9617 Opnd1 = Opnd0.getOperand(1);
9618 Opnd0 = Opnd0.getOperand(0);
9619
9620 return true;
9621}
9622
9623/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9624/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9625/// X86ISD::FMSUBADD node.
9626static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9627 const X86Subtarget &Subtarget,
9628 SelectionDAG &DAG) {
9629 SDValue Opnd0, Opnd1;
9630 unsigned NumExtracts;
9631 bool IsSubAdd;
9632 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9633 IsSubAdd))
9634 return SDValue();
9635
9636 MVT VT = BV->getSimpleValueType(0);
9637 SDLoc DL(BV);
9638
9639 // Try to generate X86ISD::FMADDSUB node here.
9640 SDValue Opnd2;
9641 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9642 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9643 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9644 }
9645
9646 // We only support ADDSUB.
9647 if (IsSubAdd)
9648 return SDValue();
9649
9650 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9651 // the ADDSUB idiom has been successfully recognized. There are no known
9652 // X86 targets with 512-bit ADDSUB instructions!
9653 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9654 // recognition.
9655 if (VT.is512BitVector())
9656 return SDValue();
9657
9658 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9659}
9660
9661static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9662 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9663 // Initialize outputs to known values.
9664 MVT VT = BV->getSimpleValueType(0);
9665 HOpcode = ISD::DELETED_NODE;
9666 V0 = DAG.getUNDEF(VT);
9667 V1 = DAG.getUNDEF(VT);
9668
9669 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9670 // half of the result is calculated independently from the 128-bit halves of
9671 // the inputs, so that makes the index-checking logic below more complicated.
9672 unsigned NumElts = VT.getVectorNumElements();
9673 unsigned GenericOpcode = ISD::DELETED_NODE;
9674 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9675 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9676 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9677 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9678 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9679 // Ignore undef elements.
9680 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9681 if (Op.isUndef())
9682 continue;
9683
9684 // If there's an opcode mismatch, we're done.
9685 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9686 return false;
9687
9688 // Initialize horizontal opcode.
9689 if (HOpcode == ISD::DELETED_NODE) {
9690 GenericOpcode = Op.getOpcode();
9691 switch (GenericOpcode) {
9692 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9693 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9694 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9695 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9696 default: return false;
9697 }
9698 }
9699
9700 SDValue Op0 = Op.getOperand(0);
9701 SDValue Op1 = Op.getOperand(1);
9702 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9703 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9704 Op0.getOperand(0) != Op1.getOperand(0) ||
9705 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9706 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9707 return false;
9708
9709 // The source vector is chosen based on which 64-bit half of the
9710 // destination vector is being calculated.
9711 if (j < NumEltsIn64Bits) {
9712 if (V0.isUndef())
9713 V0 = Op0.getOperand(0);
9714 } else {
9715 if (V1.isUndef())
9716 V1 = Op0.getOperand(0);
9717 }
9718
9719 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9720 if (SourceVec != Op0.getOperand(0))
9721 return false;
9722
9723 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9724 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9725 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9726 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9727 (j % NumEltsIn64Bits) * 2;
9728 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9729 continue;
9730
9731 // If this is not a commutative op, this does not match.
9732 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9733 return false;
9734
9735 // Addition is commutative, so try swapping the extract indexes.
9736 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9737 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9738 continue;
9739
9740 // Extract indexes do not match horizontal requirement.
9741 return false;
9742 }
9743 }
9744 // We matched. Opcode and operands are returned by reference as arguments.
9745 return true;
9746}
9747
9748static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9749 SelectionDAG &DAG, unsigned HOpcode,
9750 SDValue V0, SDValue V1) {
9751 // If either input vector is not the same size as the build vector,
9752 // extract/insert the low bits to the correct size.
9753 // This is free (examples: zmm --> xmm, xmm --> ymm).
9754 MVT VT = BV->getSimpleValueType(0);
9755 unsigned Width = VT.getSizeInBits();
9756 if (V0.getValueSizeInBits() > Width)
9757 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9758 else if (V0.getValueSizeInBits() < Width)
9759 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9760
9761 if (V1.getValueSizeInBits() > Width)
9762 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9763 else if (V1.getValueSizeInBits() < Width)
9764 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9765
9766 unsigned NumElts = VT.getVectorNumElements();
9767 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9768 for (unsigned i = 0; i != NumElts; ++i)
9769 if (BV->getOperand(i).isUndef())
9770 DemandedElts.clearBit(i);
9771
9772 // If we don't need the upper xmm, then perform as a xmm hop.
9773 unsigned HalfNumElts = NumElts / 2;
9774 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9775 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9776 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9777 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9778 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9779 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9780 }
9781
9782 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9783}
9784
9785/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9786static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9787 const X86Subtarget &Subtarget,
9788 SelectionDAG &DAG) {
9789 // We need at least 2 non-undef elements to make this worthwhile by default.
9790 unsigned NumNonUndefs =
9791 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9792 if (NumNonUndefs < 2)
9793 return SDValue();
9794
9795 // There are 4 sets of horizontal math operations distinguished by type:
9796 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9797 // subtarget feature. Try to match those "native" patterns first.
9798 MVT VT = BV->getSimpleValueType(0);
9799 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9800 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9801 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9802 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9803 unsigned HOpcode;
9804 SDValue V0, V1;
9805 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9806 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9807 }
9808
9809 // Try harder to match 256-bit ops by using extract/concat.
9810 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9811 return SDValue();
9812
9813 // Count the number of UNDEF operands in the build_vector in input.
9814 unsigned NumElts = VT.getVectorNumElements();
9815 unsigned Half = NumElts / 2;
9816 unsigned NumUndefsLO = 0;
9817 unsigned NumUndefsHI = 0;
9818 for (unsigned i = 0, e = Half; i != e; ++i)
9819 if (BV->getOperand(i)->isUndef())
9820 NumUndefsLO++;
9821
9822 for (unsigned i = Half, e = NumElts; i != e; ++i)
9823 if (BV->getOperand(i)->isUndef())
9824 NumUndefsHI++;
9825
9826 SDLoc DL(BV);
9827 SDValue InVec0, InVec1;
9828 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9829 SDValue InVec2, InVec3;
9830 unsigned X86Opcode;
9831 bool CanFold = true;
9832
9833 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9834 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9835 InVec3) &&
9836 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9837 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9838 X86Opcode = X86ISD::HADD;
9839 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9840 InVec1) &&
9841 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9842 InVec3) &&
9843 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9844 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9845 X86Opcode = X86ISD::HSUB;
9846 else
9847 CanFold = false;
9848
9849 if (CanFold) {
9850 // Do not try to expand this build_vector into a pair of horizontal
9851 // add/sub if we can emit a pair of scalar add/sub.
9852 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9853 return SDValue();
9854
9855 // Convert this build_vector into a pair of horizontal binops followed by
9856 // a concat vector. We must adjust the outputs from the partial horizontal
9857 // matching calls above to account for undefined vector halves.
9858 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9859 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9860 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")((void)0);
9861 bool isUndefLO = NumUndefsLO == Half;
9862 bool isUndefHI = NumUndefsHI == Half;
9863 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9864 isUndefHI);
9865 }
9866 }
9867
9868 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9869 VT == MVT::v16i16) {
9870 unsigned X86Opcode;
9871 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9872 X86Opcode = X86ISD::HADD;
9873 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9874 InVec1))
9875 X86Opcode = X86ISD::HSUB;
9876 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9877 InVec1))
9878 X86Opcode = X86ISD::FHADD;
9879 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9880 InVec1))
9881 X86Opcode = X86ISD::FHSUB;
9882 else
9883 return SDValue();
9884
9885 // Don't try to expand this build_vector into a pair of horizontal add/sub
9886 // if we can simply emit a pair of scalar add/sub.
9887 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9888 return SDValue();
9889
9890 // Convert this build_vector into two horizontal add/sub followed by
9891 // a concat vector.
9892 bool isUndefLO = NumUndefsLO == Half;
9893 bool isUndefHI = NumUndefsHI == Half;
9894 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9895 isUndefLO, isUndefHI);
9896 }
9897
9898 return SDValue();
9899}
9900
9901static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
9902 SelectionDAG &DAG);
9903
9904/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9905/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9906/// just apply the bit to the vectors.
9907/// NOTE: Its not in our interest to start make a general purpose vectorizer
9908/// from this, but enough scalar bit operations are created from the later
9909/// legalization + scalarization stages to need basic support.
9910static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9911 const X86Subtarget &Subtarget,
9912 SelectionDAG &DAG) {
9913 SDLoc DL(Op);
9914 MVT VT = Op->getSimpleValueType(0);
9915 unsigned NumElems = VT.getVectorNumElements();
9916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9917
9918 // Check that all elements have the same opcode.
9919 // TODO: Should we allow UNDEFS and if so how many?
9920 unsigned Opcode = Op->getOperand(0).getOpcode();
9921 for (unsigned i = 1; i < NumElems; ++i)
9922 if (Opcode != Op->getOperand(i).getOpcode())
9923 return SDValue();
9924
9925 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9926 bool IsShift = false;
9927 switch (Opcode) {
9928 default:
9929 return SDValue();
9930 case ISD::SHL:
9931 case ISD::SRL:
9932 case ISD::SRA:
9933 IsShift = true;
9934 break;
9935 case ISD::AND:
9936 case ISD::XOR:
9937 case ISD::OR:
9938 // Don't do this if the buildvector is a splat - we'd replace one
9939 // constant with an entire vector.
9940 if (Op->getSplatValue())
9941 return SDValue();
9942 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9943 return SDValue();
9944 break;
9945 }
9946
9947 SmallVector<SDValue, 4> LHSElts, RHSElts;
9948 for (SDValue Elt : Op->ops()) {
9949 SDValue LHS = Elt.getOperand(0);
9950 SDValue RHS = Elt.getOperand(1);
9951
9952 // We expect the canonicalized RHS operand to be the constant.
9953 if (!isa<ConstantSDNode>(RHS))
9954 return SDValue();
9955
9956 // Extend shift amounts.
9957 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9958 if (!IsShift)
9959 return SDValue();
9960 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9961 }
9962
9963 LHSElts.push_back(LHS);
9964 RHSElts.push_back(RHS);
9965 }
9966
9967 // Limit to shifts by uniform immediates.
9968 // TODO: Only accept vXi8/vXi64 special cases?
9969 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9970 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9971 return SDValue();
9972
9973 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9974 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9975 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
9976
9977 if (!IsShift)
9978 return Res;
9979
9980 // Immediately lower the shift to ensure the constant build vector doesn't
9981 // get converted to a constant pool before the shift is lowered.
9982 return LowerShift(Res, Subtarget, DAG);
9983}
9984
9985/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9986/// functionality to do this, so it's all zeros, all ones, or some derivation
9987/// that is cheap to calculate.
9988static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9989 const X86Subtarget &Subtarget) {
9990 SDLoc DL(Op);
9991 MVT VT = Op.getSimpleValueType();
9992
9993 // Vectors containing all zeros can be matched by pxor and xorps.
9994 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9995 return Op;
9996
9997 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9998 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9999 // vpcmpeqd on 256-bit vectors.
10000 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10001 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10002 return Op;
10003
10004 return getOnesVector(VT, DAG, DL);
10005 }
10006
10007 return SDValue();
10008}
10009
10010/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10011/// from a vector of source values and a vector of extraction indices.
10012/// The vectors might be manipulated to match the type of the permute op.
10013static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10014 SDLoc &DL, SelectionDAG &DAG,
10015 const X86Subtarget &Subtarget) {
10016 MVT ShuffleVT = VT;
10017 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10018 unsigned NumElts = VT.getVectorNumElements();
10019 unsigned SizeInBits = VT.getSizeInBits();
10020
10021 // Adjust IndicesVec to match VT size.
10022 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&((void)0)
10023 "Illegal variable permute mask size")((void)0);
10024 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10025 // Narrow/widen the indices vector to the correct size.
10026 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10027 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10028 NumElts * VT.getScalarSizeInBits());
10029 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10030 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10031 SDLoc(IndicesVec), SizeInBits);
10032 // Zero-extend the index elements within the vector.
10033 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10034 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10035 IndicesVT, IndicesVec);
10036 }
10037 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10038
10039 // Handle SrcVec that don't match VT type.
10040 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10041 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10042 // Handle larger SrcVec by treating it as a larger permute.
10043 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10044 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10045 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10046 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10047 Subtarget, DAG, SDLoc(IndicesVec));
10048 SDValue NewSrcVec =
10049 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10050 if (NewSrcVec)
10051 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10052 return SDValue();
10053 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10054 // Widen smaller SrcVec to match VT.
10055 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10056 } else
10057 return SDValue();
10058 }
10059
10060 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10061 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")((void)0);
10062 EVT SrcVT = Idx.getValueType();
10063 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10064 uint64_t IndexScale = 0;
10065 uint64_t IndexOffset = 0;
10066
10067 // If we're scaling a smaller permute op, then we need to repeat the
10068 // indices, scaling and offsetting them as well.
10069 // e.g. v4i32 -> v16i8 (Scale = 4)
10070 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10071 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10072 for (uint64_t i = 0; i != Scale; ++i) {
10073 IndexScale |= Scale << (i * NumDstBits);
10074 IndexOffset |= i << (i * NumDstBits);
10075 }
10076
10077 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10078 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10079 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10080 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10081 return Idx;
10082 };
10083
10084 unsigned Opcode = 0;
10085 switch (VT.SimpleTy) {
10086 default:
10087 break;
10088 case MVT::v16i8:
10089 if (Subtarget.hasSSSE3())
10090 Opcode = X86ISD::PSHUFB;
10091 break;
10092 case MVT::v8i16:
10093 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10094 Opcode = X86ISD::VPERMV;
10095 else if (Subtarget.hasSSSE3()) {
10096 Opcode = X86ISD::PSHUFB;
10097 ShuffleVT = MVT::v16i8;
10098 }
10099 break;
10100 case MVT::v4f32:
10101 case MVT::v4i32:
10102 if (Subtarget.hasAVX()) {
10103 Opcode = X86ISD::VPERMILPV;
10104 ShuffleVT = MVT::v4f32;
10105 } else if (Subtarget.hasSSSE3()) {
10106 Opcode = X86ISD::PSHUFB;
10107 ShuffleVT = MVT::v16i8;
10108 }
10109 break;
10110 case MVT::v2f64:
10111 case MVT::v2i64:
10112 if (Subtarget.hasAVX()) {
10113 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10114 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10115 Opcode = X86ISD::VPERMILPV;
10116 ShuffleVT = MVT::v2f64;
10117 } else if (Subtarget.hasSSE41()) {
10118 // SSE41 can compare v2i64 - select between indices 0 and 1.
10119 return DAG.getSelectCC(
10120 DL, IndicesVec,
10121 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10122 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10123 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10124 ISD::CondCode::SETEQ);
10125 }
10126 break;
10127 case MVT::v32i8:
10128 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10129 Opcode = X86ISD::VPERMV;
10130 else if (Subtarget.hasXOP()) {
10131 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10132 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10133 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10134 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10135 return DAG.getNode(
10136 ISD::CONCAT_VECTORS, DL, VT,
10137 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10138 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10139 } else if (Subtarget.hasAVX()) {
10140 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10141 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10142 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10143 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10144 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10145 ArrayRef<SDValue> Ops) {
10146 // Permute Lo and Hi and then select based on index range.
10147 // This works as SHUFB uses bits[3:0] to permute elements and we don't
10148 // care about the bit[7] as its just an index vector.
10149 SDValue Idx = Ops[2];
10150 EVT VT = Idx.getValueType();
10151 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10152 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10153 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10154 ISD::CondCode::SETGT);
10155 };
10156 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10157 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10158 PSHUFBBuilder);
10159 }
10160 break;
10161 case MVT::v16i16:
10162 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10163 Opcode = X86ISD::VPERMV;
10164 else if (Subtarget.hasAVX()) {
10165 // Scale to v32i8 and perform as v32i8.
10166 IndicesVec = ScaleIndices(IndicesVec, 2);
10167 return DAG.getBitcast(
10168 VT, createVariablePermute(
10169 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10170 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10171 }
10172 break;
10173 case MVT::v8f32:
10174 case MVT::v8i32:
10175 if (Subtarget.hasAVX2())
10176 Opcode = X86ISD::VPERMV;
10177 else if (Subtarget.hasAVX()) {
10178 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10179 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10180 {0, 1, 2, 3, 0, 1, 2, 3});
10181 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10182 {4, 5, 6, 7, 4, 5, 6, 7});
10183 if (Subtarget.hasXOP())
10184 return DAG.getBitcast(
10185 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10186 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10187 // Permute Lo and Hi and then select based on index range.
10188 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10189 SDValue Res = DAG.getSelectCC(
10190 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10191 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10192 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10193 ISD::CondCode::SETGT);
10194 return DAG.getBitcast(VT, Res);
10195 }
10196 break;
10197 case MVT::v4i64:
10198 case MVT::v4f64:
10199 if (Subtarget.hasAVX512()) {
10200 if (!Subtarget.hasVLX()) {
10201 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10202 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10203 SDLoc(SrcVec));
10204 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10205 DAG, SDLoc(IndicesVec));
10206 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10207 DAG, Subtarget);
10208 return extract256BitVector(Res, 0, DAG, DL);
10209 }
10210 Opcode = X86ISD::VPERMV;
10211 } else if (Subtarget.hasAVX()) {
10212 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10213 SDValue LoLo =
10214 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10215 SDValue HiHi =
10216 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10217 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10218 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10219 if (Subtarget.hasXOP())
10220 return DAG.getBitcast(
10221 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10222 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10223 // Permute Lo and Hi and then select based on index range.
10224 // This works as VPERMILPD only uses index bit[1] to permute elements.
10225 SDValue Res = DAG.getSelectCC(
10226 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10227 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10228 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10229 ISD::CondCode::SETGT);
10230 return DAG.getBitcast(VT, Res);
10231 }
10232 break;
10233 case MVT::v64i8:
10234 if (Subtarget.hasVBMI())
10235 Opcode = X86ISD::VPERMV;
10236 break;
10237 case MVT::v32i16:
10238 if (Subtarget.hasBWI())
10239 Opcode = X86ISD::VPERMV;
10240 break;
10241 case MVT::v16f32:
10242 case MVT::v16i32:
10243 case MVT::v8f64:
10244 case MVT::v8i64:
10245 if (Subtarget.hasAVX512())
10246 Opcode = X86ISD::VPERMV;
10247 break;
10248 }
10249 if (!Opcode)
10250 return SDValue();
10251
10252 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&((void)0)
10253 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&((void)0)
10254 "Illegal variable permute shuffle type")((void)0);
10255
10256 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10257 if (Scale > 1)
10258 IndicesVec = ScaleIndices(IndicesVec, Scale);
10259
10260 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10261 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10262
10263 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10264 SDValue Res = Opcode == X86ISD::VPERMV
10265 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10266 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10267 return DAG.getBitcast(VT, Res);
10268}
10269
10270// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10271// reasoned to be a permutation of a vector by indices in a non-constant vector.
10272// (build_vector (extract_elt V, (extract_elt I, 0)),
10273// (extract_elt V, (extract_elt I, 1)),
10274// ...
10275// ->
10276// (vpermv I, V)
10277//
10278// TODO: Handle undefs
10279// TODO: Utilize pshufb and zero mask blending to support more efficient
10280// construction of vectors with constant-0 elements.
10281static SDValue
10282LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10283 const X86Subtarget &Subtarget) {
10284 SDValue SrcVec, IndicesVec;
10285 // Check for a match of the permute source vector and permute index elements.
10286 // This is done by checking that the i-th build_vector operand is of the form:
10287 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10288 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10289 SDValue Op = V.getOperand(Idx);
10290 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10291 return SDValue();
10292
10293 // If this is the first extract encountered in V, set the source vector,
10294 // otherwise verify the extract is from the previously defined source
10295 // vector.
10296 if (!SrcVec)
10297 SrcVec = Op.getOperand(0);
10298 else if (SrcVec != Op.getOperand(0))
10299 return SDValue();
10300 SDValue ExtractedIndex = Op->getOperand(1);
10301 // Peek through extends.
10302 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10303 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10304 ExtractedIndex = ExtractedIndex.getOperand(0);
10305 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10306 return SDValue();
10307
10308 // If this is the first extract from the index vector candidate, set the
10309 // indices vector, otherwise verify the extract is from the previously
10310 // defined indices vector.
10311 if (!IndicesVec)
10312 IndicesVec = ExtractedIndex.getOperand(0);
10313 else if (IndicesVec != ExtractedIndex.getOperand(0))
10314 return SDValue();
10315
10316 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10317 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10318 return SDValue();
10319 }
10320
10321 SDLoc DL(V);
10322 MVT VT = V.getSimpleValueType();
10323 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10324}
10325
10326SDValue
10327X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10328 SDLoc dl(Op);
10329
10330 MVT VT = Op.getSimpleValueType();
10331 MVT EltVT = VT.getVectorElementType();
10332 unsigned NumElems = Op.getNumOperands();
10333
10334 // Generate vectors for predicate vectors.
10335 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10336 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10337
10338 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10339 return VectorConstant;
10340
10341 unsigned EVTBits = EltVT.getSizeInBits();
10342 APInt UndefMask = APInt::getNullValue(NumElems);
10343 APInt ZeroMask = APInt::getNullValue(NumElems);
10344 APInt NonZeroMask = APInt::getNullValue(NumElems);
10345 bool IsAllConstants = true;
10346 SmallSet<SDValue, 8> Values;
10347 unsigned NumConstants = NumElems;
10348 for (unsigned i = 0; i < NumElems; ++i) {
10349 SDValue Elt = Op.getOperand(i);
10350 if (Elt.isUndef()) {
10351 UndefMask.setBit(i);
10352 continue;
10353 }
10354 Values.insert(Elt);
10355 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10356 IsAllConstants = false;
10357 NumConstants--;
10358 }
10359 if (X86::isZeroNode(Elt)) {
10360 ZeroMask.setBit(i);
10361 } else {
10362 NonZeroMask.setBit(i);
10363 }
10364 }
10365
10366 // All undef vector. Return an UNDEF. All zero vectors were handled above.
10367 if (NonZeroMask == 0) {
10368 assert(UndefMask.isAllOnesValue() && "Fully undef mask expected")((void)0);
10369 return DAG.getUNDEF(VT);
10370 }
10371
10372 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10373
10374 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10375 // lowering to a smaller build vector and padding with undef/zero.
10376 if ((VT.is256BitVector() || VT.is512BitVector()) &&
10377 !isFoldableUseOfShuffle(BV)) {
10378 unsigned UpperElems = NumElems / 2;
10379 APInt UndefOrZeroMask = UndefMask | ZeroMask;
10380 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10381 if (NumUpperUndefsOrZeros >= UpperElems) {
10382 if (VT.is512BitVector() &&
10383 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10384 UpperElems = NumElems - (NumElems / 4);
10385 bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10386 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10387 SDValue NewBV =
10388 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10389 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10390 }
10391 }
10392
10393 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10394 return AddSub;
10395 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10396 return HorizontalOp;
10397 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10398 return Broadcast;
10399 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10400 return BitOp;
10401
10402 unsigned NumZero = ZeroMask.countPopulation();
10403 unsigned NumNonZero = NonZeroMask.countPopulation();
10404
10405 // If we are inserting one variable into a vector of non-zero constants, try
10406 // to avoid loading each constant element as a scalar. Load the constants as a
10407 // vector and then insert the variable scalar element. If insertion is not
10408 // supported, fall back to a shuffle to get the scalar blended with the
10409 // constants. Insertion into a zero vector is handled as a special-case
10410 // somewhere below here.
10411 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10412 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10413 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10414 // Create an all-constant vector. The variable element in the old
10415 // build vector is replaced by undef in the constant vector. Save the
10416 // variable scalar element and its index for use in the insertelement.
10417 LLVMContext &Context = *DAG.getContext();
10418 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10419 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10420 SDValue VarElt;
10421 SDValue InsIndex;
10422 for (unsigned i = 0; i != NumElems; ++i) {
10423 SDValue Elt = Op.getOperand(i);
10424 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10425 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10426 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10427 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10428 else if (!Elt.isUndef()) {
10429 assert(!VarElt.getNode() && !InsIndex.getNode() &&((void)0)
10430 "Expected one variable element in this vector")((void)0);
10431 VarElt = Elt;
10432 InsIndex = DAG.getVectorIdxConstant(i, dl);
10433 }
10434 }
10435 Constant *CV = ConstantVector::get(ConstVecOps);
10436 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10437
10438 // The constants we just created may not be legal (eg, floating point). We
10439 // must lower the vector right here because we can not guarantee that we'll
10440 // legalize it before loading it. This is also why we could not just create
10441 // a new build vector here. If the build vector contains illegal constants,
10442 // it could get split back up into a series of insert elements.
10443 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10444 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10445 MachineFunction &MF = DAG.getMachineFunction();
10446 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10447 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10448 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10449 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10450 if (InsertC < NumEltsInLow128Bits)
10451 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10452
10453 // There's no good way to insert into the high elements of a >128-bit
10454 // vector, so use shuffles to avoid an extract/insert sequence.
10455 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")((void)0);
10456 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")((void)0);
10457 SmallVector<int, 8> ShuffleMask;
10458 unsigned NumElts = VT.getVectorNumElements();
10459 for (unsigned i = 0; i != NumElts; ++i)
10460 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10461 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10462 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10463 }
10464
10465 // Special case for single non-zero, non-undef, element.
10466 if (NumNonZero == 1) {
10467 unsigned Idx = NonZeroMask.countTrailingZeros();
10468 SDValue Item = Op.getOperand(Idx);
10469
10470 // If we have a constant or non-constant insertion into the low element of
10471 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10472 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10473 // depending on what the source datatype is.
10474 if (Idx == 0) {
10475 if (NumZero == 0)
10476 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10477
10478 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10479 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10480 assert((VT.is128BitVector() || VT.is256BitVector() ||((void)0)
10481 VT.is512BitVector()) &&((void)0)
10482 "Expected an SSE value type!")((void)0);
10483 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10484 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10485 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10486 }
10487
10488 // We can't directly insert an i8 or i16 into a vector, so zero extend
10489 // it to i32 first.
10490 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10491 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10492 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10493 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10494 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10495 return DAG.getBitcast(VT, Item);
10496 }
10497 }
10498
10499 // Is it a vector logical left shift?
10500 if (NumElems == 2 && Idx == 1 &&
10501 X86::isZeroNode(Op.getOperand(0)) &&
10502 !X86::isZeroNode(Op.getOperand(1))) {
10503 unsigned NumBits = VT.getSizeInBits();
10504 return getVShift(true, VT,
10505 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10506 VT, Op.getOperand(1)),
10507 NumBits/2, DAG, *this, dl);
10508 }
10509
10510 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10511 return SDValue();
10512
10513 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10514 // is a non-constant being inserted into an element other than the low one,
10515 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10516 // movd/movss) to move this into the low element, then shuffle it into
10517 // place.
10518 if (EVTBits == 32) {
10519 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10520 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10521 }
10522 }
10523
10524 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10525 if (Values.size() == 1) {
10526 if (EVTBits == 32) {
10527 // Instead of a shuffle like this:
10528 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10529 // Check if it's possible to issue this instead.
10530 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10531 unsigned Idx = NonZeroMask.countTrailingZeros();
10532 SDValue Item = Op.getOperand(Idx);
10533 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10534 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10535 }
10536 return SDValue();
10537 }
10538
10539 // A vector full of immediates; various special cases are already
10540 // handled, so this is best done with a single constant-pool load.
10541 if (IsAllConstants)
10542 return SDValue();
10543
10544 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10545 return V;
10546
10547 // See if we can use a vector load to get all of the elements.
10548 {
10549 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10550 if (SDValue LD =
10551 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10552 return LD;
10553 }
10554
10555 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10556 // build_vector and broadcast it.
10557 // TODO: We could probably generalize this more.
10558 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10559 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10560 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10561 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10562 // Make sure all the even/odd operands match.
10563 for (unsigned i = 2; i != NumElems; ++i)
10564 if (Ops[i % 2] != Op.getOperand(i))
10565 return false;
10566 return true;
10567 };
10568 if (CanSplat(Op, NumElems, Ops)) {
10569 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10570 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10571 // Create a new build vector and cast to v2i64/v2f64.
10572 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10573 DAG.getBuildVector(NarrowVT, dl, Ops));
10574 // Broadcast from v2i64/v2f64 and cast to final VT.
10575 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10576 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10577 NewBV));
10578 }
10579 }
10580
10581 // For AVX-length vectors, build the individual 128-bit pieces and use
10582 // shuffles to put them in place.
10583 if (VT.getSizeInBits() > 128) {
10584 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10585
10586 // Build both the lower and upper subvector.
10587 SDValue Lower =
10588 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10589 SDValue Upper = DAG.getBuildVector(
10590 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10591
10592 // Recreate the wider vector with the lower and upper part.
10593 return concatSubVectors(Lower, Upper, DAG, dl);
10594 }
10595
10596 // Let legalizer expand 2-wide build_vectors.
10597 if (EVTBits == 64) {
10598 if (NumNonZero == 1) {
10599 // One half is zero or undef.
10600 unsigned Idx = NonZeroMask.countTrailingZeros();
10601 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10602 Op.getOperand(Idx));
10603 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10604 }
10605 return SDValue();
10606 }
10607
10608 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10609 if (EVTBits == 8 && NumElems == 16)
10610 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10611 DAG, Subtarget))
10612 return V;
10613
10614 if (EVTBits == 16 && NumElems == 8)
10615 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10616 DAG, Subtarget))
10617 return V;
10618
10619 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10620 if (EVTBits == 32 && NumElems == 4)
10621 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10622 return V;
10623
10624 // If element VT is == 32 bits, turn it into a number of shuffles.
10625 if (NumElems == 4 && NumZero > 0) {
10626 SmallVector<SDValue, 8> Ops(NumElems);
10627 for (unsigned i = 0; i < 4; ++i) {
10628 bool isZero = !NonZeroMask[i];
10629 if (isZero)
10630 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10631 else
10632 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10633 }
10634
10635 for (unsigned i = 0; i < 2; ++i) {
10636 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10637 default: llvm_unreachable("Unexpected NonZero count")__builtin_unreachable();
10638 case 0:
10639 Ops[i] = Ops[i*2]; // Must be a zero vector.
10640 break;
10641 case 1:
10642 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10643 break;
10644 case 2:
10645 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10646 break;
10647 case 3:
10648 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10649 break;
10650 }
10651 }
10652
10653 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10654 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10655 int MaskVec[] = {
10656 Reverse1 ? 1 : 0,
10657 Reverse1 ? 0 : 1,
10658 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10659 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10660 };
10661 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10662 }
10663
10664 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")((void)0);
10665
10666 // Check for a build vector from mostly shuffle plus few inserting.
10667 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10668 return Sh;
10669
10670 // For SSE 4.1, use insertps to put the high elements into the low element.
10671 if (Subtarget.hasSSE41()) {
10672 SDValue Result;
10673 if (!Op.getOperand(0).isUndef())
10674 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10675 else
10676 Result = DAG.getUNDEF(VT);
10677
10678 for (unsigned i = 1; i < NumElems; ++i) {
10679 if (Op.getOperand(i).isUndef()) continue;
10680 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10681 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10682 }
10683 return Result;
10684 }
10685
10686 // Otherwise, expand into a number of unpckl*, start by extending each of
10687 // our (non-undef) elements to the full vector width with the element in the
10688 // bottom slot of the vector (which generates no code for SSE).
10689 SmallVector<SDValue, 8> Ops(NumElems);
10690 for (unsigned i = 0; i < NumElems; ++i) {
10691 if (!Op.getOperand(i).isUndef())
10692 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10693 else
10694 Ops[i] = DAG.getUNDEF(VT);
10695 }
10696
10697 // Next, we iteratively mix elements, e.g. for v4f32:
10698 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10699 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10700 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10701 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10702 // Generate scaled UNPCKL shuffle mask.
10703 SmallVector<int, 16> Mask;
10704 for(unsigned i = 0; i != Scale; ++i)
10705 Mask.push_back(i);
10706 for (unsigned i = 0; i != Scale; ++i)
10707 Mask.push_back(NumElems+i);
10708 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10709
10710 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10711 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10712 }
10713 return Ops[0];
10714}
10715
10716// 256-bit AVX can use the vinsertf128 instruction
10717// to create 256-bit vectors from two other 128-bit ones.
10718// TODO: Detect subvector broadcast here instead of DAG combine?
10719static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10720 const X86Subtarget &Subtarget) {
10721 SDLoc dl(Op);
10722 MVT ResVT = Op.getSimpleValueType();
10723
10724 assert((ResVT.is256BitVector() ||((void)0)
10725 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")((void)0);
10726
10727 unsigned NumOperands = Op.getNumOperands();
10728 unsigned NumZero = 0;
10729 unsigned NumNonZero = 0;
10730 unsigned NonZeros = 0;
10731 for (unsigned i = 0; i != NumOperands; ++i) {
10732 SDValue SubVec = Op.getOperand(i);
10733 if (SubVec.isUndef())
10734 continue;
10735 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10736 ++NumZero;
10737 else {
10738 assert(i < sizeof(NonZeros) * CHAR_BIT)((void)0); // Ensure the shift is in range.
10739 NonZeros |= 1 << i;
10740 ++NumNonZero;
10741 }
10742 }
10743
10744 // If we have more than 2 non-zeros, build each half separately.
10745 if (NumNonZero > 2) {
10746 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10747 ArrayRef<SDUse> Ops = Op->ops();
10748 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10749 Ops.slice(0, NumOperands/2));
10750 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10751 Ops.slice(NumOperands/2));
10752 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10753 }
10754
10755 // Otherwise, build it up through insert_subvectors.
10756 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10757 : DAG.getUNDEF(ResVT);
10758
10759 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10760 unsigned NumSubElems = SubVT.getVectorNumElements();
10761 for (unsigned i = 0; i != NumOperands; ++i) {
10762 if ((NonZeros & (1 << i)) == 0)
10763 continue;
10764
10765 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10766 Op.getOperand(i),
10767 DAG.getIntPtrConstant(i * NumSubElems, dl));
10768 }
10769
10770 return Vec;
10771}
10772
10773// Returns true if the given node is a type promotion (by concatenating i1
10774// zeros) of the result of a node that already zeros all upper bits of
10775// k-register.
10776// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10777static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10778 const X86Subtarget &Subtarget,
10779 SelectionDAG & DAG) {
10780 SDLoc dl(Op);
10781 MVT ResVT = Op.getSimpleValueType();
10782 unsigned NumOperands = Op.getNumOperands();
10783
10784 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&((void)0)
10785 "Unexpected number of operands in CONCAT_VECTORS")((void)0);
10786
10787 uint64_t Zeros = 0;
10788 uint64_t NonZeros = 0;
10789 for (unsigned i = 0; i != NumOperands; ++i) {
10790 SDValue SubVec = Op.getOperand(i);
10791 if (SubVec.isUndef())
10792 continue;
10793 assert(i < sizeof(NonZeros) * CHAR_BIT)((void)0); // Ensure the shift is in range.
10794 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10795 Zeros |= (uint64_t)1 << i;
10796 else
10797 NonZeros |= (uint64_t)1 << i;
10798 }
10799
10800 unsigned NumElems = ResVT.getVectorNumElements();
10801
10802 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10803 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10804 // insert_subvector will give us two kshifts.
10805 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10806 Log2_64(NonZeros) != NumOperands - 1) {
10807 MVT ShiftVT = ResVT;
10808 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10809 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10810 unsigned Idx = Log2_64(NonZeros);
10811 SDValue SubVec = Op.getOperand(Idx);
10812 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10813 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10814 DAG.getUNDEF(ShiftVT), SubVec,
10815 DAG.getIntPtrConstant(0, dl));
10816 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10817 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10818 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10819 DAG.getIntPtrConstant(0, dl));
10820 }
10821
10822 // If there are zero or one non-zeros we can handle this very simply.
10823 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10824 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10825 if (!NonZeros)
10826 return Vec;
10827 unsigned Idx = Log2_64(NonZeros);
10828 SDValue SubVec = Op.getOperand(Idx);
10829 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10830 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10831 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10832 }
10833
10834 if (NumOperands > 2) {
10835 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10836 ArrayRef<SDUse> Ops = Op->ops();
10837 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10838 Ops.slice(0, NumOperands/2));
10839 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10840 Ops.slice(NumOperands/2));
10841 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10842 }
10843
10844 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")((void)0);
10845
10846 if (ResVT.getVectorNumElements() >= 16)
10847 return Op; // The operation is legal with KUNPCK
10848
10849 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10850 DAG.getUNDEF(ResVT), Op.getOperand(0),
10851 DAG.getIntPtrConstant(0, dl));
10852 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10853 DAG.getIntPtrConstant(NumElems/2, dl));
10854}
10855
10856static SDValue LowerCONCAT_VECTORS(SDValue Op,
10857 const X86Subtarget &Subtarget,
10858 SelectionDAG &DAG) {
10859 MVT VT = Op.getSimpleValueType();
10860 if (VT.getVectorElementType() == MVT::i1)
10861 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10862
10863 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||((void)0)
10864 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||((void)0)
10865 Op.getNumOperands() == 4)))((void)0);
10866
10867 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10868 // from two other 128-bit ones.
10869
10870 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10871 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10872}
10873
10874//===----------------------------------------------------------------------===//
10875// Vector shuffle lowering
10876//
10877// This is an experimental code path for lowering vector shuffles on x86. It is
10878// designed to handle arbitrary vector shuffles and blends, gracefully
10879// degrading performance as necessary. It works hard to recognize idiomatic
10880// shuffles and lower them to optimal instruction patterns without leaving
10881// a framework that allows reasonably efficient handling of all vector shuffle
10882// patterns.
10883//===----------------------------------------------------------------------===//
10884
10885/// Tiny helper function to identify a no-op mask.
10886///
10887/// This is a somewhat boring predicate function. It checks whether the mask
10888/// array input, which is assumed to be a single-input shuffle mask of the kind
10889/// used by the X86 shuffle instructions (not a fully general
10890/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10891/// in-place shuffle are 'no-op's.
10892static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10893 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10894 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
10895 if (Mask[i] >= 0 && Mask[i] != i)
10896 return false;
10897 }
10898 return true;
10899}
10900
10901/// Test whether there are elements crossing LaneSizeInBits lanes in this
10902/// shuffle mask.
10903///
10904/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10905/// and we routinely test for these.
10906static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10907 unsigned ScalarSizeInBits,
10908 ArrayRef<int> Mask) {
10909 assert(LaneSizeInBits && ScalarSizeInBits &&((void)0)
10910 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((void)0)
10911 "Illegal shuffle lane size")((void)0);
10912 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10913 int Size = Mask.size();
10914 for (int i = 0; i < Size; ++i)
10915 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10916 return true;
10917 return false;
10918}
10919
10920/// Test whether there are elements crossing 128-bit lanes in this
10921/// shuffle mask.
10922static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10923 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10924}
10925
10926/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
10927/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
10928/// better support 'repeated mask + lane permute' style shuffles.
10929static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
10930 unsigned ScalarSizeInBits,
10931 ArrayRef<int> Mask) {
10932 assert(LaneSizeInBits && ScalarSizeInBits &&((void)0)
10933 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((void)0)
10934 "Illegal shuffle lane size")((void)0);
10935 int NumElts = Mask.size();
10936 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10937 int NumLanes = NumElts / NumEltsPerLane;
10938 if (NumLanes > 1) {
10939 for (int i = 0; i != NumLanes; ++i) {
10940 int SrcLane = -1;
10941 for (int j = 0; j != NumEltsPerLane; ++j) {
10942 int M = Mask[(i * NumEltsPerLane) + j];
10943 if (M < 0)
10944 continue;
10945 int Lane = (M % NumElts) / NumEltsPerLane;
10946 if (SrcLane >= 0 && SrcLane != Lane)
10947 return true;
10948 SrcLane = Lane;
10949 }
10950 }
10951 }
10952 return false;
10953}
10954
10955/// Test whether a shuffle mask is equivalent within each sub-lane.
10956///
10957/// This checks a shuffle mask to see if it is performing the same
10958/// lane-relative shuffle in each sub-lane. This trivially implies
10959/// that it is also not lane-crossing. It may however involve a blend from the
10960/// same lane of a second vector.
10961///
10962/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10963/// non-trivial to compute in the face of undef lanes. The representation is
10964/// suitable for use with existing 128-bit shuffles as entries from the second
10965/// vector have been remapped to [LaneSize, 2*LaneSize).
10966static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10967 ArrayRef<int> Mask,
10968 SmallVectorImpl<int> &RepeatedMask) {
10969 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10970 RepeatedMask.assign(LaneSize, -1);
10971 int Size = Mask.size();
10972 for (int i = 0; i < Size; ++i) {
10973 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)((void)0);
10974 if (Mask[i] < 0)
10975 continue;
10976 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10977 // This entry crosses lanes, so there is no way to model this shuffle.
10978 return false;
10979
10980 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10981 // Adjust second vector indices to start at LaneSize instead of Size.
10982 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10983 : Mask[i] % LaneSize + LaneSize;
10984 if (RepeatedMask[i % LaneSize] < 0)
10985 // This is the first non-undef entry in this slot of a 128-bit lane.
10986 RepeatedMask[i % LaneSize] = LocalM;
10987 else if (RepeatedMask[i % LaneSize] != LocalM)
10988 // Found a mismatch with the repeated mask.
10989 return false;
10990 }
10991 return true;
10992}
10993
10994/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10995static bool
10996is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10997 SmallVectorImpl<int> &RepeatedMask) {
10998 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10999}
11000
11001static bool
11002is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11003 SmallVector<int, 32> RepeatedMask;
11004 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11005}
11006
11007/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11008static bool
11009is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11010 SmallVectorImpl<int> &RepeatedMask) {
11011 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11012}
11013
11014/// Test whether a target shuffle mask is equivalent within each sub-lane.
11015/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11016static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11017 unsigned EltSizeInBits,
11018 ArrayRef<int> Mask,
11019 SmallVectorImpl<int> &RepeatedMask) {
11020 int LaneSize = LaneSizeInBits / EltSizeInBits;
11021 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11022 int Size = Mask.size();
11023 for (int i = 0; i < Size; ++i) {
11024 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))((void)0);
11025 if (Mask[i] == SM_SentinelUndef)
11026 continue;
11027 if (Mask[i] == SM_SentinelZero) {
11028 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11029 return false;
11030 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11031 continue;
11032 }
11033 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11034 // This entry crosses lanes, so there is no way to model this shuffle.
11035 return false;
11036
11037 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11038 // later vector indices to start at multiples of LaneSize instead of Size.
11039 int LaneM = Mask[i] / Size;
11040 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11041 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11042 // This is the first non-undef entry in this slot of a 128-bit lane.
11043 RepeatedMask[i % LaneSize] = LocalM;
11044 else if (RepeatedMask[i % LaneSize] != LocalM)
11045 // Found a mismatch with the repeated mask.
11046 return false;
11047 }
11048 return true;
11049}
11050
11051/// Test whether a target shuffle mask is equivalent within each sub-lane.
11052/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11053static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11054 ArrayRef<int> Mask,
11055 SmallVectorImpl<int> &RepeatedMask) {
11056 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11057 Mask, RepeatedMask);
11058}
11059
11060/// Checks whether the vector elements referenced by two shuffle masks are
11061/// equivalent.
11062static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11063 int Idx, int ExpectedIdx) {
11064 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&((void)0)
11065 ExpectedIdx < MaskSize && "Out of range element index")((void)0);
11066 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11067 return false;
11068
11069 switch (Op.getOpcode()) {
11070 case ISD::BUILD_VECTOR:
11071 // If the values are build vectors, we can look through them to find
11072 // equivalent inputs that make the shuffles equivalent.
11073 // TODO: Handle MaskSize != Op.getNumOperands()?
11074 if (MaskSize == (int)Op.getNumOperands() &&
11075 MaskSize == (int)ExpectedOp.getNumOperands())
11076 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11077 break;
11078 case X86ISD::VBROADCAST:
11079 case X86ISD::VBROADCAST_LOAD:
11080 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11081 return (Op == ExpectedOp &&
11082 (int)Op.getValueType().getVectorNumElements() == MaskSize);
11083 case X86ISD::HADD:
11084 case X86ISD::HSUB:
11085 case X86ISD::FHADD:
11086 case X86ISD::FHSUB:
11087 case X86ISD::PACKSS:
11088 case X86ISD::PACKUS:
11089 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11090 // TODO: Handle MaskSize != NumElts?
11091 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11092 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11093 MVT VT = Op.getSimpleValueType();
11094 int NumElts = VT.getVectorNumElements();
11095 if (MaskSize == NumElts) {
11096 int NumLanes = VT.getSizeInBits() / 128;
11097 int NumEltsPerLane = NumElts / NumLanes;
11098 int NumHalfEltsPerLane = NumEltsPerLane / 2;
11099 bool SameLane =
11100 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11101 bool SameElt =
11102 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11103 return SameLane && SameElt;
11104 }
11105 }
11106 break;
11107 }
11108
11109 return false;
11110}
11111
11112/// Checks whether a shuffle mask is equivalent to an explicit list of
11113/// arguments.
11114///
11115/// This is a fast way to test a shuffle mask against a fixed pattern:
11116///
11117/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11118///
11119/// It returns true if the mask is exactly as wide as the argument list, and
11120/// each element of the mask is either -1 (signifying undef) or the value given
11121/// in the argument.
11122static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11123 SDValue V1 = SDValue(),
11124 SDValue V2 = SDValue()) {
11125 int Size = Mask.size();
11126 if (Size != (int)ExpectedMask.size())
11127 return false;
11128
11129 for (int i = 0; i < Size; ++i) {
11130 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
11131 int MaskIdx = Mask[i];
11132 int ExpectedIdx = ExpectedMask[i];
11133 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11134 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11135 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11136 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11137 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11138 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11139 return false;
11140 }
11141 }
11142 return true;
11143}
11144
11145/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11146///
11147/// The masks must be exactly the same width.
11148///
11149/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11150/// value in ExpectedMask is always accepted. Otherwise the indices must match.
11151///
11152/// SM_SentinelZero is accepted as a valid negative index but must match in
11153/// both.
11154static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11155 ArrayRef<int> ExpectedMask,
11156 SDValue V1 = SDValue(),
11157 SDValue V2 = SDValue()) {
11158 int Size = Mask.size();
11159 if (Size != (int)ExpectedMask.size())
11160 return false;
11161 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&((void)0)
11162 "Illegal target shuffle mask")((void)0);
11163
11164 // Check for out-of-range target shuffle mask indices.
11165 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11166 return false;
11167
11168 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11169 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11170 V1 = SDValue();
11171 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11172 V2 = SDValue();
11173
11174 for (int i = 0; i < Size; ++i) {
11175 int MaskIdx = Mask[i];
11176 int ExpectedIdx = ExpectedMask[i];
11177 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11178 continue;
11179 if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11180 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11181 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11182 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11183 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11184 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11185 continue;
11186 }
11187 // TODO - handle SM_Sentinel equivalences.
11188 return false;
11189 }
11190 return true;
11191}
11192
11193// Attempt to create a shuffle mask from a VSELECT condition mask.
11194static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11195 SDValue Cond) {
11196 EVT CondVT = Cond.getValueType();
11197 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11198 unsigned NumElts = CondVT.getVectorNumElements();
11199
11200 APInt UndefElts;
11201 SmallVector<APInt, 32> EltBits;
11202 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11203 true, false))
11204 return false;
11205
11206 Mask.resize(NumElts, SM_SentinelUndef);
11207
11208 for (int i = 0; i != (int)NumElts; ++i) {
11209 Mask[i] = i;
11210 // Arbitrarily choose from the 2nd operand if the select condition element
11211 // is undef.
11212 // TODO: Can we do better by matching patterns such as even/odd?
11213 if (UndefElts[i] || EltBits[i].isNullValue())
11214 Mask[i] += NumElts;
11215 }
11216
11217 return true;
11218}
11219
11220// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11221// instructions.
11222static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11223 if (VT != MVT::v8i32 && VT != MVT::v8f32)
11224 return false;
11225
11226 SmallVector<int, 8> Unpcklwd;
11227 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11228 /* Unary = */ false);
11229 SmallVector<int, 8> Unpckhwd;
11230 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11231 /* Unary = */ false);
11232 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11233 isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11234 return IsUnpackwdMask;
11235}
11236
11237static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11238 // Create 128-bit vector type based on mask size.
11239 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11240 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11241
11242 // We can't assume a canonical shuffle mask, so try the commuted version too.
11243 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11244 ShuffleVectorSDNode::commuteMask(CommutedMask);
11245
11246 // Match any of unary/binary or low/high.
11247 for (unsigned i = 0; i != 4; ++i) {
11248 SmallVector<int, 16> UnpackMask;
11249 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11250 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11251 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11252 return true;
11253 }
11254 return false;
11255}
11256
11257/// Return true if a shuffle mask chooses elements identically in its top and
11258/// bottom halves. For example, any splat mask has the same top and bottom
11259/// halves. If an element is undefined in only one half of the mask, the halves
11260/// are not considered identical.
11261static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11262 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")((void)0);
11263 unsigned HalfSize = Mask.size() / 2;
11264 for (unsigned i = 0; i != HalfSize; ++i) {
11265 if (Mask[i] != Mask[i + HalfSize])
11266 return false;
11267 }
11268 return true;
11269}
11270
11271/// Get a 4-lane 8-bit shuffle immediate for a mask.
11272///
11273/// This helper function produces an 8-bit shuffle immediate corresponding to
11274/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11275/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11276/// example.
11277///
11278/// NB: We rely heavily on "undef" masks preserving the input lane.
11279static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11280 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((void)0);
11281 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((void)0);
11282 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((void)0);
11283 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((void)0);
11284 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((void)0);
11285
11286 // If the mask only uses one non-undef element, then fully 'splat' it to
11287 // improve later broadcast matching.
11288 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11289 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")((void)0);
11290
11291 int FirstElt = Mask[FirstIndex];
11292 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11293 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11294
11295 unsigned Imm = 0;
11296 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11297 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11298 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11299 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11300 return Imm;
11301}
11302
11303static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11304 SelectionDAG &DAG) {
11305 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11306}
11307
11308// The Shuffle result is as follow:
11309// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11310// Each Zeroable's element correspond to a particular Mask's element.
11311// As described in computeZeroableShuffleElements function.
11312//
11313// The function looks for a sub-mask that the nonzero elements are in
11314// increasing order. If such sub-mask exist. The function returns true.
11315static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11316 ArrayRef<int> Mask, const EVT &VectorType,
11317 bool &IsZeroSideLeft) {
11318 int NextElement = -1;
11319 // Check if the Mask's nonzero elements are in increasing order.
11320 for (int i = 0, e = Mask.size(); i < e; i++) {
11321 // Checks if the mask's zeros elements are built from only zeros.
11322 assert(Mask[i] >= -1 && "Out of bound mask element!")((void)0);
11323 if (Mask[i] < 0)
11324 return false;
11325 if (Zeroable[i])
11326 continue;
11327 // Find the lowest non zero element
11328 if (NextElement < 0) {
11329 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11330 IsZeroSideLeft = NextElement != 0;
11331 }
11332 // Exit if the mask's non zero elements are not in increasing order.
11333 if (NextElement != Mask[i])
11334 return false;
11335 NextElement++;
11336 }
11337 return true;
11338}
11339
11340/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11341static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11342 ArrayRef<int> Mask, SDValue V1,
11343 SDValue V2, const APInt &Zeroable,
11344 const X86Subtarget &Subtarget,
11345 SelectionDAG &DAG) {
11346 int Size = Mask.size();
11347 int LaneSize = 128 / VT.getScalarSizeInBits();
11348 const int NumBytes = VT.getSizeInBits() / 8;
11349 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11350
11351 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||((void)0)
11352 (Subtarget.hasAVX2() && VT.is256BitVector()) ||((void)0)
11353 (Subtarget.hasBWI() && VT.is512BitVector()))((void)0);
11354
11355 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11356 // Sign bit set in i8 mask means zero element.
11357 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11358
11359 SDValue V;
11360 for (int i = 0; i < NumBytes; ++i) {
11361 int M = Mask[i / NumEltBytes];
11362 if (M < 0) {
11363 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11364 continue;
11365 }
11366 if (Zeroable[i / NumEltBytes]) {
11367 PSHUFBMask[i] = ZeroMask;
11368 continue;
11369 }
11370
11371 // We can only use a single input of V1 or V2.
11372 SDValue SrcV = (M >= Size ? V2 : V1);
11373 if (V && V != SrcV)
11374 return SDValue();
11375 V = SrcV;
11376 M %= Size;
11377
11378 // PSHUFB can't cross lanes, ensure this doesn't happen.
11379 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11380 return SDValue();
11381
11382 M = M % LaneSize;
11383 M = M * NumEltBytes + (i % NumEltBytes);
11384 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11385 }
11386 assert(V && "Failed to find a source input")((void)0);
11387
11388 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11389 return DAG.getBitcast(
11390 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11391 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11392}
11393
11394static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11395 const X86Subtarget &Subtarget, SelectionDAG &DAG,
11396 const SDLoc &dl);
11397
11398// X86 has dedicated shuffle that can be lowered to VEXPAND
11399static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11400 const APInt &Zeroable,
11401 ArrayRef<int> Mask, SDValue &V1,
11402 SDValue &V2, SelectionDAG &DAG,
11403 const X86Subtarget &Subtarget) {
11404 bool IsLeftZeroSide = true;
11405 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11406 IsLeftZeroSide))
11407 return SDValue();
11408 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11409 MVT IntegerType =
11410 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11411 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11412 unsigned NumElts = VT.getVectorNumElements();
11413 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&((void)0)
11414 "Unexpected number of vector elements")((void)0);
11415 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11416 Subtarget, DAG, DL);
11417 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11418 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11419 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11420}
11421
11422static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11423 unsigned &UnpackOpcode, bool IsUnary,
11424 ArrayRef<int> TargetMask, const SDLoc &DL,
11425 SelectionDAG &DAG,
11426 const X86Subtarget &Subtarget) {
11427 int NumElts = VT.getVectorNumElements();
11428
11429 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11430 for (int i = 0; i != NumElts; i += 2) {
11431 int M1 = TargetMask[i + 0];
11432 int M2 = TargetMask[i + 1];
11433 Undef1 &= (SM_SentinelUndef == M1);
11434 Undef2 &= (SM_SentinelUndef == M2);
11435 Zero1 &= isUndefOrZero(M1);
11436 Zero2 &= isUndefOrZero(M2);
11437 }
11438 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&((void)0)
11439 "Zeroable shuffle detected")((void)0);
11440
11441 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11442 SmallVector<int, 64> Unpckl, Unpckh;
11443 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11444 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11445 (IsUnary ? V1 : V2))) {
11446 UnpackOpcode = X86ISD::UNPCKL;
11447 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11448 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11449 return true;
11450 }
11451
11452 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11453 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11454 (IsUnary ? V1 : V2))) {
11455 UnpackOpcode = X86ISD::UNPCKH;
11456 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11457 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11458 return true;
11459 }
11460
11461 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11462 if (IsUnary && (Zero1 || Zero2)) {
11463 // Don't bother if we can blend instead.
11464 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11465 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11466 return false;
11467
11468 bool MatchLo = true, MatchHi = true;
11469 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11470 int M = TargetMask[i];
11471
11472 // Ignore if the input is known to be zero or the index is undef.
11473 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11474 (M == SM_SentinelUndef))
11475 continue;
11476
11477 MatchLo &= (M == Unpckl[i]);
11478 MatchHi &= (M == Unpckh[i]);
11479 }
11480
11481 if (MatchLo || MatchHi) {
11482 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11483 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11484 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11485 return true;
11486 }
11487 }
11488
11489 // If a binary shuffle, commute and try again.
11490 if (!IsUnary) {
11491 ShuffleVectorSDNode::commuteMask(Unpckl);
11492 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11493 UnpackOpcode = X86ISD::UNPCKL;
11494 std::swap(V1, V2);
11495 return true;
11496 }
11497
11498 ShuffleVectorSDNode::commuteMask(Unpckh);
11499 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11500 UnpackOpcode = X86ISD::UNPCKH;
11501 std::swap(V1, V2);
11502 return true;
11503 }
11504 }
11505
11506 return false;
11507}
11508
11509// X86 has dedicated unpack instructions that can handle specific blend
11510// operations: UNPCKH and UNPCKL.
11511static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11512 ArrayRef<int> Mask, SDValue V1, SDValue V2,
11513 SelectionDAG &DAG) {
11514 SmallVector<int, 8> Unpckl;
11515 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11516 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11517 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11518
11519 SmallVector<int, 8> Unpckh;
11520 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11521 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11522 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11523
11524 // Commute and try again.
11525 ShuffleVectorSDNode::commuteMask(Unpckl);
11526 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11527 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11528
11529 ShuffleVectorSDNode::commuteMask(Unpckh);
11530 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11531 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11532
11533 return SDValue();
11534}
11535
11536/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11537/// followed by unpack 256-bit.
11538static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11539 ArrayRef<int> Mask, SDValue V1,
11540 SDValue V2, SelectionDAG &DAG) {
11541 SmallVector<int, 32> Unpckl, Unpckh;
11542 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11543 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11544
11545 unsigned UnpackOpcode;
11546 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11547 UnpackOpcode = X86ISD::UNPCKL;
11548 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11549 UnpackOpcode = X86ISD::UNPCKH;
11550 else
11551 return SDValue();
11552
11553 // This is a "natural" unpack operation (rather than the 128-bit sectored
11554 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11555 // input in order to use the x86 instruction.
11556 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11557 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11558 V1 = DAG.getBitcast(VT, V1);
11559 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11560}
11561
11562// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11563// source into the lower elements and zeroing the upper elements.
11564static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11565 ArrayRef<int> Mask, const APInt &Zeroable,
11566 const X86Subtarget &Subtarget) {
11567 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11568 return false;
11569
11570 unsigned NumElts = Mask.size();
11571 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11572 unsigned MaxScale = 64 / EltSizeInBits;
11573
11574 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11575 unsigned SrcEltBits = EltSizeInBits * Scale;
11576 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11577 continue;
11578 unsigned NumSrcElts = NumElts / Scale;
11579 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11580 continue;
11581 unsigned UpperElts = NumElts - NumSrcElts;
11582 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11583 continue;
11584 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11585 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11586 DstVT = MVT::getIntegerVT(EltSizeInBits);
11587 if ((NumSrcElts * EltSizeInBits) >= 128) {
11588 // ISD::TRUNCATE
11589 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11590 } else {
11591 // X86ISD::VTRUNC
11592 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11593 }
11594 return true;
11595 }
11596
11597 return false;
11598}
11599
11600// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11601// element padding to the final DstVT.
11602static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11603 const X86Subtarget &Subtarget,
11604 SelectionDAG &DAG, bool ZeroUppers) {
11605 MVT SrcVT = Src.getSimpleValueType();
11606 MVT DstSVT = DstVT.getScalarType();
11607 unsigned NumDstElts = DstVT.getVectorNumElements();
11608 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11609 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11610
11611 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11612 return SDValue();
11613
11614 // Perform a direct ISD::TRUNCATE if possible.
11615 if (NumSrcElts == NumDstElts)
11616 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11617
11618 if (NumSrcElts > NumDstElts) {
11619 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11620 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11621 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11622 }
11623
11624 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11625 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11626 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11627 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11628 DstVT.getSizeInBits());
11629 }
11630
11631 // Non-VLX targets must truncate from a 512-bit type, so we need to
11632 // widen, truncate and then possibly extract the original subvector.
11633 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11634 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11635 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11636 }
11637
11638 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11639 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11640 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11641 if (DstVT != TruncVT)
11642 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11643 DstVT.getSizeInBits());
11644 return Trunc;
11645}
11646
11647// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11648//
11649// An example is the following:
11650//
11651// t0: ch = EntryToken
11652// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11653// t25: v4i32 = truncate t2
11654// t41: v8i16 = bitcast t25
11655// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11656// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11657// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11658// t18: v2i64 = bitcast t51
11659//
11660// One can just use a single vpmovdw instruction, without avx512vl we need to
11661// use the zmm variant and extract the lower subvector, padding with zeroes.
11662// TODO: Merge with lowerShuffleAsVTRUNC.
11663static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11664 SDValue V2, ArrayRef<int> Mask,
11665 const APInt &Zeroable,
11666 const X86Subtarget &Subtarget,
11667 SelectionDAG &DAG) {
11668 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")((void)0);
11669 if (!Subtarget.hasAVX512())
11670 return SDValue();
11671
11672 unsigned NumElts = VT.getVectorNumElements();
11673 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11674 unsigned MaxScale = 64 / EltSizeInBits;
11675 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11676 unsigned NumSrcElts = NumElts / Scale;
11677 unsigned UpperElts = NumElts - NumSrcElts;
11678 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11679 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11680 continue;
11681
11682 SDValue Src = V1;
11683 if (!Src.hasOneUse())
11684 return SDValue();
11685
11686 Src = peekThroughOneUseBitcasts(Src);
11687 if (Src.getOpcode() != ISD::TRUNCATE ||
11688 Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11689 return SDValue();
11690 Src = Src.getOperand(0);
11691
11692 // VPMOVWB is only available with avx512bw.
11693 MVT SrcVT = Src.getSimpleValueType();
11694 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11695 !Subtarget.hasBWI())
11696 return SDValue();
11697
11698 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11699 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11700 }
11701
11702 return SDValue();
11703}
11704
11705// Attempt to match binary shuffle patterns as a truncate.
11706static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11707 SDValue V2, ArrayRef<int> Mask,
11708 const APInt &Zeroable,
11709 const X86Subtarget &Subtarget,
11710 SelectionDAG &DAG) {
11711 assert((VT.is128BitVector() || VT.is256BitVector()) &&((void)0)
11712 "Unexpected VTRUNC type")((void)0);
11713 if (!Subtarget.hasAVX512())
11714 return SDValue();
11715
11716 unsigned NumElts = VT.getVectorNumElements();
11717 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11718 unsigned MaxScale = 64 / EltSizeInBits;
11719 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11720 // TODO: Support non-BWI VPMOVWB truncations?
11721 unsigned SrcEltBits = EltSizeInBits * Scale;
11722 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11723 continue;
11724
11725 // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11726 // Bail if the V2 elements are undef.
11727 unsigned NumHalfSrcElts = NumElts / Scale;
11728 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11729 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11730 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11731 continue;
11732
11733 // The elements beyond the truncation must be undef/zero.
11734 unsigned UpperElts = NumElts - NumSrcElts;
11735 if (UpperElts > 0 &&
11736 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11737 continue;
11738 bool UndefUppers =
11739 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11740
11741 // As we're using both sources then we need to concat them together
11742 // and truncate from the double-sized src.
11743 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11744 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11745
11746 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11747 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11748 Src = DAG.getBitcast(SrcVT, Src);
11749 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11750 }
11751
11752 return SDValue();
11753}
11754
11755/// Check whether a compaction lowering can be done by dropping even
11756/// elements and compute how many times even elements must be dropped.
11757///
11758/// This handles shuffles which take every Nth element where N is a power of
11759/// two. Example shuffle masks:
11760///
11761/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11762/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11763/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11764/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11765/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11766/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11767///
11768/// Any of these lanes can of course be undef.
11769///
11770/// This routine only supports N <= 3.
11771/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11772/// for larger N.
11773///
11774/// \returns N above, or the number of times even elements must be dropped if
11775/// there is such a number. Otherwise returns zero.
11776static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11777 bool IsSingleInput) {
11778 // The modulus for the shuffle vector entries is based on whether this is
11779 // a single input or not.
11780 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11781 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((void)0)
11782 "We should only be called with masks with a power-of-2 size!")((void)0);
11783
11784 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11785
11786 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11787 // and 2^3 simultaneously. This is because we may have ambiguity with
11788 // partially undef inputs.
11789 bool ViableForN[3] = {true, true, true};
11790
11791 for (int i = 0, e = Mask.size(); i < e; ++i) {
11792 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11793 // want.
11794 if (Mask[i] < 0)
11795 continue;
11796
11797 bool IsAnyViable = false;
11798 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11799 if (ViableForN[j]) {
11800 uint64_t N = j + 1;
11801
11802 // The shuffle mask must be equal to (i * 2^N) % M.
11803 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11804 IsAnyViable = true;
11805 else
11806 ViableForN[j] = false;
11807 }
11808 // Early exit if we exhaust the possible powers of two.
11809 if (!IsAnyViable)
11810 break;
11811 }
11812
11813 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11814 if (ViableForN[j])
11815 return j + 1;
11816
11817 // Return 0 as there is no viable power of two.
11818 return 0;
11819}
11820
11821// X86 has dedicated pack instructions that can handle specific truncation
11822// operations: PACKSS and PACKUS.
11823// Checks for compaction shuffle masks if MaxStages > 1.
11824// TODO: Add support for matching multiple PACKSS/PACKUS stages.
11825static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11826 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11827 const SelectionDAG &DAG,
11828 const X86Subtarget &Subtarget,
11829 unsigned MaxStages = 1) {
11830 unsigned NumElts = VT.getVectorNumElements();
11831 unsigned BitSize = VT.getScalarSizeInBits();
11832 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&((void)0)
11833 "Illegal maximum compaction")((void)0);
11834
11835 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
11836 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11837 unsigned NumPackedBits = NumSrcBits - BitSize;
11838 N1 = peekThroughBitcasts(N1);
11839 N2 = peekThroughBitcasts(N2);
11840 unsigned NumBits1 = N1.getScalarValueSizeInBits();
11841 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11842 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
11843 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
11844 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11845 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11846 return false;
11847 if (Subtarget.hasSSE41() || BitSize == 8) {
11848 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
11849 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
11850 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
11851 V1 = N1;
11852 V2 = N2;
11853 SrcVT = PackVT;
11854 PackOpcode = X86ISD::PACKUS;
11855 return true;
11856 }
11857 }
11858 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
11859 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
11860 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
11861 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
11862 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11863 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
11864 V1 = N1;
11865 V2 = N2;
11866 SrcVT = PackVT;
11867 PackOpcode = X86ISD::PACKSS;
11868 return true;
11869 }
11870 return false;
11871 };
11872
11873 // Attempt to match against wider and wider compaction patterns.
11874 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11875 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
11876 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
11877
11878 // Try binary shuffle.
11879 SmallVector<int, 32> BinaryMask;
11880 createPackShuffleMask(VT, BinaryMask, false, NumStages);
11881 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
11882 if (MatchPACK(V1, V2, PackVT))
11883 return true;
11884
11885 // Try unary shuffle.
11886 SmallVector<int, 32> UnaryMask;
11887 createPackShuffleMask(VT, UnaryMask, true, NumStages);
11888 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
11889 if (MatchPACK(V1, V1, PackVT))
11890 return true;
11891 }
11892
11893 return false;
11894}
11895
11896static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11897 SDValue V1, SDValue V2, SelectionDAG &DAG,
11898 const X86Subtarget &Subtarget) {
11899 MVT PackVT;
11900 unsigned PackOpcode;
11901 unsigned SizeBits = VT.getSizeInBits();
11902 unsigned EltBits = VT.getScalarSizeInBits();
11903 unsigned MaxStages = Log2_32(64 / EltBits);
11904 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11905 Subtarget, MaxStages))
11906 return SDValue();
11907
11908 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
11909 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
11910
11911 // Don't lower multi-stage packs on AVX512, truncation is better.
11912 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11913 return SDValue();
11914
11915 // Pack to the largest type possible:
11916 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
11917 unsigned MaxPackBits = 16;
11918 if (CurrentEltBits > 16 &&
11919 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
11920 MaxPackBits = 32;
11921
11922 // Repeatedly pack down to the target size.
11923 SDValue Res;
11924 for (unsigned i = 0; i != NumStages; ++i) {
11925 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11926 unsigned NumSrcElts = SizeBits / SrcEltBits;
11927 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11928 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
11929 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11930 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
11931 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
11932 DAG.getBitcast(SrcVT, V2));
11933 V1 = V2 = Res;
11934 CurrentEltBits /= 2;
11935 }
11936 assert(Res && Res.getValueType() == VT &&((void)0)
11937 "Failed to lower compaction shuffle")((void)0);
11938 return Res;
11939}
11940
11941/// Try to emit a bitmask instruction for a shuffle.
11942///
11943/// This handles cases where we can model a blend exactly as a bitmask due to
11944/// one of the inputs being zeroable.
11945static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11946 SDValue V2, ArrayRef<int> Mask,
11947 const APInt &Zeroable,
11948 const X86Subtarget &Subtarget,
11949 SelectionDAG &DAG) {
11950 MVT MaskVT = VT;
11951 MVT EltVT = VT.getVectorElementType();
11952 SDValue Zero, AllOnes;
11953 // Use f64 if i64 isn't legal.
11954 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11955 EltVT = MVT::f64;
11956 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11957 }
11958
11959 MVT LogicVT = VT;
11960 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11961 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11962 APFloat AllOnesValue = APFloat::getAllOnesValue(
11963 SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
11964 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11965 LogicVT =
11966 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11967 } else {
11968 Zero = DAG.getConstant(0, DL, EltVT);
11969 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11970 }
11971
11972 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11973 SDValue V;
11974 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11975 if (Zeroable[i])
11976 continue;
11977 if (Mask[i] % Size != i)
11978 return SDValue(); // Not a blend.
11979 if (!V)
11980 V = Mask[i] < Size ? V1 : V2;
11981 else if (V != (Mask[i] < Size ? V1 : V2))
11982 return SDValue(); // Can only let one input through the mask.
11983
11984 VMaskOps[i] = AllOnes;
11985 }
11986 if (!V)
11987 return SDValue(); // No non-zeroable elements!
11988
11989 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11990 VMask = DAG.getBitcast(LogicVT, VMask);
11991 V = DAG.getBitcast(LogicVT, V);
11992 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11993 return DAG.getBitcast(VT, And);
11994}
11995
11996/// Try to emit a blend instruction for a shuffle using bit math.
11997///
11998/// This is used as a fallback approach when first class blend instructions are
11999/// unavailable. Currently it is only suitable for integer vectors, but could
12000/// be generalized for floating point vectors if desirable.
12001static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12002 SDValue V2, ArrayRef<int> Mask,
12003 SelectionDAG &DAG) {
12004 assert(VT.isInteger() && "Only supports integer vector types!")((void)0);
12005 MVT EltVT = VT.getVectorElementType();
12006 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12007 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12008 SmallVector<SDValue, 16> MaskOps;
12009 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12010 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12011 return SDValue(); // Shuffled input!
12012 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12013 }
12014
12015 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12016 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12017 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12018 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12019}
12020
12021static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12022 SDValue PreservedSrc,
12023 const X86Subtarget &Subtarget,
12024 SelectionDAG &DAG);
12025
12026static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12027 MutableArrayRef<int> Mask,
12028 const APInt &Zeroable, bool &ForceV1Zero,
12029 bool &ForceV2Zero, uint64_t &BlendMask) {
12030 bool V1IsZeroOrUndef =
12031 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12032 bool V2IsZeroOrUndef =
12033 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12034
12035 BlendMask = 0;
12036 ForceV1Zero = false, ForceV2Zero = false;
12037 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")((void)0);
12038
12039 // Attempt to generate the binary blend mask. If an input is zero then
12040 // we can use any lane.
12041 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12042 int M = Mask[i];
12043 if (M == SM_SentinelUndef)
12044 continue;
12045 if (M == i)
12046 continue;
12047 if (M == i + Size) {
12048 BlendMask |= 1ull << i;
12049 continue;
12050 }
12051 if (Zeroable[i]) {
12052 if (V1IsZeroOrUndef) {
12053 ForceV1Zero = true;
12054 Mask[i] = i;
12055 continue;
12056 }
12057 if (V2IsZeroOrUndef) {
12058 ForceV2Zero = true;
12059 BlendMask |= 1ull << i;
12060 Mask[i] = i + Size;
12061 continue;
12062 }
12063 }
12064 return false;
12065 }
12066 return true;
12067}
12068
12069static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12070 int Scale) {
12071 uint64_t ScaledMask = 0;
12072 for (int i = 0; i != Size; ++i)
12073 if (BlendMask & (1ull << i))
12074 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12075 return ScaledMask;
12076}
12077
12078/// Try to emit a blend instruction for a shuffle.
12079///
12080/// This doesn't do any checks for the availability of instructions for blending
12081/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12082/// be matched in the backend with the type given. What it does check for is
12083/// that the shuffle mask is a blend, or convertible into a blend with zero.
12084static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12085 SDValue V2, ArrayRef<int> Original,
12086 const APInt &Zeroable,
12087 const X86Subtarget &Subtarget,
12088 SelectionDAG &DAG) {
12089 uint64_t BlendMask = 0;
12090 bool ForceV1Zero = false, ForceV2Zero = false;
12091 SmallVector<int, 64> Mask(Original.begin(), Original.end());
12092 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12093 BlendMask))
12094 return SDValue();
12095
12096 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12097 if (ForceV1Zero)
12098 V1 = getZeroVector(VT, Subtarget, DAG, DL);
12099 if (ForceV2Zero)
12100 V2 = getZeroVector(VT, Subtarget, DAG, DL);
12101
12102 switch (VT.SimpleTy) {
12103 case MVT::v4i64:
12104 case MVT::v8i32:
12105 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((void)0);
12106 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12107 case MVT::v4f64:
12108 case MVT::v8f32:
12109 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")((void)0);
12110 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12111 case MVT::v2f64:
12112 case MVT::v2i64:
12113 case MVT::v4f32:
12114 case MVT::v4i32:
12115 case MVT::v8i16:
12116 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")((void)0);
12117 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12118 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12119 case MVT::v16i16: {
12120 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")((void)0);
12121 SmallVector<int, 8> RepeatedMask;
12122 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12123 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12124 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((void)0);
12125 BlendMask = 0;
12126 for (int i = 0; i < 8; ++i)
12127 if (RepeatedMask[i] >= 8)
12128 BlendMask |= 1ull << i;
12129 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12130 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12131 }
12132 // Use PBLENDW for lower/upper lanes and then blend lanes.
12133 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12134 // merge to VSELECT where useful.
12135 uint64_t LoMask = BlendMask & 0xFF;
12136 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12137 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12138 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12139 DAG.getTargetConstant(LoMask, DL, MVT::i8));
12140 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12141 DAG.getTargetConstant(HiMask, DL, MVT::i8));
12142 return DAG.getVectorShuffle(
12143 MVT::v16i16, DL, Lo, Hi,
12144 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12145 }
12146 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12147 }
12148 case MVT::v32i8:
12149 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")((void)0);
12150 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12151 case MVT::v16i8: {
12152 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")((void)0);
12153
12154 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12155 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12156 Subtarget, DAG))
12157 return Masked;
12158
12159 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12160 MVT IntegerType =
12161 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12162 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12163 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12164 }
12165
12166 // If we have VPTERNLOG, we can use that as a bit blend.
12167 if (Subtarget.hasVLX())
12168 if (SDValue BitBlend =
12169 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12170 return BitBlend;
12171
12172 // Scale the blend by the number of bytes per element.
12173 int Scale = VT.getScalarSizeInBits() / 8;
12174
12175 // This form of blend is always done on bytes. Compute the byte vector
12176 // type.
12177 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12178
12179 // x86 allows load folding with blendvb from the 2nd source operand. But
12180 // we are still using LLVM select here (see comment below), so that's V1.
12181 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12182 // allow that load-folding possibility.
12183 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12184 ShuffleVectorSDNode::commuteMask(Mask);
12185 std::swap(V1, V2);
12186 }
12187
12188 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12189 // mix of LLVM's code generator and the x86 backend. We tell the code
12190 // generator that boolean values in the elements of an x86 vector register
12191 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12192 // mapping a select to operand #1, and 'false' mapping to operand #2. The
12193 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12194 // of the element (the remaining are ignored) and 0 in that high bit would
12195 // mean operand #1 while 1 in the high bit would mean operand #2. So while
12196 // the LLVM model for boolean values in vector elements gets the relevant
12197 // bit set, it is set backwards and over constrained relative to x86's
12198 // actual model.
12199 SmallVector<SDValue, 32> VSELECTMask;
12200 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12201 for (int j = 0; j < Scale; ++j)
12202 VSELECTMask.push_back(
12203 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12204 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12205 MVT::i8));
12206
12207 V1 = DAG.getBitcast(BlendVT, V1);
12208 V2 = DAG.getBitcast(BlendVT, V2);
12209 return DAG.getBitcast(
12210 VT,
12211 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12212 V1, V2));
12213 }
12214 case MVT::v16f32:
12215 case MVT::v8f64:
12216 case MVT::v8i64:
12217 case MVT::v16i32:
12218 case MVT::v32i16:
12219 case MVT::v64i8: {
12220 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12221 bool OptForSize = DAG.shouldOptForSize();
12222 if (!OptForSize) {
12223 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12224 Subtarget, DAG))
12225 return Masked;
12226 }
12227
12228 // Otherwise load an immediate into a GPR, cast to k-register, and use a
12229 // masked move.
12230 MVT IntegerType =
12231 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12232 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12233 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12234 }
12235 default:
12236 llvm_unreachable("Not a supported integer vector type!")__builtin_unreachable();
12237 }
12238}
12239
12240/// Try to lower as a blend of elements from two inputs followed by
12241/// a single-input permutation.
12242///
12243/// This matches the pattern where we can blend elements from two inputs and
12244/// then reduce the shuffle to a single-input permutation.
12245static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12246 SDValue V1, SDValue V2,
12247 ArrayRef<int> Mask,
12248 SelectionDAG &DAG,
12249 bool ImmBlends = false) {
12250 // We build up the blend mask while checking whether a blend is a viable way
12251 // to reduce the shuffle.
12252 SmallVector<int, 32> BlendMask(Mask.size(), -1);
12253 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12254
12255 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12256 if (Mask[i] < 0)
12257 continue;
12258
12259 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((void)0);
12260
12261 if (BlendMask[Mask[i] % Size] < 0)
12262 BlendMask[Mask[i] % Size] = Mask[i];
12263 else if (BlendMask[Mask[i] % Size] != Mask[i])
12264 return SDValue(); // Can't blend in the needed input!
12265
12266 PermuteMask[i] = Mask[i] % Size;
12267 }
12268
12269 // If only immediate blends, then bail if the blend mask can't be widened to
12270 // i16.
12271 unsigned EltSize = VT.getScalarSizeInBits();
12272 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12273 return SDValue();
12274
12275 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12276 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12277}
12278
12279/// Try to lower as an unpack of elements from two inputs followed by
12280/// a single-input permutation.
12281///
12282/// This matches the pattern where we can unpack elements from two inputs and
12283/// then reduce the shuffle to a single-input (wider) permutation.
12284static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12285 SDValue V1, SDValue V2,
12286 ArrayRef<int> Mask,
12287 SelectionDAG &DAG) {
12288 int NumElts = Mask.size();
12289 int NumLanes = VT.getSizeInBits() / 128;
12290 int NumLaneElts = NumElts / NumLanes;
12291 int NumHalfLaneElts = NumLaneElts / 2;
12292
12293 bool MatchLo = true, MatchHi = true;
12294 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12295
12296 // Determine UNPCKL/UNPCKH type and operand order.
12297 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12298 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12299 int M = Mask[Lane + Elt];
12300 if (M < 0)
12301 continue;
12302
12303 SDValue &Op = Ops[Elt & 1];
12304 if (M < NumElts && (Op.isUndef() || Op == V1))
12305 Op = V1;
12306 else if (NumElts <= M && (Op.isUndef() || Op == V2))
12307 Op = V2;
12308 else
12309 return SDValue();
12310
12311 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12312 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12313 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12314 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12315 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12316 if (!MatchLo && !MatchHi)
12317 return SDValue();
12318 }
12319 }
12320 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")((void)0);
12321
12322 // Now check that each pair of elts come from the same unpack pair
12323 // and set the permute mask based on each pair.
12324 // TODO - Investigate cases where we permute individual elements.
12325 SmallVector<int, 32> PermuteMask(NumElts, -1);
12326 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12327 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12328 int M0 = Mask[Lane + Elt + 0];
12329 int M1 = Mask[Lane + Elt + 1];
12330 if (0 <= M0 && 0 <= M1 &&
12331 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12332 return SDValue();
12333 if (0 <= M0)
12334 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12335 if (0 <= M1)
12336 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12337 }
12338 }
12339
12340 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12341 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12342 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12343}
12344
12345/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12346/// permuting the elements of the result in place.
12347static SDValue lowerShuffleAsByteRotateAndPermute(
12348 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12349 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12350 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12351 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12352 (VT.is512BitVector() && !Subtarget.hasBWI()))
12353 return SDValue();
12354
12355 // We don't currently support lane crossing permutes.
12356 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12357 return SDValue();
12358
12359 int Scale = VT.getScalarSizeInBits() / 8;
12360 int NumLanes = VT.getSizeInBits() / 128;
12361 int NumElts = VT.getVectorNumElements();
12362 int NumEltsPerLane = NumElts / NumLanes;
12363
12364 // Determine range of mask elts.
12365 bool Blend1 = true;
12366 bool Blend2 = true;
12367 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12368 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12369 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12370 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12371 int M = Mask[Lane + Elt];
12372 if (M < 0)
12373 continue;
12374 if (M < NumElts) {
12375 Blend1 &= (M == (Lane + Elt));
12376 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((void)0);
12377 M = M % NumEltsPerLane;
12378 Range1.first = std::min(Range1.first, M);
12379 Range1.second = std::max(Range1.second, M);
12380 } else {
12381 M -= NumElts;
12382 Blend2 &= (M == (Lane + Elt));
12383 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((void)0);
12384 M = M % NumEltsPerLane;
12385 Range2.first = std::min(Range2.first, M);
12386 Range2.second = std::max(Range2.second, M);
12387 }
12388 }
12389 }
12390
12391 // Bail if we don't need both elements.
12392 // TODO - it might be worth doing this for unary shuffles if the permute
12393 // can be widened.
12394 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12395 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12396 return SDValue();
12397
12398 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12399 return SDValue();
12400
12401 // Rotate the 2 ops so we can access both ranges, then permute the result.
12402 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12403 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12404 SDValue Rotate = DAG.getBitcast(
12405 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12406 DAG.getBitcast(ByteVT, Lo),
12407 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12408 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12409 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12410 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12411 int M = Mask[Lane + Elt];
12412 if (M < 0)
12413 continue;
12414 if (M < NumElts)
12415 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12416 else
12417 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12418 }
12419 }
12420 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12421 };
12422
12423 // Check if the ranges are small enough to rotate from either direction.
12424 if (Range2.second < Range1.first)
12425 return RotateAndPermute(V1, V2, Range1.first, 0);
12426 if (Range1.second < Range2.first)
12427 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12428 return SDValue();
12429}
12430
12431/// Generic routine to decompose a shuffle and blend into independent
12432/// blends and permutes.
12433///
12434/// This matches the extremely common pattern for handling combined
12435/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12436/// operations. It will try to pick the best arrangement of shuffles and
12437/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12438static SDValue lowerShuffleAsDecomposedShuffleMerge(
12439 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12440 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12441 int NumElts = Mask.size();
12442 int NumLanes = VT.getSizeInBits() / 128;
12443 int NumEltsPerLane = NumElts / NumLanes;
12444
12445 // Shuffle the input elements into the desired positions in V1 and V2 and
12446 // unpack/blend them together.
12447 bool IsAlternating = true;
12448 SmallVector<int, 32> V1Mask(NumElts, -1);
12449 SmallVector<int, 32> V2Mask(NumElts, -1);
12450 SmallVector<int, 32> FinalMask(NumElts, -1);
12451 for (int i = 0; i < NumElts; ++i) {
12452 int M = Mask[i];
12453 if (M >= 0 && M < NumElts) {
12454 V1Mask[i] = M;
12455 FinalMask[i] = i;
12456 IsAlternating &= (i & 1) == 0;
12457 } else if (M >= NumElts) {
12458 V2Mask[i] = M - NumElts;
12459 FinalMask[i] = i + NumElts;
12460 IsAlternating &= (i & 1) == 1;
12461 }
12462 }
12463
12464 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12465 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12466 // the shuffle may be able to fold with a load or other benefit. However, when
12467 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12468 // pre-shuffle first is a better strategy.
12469 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12470 // Only prefer immediate blends to unpack/rotate.
12471 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12472 DAG, true))
12473 return BlendPerm;
12474 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12475 DAG))
12476 return UnpackPerm;
12477 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12478 DL, VT, V1, V2, Mask, Subtarget, DAG))
12479 return RotatePerm;
12480 // Unpack/rotate failed - try again with variable blends.
12481 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12482 DAG))
12483 return BlendPerm;
12484 }
12485
12486 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12487 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12488 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12489 // than half the elements coming from each source.
12490 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12491 V1Mask.assign(NumElts, -1);
12492 V2Mask.assign(NumElts, -1);
12493 FinalMask.assign(NumElts, -1);
12494 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12495 for (int j = 0; j != NumEltsPerLane; ++j) {
12496 int M = Mask[i + j];
12497 if (M >= 0 && M < NumElts) {
12498 V1Mask[i + (j / 2)] = M;
12499 FinalMask[i + j] = i + (j / 2);
12500 } else if (M >= NumElts) {
12501 V2Mask[i + (j / 2)] = M - NumElts;
12502 FinalMask[i + j] = i + (j / 2) + NumElts;
12503 }
12504 }
12505 }
12506
12507 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12508 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12509 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12510}
12511
12512/// Try to lower a vector shuffle as a bit rotation.
12513///
12514/// Look for a repeated rotation pattern in each sub group.
12515/// Returns a ISD::ROTL element rotation amount or -1 if failed.
12516static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12517 int NumElts = Mask.size();
12518 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")((void)0);
12519
12520 int RotateAmt = -1;
12521 for (int i = 0; i != NumElts; i += NumSubElts) {
12522 for (int j = 0; j != NumSubElts; ++j) {
12523 int M = Mask[i + j];
12524 if (M < 0)
12525 continue;
12526 if (!isInRange(M, i, i + NumSubElts))
12527 return -1;
12528 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12529 if (0 <= RotateAmt && Offset != RotateAmt)
12530 return -1;
12531 RotateAmt = Offset;
12532 }
12533 }
12534 return RotateAmt;
12535}
12536
12537static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12538 const X86Subtarget &Subtarget,
12539 ArrayRef<int> Mask) {
12540 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12541 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")((void)0);
12542
12543 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12544 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12545 int MaxSubElts = 64 / EltSizeInBits;
12546 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12547 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12548 if (RotateAmt < 0)
12549 continue;
12550
12551 int NumElts = Mask.size();
12552 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12553 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12554 return RotateAmt * EltSizeInBits;
12555 }
12556
12557 return -1;
12558}
12559
12560/// Lower shuffle using X86ISD::VROTLI rotations.
12561static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12562 ArrayRef<int> Mask,
12563 const X86Subtarget &Subtarget,
12564 SelectionDAG &DAG) {
12565 // Only XOP + AVX512 targets have bit rotation instructions.
12566 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12567 bool IsLegal =
12568 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12569 if (!IsLegal && Subtarget.hasSSE3())
12570 return SDValue();
12571
12572 MVT RotateVT;
12573 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12574 Subtarget, Mask);
12575 if (RotateAmt < 0)
12576 return SDValue();
12577
12578 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12579 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12580 // widen to vXi16 or more then existing lowering should will be better.
12581 if (!IsLegal) {
12582 if ((RotateAmt % 16) == 0)
12583 return SDValue();
12584 // TODO: Use getTargetVShiftByConstNode.
12585 unsigned ShlAmt = RotateAmt;
12586 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12587 V1 = DAG.getBitcast(RotateVT, V1);
12588 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12589 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12590 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12591 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12592 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12593 return DAG.getBitcast(VT, Rot);
12594 }
12595
12596 SDValue Rot =
12597 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12598 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12599 return DAG.getBitcast(VT, Rot);
12600}
12601
12602/// Try to match a vector shuffle as an element rotation.
12603///
12604/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12605static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12606 ArrayRef<int> Mask) {
12607 int NumElts = Mask.size();
12608
12609 // We need to detect various ways of spelling a rotation:
12610 // [11, 12, 13, 14, 15, 0, 1, 2]
12611 // [-1, 12, 13, 14, -1, -1, 1, -1]
12612 // [-1, -1, -1, -1, -1, -1, 1, 2]
12613 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12614 // [-1, 4, 5, 6, -1, -1, 9, -1]
12615 // [-1, 4, 5, 6, -1, -1, -1, -1]
12616 int Rotation = 0;
12617 SDValue Lo, Hi;
12618 for (int i = 0; i < NumElts; ++i) {
12619 int M = Mask[i];
12620 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&((void)0)
12621 "Unexpected mask index.")((void)0);
12622 if (M < 0)
12623 continue;
12624
12625 // Determine where a rotated vector would have started.
12626 int StartIdx = i - (M % NumElts);
12627 if (StartIdx == 0)
12628 // The identity rotation isn't interesting, stop.
12629 return -1;
12630
12631 // If we found the tail of a vector the rotation must be the missing
12632 // front. If we found the head of a vector, it must be how much of the
12633 // head.
12634 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12635
12636 if (Rotation == 0)
12637 Rotation = CandidateRotation;
12638 else if (Rotation != CandidateRotation)
12639 // The rotations don't match, so we can't match this mask.
12640 return -1;
12641
12642 // Compute which value this mask is pointing at.
12643 SDValue MaskV = M < NumElts ? V1 : V2;
12644
12645 // Compute which of the two target values this index should be assigned
12646 // to. This reflects whether the high elements are remaining or the low
12647 // elements are remaining.
12648 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12649
12650 // Either set up this value if we've not encountered it before, or check
12651 // that it remains consistent.
12652 if (!TargetV)
12653 TargetV = MaskV;
12654 else if (TargetV != MaskV)
12655 // This may be a rotation, but it pulls from the inputs in some
12656 // unsupported interleaving.
12657 return -1;
12658 }
12659
12660 // Check that we successfully analyzed the mask, and normalize the results.
12661 assert(Rotation != 0 && "Failed to locate a viable rotation!")((void)0);
12662 assert((Lo || Hi) && "Failed to find a rotated input vector!")((void)0);
12663 if (!Lo)
12664 Lo = Hi;
12665 else if (!Hi)
12666 Hi = Lo;
12667
12668 V1 = Lo;
12669 V2 = Hi;
12670
12671 return Rotation;
12672}
12673
12674/// Try to lower a vector shuffle as a byte rotation.
12675///
12676/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12677/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12678/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12679/// try to generically lower a vector shuffle through such an pattern. It
12680/// does not check for the profitability of lowering either as PALIGNR or
12681/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12682/// This matches shuffle vectors that look like:
12683///
12684/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12685///
12686/// Essentially it concatenates V1 and V2, shifts right by some number of
12687/// elements, and takes the low elements as the result. Note that while this is
12688/// specified as a *right shift* because x86 is little-endian, it is a *left
12689/// rotate* of the vector lanes.
12690static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12691 ArrayRef<int> Mask) {
12692 // Don't accept any shuffles with zero elements.
12693 if (isAnyZero(Mask))
12694 return -1;
12695
12696 // PALIGNR works on 128-bit lanes.
12697 SmallVector<int, 16> RepeatedMask;
12698 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12699 return -1;
12700
12701 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12702 if (Rotation <= 0)
12703 return -1;
12704
12705 // PALIGNR rotates bytes, so we need to scale the
12706 // rotation based on how many bytes are in the vector lane.
12707 int NumElts = RepeatedMask.size();
12708 int Scale = 16 / NumElts;
12709 return Rotation * Scale;
12710}
12711
12712static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12713 SDValue V2, ArrayRef<int> Mask,
12714 const X86Subtarget &Subtarget,
12715 SelectionDAG &DAG) {
12716 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12717
12718 SDValue Lo = V1, Hi = V2;
12719 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12720 if (ByteRotation <= 0)
12721 return SDValue();
12722
12723 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12724 // PSLLDQ/PSRLDQ.
12725 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12726 Lo = DAG.getBitcast(ByteVT, Lo);
12727 Hi = DAG.getBitcast(ByteVT, Hi);
12728
12729 // SSSE3 targets can use the palignr instruction.
12730 if (Subtarget.hasSSSE3()) {
12731 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&((void)0)
12732 "512-bit PALIGNR requires BWI instructions")((void)0);
12733 return DAG.getBitcast(
12734 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12735 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12736 }
12737
12738 assert(VT.is128BitVector() &&((void)0)
12739 "Rotate-based lowering only supports 128-bit lowering!")((void)0);
12740 assert(Mask.size() <= 16 &&((void)0)
12741 "Can shuffle at most 16 bytes in a 128-bit vector!")((void)0);
12742 assert(ByteVT == MVT::v16i8 &&((void)0)
12743 "SSE2 rotate lowering only needed for v16i8!")((void)0);
12744
12745 // Default SSE2 implementation
12746 int LoByteShift = 16 - ByteRotation;
12747 int HiByteShift = ByteRotation;
12748
12749 SDValue LoShift =
12750 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12751 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12752 SDValue HiShift =
12753 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12754 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12755 return DAG.getBitcast(VT,
12756 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12757}
12758
12759/// Try to lower a vector shuffle as a dword/qword rotation.
12760///
12761/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12762/// rotation of the concatenation of two vectors; This routine will
12763/// try to generically lower a vector shuffle through such an pattern.
12764///
12765/// Essentially it concatenates V1 and V2, shifts right by some number of
12766/// elements, and takes the low elements as the result. Note that while this is
12767/// specified as a *right shift* because x86 is little-endian, it is a *left
12768/// rotate* of the vector lanes.
12769static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
12770 SDValue V2, ArrayRef<int> Mask,
12771 const X86Subtarget &Subtarget,
12772 SelectionDAG &DAG) {
12773 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&((void)0)
12774 "Only 32-bit and 64-bit elements are supported!")((void)0);
12775
12776 // 128/256-bit vectors are only supported with VLX.
12777 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))((void)0)
12778 && "VLX required for 128/256-bit vectors")((void)0);
12779
12780 SDValue Lo = V1, Hi = V2;
12781 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12782 if (Rotation <= 0)
12783 return SDValue();
12784
12785 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12786 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12787}
12788
12789/// Try to lower a vector shuffle as a byte shift sequence.
12790static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12791 SDValue V2, ArrayRef<int> Mask,
12792 const APInt &Zeroable,
12793 const X86Subtarget &Subtarget,
12794 SelectionDAG &DAG) {
12795 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((void)0);
12796 assert(VT.is128BitVector() && "Only 128-bit vectors supported")((void)0);
12797
12798 // We need a shuffle that has zeros at one/both ends and a sequential
12799 // shuffle from one source within.
12800 unsigned ZeroLo = Zeroable.countTrailingOnes();
12801 unsigned ZeroHi = Zeroable.countLeadingOnes();
12802 if (!ZeroLo && !ZeroHi)
12803 return SDValue();
12804
12805 unsigned NumElts = Mask.size();
12806 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12807 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12808 return SDValue();
12809
12810 unsigned Scale = VT.getScalarSizeInBits() / 8;
12811 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12812 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12813 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12814 return SDValue();
12815
12816 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12817 Res = DAG.getBitcast(MVT::v16i8, Res);
12818
12819 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12820 // inner sequential set of elements, possibly offset:
12821 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12822 // 01234567 --> 4567zzzz --> zzzzz456
12823 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12824 if (ZeroLo == 0) {
12825 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12826 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12827 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12828 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12829 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12830 } else if (ZeroHi == 0) {
12831 unsigned Shift = Mask[ZeroLo] % NumElts;
12832 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12833 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12834 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12835 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12836 } else if (!Subtarget.hasSSSE3()) {
12837 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12838 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12839 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12840 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12841 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12842 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12843 Shift += Mask[ZeroLo] % NumElts;
12844 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12845 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12846 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12847 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12848 } else
12849 return SDValue();
12850
12851 return DAG.getBitcast(VT, Res);
12852}
12853
12854/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12855///
12856/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12857/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12858/// matches elements from one of the input vectors shuffled to the left or
12859/// right with zeroable elements 'shifted in'. It handles both the strictly
12860/// bit-wise element shifts and the byte shift across an entire 128-bit double
12861/// quad word lane.
12862///
12863/// PSHL : (little-endian) left bit shift.
12864/// [ zz, 0, zz, 2 ]
12865/// [ -1, 4, zz, -1 ]
12866/// PSRL : (little-endian) right bit shift.
12867/// [ 1, zz, 3, zz]
12868/// [ -1, -1, 7, zz]
12869/// PSLLDQ : (little-endian) left byte shift
12870/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12871/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12872/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12873/// PSRLDQ : (little-endian) right byte shift
12874/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12875/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12876/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12877static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12878 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12879 int MaskOffset, const APInt &Zeroable,
12880 const X86Subtarget &Subtarget) {
12881 int Size = Mask.size();
12882 unsigned SizeInBits = Size * ScalarSizeInBits;
12883
12884 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12885 for (int i = 0; i < Size; i += Scale)
12886 for (int j = 0; j < Shift; ++j)
12887 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12888 return false;
12889
12890 return true;
12891 };
12892
12893 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12894 for (int i = 0; i != Size; i += Scale) {
12895 unsigned Pos = Left ? i + Shift : i;
12896 unsigned Low = Left ? i : i + Shift;
12897 unsigned Len = Scale - Shift;
12898 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12899 return -1;
12900 }
12901
12902 int ShiftEltBits = ScalarSizeInBits * Scale;
12903 bool ByteShift = ShiftEltBits > 64;
12904 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12905 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12906 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12907
12908 // Normalize the scale for byte shifts to still produce an i64 element
12909 // type.
12910 Scale = ByteShift ? Scale / 2 : Scale;
12911
12912 // We need to round trip through the appropriate type for the shift.
12913 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12914 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12915 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12916 return (int)ShiftAmt;
12917 };
12918
12919 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12920 // keep doubling the size of the integer elements up to that. We can
12921 // then shift the elements of the integer vector by whole multiples of
12922 // their width within the elements of the larger integer vector. Test each
12923 // multiple to see if we can find a match with the moved element indices
12924 // and that the shifted in elements are all zeroable.
12925 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12926 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12927 for (int Shift = 1; Shift != Scale; ++Shift)
12928 for (bool Left : {true, false})
12929 if (CheckZeros(Shift, Scale, Left)) {
12930 int ShiftAmt = MatchShift(Shift, Scale, Left);
12931 if (0 < ShiftAmt)
12932 return ShiftAmt;
12933 }
12934
12935 // no match
12936 return -1;
12937}
12938
12939static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12940 SDValue V2, ArrayRef<int> Mask,
12941 const APInt &Zeroable,
12942 const X86Subtarget &Subtarget,
12943 SelectionDAG &DAG) {
12944 int Size = Mask.size();
12945 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
12946
12947 MVT ShiftVT;
12948 SDValue V = V1;
12949 unsigned Opcode;
12950
12951 // Try to match shuffle against V1 shift.
12952 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12953 Mask, 0, Zeroable, Subtarget);
12954
12955 // If V1 failed, try to match shuffle against V2 shift.
12956 if (ShiftAmt < 0) {
12957 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12958 Mask, Size, Zeroable, Subtarget);
12959 V = V2;
12960 }
12961
12962 if (ShiftAmt < 0)
12963 return SDValue();
12964
12965 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((void)0)
12966 "Illegal integer vector type")((void)0);
12967 V = DAG.getBitcast(ShiftVT, V);
12968 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12969 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12970 return DAG.getBitcast(VT, V);
12971}
12972
12973// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12974// Remainder of lower half result is zero and upper half is all undef.
12975static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12976 ArrayRef<int> Mask, uint64_t &BitLen,
12977 uint64_t &BitIdx, const APInt &Zeroable) {
12978 int Size = Mask.size();
12979 int HalfSize = Size / 2;
12980 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
12981 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((void)0);
12982
12983 // Upper half must be undefined.
12984 if (!isUndefUpperHalf(Mask))
12985 return false;
12986
12987 // Determine the extraction length from the part of the
12988 // lower half that isn't zeroable.
12989 int Len = HalfSize;
12990 for (; Len > 0; --Len)
12991 if (!Zeroable[Len - 1])
12992 break;
12993 assert(Len > 0 && "Zeroable shuffle mask")((void)0);
12994
12995 // Attempt to match first Len sequential elements from the lower half.
12996 SDValue Src;
12997 int Idx = -1;
12998 for (int i = 0; i != Len; ++i) {
12999 int M = Mask[i];
13000 if (M == SM_SentinelUndef)
13001 continue;
13002 SDValue &V = (M < Size ? V1 : V2);
13003 M = M % Size;
13004
13005 // The extracted elements must start at a valid index and all mask
13006 // elements must be in the lower half.
13007 if (i > M || M >= HalfSize)
13008 return false;
13009
13010 if (Idx < 0 || (Src == V && Idx == (M - i))) {
13011 Src = V;
13012 Idx = M - i;
13013 continue;
13014 }
13015 return false;
13016 }
13017
13018 if (!Src || Idx < 0)
13019 return false;
13020
13021 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")((void)0);
13022 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13023 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13024 V1 = Src;
13025 return true;
13026}
13027
13028// INSERTQ: Extract lowest Len elements from lower half of second source and
13029// insert over first source, starting at Idx.
13030// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13031static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13032 ArrayRef<int> Mask, uint64_t &BitLen,
13033 uint64_t &BitIdx) {
13034 int Size = Mask.size();
13035 int HalfSize = Size / 2;
13036 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((void)0);
13037
13038 // Upper half must be undefined.
13039 if (!isUndefUpperHalf(Mask))
13040 return false;
13041
13042 for (int Idx = 0; Idx != HalfSize; ++Idx) {
13043 SDValue Base;
13044
13045 // Attempt to match first source from mask before insertion point.
13046 if (isUndefInRange(Mask, 0, Idx)) {
13047 /* EMPTY */
13048 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13049 Base = V1;
13050 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13051 Base = V2;
13052 } else {
13053 continue;
13054 }
13055
13056 // Extend the extraction length looking to match both the insertion of
13057 // the second source and the remaining elements of the first.
13058 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13059 SDValue Insert;
13060 int Len = Hi - Idx;
13061
13062 // Match insertion.
13063 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13064 Insert = V1;
13065 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13066 Insert = V2;
13067 } else {
13068 continue;
13069 }
13070
13071 // Match the remaining elements of the lower half.
13072 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13073 /* EMPTY */
13074 } else if ((!Base || (Base == V1)) &&
13075 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13076 Base = V1;
13077 } else if ((!Base || (Base == V2)) &&
13078 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13079 Size + Hi)) {
13080 Base = V2;
13081 } else {
13082 continue;
13083 }
13084
13085 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13086 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13087 V1 = Base;
13088 V2 = Insert;
13089 return true;
13090 }
13091 }
13092
13093 return false;
13094}
13095
13096/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13097static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13098 SDValue V2, ArrayRef<int> Mask,
13099 const APInt &Zeroable, SelectionDAG &DAG) {
13100 uint64_t BitLen, BitIdx;
13101 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13102 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13103 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13104 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13105
13106 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13107 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13108 V2 ? V2 : DAG.getUNDEF(VT),
13109 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13110 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13111
13112 return SDValue();
13113}
13114
13115/// Lower a vector shuffle as a zero or any extension.
13116///
13117/// Given a specific number of elements, element bit width, and extension
13118/// stride, produce either a zero or any extension based on the available
13119/// features of the subtarget. The extended elements are consecutive and
13120/// begin and can start from an offsetted element index in the input; to
13121/// avoid excess shuffling the offset must either being in the bottom lane
13122/// or at the start of a higher lane. All extended elements must be from
13123/// the same lane.
13124static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13125 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13126 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13127 assert(Scale > 1 && "Need a scale to extend.")((void)0);
13128 int EltBits = VT.getScalarSizeInBits();
13129 int NumElements = VT.getVectorNumElements();
13130 int NumEltsPerLane = 128 / EltBits;
13131 int OffsetLane = Offset / NumEltsPerLane;
13132 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&((void)0)
13133 "Only 8, 16, and 32 bit elements can be extended.")((void)0);
13134 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((void)0);
13135 assert(0 <= Offset && "Extension offset must be positive.")((void)0);
13136 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&((void)0)
13137 "Extension offset must be in the first lane or start an upper lane.")((void)0);
13138
13139 // Check that an index is in same lane as the base offset.
13140 auto SafeOffset = [&](int Idx) {
13141 return OffsetLane == (Idx / NumEltsPerLane);
13142 };
13143
13144 // Shift along an input so that the offset base moves to the first element.
13145 auto ShuffleOffset = [&](SDValue V) {
13146 if (!Offset)
13147 return V;
13148
13149 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13150 for (int i = 0; i * Scale < NumElements; ++i) {
13151 int SrcIdx = i + Offset;
13152 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13153 }
13154 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13155 };
13156
13157 // Found a valid a/zext mask! Try various lowering strategies based on the
13158 // input type and available ISA extensions.
13159 if (Subtarget.hasSSE41()) {
13160 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13161 // PUNPCK will catch this in a later shuffle match.
13162 if (Offset && Scale == 2 && VT.is128BitVector())
13163 return SDValue();
13164 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13165 NumElements / Scale);
13166 InputV = ShuffleOffset(InputV);
13167 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13168 DL, ExtVT, InputV, DAG);
13169 return DAG.getBitcast(VT, InputV);
13170 }
13171
13172 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((void)0);
13173
13174 // For any extends we can cheat for larger element sizes and use shuffle
13175 // instructions that can fold with a load and/or copy.
13176 if (AnyExt && EltBits == 32) {
13177 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13178 -1};
13179 return DAG.getBitcast(
13180 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13181 DAG.getBitcast(MVT::v4i32, InputV),
13182 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13183 }
13184 if (AnyExt && EltBits == 16 && Scale > 2) {
13185 int PSHUFDMask[4] = {Offset / 2, -1,
13186 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13187 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13188 DAG.getBitcast(MVT::v4i32, InputV),
13189 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13190 int PSHUFWMask[4] = {1, -1, -1, -1};
13191 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13192 return DAG.getBitcast(
13193 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13194 DAG.getBitcast(MVT::v8i16, InputV),
13195 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13196 }
13197
13198 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13199 // to 64-bits.
13200 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13201 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((void)0);
13202 assert(VT.is128BitVector() && "Unexpected vector width!")((void)0);
13203
13204 int LoIdx = Offset * EltBits;
13205 SDValue Lo = DAG.getBitcast(
13206 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13207 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13208 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13209
13210 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13211 return DAG.getBitcast(VT, Lo);
13212
13213 int HiIdx = (Offset + 1) * EltBits;
13214 SDValue Hi = DAG.getBitcast(
13215 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13216 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13217 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13218 return DAG.getBitcast(VT,
13219 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13220 }
13221
13222 // If this would require more than 2 unpack instructions to expand, use
13223 // pshufb when available. We can only use more than 2 unpack instructions
13224 // when zero extending i8 elements which also makes it easier to use pshufb.
13225 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13226 assert(NumElements == 16 && "Unexpected byte vector width!")((void)0);
13227 SDValue PSHUFBMask[16];
13228 for (int i = 0; i < 16; ++i) {
13229 int Idx = Offset + (i / Scale);
13230 if ((i % Scale == 0 && SafeOffset(Idx))) {
13231 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13232 continue;
13233 }
13234 PSHUFBMask[i] =
13235 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13236 }
13237 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13238 return DAG.getBitcast(
13239 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13240 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13241 }
13242
13243 // If we are extending from an offset, ensure we start on a boundary that
13244 // we can unpack from.
13245 int AlignToUnpack = Offset % (NumElements / Scale);
13246 if (AlignToUnpack) {
13247 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13248 for (int i = AlignToUnpack; i < NumElements; ++i)
13249 ShMask[i - AlignToUnpack] = i;
13250 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13251 Offset -= AlignToUnpack;
13252 }
13253
13254 // Otherwise emit a sequence of unpacks.
13255 do {
13256 unsigned UnpackLoHi = X86ISD::UNPCKL;
13257 if (Offset >= (NumElements / 2)) {
13258 UnpackLoHi = X86ISD::UNPCKH;
13259 Offset -= (NumElements / 2);
13260 }
13261
13262 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13263 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13264 : getZeroVector(InputVT, Subtarget, DAG, DL);
13265 InputV = DAG.getBitcast(InputVT, InputV);
13266 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13267 Scale /= 2;
13268 EltBits *= 2;
13269 NumElements /= 2;
13270 } while (Scale > 1);
13271 return DAG.getBitcast(VT, InputV);
13272}
13273
13274/// Try to lower a vector shuffle as a zero extension on any microarch.
13275///
13276/// This routine will try to do everything in its power to cleverly lower
13277/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13278/// check for the profitability of this lowering, it tries to aggressively
13279/// match this pattern. It will use all of the micro-architectural details it
13280/// can to emit an efficient lowering. It handles both blends with all-zero
13281/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13282/// masking out later).
13283///
13284/// The reason we have dedicated lowering for zext-style shuffles is that they
13285/// are both incredibly common and often quite performance sensitive.
13286static SDValue lowerShuffleAsZeroOrAnyExtend(
13287 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13288 const APInt &Zeroable, const X86Subtarget &Subtarget,
13289 SelectionDAG &DAG) {
13290 int Bits = VT.getSizeInBits();
13291 int NumLanes = Bits / 128;
13292 int NumElements = VT.getVectorNumElements();
13293 int NumEltsPerLane = NumElements / NumLanes;
13294 assert(VT.getScalarSizeInBits() <= 32 &&((void)0)
13295 "Exceeds 32-bit integer zero extension limit")((void)0);
13296 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")((void)0);
13297
13298 // Define a helper function to check a particular ext-scale and lower to it if
13299 // valid.
13300 auto Lower = [&](int Scale) -> SDValue {
13301 SDValue InputV;
13302 bool AnyExt = true;
13303 int Offset = 0;
13304 int Matches = 0;
13305 for (int i = 0; i < NumElements; ++i) {
13306 int M = Mask[i];
13307 if (M < 0)
13308 continue; // Valid anywhere but doesn't tell us anything.
13309 if (i % Scale != 0) {
13310 // Each of the extended elements need to be zeroable.
13311 if (!Zeroable[i])
13312 return SDValue();
13313
13314 // We no longer are in the anyext case.
13315 AnyExt = false;
13316 continue;
13317 }
13318
13319 // Each of the base elements needs to be consecutive indices into the
13320 // same input vector.
13321 SDValue V = M < NumElements ? V1 : V2;
13322 M = M % NumElements;
13323 if (!InputV) {
13324 InputV = V;
13325 Offset = M - (i / Scale);
13326 } else if (InputV != V)
13327 return SDValue(); // Flip-flopping inputs.
13328
13329 // Offset must start in the lowest 128-bit lane or at the start of an
13330 // upper lane.
13331 // FIXME: Is it ever worth allowing a negative base offset?
13332 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13333 (Offset % NumEltsPerLane) == 0))
13334 return SDValue();
13335
13336 // If we are offsetting, all referenced entries must come from the same
13337 // lane.
13338 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13339 return SDValue();
13340
13341 if ((M % NumElements) != (Offset + (i / Scale)))
13342 return SDValue(); // Non-consecutive strided elements.
13343 Matches++;
13344 }
13345
13346 // If we fail to find an input, we have a zero-shuffle which should always
13347 // have already been handled.
13348 // FIXME: Maybe handle this here in case during blending we end up with one?
13349 if (!InputV)
13350 return SDValue();
13351
13352 // If we are offsetting, don't extend if we only match a single input, we
13353 // can always do better by using a basic PSHUF or PUNPCK.
13354 if (Offset != 0 && Matches < 2)
13355 return SDValue();
13356
13357 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13358 InputV, Mask, Subtarget, DAG);
13359 };
13360
13361 // The widest scale possible for extending is to a 64-bit integer.
13362 assert(Bits % 64 == 0 &&((void)0)
13363 "The number of bits in a vector must be divisible by 64 on x86!")((void)0);
13364 int NumExtElements = Bits / 64;
13365
13366 // Each iteration, try extending the elements half as much, but into twice as
13367 // many elements.
13368 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13369 assert(NumElements % NumExtElements == 0 &&((void)0)
13370 "The input vector size must be divisible by the extended size.")((void)0);
13371 if (SDValue V = Lower(NumElements / NumExtElements))
13372 return V;
13373 }
13374
13375 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13376 if (Bits != 128)
13377 return SDValue();
13378
13379 // Returns one of the source operands if the shuffle can be reduced to a
13380 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13381 auto CanZExtLowHalf = [&]() {
13382 for (int i = NumElements / 2; i != NumElements; ++i)
13383 if (!Zeroable[i])
13384 return SDValue();
13385 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13386 return V1;
13387 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13388 return V2;
13389 return SDValue();
13390 };
13391
13392 if (SDValue V = CanZExtLowHalf()) {
13393 V = DAG.getBitcast(MVT::v2i64, V);
13394 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13395 return DAG.getBitcast(VT, V);
13396 }
13397
13398 // No viable ext lowering found.
13399 return SDValue();
13400}
13401
13402/// Try to get a scalar value for a specific element of a vector.
13403///
13404/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13405static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13406 SelectionDAG &DAG) {
13407 MVT VT = V.getSimpleValueType();
13408 MVT EltVT = VT.getVectorElementType();
13409 V = peekThroughBitcasts(V);
13410
13411 // If the bitcasts shift the element size, we can't extract an equivalent
13412 // element from it.
13413 MVT NewVT = V.getSimpleValueType();
13414 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13415 return SDValue();
13416
13417 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13418 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13419 // Ensure the scalar operand is the same size as the destination.
13420 // FIXME: Add support for scalar truncation where possible.
13421 SDValue S = V.getOperand(Idx);
13422 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13423 return DAG.getBitcast(EltVT, S);
13424 }
13425
13426 return SDValue();
13427}
13428
13429/// Helper to test for a load that can be folded with x86 shuffles.
13430///
13431/// This is particularly important because the set of instructions varies
13432/// significantly based on whether the operand is a load or not.
13433static bool isShuffleFoldableLoad(SDValue V) {
13434 V = peekThroughBitcasts(V);
13435 return ISD::isNON_EXTLoad(V.getNode());
13436}
13437
13438/// Try to lower insertion of a single element into a zero vector.
13439///
13440/// This is a common pattern that we have especially efficient patterns to lower
13441/// across all subtarget feature sets.
13442static SDValue lowerShuffleAsElementInsertion(
13443 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13444 const APInt &Zeroable, const X86Subtarget &Subtarget,
13445 SelectionDAG &DAG) {
13446 MVT ExtVT = VT;
13447 MVT EltVT = VT.getVectorElementType();
13448
13449 int V2Index =
13450 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13451 Mask.begin();
13452 bool IsV1Zeroable = true;
13453 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13454 if (i != V2Index && !Zeroable[i]) {
13455 IsV1Zeroable = false;
13456 break;
13457 }
13458
13459 // Check for a single input from a SCALAR_TO_VECTOR node.
13460 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13461 // all the smarts here sunk into that routine. However, the current
13462 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13463 // vector shuffle lowering is dead.
13464 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13465 DAG);
13466 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13467 // We need to zext the scalar if it is smaller than an i32.
13468 V2S = DAG.getBitcast(EltVT, V2S);
13469 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
13470 // Using zext to expand a narrow element won't work for non-zero
13471 // insertions.
13472 if (!IsV1Zeroable)
13473 return SDValue();
13474
13475 // Zero-extend directly to i32.
13476 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13477 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13478 }
13479 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13480 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13481 EltVT == MVT::i16) {
13482 // Either not inserting from the low element of the input or the input
13483 // element size is too small to use VZEXT_MOVL to clear the high bits.
13484 return SDValue();
13485 }
13486
13487 if (!IsV1Zeroable) {
13488 // If V1 can't be treated as a zero vector we have fewer options to lower
13489 // this. We can't support integer vectors or non-zero targets cheaply, and
13490 // the V1 elements can't be permuted in any way.
13491 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((void)0);
13492 if (!VT.isFloatingPoint() || V2Index != 0)
13493 return SDValue();
13494 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13495 V1Mask[V2Index] = -1;
13496 if (!isNoopShuffleMask(V1Mask))
13497 return SDValue();
13498 if (!VT.is128BitVector())
13499 return SDValue();
13500
13501 // Otherwise, use MOVSD or MOVSS.
13502 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&((void)0)
13503 "Only two types of floating point element types to handle!")((void)0);
13504 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
13505 ExtVT, V1, V2);
13506 }
13507
13508 // This lowering only works for the low element with floating point vectors.
13509 if (VT.isFloatingPoint() && V2Index != 0)
13510 return SDValue();
13511
13512 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13513 if (ExtVT != VT)
13514 V2 = DAG.getBitcast(VT, V2);
13515
13516 if (V2Index != 0) {
13517 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13518 // the desired position. Otherwise it is more efficient to do a vector
13519 // shift left. We know that we can do a vector shift left because all
13520 // the inputs are zero.
13521 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13522 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13523 V2Shuffle[V2Index] = 0;
13524 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13525 } else {
13526 V2 = DAG.getBitcast(MVT::v16i8, V2);
13527 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13528 DAG.getTargetConstant(
13529 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13530 V2 = DAG.getBitcast(VT, V2);
13531 }
13532 }
13533 return V2;
13534}
13535
13536/// Try to lower broadcast of a single - truncated - integer element,
13537/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13538///
13539/// This assumes we have AVX2.
13540static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13541 int BroadcastIdx,
13542 const X86Subtarget &Subtarget,
13543 SelectionDAG &DAG) {
13544 assert(Subtarget.hasAVX2() &&((void)0)
13545 "We can only lower integer broadcasts with AVX2!")((void)0);
13546
13547 MVT EltVT = VT.getVectorElementType();
13548 MVT V0VT = V0.getSimpleValueType();
13549
13550 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((void)0);
13551 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((void)0);
13552
13553 MVT V0EltVT = V0VT.getVectorElementType();
13554 if (!V0EltVT.isInteger())
13555 return SDValue();
13556
13557 const unsigned EltSize = EltVT.getSizeInBits();
13558 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13559
13560 // This is only a truncation if the original element type is larger.
13561 if (V0EltSize <= EltSize)
13562 return SDValue();
13563
13564 assert(((V0EltSize % EltSize) == 0) &&((void)0)
13565 "Scalar type sizes must all be powers of 2 on x86!")((void)0);
13566
13567 const unsigned V0Opc = V0.getOpcode();
13568 const unsigned Scale = V0EltSize / EltSize;
13569 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13570
13571 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13572 V0Opc != ISD::BUILD_VECTOR)
13573 return SDValue();
13574
13575 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13576
13577 // If we're extracting non-least-significant bits, shift so we can truncate.
13578 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13579 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13580 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13581 if (const int OffsetIdx = BroadcastIdx % Scale)
13582 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13583 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13584
13585 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13586 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13587}
13588
13589/// Test whether this can be lowered with a single SHUFPS instruction.
13590///
13591/// This is used to disable more specialized lowerings when the shufps lowering
13592/// will happen to be efficient.
13593static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13594 // This routine only handles 128-bit shufps.
13595 assert(Mask.size() == 4 && "Unsupported mask size!")((void)0);
13596 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((void)0);
13597 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((void)0);
13598 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((void)0);
13599 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((void)0);
13600
13601 // To lower with a single SHUFPS we need to have the low half and high half
13602 // each requiring a single input.
13603 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13604 return false;
13605 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13606 return false;
13607
13608 return true;
13609}
13610
13611/// If we are extracting two 128-bit halves of a vector and shuffling the
13612/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13613/// multi-shuffle lowering.
13614static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13615 SDValue N1, ArrayRef<int> Mask,
13616 SelectionDAG &DAG) {
13617 MVT VT = N0.getSimpleValueType();
13618 assert((VT.is128BitVector() &&((void)0)
13619 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&((void)0)
13620 "VPERM* family of shuffles requires 32-bit or 64-bit elements")((void)0);
13621
13622 // Check that both sources are extracts of the same source vector.
13623 if (!N0.hasOneUse() || !N1.hasOneUse() ||
13624 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13625 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13626 N0.getOperand(0) != N1.getOperand(0))
13627 return SDValue();
13628
13629 SDValue WideVec = N0.getOperand(0);
13630 MVT WideVT = WideVec.getSimpleValueType();
13631 if (!WideVT.is256BitVector())
13632 return SDValue();
13633
13634 // Match extracts of each half of the wide source vector. Commute the shuffle
13635 // if the extract of the low half is N1.
13636 unsigned NumElts = VT.getVectorNumElements();
13637 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13638 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13639 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13640 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13641 ShuffleVectorSDNode::commuteMask(NewMask);
13642 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13643 return SDValue();
13644
13645 // Final bailout: if the mask is simple, we are better off using an extract
13646 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13647 // because that avoids a constant load from memory.
13648 if (NumElts == 4 &&
13649 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13650 return SDValue();
13651
13652 // Extend the shuffle mask with undef elements.
13653 NewMask.append(NumElts, -1);
13654
13655 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13656 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13657 NewMask);
13658 // This is free: ymm -> xmm.
13659 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13660 DAG.getIntPtrConstant(0, DL));
13661}
13662
13663/// Try to lower broadcast of a single element.
13664///
13665/// For convenience, this code also bundles all of the subtarget feature set
13666/// filtering. While a little annoying to re-dispatch on type here, there isn't
13667/// a convenient way to factor it out.
13668static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13669 SDValue V2, ArrayRef<int> Mask,
13670 const X86Subtarget &Subtarget,
13671 SelectionDAG &DAG) {
13672 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13673 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13674 (Subtarget.hasAVX2() && VT.isInteger())))
13675 return SDValue();
13676
13677 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13678 // we can only broadcast from a register with AVX2.
13679 unsigned NumEltBits = VT.getScalarSizeInBits();
13680 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13681 ? X86ISD::MOVDDUP
13682 : X86ISD::VBROADCAST;
13683 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13684
13685 // Check that the mask is a broadcast.
13686 int BroadcastIdx = getSplatIndex(Mask);
13687 if (BroadcastIdx < 0)
13688 return SDValue();
13689 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((void)0)
13690 "a sorted mask where the broadcast "((void)0)
13691 "comes from V1.")((void)0);
13692
13693 // Go up the chain of (vector) values to find a scalar load that we can
13694 // combine with the broadcast.
13695 // TODO: Combine this logic with findEltLoadSrc() used by
13696 // EltsFromConsecutiveLoads().
13697 int BitOffset = BroadcastIdx * NumEltBits;
13698 SDValue V = V1;
13699 for (;;) {
13700 switch (V.getOpcode()) {
13701 case ISD::BITCAST: {
13702 V = V.getOperand(0);
13703 continue;
13704 }
13705 case ISD::CONCAT_VECTORS: {
13706 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13707 int OpIdx = BitOffset / OpBitWidth;
13708 V = V.getOperand(OpIdx);
13709 BitOffset %= OpBitWidth;
13710 continue;
13711 }
13712 case ISD::EXTRACT_SUBVECTOR: {
13713 // The extraction index adds to the existing offset.
13714 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13715 unsigned Idx = V.getConstantOperandVal(1);
13716 unsigned BeginOffset = Idx * EltBitWidth;
13717 BitOffset += BeginOffset;
13718 V = V.getOperand(0);
13719 continue;
13720 }
13721 case ISD::INSERT_SUBVECTOR: {
13722 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13723 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13724 int Idx = (int)V.getConstantOperandVal(2);
13725 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13726 int BeginOffset = Idx * EltBitWidth;
13727 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13728 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13729 BitOffset -= BeginOffset;
13730 V = VInner;
13731 } else {
13732 V = VOuter;
13733 }
13734 continue;
13735 }
13736 }
13737 break;
13738 }
13739 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")((void)0);
13740 BroadcastIdx = BitOffset / NumEltBits;
13741
13742 // Do we need to bitcast the source to retrieve the original broadcast index?
13743 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13744
13745 // Check if this is a broadcast of a scalar. We special case lowering
13746 // for scalars so that we can more effectively fold with loads.
13747 // If the original value has a larger element type than the shuffle, the
13748 // broadcast element is in essence truncated. Make that explicit to ease
13749 // folding.
13750 if (BitCastSrc && VT.isInteger())
13751 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13752 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13753 return TruncBroadcast;
13754
13755 // Also check the simpler case, where we can directly reuse the scalar.
13756 if (!BitCastSrc &&
13757 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13758 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13759 V = V.getOperand(BroadcastIdx);
13760
13761 // If we can't broadcast from a register, check that the input is a load.
13762 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13763 return SDValue();
13764 } else if (ISD::isNormalLoad(V.getNode()) &&
13765 cast<LoadSDNode>(V)->isSimple()) {
13766 // We do not check for one-use of the vector load because a broadcast load
13767 // is expected to be a win for code size, register pressure, and possibly
13768 // uops even if the original vector load is not eliminated.
13769
13770 // Reduce the vector load and shuffle to a broadcasted scalar load.
13771 LoadSDNode *Ld = cast<LoadSDNode>(V);
13772 SDValue BaseAddr = Ld->getOperand(1);
13773 MVT SVT = VT.getScalarType();
13774 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13775 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")((void)0);
13776 SDValue NewAddr =
13777 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
13778
13779 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13780 // than MOVDDUP.
13781 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13782 if (Opcode == X86ISD::VBROADCAST) {
13783 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13784 SDValue Ops[] = {Ld->getChain(), NewAddr};
13785 V = DAG.getMemIntrinsicNode(
13786 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13787 DAG.getMachineFunction().getMachineMemOperand(
13788 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13789 DAG.makeEquivalentMemoryOrdering(Ld, V);
13790 return DAG.getBitcast(VT, V);
13791 }
13792 assert(SVT == MVT::f64 && "Unexpected VT!")((void)0);
13793 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13794 DAG.getMachineFunction().getMachineMemOperand(
13795 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13796 DAG.makeEquivalentMemoryOrdering(Ld, V);
13797 } else if (!BroadcastFromReg) {
13798 // We can't broadcast from a vector register.
13799 return SDValue();
13800 } else if (BitOffset != 0) {
13801 // We can only broadcast from the zero-element of a vector register,
13802 // but it can be advantageous to broadcast from the zero-element of a
13803 // subvector.
13804 if (!VT.is256BitVector() && !VT.is512BitVector())
13805 return SDValue();
13806
13807 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13808 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13809 return SDValue();
13810
13811 // Only broadcast the zero-element of a 128-bit subvector.
13812 if ((BitOffset % 128) != 0)
13813 return SDValue();
13814
13815 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&((void)0)
13816 "Unexpected bit-offset")((void)0);
13817 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&((void)0)
13818 "Unexpected vector size")((void)0);
13819 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13820 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13821 }
13822
13823 // On AVX we can use VBROADCAST directly for scalar sources.
13824 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13825 V = DAG.getBitcast(MVT::f64, V);
13826 if (Subtarget.hasAVX()) {
13827 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13828 return DAG.getBitcast(VT, V);
13829 }
13830 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13831 }
13832
13833 // If this is a scalar, do the broadcast on this type and bitcast.
13834 if (!V.getValueType().isVector()) {
13835 assert(V.getScalarValueSizeInBits() == NumEltBits &&((void)0)
13836 "Unexpected scalar size")((void)0);
13837 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13838 VT.getVectorNumElements());
13839 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13840 }
13841
13842 // We only support broadcasting from 128-bit vectors to minimize the
13843 // number of patterns we need to deal with in isel. So extract down to
13844 // 128-bits, removing as many bitcasts as possible.
13845 if (V.getValueSizeInBits() > 128)
13846 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13847
13848 // Otherwise cast V to a vector with the same element type as VT, but
13849 // possibly narrower than VT. Then perform the broadcast.
13850 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13851 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13852 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13853}
13854
13855// Check for whether we can use INSERTPS to perform the shuffle. We only use
13856// INSERTPS when the V1 elements are already in the correct locations
13857// because otherwise we can just always use two SHUFPS instructions which
13858// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13859// perform INSERTPS if a single V1 element is out of place and all V2
13860// elements are zeroable.
13861static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13862 unsigned &InsertPSMask,
13863 const APInt &Zeroable,
13864 ArrayRef<int> Mask, SelectionDAG &DAG) {
13865 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((void)0);
13866 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((void)0);
13867 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
13868
13869 // Attempt to match INSERTPS with one element from VA or VB being
13870 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13871 // are updated.
13872 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13873 ArrayRef<int> CandidateMask) {
13874 unsigned ZMask = 0;
13875 int VADstIndex = -1;
13876 int VBDstIndex = -1;
13877 bool VAUsedInPlace = false;
13878
13879 for (int i = 0; i < 4; ++i) {
13880 // Synthesize a zero mask from the zeroable elements (includes undefs).
13881 if (Zeroable[i]) {
13882 ZMask |= 1 << i;
13883 continue;
13884 }
13885
13886 // Flag if we use any VA inputs in place.
13887 if (i == CandidateMask[i]) {
13888 VAUsedInPlace = true;
13889 continue;
13890 }
13891
13892 // We can only insert a single non-zeroable element.
13893 if (VADstIndex >= 0 || VBDstIndex >= 0)
13894 return false;
13895
13896 if (CandidateMask[i] < 4) {
13897 // VA input out of place for insertion.
13898 VADstIndex = i;
13899 } else {
13900 // VB input for insertion.
13901 VBDstIndex = i;
13902 }
13903 }
13904
13905 // Don't bother if we have no (non-zeroable) element for insertion.
13906 if (VADstIndex < 0 && VBDstIndex < 0)
13907 return false;
13908
13909 // Determine element insertion src/dst indices. The src index is from the
13910 // start of the inserted vector, not the start of the concatenated vector.
13911 unsigned VBSrcIndex = 0;
13912 if (VADstIndex >= 0) {
13913 // If we have a VA input out of place, we use VA as the V2 element
13914 // insertion and don't use the original V2 at all.
13915 VBSrcIndex = CandidateMask[VADstIndex];
13916 VBDstIndex = VADstIndex;
13917 VB = VA;
13918 } else {
13919 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13920 }
13921
13922 // If no V1 inputs are used in place, then the result is created only from
13923 // the zero mask and the V2 insertion - so remove V1 dependency.
13924 if (!VAUsedInPlace)
13925 VA = DAG.getUNDEF(MVT::v4f32);
13926
13927 // Update V1, V2 and InsertPSMask accordingly.
13928 V1 = VA;
13929 V2 = VB;
13930
13931 // Insert the V2 element into the desired position.
13932 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13933 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")((void)0);
13934 return true;
13935 };
13936
13937 if (matchAsInsertPS(V1, V2, Mask))
13938 return true;
13939
13940 // Commute and try again.
13941 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13942 ShuffleVectorSDNode::commuteMask(CommutedMask);
13943 if (matchAsInsertPS(V2, V1, CommutedMask))
13944 return true;
13945
13946 return false;
13947}
13948
13949static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13950 ArrayRef<int> Mask, const APInt &Zeroable,
13951 SelectionDAG &DAG) {
13952 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
13953 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
13954
13955 // Attempt to match the insertps pattern.
13956 unsigned InsertPSMask = 0;
13957 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13958 return SDValue();
13959
13960 // Insert the V2 element into the desired position.
13961 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13962 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13963}
13964
13965/// Try to lower a shuffle as a permute of the inputs followed by an
13966/// UNPCK instruction.
13967///
13968/// This specifically targets cases where we end up with alternating between
13969/// the two inputs, and so can permute them into something that feeds a single
13970/// UNPCK instruction. Note that this routine only targets integer vectors
13971/// because for floating point vectors we have a generalized SHUFPS lowering
13972/// strategy that handles everything that doesn't *exactly* match an unpack,
13973/// making this clever lowering unnecessary.
13974static SDValue lowerShuffleAsPermuteAndUnpack(
13975 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13976 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13977 assert(!VT.isFloatingPoint() &&((void)0)
13978 "This routine only supports integer vectors.")((void)0);
13979 assert(VT.is128BitVector() &&((void)0)
13980 "This routine only works on 128-bit vectors.")((void)0);
13981 assert(!V2.isUndef() &&((void)0)
13982 "This routine should only be used when blending two inputs.")((void)0);
13983 assert(Mask.size() >= 2 && "Single element masks are invalid.")((void)0);
13984
13985 int Size = Mask.size();
13986
13987 int NumLoInputs =
13988 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13989 int NumHiInputs =
13990 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13991
13992 bool UnpackLo = NumLoInputs >= NumHiInputs;
13993
13994 auto TryUnpack = [&](int ScalarSize, int Scale) {
13995 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13996 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13997
13998 for (int i = 0; i < Size; ++i) {
13999 if (Mask[i] < 0)
14000 continue;
14001
14002 // Each element of the unpack contains Scale elements from this mask.
14003 int UnpackIdx = i / Scale;
14004
14005 // We only handle the case where V1 feeds the first slots of the unpack.
14006 // We rely on canonicalization to ensure this is the case.
14007 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14008 return SDValue();
14009
14010 // Setup the mask for this input. The indexing is tricky as we have to
14011 // handle the unpack stride.
14012 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14013 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14014 Mask[i] % Size;
14015 }
14016
14017 // If we will have to shuffle both inputs to use the unpack, check whether
14018 // we can just unpack first and shuffle the result. If so, skip this unpack.
14019 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14020 !isNoopShuffleMask(V2Mask))
14021 return SDValue();
14022
14023 // Shuffle the inputs into place.
14024 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14025 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14026
14027 // Cast the inputs to the type we will use to unpack them.
14028 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14029 V1 = DAG.getBitcast(UnpackVT, V1);
14030 V2 = DAG.getBitcast(UnpackVT, V2);
14031
14032 // Unpack the inputs and cast the result back to the desired type.
14033 return DAG.getBitcast(
14034 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14035 UnpackVT, V1, V2));
14036 };
14037
14038 // We try each unpack from the largest to the smallest to try and find one
14039 // that fits this mask.
14040 int OrigScalarSize = VT.getScalarSizeInBits();
14041 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14042 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14043 return Unpack;
14044
14045 // If we're shuffling with a zero vector then we're better off not doing
14046 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14047 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14048 ISD::isBuildVectorAllZeros(V2.getNode()))
14049 return SDValue();
14050
14051 // If none of the unpack-rooted lowerings worked (or were profitable) try an
14052 // initial unpack.
14053 if (NumLoInputs == 0 || NumHiInputs == 0) {
14054 assert((NumLoInputs > 0 || NumHiInputs > 0) &&((void)0)
14055 "We have to have *some* inputs!")((void)0);
14056 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14057
14058 // FIXME: We could consider the total complexity of the permute of each
14059 // possible unpacking. Or at the least we should consider how many
14060 // half-crossings are created.
14061 // FIXME: We could consider commuting the unpacks.
14062
14063 SmallVector<int, 32> PermMask((unsigned)Size, -1);
14064 for (int i = 0; i < Size; ++i) {
14065 if (Mask[i] < 0)
14066 continue;
14067
14068 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((void)0);
14069
14070 PermMask[i] =
14071 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14072 }
14073 return DAG.getVectorShuffle(
14074 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14075 DL, VT, V1, V2),
14076 DAG.getUNDEF(VT), PermMask);
14077 }
14078
14079 return SDValue();
14080}
14081
14082/// Handle lowering of 2-lane 64-bit floating point shuffles.
14083///
14084/// This is the basis function for the 2-lane 64-bit shuffles as we have full
14085/// support for floating point shuffles but not integer shuffles. These
14086/// instructions will incur a domain crossing penalty on some chips though so
14087/// it is better to avoid lowering through this for integer vectors where
14088/// possible.
14089static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14090 const APInt &Zeroable, SDValue V1, SDValue V2,
14091 const X86Subtarget &Subtarget,
14092 SelectionDAG &DAG) {
14093 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((void)0);
14094 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((void)0);
14095 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((void)0);
14096
14097 if (V2.isUndef()) {
14098 // Check for being able to broadcast a single element.
14099 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14100 Mask, Subtarget, DAG))
14101 return Broadcast;
14102
14103 // Straight shuffle of a single input vector. Simulate this by using the
14104 // single input as both of the "inputs" to this instruction..
14105 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14106
14107 if (Subtarget.hasAVX()) {
14108 // If we have AVX, we can use VPERMILPS which will allow folding a load
14109 // into the shuffle.
14110 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14111 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14112 }
14113
14114 return DAG.getNode(
14115 X86ISD::SHUFP, DL, MVT::v2f64,
14116 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14117 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14118 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14119 }
14120 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14121 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14122 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((void)0);
14123 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((void)0);
14124
14125 if (Subtarget.hasAVX2())
14126 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14127 return Extract;
14128
14129 // When loading a scalar and then shuffling it into a vector we can often do
14130 // the insertion cheaply.
14131 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14132 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14133 return Insertion;
14134 // Try inverting the insertion since for v2 masks it is easy to do and we
14135 // can't reliably sort the mask one way or the other.
14136 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14137 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14138 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14139 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14140 return Insertion;
14141
14142 // Try to use one of the special instruction patterns to handle two common
14143 // blend patterns if a zero-blend above didn't work.
14144 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14145 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14146 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14147 // We can either use a special instruction to load over the low double or
14148 // to move just the low double.
14149 return DAG.getNode(
14150 X86ISD::MOVSD, DL, MVT::v2f64, V2,
14151 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14152
14153 if (Subtarget.hasSSE41())
14154 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14155 Zeroable, Subtarget, DAG))
14156 return Blend;
14157
14158 // Use dedicated unpack instructions for masks that match their pattern.
14159 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14160 return V;
14161
14162 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14163 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14164 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14165}
14166
14167/// Handle lowering of 2-lane 64-bit integer shuffles.
14168///
14169/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14170/// the integer unit to minimize domain crossing penalties. However, for blends
14171/// it falls back to the floating point shuffle operation with appropriate bit
14172/// casting.
14173static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14174 const APInt &Zeroable, SDValue V1, SDValue V2,
14175 const X86Subtarget &Subtarget,
14176 SelectionDAG &DAG) {
14177 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((void)0);
14178 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((void)0);
14179 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((void)0);
14180
14181 if (V2.isUndef()) {
14182 // Check for being able to broadcast a single element.
14183 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14184 Mask, Subtarget, DAG))
14185 return Broadcast;
14186
14187 // Straight shuffle of a single input vector. For everything from SSE2
14188 // onward this has a single fast instruction with no scary immediates.
14189 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14190 V1 = DAG.getBitcast(MVT::v4i32, V1);
14191 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14192 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14193 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14194 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14195 return DAG.getBitcast(
14196 MVT::v2i64,
14197 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14198 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14199 }
14200 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14201 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((void)0);
14202 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((void)0);
14203 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((void)0);
14204
14205 if (Subtarget.hasAVX2())
14206 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14207 return Extract;
14208
14209 // Try to use shift instructions.
14210 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14211 Zeroable, Subtarget, DAG))
14212 return Shift;
14213
14214 // When loading a scalar and then shuffling it into a vector we can often do
14215 // the insertion cheaply.
14216 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14217 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14218 return Insertion;
14219 // Try inverting the insertion since for v2 masks it is easy to do and we
14220 // can't reliably sort the mask one way or the other.
14221 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14222 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14223 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14224 return Insertion;
14225
14226 // We have different paths for blend lowering, but they all must use the
14227 // *exact* same predicate.
14228 bool IsBlendSupported = Subtarget.hasSSE41();
14229 if (IsBlendSupported)
14230 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14231 Zeroable, Subtarget, DAG))
14232 return Blend;
14233
14234 // Use dedicated unpack instructions for masks that match their pattern.
14235 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14236 return V;
14237
14238 // Try to use byte rotation instructions.
14239 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14240 if (Subtarget.hasSSSE3()) {
14241 if (Subtarget.hasVLX())
14242 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14243 Subtarget, DAG))
14244 return Rotate;
14245
14246 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14247 Subtarget, DAG))
14248 return Rotate;
14249 }
14250
14251 // If we have direct support for blends, we should lower by decomposing into
14252 // a permute. That will be faster than the domain cross.
14253 if (IsBlendSupported)
14254 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14255 Subtarget, DAG);
14256
14257 // We implement this with SHUFPD which is pretty lame because it will likely
14258 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14259 // However, all the alternatives are still more cycles and newer chips don't
14260 // have this problem. It would be really nice if x86 had better shuffles here.
14261 V1 = DAG.getBitcast(MVT::v2f64, V1);
14262 V2 = DAG.getBitcast(MVT::v2f64, V2);
14263 return DAG.getBitcast(MVT::v2i64,
14264 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14265}
14266
14267/// Lower a vector shuffle using the SHUFPS instruction.
14268///
14269/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14270/// It makes no assumptions about whether this is the *best* lowering, it simply
14271/// uses it.
14272static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14273 ArrayRef<int> Mask, SDValue V1,
14274 SDValue V2, SelectionDAG &DAG) {
14275 SDValue LowV = V1, HighV = V2;
14276 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14277 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14278
14279 if (NumV2Elements == 1) {
14280 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14281
14282 // Compute the index adjacent to V2Index and in the same half by toggling
14283 // the low bit.
14284 int V2AdjIndex = V2Index ^ 1;
14285
14286 if (Mask[V2AdjIndex] < 0) {
14287 // Handles all the cases where we have a single V2 element and an undef.
14288 // This will only ever happen in the high lanes because we commute the
14289 // vector otherwise.
14290 if (V2Index < 2)
14291 std::swap(LowV, HighV);
14292 NewMask[V2Index] -= 4;
14293 } else {
14294 // Handle the case where the V2 element ends up adjacent to a V1 element.
14295 // To make this work, blend them together as the first step.
14296 int V1Index = V2AdjIndex;
14297 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14298 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14299 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14300
14301 // Now proceed to reconstruct the final blend as we have the necessary
14302 // high or low half formed.
14303 if (V2Index < 2) {
14304 LowV = V2;
14305 HighV = V1;
14306 } else {
14307 HighV = V2;
14308 }
14309 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14310 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14311 }
14312 } else if (NumV2Elements == 2) {
14313 if (Mask[0] < 4 && Mask[1] < 4) {
14314 // Handle the easy case where we have V1 in the low lanes and V2 in the
14315 // high lanes.
14316 NewMask[2] -= 4;
14317 NewMask[3] -= 4;
14318 } else if (Mask[2] < 4 && Mask[3] < 4) {
14319 // We also handle the reversed case because this utility may get called
14320 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14321 // arrange things in the right direction.
14322 NewMask[0] -= 4;
14323 NewMask[1] -= 4;
14324 HighV = V1;
14325 LowV = V2;
14326 } else {
14327 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14328 // trying to place elements directly, just blend them and set up the final
14329 // shuffle to place them.
14330
14331 // The first two blend mask elements are for V1, the second two are for
14332 // V2.
14333 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14334 Mask[2] < 4 ? Mask[2] : Mask[3],
14335 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14336 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14337 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14338 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14339
14340 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14341 // a blend.
14342 LowV = HighV = V1;
14343 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14344 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14345 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14346 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14347 }
14348 } else if (NumV2Elements == 3) {
14349 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14350 // we can get here due to other paths (e.g repeated mask matching) that we
14351 // don't want to do another round of lowerVECTOR_SHUFFLE.
14352 ShuffleVectorSDNode::commuteMask(NewMask);
14353 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14354 }
14355 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14356 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14357}
14358
14359/// Lower 4-lane 32-bit floating point shuffles.
14360///
14361/// Uses instructions exclusively from the floating point unit to minimize
14362/// domain crossing penalties, as these are sufficient to implement all v4f32
14363/// shuffles.
14364static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14365 const APInt &Zeroable, SDValue V1, SDValue V2,
14366 const X86Subtarget &Subtarget,
14367 SelectionDAG &DAG) {
14368 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
14369 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((void)0);
14370 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
14371
14372 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14373
14374 if (NumV2Elements == 0) {
14375 // Check for being able to broadcast a single element.
14376 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14377 Mask, Subtarget, DAG))
14378 return Broadcast;
14379
14380 // Use even/odd duplicate instructions for masks that match their pattern.
14381 if (Subtarget.hasSSE3()) {
14382 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14383 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14384 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14385 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14386 }
14387
14388 if (Subtarget.hasAVX()) {
14389 // If we have AVX, we can use VPERMILPS which will allow folding a load
14390 // into the shuffle.
14391 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14392 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14393 }
14394
14395 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14396 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14397 if (!Subtarget.hasSSE2()) {
14398 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14399 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14400 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14401 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14402 }
14403
14404 // Otherwise, use a straight shuffle of a single input vector. We pass the
14405 // input vector to both operands to simulate this with a SHUFPS.
14406 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14407 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14408 }
14409
14410 if (Subtarget.hasAVX2())
14411 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14412 return Extract;
14413
14414 // There are special ways we can lower some single-element blends. However, we
14415 // have custom ways we can lower more complex single-element blends below that
14416 // we defer to if both this and BLENDPS fail to match, so restrict this to
14417 // when the V2 input is targeting element 0 of the mask -- that is the fast
14418 // case here.
14419 if (NumV2Elements == 1 && Mask[0] >= 4)
14420 if (SDValue V = lowerShuffleAsElementInsertion(
14421 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14422 return V;
14423
14424 if (Subtarget.hasSSE41()) {
14425 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14426 Zeroable, Subtarget, DAG))
14427 return Blend;
14428
14429 // Use INSERTPS if we can complete the shuffle efficiently.
14430 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14431 return V;
14432
14433 if (!isSingleSHUFPSMask(Mask))
14434 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14435 V2, Mask, DAG))
14436 return BlendPerm;
14437 }
14438
14439 // Use low/high mov instructions. These are only valid in SSE1 because
14440 // otherwise they are widened to v2f64 and never get here.
14441 if (!Subtarget.hasSSE2()) {
14442 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14443 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14444 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14445 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14446 }
14447
14448 // Use dedicated unpack instructions for masks that match their pattern.
14449 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14450 return V;
14451
14452 // Otherwise fall back to a SHUFPS lowering strategy.
14453 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14454}
14455
14456/// Lower 4-lane i32 vector shuffles.
14457///
14458/// We try to handle these with integer-domain shuffles where we can, but for
14459/// blends we use the floating point domain blend instructions.
14460static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14461 const APInt &Zeroable, SDValue V1, SDValue V2,
14462 const X86Subtarget &Subtarget,
14463 SelectionDAG &DAG) {
14464 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((void)0);
14465 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((void)0);
14466 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
14467
14468 // Whenever we can lower this as a zext, that instruction is strictly faster
14469 // than any alternative. It also allows us to fold memory operands into the
14470 // shuffle in many cases.
14471 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14472 Zeroable, Subtarget, DAG))
14473 return ZExt;
14474
14475 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14476
14477 if (NumV2Elements == 0) {
14478 // Try to use broadcast unless the mask only has one non-undef element.
14479 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14480 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14481 Mask, Subtarget, DAG))
14482 return Broadcast;
14483 }
14484
14485 // Straight shuffle of a single input vector. For everything from SSE2
14486 // onward this has a single fast instruction with no scary immediates.
14487 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14488 // but we aren't actually going to use the UNPCK instruction because doing
14489 // so prevents folding a load into this instruction or making a copy.
14490 const int UnpackLoMask[] = {0, 0, 1, 1};
14491 const int UnpackHiMask[] = {2, 2, 3, 3};
14492 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14493 Mask = UnpackLoMask;
14494 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14495 Mask = UnpackHiMask;
14496
14497 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14498 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14499 }
14500
14501 if (Subtarget.hasAVX2())
14502 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14503 return Extract;
14504
14505 // Try to use shift instructions.
14506 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14507 Zeroable, Subtarget, DAG))
14508 return Shift;
14509
14510 // There are special ways we can lower some single-element blends.
14511 if (NumV2Elements == 1)
14512 if (SDValue V = lowerShuffleAsElementInsertion(
14513 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14514 return V;
14515
14516 // We have different paths for blend lowering, but they all must use the
14517 // *exact* same predicate.
14518 bool IsBlendSupported = Subtarget.hasSSE41();
14519 if (IsBlendSupported)
14520 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14521 Zeroable, Subtarget, DAG))
14522 return Blend;
14523
14524 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14525 Zeroable, Subtarget, DAG))
14526 return Masked;
14527
14528 // Use dedicated unpack instructions for masks that match their pattern.
14529 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14530 return V;
14531
14532 // Try to use byte rotation instructions.
14533 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14534 if (Subtarget.hasSSSE3()) {
14535 if (Subtarget.hasVLX())
14536 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14537 Subtarget, DAG))
14538 return Rotate;
14539
14540 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14541 Subtarget, DAG))
14542 return Rotate;
14543 }
14544
14545 // Assume that a single SHUFPS is faster than an alternative sequence of
14546 // multiple instructions (even if the CPU has a domain penalty).
14547 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14548 if (!isSingleSHUFPSMask(Mask)) {
14549 // If we have direct support for blends, we should lower by decomposing into
14550 // a permute. That will be faster than the domain cross.
14551 if (IsBlendSupported)
14552 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14553 Subtarget, DAG);
14554
14555 // Try to lower by permuting the inputs into an unpack instruction.
14556 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14557 Mask, Subtarget, DAG))
14558 return Unpack;
14559 }
14560
14561 // We implement this with SHUFPS because it can blend from two vectors.
14562 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14563 // up the inputs, bypassing domain shift penalties that we would incur if we
14564 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14565 // relevant.
14566 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14567 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14568 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14569 return DAG.getBitcast(MVT::v4i32, ShufPS);
14570}
14571
14572/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14573/// shuffle lowering, and the most complex part.
14574///
14575/// The lowering strategy is to try to form pairs of input lanes which are
14576/// targeted at the same half of the final vector, and then use a dword shuffle
14577/// to place them onto the right half, and finally unpack the paired lanes into
14578/// their final position.
14579///
14580/// The exact breakdown of how to form these dword pairs and align them on the
14581/// correct sides is really tricky. See the comments within the function for
14582/// more of the details.
14583///
14584/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14585/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14586/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14587/// vector, form the analogous 128-bit 8-element Mask.
14588static SDValue lowerV8I16GeneralSingleInputShuffle(
14589 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14590 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14591 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((void)0);
14592 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14593
14594 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((void)0);
14595 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14596 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14597
14598 // Attempt to directly match PSHUFLW or PSHUFHW.
14599 if (isUndefOrInRange(LoMask, 0, 4) &&
14600 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14601 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14602 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14603 }
14604 if (isUndefOrInRange(HiMask, 4, 8) &&
14605 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14606 for (int i = 0; i != 4; ++i)
14607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14608 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14609 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14610 }
14611
14612 SmallVector<int, 4> LoInputs;
14613 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14614 array_pod_sort(LoInputs.begin(), LoInputs.end());
14615 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14616 SmallVector<int, 4> HiInputs;
14617 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14618 array_pod_sort(HiInputs.begin(), HiInputs.end());
14619 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14620 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14621 int NumHToL = LoInputs.size() - NumLToL;
14622 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14623 int NumHToH = HiInputs.size() - NumLToH;
14624 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14625 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14626 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14627 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14628
14629 // If we are shuffling values from one half - check how many different DWORD
14630 // pairs we need to create. If only 1 or 2 then we can perform this as a
14631 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14632 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14633 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14634 V = DAG.getNode(ShufWOp, DL, VT, V,
14635 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14636 V = DAG.getBitcast(PSHUFDVT, V);
14637 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14638 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14639 return DAG.getBitcast(VT, V);
14640 };
14641
14642 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14643 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14644 SmallVector<std::pair<int, int>, 4> DWordPairs;
14645 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14646
14647 // Collect the different DWORD pairs.
14648 for (int DWord = 0; DWord != 4; ++DWord) {
14649 int M0 = Mask[2 * DWord + 0];
14650 int M1 = Mask[2 * DWord + 1];
14651 M0 = (M0 >= 0 ? M0 % 4 : M0);
14652 M1 = (M1 >= 0 ? M1 % 4 : M1);
14653 if (M0 < 0 && M1 < 0)
14654 continue;
14655
14656 bool Match = false;
14657 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14658 auto &DWordPair = DWordPairs[j];
14659 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14660 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14661 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14662 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14663 PSHUFDMask[DWord] = DOffset + j;
14664 Match = true;
14665 break;
14666 }
14667 }
14668 if (!Match) {
14669 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14670 DWordPairs.push_back(std::make_pair(M0, M1));
14671 }
14672 }
14673
14674 if (DWordPairs.size() <= 2) {
14675 DWordPairs.resize(2, std::make_pair(-1, -1));
14676 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14677 DWordPairs[1].first, DWordPairs[1].second};
14678 if ((NumHToL + NumHToH) == 0)
14679 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14680 if ((NumLToL + NumLToH) == 0)
14681 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14682 }
14683 }
14684
14685 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14686 // such inputs we can swap two of the dwords across the half mark and end up
14687 // with <=2 inputs to each half in each half. Once there, we can fall through
14688 // to the generic code below. For example:
14689 //
14690 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14691 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14692 //
14693 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14694 // and an existing 2-into-2 on the other half. In this case we may have to
14695 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14696 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14697 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14698 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14699 // half than the one we target for fixing) will be fixed when we re-enter this
14700 // path. We will also combine away any sequence of PSHUFD instructions that
14701 // result into a single instruction. Here is an example of the tricky case:
14702 //
14703 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14704 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14705 //
14706 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14707 //
14708 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14709 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14710 //
14711 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14712 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14713 //
14714 // The result is fine to be handled by the generic logic.
14715 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14716 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14717 int AOffset, int BOffset) {
14718 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&((void)0)
14719 "Must call this with A having 3 or 1 inputs from the A half.")((void)0);
14720 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&((void)0)
14721 "Must call this with B having 1 or 3 inputs from the B half.")((void)0);
14722 assert(AToAInputs.size() + BToAInputs.size() == 4 &&((void)0)
14723 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")((void)0);
14724
14725 bool ThreeAInputs = AToAInputs.size() == 3;
14726
14727 // Compute the index of dword with only one word among the three inputs in
14728 // a half by taking the sum of the half with three inputs and subtracting
14729 // the sum of the actual three inputs. The difference is the remaining
14730 // slot.
14731 int ADWord = 0, BDWord = 0;
14732 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14733 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14734 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14735 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14736 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14737 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14738 int TripleNonInputIdx =
14739 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14740 TripleDWord = TripleNonInputIdx / 2;
14741
14742 // We use xor with one to compute the adjacent DWord to whichever one the
14743 // OneInput is in.
14744 OneInputDWord = (OneInput / 2) ^ 1;
14745
14746 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14747 // and BToA inputs. If there is also such a problem with the BToB and AToB
14748 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14749 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14750 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14751 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14752 // Compute how many inputs will be flipped by swapping these DWords. We
14753 // need
14754 // to balance this to ensure we don't form a 3-1 shuffle in the other
14755 // half.
14756 int NumFlippedAToBInputs =
14757 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
14758 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
14759 int NumFlippedBToBInputs =
14760 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
14761 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
14762 if ((NumFlippedAToBInputs == 1 &&
14763 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14764 (NumFlippedBToBInputs == 1 &&
14765 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14766 // We choose whether to fix the A half or B half based on whether that
14767 // half has zero flipped inputs. At zero, we may not be able to fix it
14768 // with that half. We also bias towards fixing the B half because that
14769 // will more commonly be the high half, and we have to bias one way.
14770 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14771 ArrayRef<int> Inputs) {
14772 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14773 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14774 // Determine whether the free index is in the flipped dword or the
14775 // unflipped dword based on where the pinned index is. We use this bit
14776 // in an xor to conditionally select the adjacent dword.
14777 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14778 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14779 if (IsFixIdxInput == IsFixFreeIdxInput)
14780 FixFreeIdx += 1;
14781 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14782 assert(IsFixIdxInput != IsFixFreeIdxInput &&((void)0)
14783 "We need to be changing the number of flipped inputs!")((void)0);
14784 int PSHUFHalfMask[] = {0, 1, 2, 3};
14785 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14786 V = DAG.getNode(
14787 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14788 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14789 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14790
14791 for (int &M : Mask)
14792 if (M >= 0 && M == FixIdx)
14793 M = FixFreeIdx;
14794 else if (M >= 0 && M == FixFreeIdx)
14795 M = FixIdx;
14796 };
14797 if (NumFlippedBToBInputs != 0) {
14798 int BPinnedIdx =
14799 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14800 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14801 } else {
14802 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((void)0);
14803 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14804 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14805 }
14806 }
14807 }
14808
14809 int PSHUFDMask[] = {0, 1, 2, 3};
14810 PSHUFDMask[ADWord] = BDWord;
14811 PSHUFDMask[BDWord] = ADWord;
14812 V = DAG.getBitcast(
14813 VT,
14814 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14815 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14816
14817 // Adjust the mask to match the new locations of A and B.
14818 for (int &M : Mask)
14819 if (M >= 0 && M/2 == ADWord)
14820 M = 2 * BDWord + M % 2;
14821 else if (M >= 0 && M/2 == BDWord)
14822 M = 2 * ADWord + M % 2;
14823
14824 // Recurse back into this routine to re-compute state now that this isn't
14825 // a 3 and 1 problem.
14826 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14827 };
14828 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14829 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14830 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14831 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14832
14833 // At this point there are at most two inputs to the low and high halves from
14834 // each half. That means the inputs can always be grouped into dwords and
14835 // those dwords can then be moved to the correct half with a dword shuffle.
14836 // We use at most one low and one high word shuffle to collect these paired
14837 // inputs into dwords, and finally a dword shuffle to place them.
14838 int PSHUFLMask[4] = {-1, -1, -1, -1};
14839 int PSHUFHMask[4] = {-1, -1, -1, -1};
14840 int PSHUFDMask[4] = {-1, -1, -1, -1};
14841
14842 // First fix the masks for all the inputs that are staying in their
14843 // original halves. This will then dictate the targets of the cross-half
14844 // shuffles.
14845 auto fixInPlaceInputs =
14846 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14847 MutableArrayRef<int> SourceHalfMask,
14848 MutableArrayRef<int> HalfMask, int HalfOffset) {
14849 if (InPlaceInputs.empty())
14850 return;
14851 if (InPlaceInputs.size() == 1) {
14852 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14853 InPlaceInputs[0] - HalfOffset;
14854 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14855 return;
14856 }
14857 if (IncomingInputs.empty()) {
14858 // Just fix all of the in place inputs.
14859 for (int Input : InPlaceInputs) {
14860 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14861 PSHUFDMask[Input / 2] = Input / 2;
14862 }
14863 return;
14864 }
14865
14866 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((void)0);
14867 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14868 InPlaceInputs[0] - HalfOffset;
14869 // Put the second input next to the first so that they are packed into
14870 // a dword. We find the adjacent index by toggling the low bit.
14871 int AdjIndex = InPlaceInputs[0] ^ 1;
14872 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14873 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14874 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14875 };
14876 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14877 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14878
14879 // Now gather the cross-half inputs and place them into a free dword of
14880 // their target half.
14881 // FIXME: This operation could almost certainly be simplified dramatically to
14882 // look more like the 3-1 fixing operation.
14883 auto moveInputsToRightHalf = [&PSHUFDMask](
14884 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14885 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14886 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14887 int DestOffset) {
14888 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14889 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14890 };
14891 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14892 int Word) {
14893 int LowWord = Word & ~1;
14894 int HighWord = Word | 1;
14895 return isWordClobbered(SourceHalfMask, LowWord) ||
14896 isWordClobbered(SourceHalfMask, HighWord);
14897 };
14898
14899 if (IncomingInputs.empty())
14900 return;
14901
14902 if (ExistingInputs.empty()) {
14903 // Map any dwords with inputs from them into the right half.
14904 for (int Input : IncomingInputs) {
14905 // If the source half mask maps over the inputs, turn those into
14906 // swaps and use the swapped lane.
14907 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14908 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14909 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14910 Input - SourceOffset;
14911 // We have to swap the uses in our half mask in one sweep.
14912 for (int &M : HalfMask)
14913 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14914 M = Input;
14915 else if (M == Input)
14916 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14917 } else {
14918 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((void)0)
14919 Input - SourceOffset &&((void)0)
14920 "Previous placement doesn't match!")((void)0);
14921 }
14922 // Note that this correctly re-maps both when we do a swap and when
14923 // we observe the other side of the swap above. We rely on that to
14924 // avoid swapping the members of the input list directly.
14925 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14926 }
14927
14928 // Map the input's dword into the correct half.
14929 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14930 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14931 else
14932 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((void)0)
14933 Input / 2 &&((void)0)
14934 "Previous placement doesn't match!")((void)0);
14935 }
14936
14937 // And just directly shift any other-half mask elements to be same-half
14938 // as we will have mirrored the dword containing the element into the
14939 // same position within that half.
14940 for (int &M : HalfMask)
14941 if (M >= SourceOffset && M < SourceOffset + 4) {
14942 M = M - SourceOffset + DestOffset;
14943 assert(M >= 0 && "This should never wrap below zero!")((void)0);
14944 }
14945 return;
14946 }
14947
14948 // Ensure we have the input in a viable dword of its current half. This
14949 // is particularly tricky because the original position may be clobbered
14950 // by inputs being moved and *staying* in that half.
14951 if (IncomingInputs.size() == 1) {
14952 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14953 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14954 SourceOffset;
14955 SourceHalfMask[InputFixed - SourceOffset] =
14956 IncomingInputs[0] - SourceOffset;
14957 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14958 InputFixed);
14959 IncomingInputs[0] = InputFixed;
14960 }
14961 } else if (IncomingInputs.size() == 2) {
14962 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14963 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14964 // We have two non-adjacent or clobbered inputs we need to extract from
14965 // the source half. To do this, we need to map them into some adjacent
14966 // dword slot in the source mask.
14967 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14968 IncomingInputs[1] - SourceOffset};
14969
14970 // If there is a free slot in the source half mask adjacent to one of
14971 // the inputs, place the other input in it. We use (Index XOR 1) to
14972 // compute an adjacent index.
14973 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14974 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14975 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14976 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14977 InputsFixed[1] = InputsFixed[0] ^ 1;
14978 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14979 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14980 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14981 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14982 InputsFixed[0] = InputsFixed[1] ^ 1;
14983 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14984 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14985 // The two inputs are in the same DWord but it is clobbered and the
14986 // adjacent DWord isn't used at all. Move both inputs to the free
14987 // slot.
14988 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14989 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14990 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14991 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14992 } else {
14993 // The only way we hit this point is if there is no clobbering
14994 // (because there are no off-half inputs to this half) and there is no
14995 // free slot adjacent to one of the inputs. In this case, we have to
14996 // swap an input with a non-input.
14997 for (int i = 0; i < 4; ++i)
14998 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&((void)0)
14999 "We can't handle any clobbers here!")((void)0);
15000 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((void)0)
15001 "Cannot have adjacent inputs here!")((void)0);
15002
15003 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15004 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15005
15006 // We also have to update the final source mask in this case because
15007 // it may need to undo the above swap.
15008 for (int &M : FinalSourceHalfMask)
15009 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15010 M = InputsFixed[1] + SourceOffset;
15011 else if (M == InputsFixed[1] + SourceOffset)
15012 M = (InputsFixed[0] ^ 1) + SourceOffset;
15013
15014 InputsFixed[1] = InputsFixed[0] ^ 1;
15015 }
15016
15017 // Point everything at the fixed inputs.
15018 for (int &M : HalfMask)
15019 if (M == IncomingInputs[0])
15020 M = InputsFixed[0] + SourceOffset;
15021 else if (M == IncomingInputs[1])
15022 M = InputsFixed[1] + SourceOffset;
15023
15024 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15025 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15026 }
15027 } else {
15028 llvm_unreachable("Unhandled input size!")__builtin_unreachable();
15029 }
15030
15031 // Now hoist the DWord down to the right half.
15032 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15033 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((void)0);
15034 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15035 for (int &M : HalfMask)
15036 for (int Input : IncomingInputs)
15037 if (M == Input)
15038 M = FreeDWord * 2 + Input % 2;
15039 };
15040 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15041 /*SourceOffset*/ 4, /*DestOffset*/ 0);
15042 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15043 /*SourceOffset*/ 0, /*DestOffset*/ 4);
15044
15045 // Now enact all the shuffles we've computed to move the inputs into their
15046 // target half.
15047 if (!isNoopShuffleMask(PSHUFLMask))
15048 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15049 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15050 if (!isNoopShuffleMask(PSHUFHMask))
15051 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15052 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15053 if (!isNoopShuffleMask(PSHUFDMask))
15054 V = DAG.getBitcast(
15055 VT,
15056 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15057 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15058
15059 // At this point, each half should contain all its inputs, and we can then
15060 // just shuffle them into their final position.
15061 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((void)0)
15062 "Failed to lift all the high half inputs to the low mask!")((void)0);
15063 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((void)0)
15064 "Failed to lift all the low half inputs to the high mask!")((void)0);
15065
15066 // Do a half shuffle for the low mask.
15067 if (!isNoopShuffleMask(LoMask))
15068 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15069 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15070
15071 // Do a half shuffle with the high mask after shifting its values down.
15072 for (int &M : HiMask)
15073 if (M >= 0)
15074 M -= 4;
15075 if (!isNoopShuffleMask(HiMask))
15076 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15077 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15078
15079 return V;
15080}
15081
15082/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15083/// blend if only one input is used.
15084static SDValue lowerShuffleAsBlendOfPSHUFBs(
15085 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15086 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15087 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&((void)0)
15088 "Lane crossing shuffle masks not supported")((void)0);
15089
15090 int NumBytes = VT.getSizeInBits() / 8;
15091 int Size = Mask.size();
15092 int Scale = NumBytes / Size;
15093
15094 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15095 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15096 V1InUse = false;
15097 V2InUse = false;
15098
15099 for (int i = 0; i < NumBytes; ++i) {
15100 int M = Mask[i / Scale];
15101 if (M < 0)
15102 continue;
15103
15104 const int ZeroMask = 0x80;
15105 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15106 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15107 if (Zeroable[i / Scale])
15108 V1Idx = V2Idx = ZeroMask;
15109
15110 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15111 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15112 V1InUse |= (ZeroMask != V1Idx);
15113 V2InUse |= (ZeroMask != V2Idx);
15114 }
15115
15116 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15117 if (V1InUse)
15118 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15119 DAG.getBuildVector(ShufVT, DL, V1Mask));
15120 if (V2InUse)
15121 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15122 DAG.getBuildVector(ShufVT, DL, V2Mask));
15123
15124 // If we need shuffled inputs from both, blend the two.
15125 SDValue V;
15126 if (V1InUse && V2InUse)
15127 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15128 else
15129 V = V1InUse ? V1 : V2;
15130
15131 // Cast the result back to the correct type.
15132 return DAG.getBitcast(VT, V);
15133}
15134
15135/// Generic lowering of 8-lane i16 shuffles.
15136///
15137/// This handles both single-input shuffles and combined shuffle/blends with
15138/// two inputs. The single input shuffles are immediately delegated to
15139/// a dedicated lowering routine.
15140///
15141/// The blends are lowered in one of three fundamental ways. If there are few
15142/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15143/// of the input is significantly cheaper when lowered as an interleaving of
15144/// the two inputs, try to interleave them. Otherwise, blend the low and high
15145/// halves of the inputs separately (making them have relatively few inputs)
15146/// and then concatenate them.
15147static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15148 const APInt &Zeroable, SDValue V1, SDValue V2,
15149 const X86Subtarget &Subtarget,
15150 SelectionDAG &DAG) {
15151 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((void)0);
15152 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((void)0);
15153 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
15154
15155 // Whenever we can lower this as a zext, that instruction is strictly faster
15156 // than any alternative.
15157 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15158 Zeroable, Subtarget, DAG))
15159 return ZExt;
15160
15161 // Try to use lower using a truncation.
15162 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15163 Subtarget, DAG))
15164 return V;
15165
15166 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15167
15168 if (NumV2Inputs == 0) {
15169 // Try to use shift instructions.
15170 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15171 Zeroable, Subtarget, DAG))
15172 return Shift;
15173
15174 // Check for being able to broadcast a single element.
15175 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15176 Mask, Subtarget, DAG))
15177 return Broadcast;
15178
15179 // Try to use bit rotation instructions.
15180 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15181 Subtarget, DAG))
15182 return Rotate;
15183
15184 // Use dedicated unpack instructions for masks that match their pattern.
15185 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15186 return V;
15187
15188 // Use dedicated pack instructions for masks that match their pattern.
15189 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15190 Subtarget))
15191 return V;
15192
15193 // Try to use byte rotation instructions.
15194 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15195 Subtarget, DAG))
15196 return Rotate;
15197
15198 // Make a copy of the mask so it can be modified.
15199 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15200 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15201 Subtarget, DAG);
15202 }
15203
15204 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((void)0)
15205 "All single-input shuffles should be canonicalized to be V1-input "((void)0)
15206 "shuffles.")((void)0);
15207
15208 // Try to use shift instructions.
15209 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15210 Zeroable, Subtarget, DAG))
15211 return Shift;
15212
15213 // See if we can use SSE4A Extraction / Insertion.
15214 if (Subtarget.hasSSE4A())
15215 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15216 Zeroable, DAG))
15217 return V;
15218
15219 // There are special ways we can lower some single-element blends.
15220 if (NumV2Inputs == 1)
15221 if (SDValue V = lowerShuffleAsElementInsertion(
15222 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15223 return V;
15224
15225 // We have different paths for blend lowering, but they all must use the
15226 // *exact* same predicate.
15227 bool IsBlendSupported = Subtarget.hasSSE41();
15228 if (IsBlendSupported)
15229 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15230 Zeroable, Subtarget, DAG))
15231 return Blend;
15232
15233 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15234 Zeroable, Subtarget, DAG))
15235 return Masked;
15236
15237 // Use dedicated unpack instructions for masks that match their pattern.
15238 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15239 return V;
15240
15241 // Use dedicated pack instructions for masks that match their pattern.
15242 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15243 Subtarget))
15244 return V;
15245
15246 // Try to use lower using a truncation.
15247 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15248 Subtarget, DAG))
15249 return V;
15250
15251 // Try to use byte rotation instructions.
15252 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15253 Subtarget, DAG))
15254 return Rotate;
15255
15256 if (SDValue BitBlend =
15257 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15258 return BitBlend;
15259
15260 // Try to use byte shift instructions to mask.
15261 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15262 Zeroable, Subtarget, DAG))
15263 return V;
15264
15265 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15266 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15267 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15268 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15269 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15270 !Subtarget.hasVLX()) {
15271 SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15272 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15273 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15274 SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15275 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15276 DWordClearMask);
15277 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15278 DWordClearMask);
15279 // Now pack things back together.
15280 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15281 if (NumEvenDrops == 2) {
15282 Result = DAG.getBitcast(MVT::v4i32, Result);
15283 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15284 }
15285 return Result;
15286 }
15287
15288 // Try to lower by permuting the inputs into an unpack instruction.
15289 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15290 Mask, Subtarget, DAG))
15291 return Unpack;
15292
15293 // If we can't directly blend but can use PSHUFB, that will be better as it
15294 // can both shuffle and set up the inefficient blend.
15295 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15296 bool V1InUse, V2InUse;
15297 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15298 Zeroable, DAG, V1InUse, V2InUse);
15299 }
15300
15301 // We can always bit-blend if we have to so the fallback strategy is to
15302 // decompose into single-input permutes and blends/unpacks.
15303 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15304 Mask, Subtarget, DAG);
15305}
15306
15307// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15308// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15309// the active subvector is extracted.
15310static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15311 ArrayRef<int> Mask, SDValue V1, SDValue V2,
15312 const X86Subtarget &Subtarget,
15313 SelectionDAG &DAG) {
15314 MVT MaskVT = VT.changeTypeToInteger();
15315 SDValue MaskNode;
15316 MVT ShuffleVT = VT;
15317 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15318 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15319 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15320 ShuffleVT = V1.getSimpleValueType();
15321
15322 // Adjust mask to correct indices for the second input.
15323 int NumElts = VT.getVectorNumElements();
15324 unsigned Scale = 512 / VT.getSizeInBits();
15325 SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15326 for (int &M : AdjustedMask)
15327 if (NumElts <= M)
15328 M += (Scale - 1) * NumElts;
15329 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15330 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15331 } else {
15332 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15333 }
15334
15335 SDValue Result;
15336 if (V2.isUndef())
15337 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15338 else
15339 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15340
15341 if (VT != ShuffleVT)
15342 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15343
15344 return Result;
15345}
15346
15347/// Generic lowering of v16i8 shuffles.
15348///
15349/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15350/// detect any complexity reducing interleaving. If that doesn't help, it uses
15351/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15352/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15353/// back together.
15354static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15355 const APInt &Zeroable, SDValue V1, SDValue V2,
15356 const X86Subtarget &Subtarget,
15357 SelectionDAG &DAG) {
15358 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((void)0);
15359 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((void)0);
15360 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
15361
15362 // Try to use shift instructions.
15363 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15364 Zeroable, Subtarget, DAG))
15365 return Shift;
15366
15367 // Try to use byte rotation instructions.
15368 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15369 Subtarget, DAG))
15370 return Rotate;
15371
15372 // Use dedicated pack instructions for masks that match their pattern.
15373 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15374 Subtarget))
15375 return V;
15376
15377 // Try to use a zext lowering.
15378 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15379 Zeroable, Subtarget, DAG))
15380 return ZExt;
15381
15382 // Try to use lower using a truncation.
15383 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15384 Subtarget, DAG))
15385 return V;
15386
15387 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15388 Subtarget, DAG))
15389 return V;
15390
15391 // See if we can use SSE4A Extraction / Insertion.
15392 if (Subtarget.hasSSE4A())
15393 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15394 Zeroable, DAG))
15395 return V;
15396
15397 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15398
15399 // For single-input shuffles, there are some nicer lowering tricks we can use.
15400 if (NumV2Elements == 0) {
15401 // Check for being able to broadcast a single element.
15402 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15403 Mask, Subtarget, DAG))
15404 return Broadcast;
15405
15406 // Try to use bit rotation instructions.
15407 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15408 Subtarget, DAG))
15409 return Rotate;
15410
15411 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15412 return V;
15413
15414 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15415 // Notably, this handles splat and partial-splat shuffles more efficiently.
15416 // However, it only makes sense if the pre-duplication shuffle simplifies
15417 // things significantly. Currently, this means we need to be able to
15418 // express the pre-duplication shuffle as an i16 shuffle.
15419 //
15420 // FIXME: We should check for other patterns which can be widened into an
15421 // i16 shuffle as well.
15422 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15423 for (int i = 0; i < 16; i += 2)
15424 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15425 return false;
15426
15427 return true;
15428 };
15429 auto tryToWidenViaDuplication = [&]() -> SDValue {
15430 if (!canWidenViaDuplication(Mask))
15431 return SDValue();
15432 SmallVector<int, 4> LoInputs;
15433 copy_if(Mask, std::back_inserter(LoInputs),
15434 [](int M) { return M >= 0 && M < 8; });
15435 array_pod_sort(LoInputs.begin(), LoInputs.end());
15436 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15437 LoInputs.end());
15438 SmallVector<int, 4> HiInputs;
15439 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15440 array_pod_sort(HiInputs.begin(), HiInputs.end());
15441 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15442 HiInputs.end());
15443
15444 bool TargetLo = LoInputs.size() >= HiInputs.size();
15445 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15446 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15447
15448 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15449 SmallDenseMap<int, int, 8> LaneMap;
15450 for (int I : InPlaceInputs) {
15451 PreDupI16Shuffle[I/2] = I/2;
15452 LaneMap[I] = I;
15453 }
15454 int j = TargetLo ? 0 : 4, je = j + 4;
15455 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15456 // Check if j is already a shuffle of this input. This happens when
15457 // there are two adjacent bytes after we move the low one.
15458 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15459 // If we haven't yet mapped the input, search for a slot into which
15460 // we can map it.
15461 while (j < je && PreDupI16Shuffle[j] >= 0)
15462 ++j;
15463
15464 if (j == je)
15465 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15466 return SDValue();
15467
15468 // Map this input with the i16 shuffle.
15469 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15470 }
15471
15472 // Update the lane map based on the mapping we ended up with.
15473 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15474 }
15475 V1 = DAG.getBitcast(
15476 MVT::v16i8,
15477 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15478 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15479
15480 // Unpack the bytes to form the i16s that will be shuffled into place.
15481 bool EvenInUse = false, OddInUse = false;
15482 for (int i = 0; i < 16; i += 2) {
15483 EvenInUse |= (Mask[i + 0] >= 0);
15484 OddInUse |= (Mask[i + 1] >= 0);
15485 if (EvenInUse && OddInUse)
15486 break;
15487 }
15488 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15489 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15490 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15491
15492 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15493 for (int i = 0; i < 16; ++i)
15494 if (Mask[i] >= 0) {
15495 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15496 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((void)0);
15497 if (PostDupI16Shuffle[i / 2] < 0)
15498 PostDupI16Shuffle[i / 2] = MappedMask;
15499 else
15500 assert(PostDupI16Shuffle[i / 2] == MappedMask &&((void)0)
15501 "Conflicting entries in the original shuffle!")((void)0);
15502 }
15503 return DAG.getBitcast(
15504 MVT::v16i8,
15505 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15506 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15507 };
15508 if (SDValue V = tryToWidenViaDuplication())
15509 return V;
15510 }
15511
15512 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15513 Zeroable, Subtarget, DAG))
15514 return Masked;
15515
15516 // Use dedicated unpack instructions for masks that match their pattern.
15517 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15518 return V;
15519
15520 // Try to use byte shift instructions to mask.
15521 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15522 Zeroable, Subtarget, DAG))
15523 return V;
15524
15525 // Check for compaction patterns.
15526 bool IsSingleInput = V2.isUndef();
15527 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15528
15529 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15530 // with PSHUFB. It is important to do this before we attempt to generate any
15531 // blends but after all of the single-input lowerings. If the single input
15532 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15533 // want to preserve that and we can DAG combine any longer sequences into
15534 // a PSHUFB in the end. But once we start blending from multiple inputs,
15535 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15536 // and there are *very* few patterns that would actually be faster than the
15537 // PSHUFB approach because of its ability to zero lanes.
15538 //
15539 // If the mask is a binary compaction, we can more efficiently perform this
15540 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15541 //
15542 // FIXME: The only exceptions to the above are blends which are exact
15543 // interleavings with direct instructions supporting them. We currently don't
15544 // handle those well here.
15545 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15546 bool V1InUse = false;
15547 bool V2InUse = false;
15548
15549 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15550 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15551
15552 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15553 // do so. This avoids using them to handle blends-with-zero which is
15554 // important as a single pshufb is significantly faster for that.
15555 if (V1InUse && V2InUse) {
15556 if (Subtarget.hasSSE41())
15557 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15558 Zeroable, Subtarget, DAG))
15559 return Blend;
15560
15561 // We can use an unpack to do the blending rather than an or in some
15562 // cases. Even though the or may be (very minorly) more efficient, we
15563 // preference this lowering because there are common cases where part of
15564 // the complexity of the shuffles goes away when we do the final blend as
15565 // an unpack.
15566 // FIXME: It might be worth trying to detect if the unpack-feeding
15567 // shuffles will both be pshufb, in which case we shouldn't bother with
15568 // this.
15569 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15570 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15571 return Unpack;
15572
15573 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15574 if (Subtarget.hasVBMI())
15575 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15576 DAG);
15577
15578 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15579 if (Subtarget.hasXOP()) {
15580 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15581 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15582 }
15583
15584 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15585 // PALIGNR will be cheaper than the second PSHUFB+OR.
15586 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15587 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15588 return V;
15589 }
15590
15591 return PSHUFB;
15592 }
15593
15594 // There are special ways we can lower some single-element blends.
15595 if (NumV2Elements == 1)
15596 if (SDValue V = lowerShuffleAsElementInsertion(
15597 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15598 return V;
15599
15600 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15601 return Blend;
15602
15603 // Check whether a compaction lowering can be done. This handles shuffles
15604 // which take every Nth element for some even N. See the helper function for
15605 // details.
15606 //
15607 // We special case these as they can be particularly efficiently handled with
15608 // the PACKUSB instruction on x86 and they show up in common patterns of
15609 // rearranging bytes to truncate wide elements.
15610 if (NumEvenDrops) {
15611 // NumEvenDrops is the power of two stride of the elements. Another way of
15612 // thinking about it is that we need to drop the even elements this many
15613 // times to get the original input.
15614
15615 // First we need to zero all the dropped bytes.
15616 assert(NumEvenDrops <= 3 &&((void)0)
15617 "No support for dropping even elements more than 3 times.")((void)0);
15618 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15619 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15620 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15621 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15622 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15623 WordClearMask);
15624 if (!IsSingleInput)
15625 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15626 WordClearMask);
15627
15628 // Now pack things back together.
15629 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15630 IsSingleInput ? V1 : V2);
15631 for (int i = 1; i < NumEvenDrops; ++i) {
15632 Result = DAG.getBitcast(MVT::v8i16, Result);
15633 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15634 }
15635 return Result;
15636 }
15637
15638 // Handle multi-input cases by blending/unpacking single-input shuffles.
15639 if (NumV2Elements > 0)
15640 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15641 Subtarget, DAG);
15642
15643 // The fallback path for single-input shuffles widens this into two v8i16
15644 // vectors with unpacks, shuffles those, and then pulls them back together
15645 // with a pack.
15646 SDValue V = V1;
15647
15648 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15649 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15650 for (int i = 0; i < 16; ++i)
15651 if (Mask[i] >= 0)
15652 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15653
15654 SDValue VLoHalf, VHiHalf;
15655 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15656 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15657 // i16s.
15658 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15659 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15660 // Use a mask to drop the high bytes.
15661 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15662 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15663 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15664
15665 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15666 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15667
15668 // Squash the masks to point directly into VLoHalf.
15669 for (int &M : LoBlendMask)
15670 if (M >= 0)
15671 M /= 2;
15672 for (int &M : HiBlendMask)
15673 if (M >= 0)
15674 M /= 2;
15675 } else {
15676 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15677 // VHiHalf so that we can blend them as i16s.
15678 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15679
15680 VLoHalf = DAG.getBitcast(
15681 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15682 VHiHalf = DAG.getBitcast(
15683 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15684 }
15685
15686 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15687 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15688
15689 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15690}
15691
15692/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15693///
15694/// This routine breaks down the specific type of 128-bit shuffle and
15695/// dispatches to the lowering routines accordingly.
15696static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15697 MVT VT, SDValue V1, SDValue V2,
15698 const APInt &Zeroable,
15699 const X86Subtarget &Subtarget,
15700 SelectionDAG &DAG) {
15701 switch (VT.SimpleTy) {
15702 case MVT::v2i64:
15703 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15704 case MVT::v2f64:
15705 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15706 case MVT::v4i32:
15707 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15708 case MVT::v4f32:
15709 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15710 case MVT::v8i16:
15711 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15712 case MVT::v16i8:
15713 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15714
15715 default:
15716 llvm_unreachable("Unimplemented!")__builtin_unreachable();
15717 }
15718}
15719
15720/// Generic routine to split vector shuffle into half-sized shuffles.
15721///
15722/// This routine just extracts two subvectors, shuffles them independently, and
15723/// then concatenates them back together. This should work effectively with all
15724/// AVX vector shuffle types.
15725static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15726 SDValue V2, ArrayRef<int> Mask,
15727 SelectionDAG &DAG) {
15728 assert(VT.getSizeInBits() >= 256 &&((void)0)
15729 "Only for 256-bit or wider vector shuffles!")((void)0);
15730 assert(V1.getSimpleValueType() == VT && "Bad operand type!")((void)0);
15731 assert(V2.getSimpleValueType() == VT && "Bad operand type!")((void)0);
15732
15733 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15734 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15735
15736 int NumElements = VT.getVectorNumElements();
15737 int SplitNumElements = NumElements / 2;
15738 MVT ScalarVT = VT.getVectorElementType();
15739 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15740
15741 // Use splitVector/extractSubVector so that split build-vectors just build two
15742 // narrower build vectors. This helps shuffling with splats and zeros.
15743 auto SplitVector = [&](SDValue V) {
15744 SDValue LoV, HiV;
15745 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15746 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15747 DAG.getBitcast(SplitVT, HiV));
15748 };
15749
15750 SDValue LoV1, HiV1, LoV2, HiV2;
15751 std::tie(LoV1, HiV1) = SplitVector(V1);
15752 std::tie(LoV2, HiV2) = SplitVector(V2);
15753
15754 // Now create two 4-way blends of these half-width vectors.
15755 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15756 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
15757 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15758 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15759 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15760 for (int i = 0; i < SplitNumElements; ++i) {
15761 int M = HalfMask[i];
15762 if (M >= NumElements) {
15763 if (M >= NumElements + SplitNumElements)
15764 UseHiV2 = true;
15765 else
15766 UseLoV2 = true;
15767 V2BlendMask[i] = M - NumElements;
15768 BlendMask[i] = SplitNumElements + i;
15769 } else if (M >= 0) {
15770 if (M >= SplitNumElements)
15771 UseHiV1 = true;
15772 else
15773 UseLoV1 = true;
15774 V1BlendMask[i] = M;
15775 BlendMask[i] = i;
15776 }
15777 }
15778
15779 // Because the lowering happens after all combining takes place, we need to
15780 // manually combine these blend masks as much as possible so that we create
15781 // a minimal number of high-level vector shuffle nodes.
15782
15783 // First try just blending the halves of V1 or V2.
15784 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15785 return DAG.getUNDEF(SplitVT);
15786 if (!UseLoV2 && !UseHiV2)
15787 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15788 if (!UseLoV1 && !UseHiV1)
15789 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15790
15791 SDValue V1Blend, V2Blend;
15792 if (UseLoV1 && UseHiV1) {
15793 V1Blend =
15794 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15795 } else {
15796 // We only use half of V1 so map the usage down into the final blend mask.
15797 V1Blend = UseLoV1 ? LoV1 : HiV1;
15798 for (int i = 0; i < SplitNumElements; ++i)
15799 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15800 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15801 }
15802 if (UseLoV2 && UseHiV2) {
15803 V2Blend =
15804 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15805 } else {
15806 // We only use half of V2 so map the usage down into the final blend mask.
15807 V2Blend = UseLoV2 ? LoV2 : HiV2;
15808 for (int i = 0; i < SplitNumElements; ++i)
15809 if (BlendMask[i] >= SplitNumElements)
15810 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15811 }
15812 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15813 };
15814 SDValue Lo = HalfBlend(LoMask);
15815 SDValue Hi = HalfBlend(HiMask);
15816 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15817}
15818
15819/// Either split a vector in halves or decompose the shuffles and the
15820/// blend/unpack.
15821///
15822/// This is provided as a good fallback for many lowerings of non-single-input
15823/// shuffles with more than one 128-bit lane. In those cases, we want to select
15824/// between splitting the shuffle into 128-bit components and stitching those
15825/// back together vs. extracting the single-input shuffles and blending those
15826/// results.
15827static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15828 SDValue V2, ArrayRef<int> Mask,
15829 const X86Subtarget &Subtarget,
15830 SelectionDAG &DAG) {
15831 assert(!V2.isUndef() && "This routine must not be used to lower single-input "((void)0)
15832 "shuffles as it could then recurse on itself.")((void)0);
15833 int Size = Mask.size();
15834
15835 // If this can be modeled as a broadcast of two elements followed by a blend,
15836 // prefer that lowering. This is especially important because broadcasts can
15837 // often fold with memory operands.
15838 auto DoBothBroadcast = [&] {
15839 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15840 for (int M : Mask)
15841 if (M >= Size) {
15842 if (V2BroadcastIdx < 0)
15843 V2BroadcastIdx = M - Size;
15844 else if (M - Size != V2BroadcastIdx)
15845 return false;
15846 } else if (M >= 0) {
15847 if (V1BroadcastIdx < 0)
15848 V1BroadcastIdx = M;
15849 else if (M != V1BroadcastIdx)
15850 return false;
15851 }
15852 return true;
15853 };
15854 if (DoBothBroadcast())
15855 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15856 DAG);
15857
15858 // If the inputs all stem from a single 128-bit lane of each input, then we
15859 // split them rather than blending because the split will decompose to
15860 // unusually few instructions.
15861 int LaneCount = VT.getSizeInBits() / 128;
15862 int LaneSize = Size / LaneCount;
15863 SmallBitVector LaneInputs[2];
15864 LaneInputs[0].resize(LaneCount, false);
15865 LaneInputs[1].resize(LaneCount, false);
15866 for (int i = 0; i < Size; ++i)
15867 if (Mask[i] >= 0)
15868 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15869 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15870 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15871
15872 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15873 // requires that the decomposed single-input shuffles don't end up here.
15874 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
15875 DAG);
15876}
15877
15878// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15879// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15880static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15881 SDValue V1, SDValue V2,
15882 ArrayRef<int> Mask,
15883 SelectionDAG &DAG) {
15884 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")((void)0);
15885
15886 int LHSMask[4] = {-1, -1, -1, -1};
15887 int RHSMask[4] = {-1, -1, -1, -1};
15888 unsigned SHUFPMask = 0;
15889
15890 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15891 // perform the shuffle once the lanes have been shuffled in place.
15892 for (int i = 0; i != 4; ++i) {
15893 int M = Mask[i];
15894 if (M < 0)
15895 continue;
15896 int LaneBase = i & ~1;
15897 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15898 LaneMask[LaneBase + (M & 1)] = M;
15899 SHUFPMask |= (M & 1) << i;
15900 }
15901
15902 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15903 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15904 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15905 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15906}
15907
15908/// Lower a vector shuffle crossing multiple 128-bit lanes as
15909/// a lane permutation followed by a per-lane permutation.
15910///
15911/// This is mainly for cases where we can have non-repeating permutes
15912/// in each lane.
15913///
15914/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15915/// we should investigate merging them.
15916static SDValue lowerShuffleAsLanePermuteAndPermute(
15917 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15918 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15919 int NumElts = VT.getVectorNumElements();
15920 int NumLanes = VT.getSizeInBits() / 128;
15921 int NumEltsPerLane = NumElts / NumLanes;
15922 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15923
15924 /// Attempts to find a sublane permute with the given size
15925 /// that gets all elements into their target lanes.
15926 ///
15927 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15928 /// If unsuccessful, returns false and may overwrite InLaneMask.
15929 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15930 int NumSublanesPerLane = NumSublanes / NumLanes;
15931 int NumEltsPerSublane = NumElts / NumSublanes;
15932
15933 SmallVector<int, 16> CrossLaneMask;
15934 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15935 // CrossLaneMask but one entry == one sublane.
15936 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15937
15938 for (int i = 0; i != NumElts; ++i) {
15939 int M = Mask[i];
15940 if (M < 0)
15941 continue;
15942
15943 int SrcSublane = M / NumEltsPerSublane;
15944 int DstLane = i / NumEltsPerLane;
15945
15946 // We only need to get the elements into the right lane, not sublane.
15947 // So search all sublanes that make up the destination lane.
15948 bool Found = false;
15949 int DstSubStart = DstLane * NumSublanesPerLane;
15950 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15951 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15952 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15953 continue;
15954
15955 Found = true;
15956 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15957 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15958 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15959 break;
15960 }
15961 if (!Found)
15962 return SDValue();
15963 }
15964
15965 // Fill CrossLaneMask using CrossLaneMaskLarge.
15966 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15967
15968 if (!CanUseSublanes) {
15969 // If we're only shuffling a single lowest lane and the rest are identity
15970 // then don't bother.
15971 // TODO - isShuffleMaskInputInPlace could be extended to something like
15972 // this.
15973 int NumIdentityLanes = 0;
15974 bool OnlyShuffleLowestLane = true;
15975 for (int i = 0; i != NumLanes; ++i) {
15976 int LaneOffset = i * NumEltsPerLane;
15977 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15978 i * NumEltsPerLane))
15979 NumIdentityLanes++;
15980 else if (CrossLaneMask[LaneOffset] != 0)
15981 OnlyShuffleLowestLane = false;
15982 }
15983 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15984 return SDValue();
15985 }
15986
15987 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15988 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15989 InLaneMask);
15990 };
15991
15992 // First attempt a solution with full lanes.
15993 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15994 return V;
15995
15996 // The rest of the solutions use sublanes.
15997 if (!CanUseSublanes)
15998 return SDValue();
15999
16000 // Then attempt a solution with 64-bit sublanes (vpermq).
16001 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16002 return V;
16003
16004 // If that doesn't work and we have fast variable cross-lane shuffle,
16005 // attempt 32-bit sublanes (vpermd).
16006 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16007 return SDValue();
16008
16009 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16010}
16011
16012/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16013/// source with a lane permutation.
16014///
16015/// This lowering strategy results in four instructions in the worst case for a
16016/// single-input cross lane shuffle which is lower than any other fully general
16017/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16018/// shuffle pattern should be handled prior to trying this lowering.
16019static SDValue lowerShuffleAsLanePermuteAndShuffle(
16020 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16021 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16022 // FIXME: This should probably be generalized for 512-bit vectors as well.
16023 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((void)0);
16024 int Size = Mask.size();
16025 int LaneSize = Size / 2;
16026
16027 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16028 // Only do this if the elements aren't all from the lower lane,
16029 // otherwise we're (probably) better off doing a split.
16030 if (VT == MVT::v4f64 &&
16031 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16032 if (SDValue V =
16033 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16034 return V;
16035
16036 // If there are only inputs from one 128-bit lane, splitting will in fact be
16037 // less expensive. The flags track whether the given lane contains an element
16038 // that crosses to another lane.
16039 if (!Subtarget.hasAVX2()) {
16040 bool LaneCrossing[2] = {false, false};
16041 for (int i = 0; i < Size; ++i)
16042 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16043 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16044 if (!LaneCrossing[0] || !LaneCrossing[1])
16045 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16046 } else {
16047 bool LaneUsed[2] = {false, false};
16048 for (int i = 0; i < Size; ++i)
16049 if (Mask[i] >= 0)
16050 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16051 if (!LaneUsed[0] || !LaneUsed[1])
16052 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16053 }
16054
16055 // TODO - we could support shuffling V2 in the Flipped input.
16056 assert(V2.isUndef() &&((void)0)
16057 "This last part of this routine only works on single input shuffles")((void)0);
16058
16059 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16060 for (int i = 0; i < Size; ++i) {
16061 int &M = InLaneMask[i];
16062 if (M < 0)
16063 continue;
16064 if (((M % Size) / LaneSize) != (i / LaneSize))
16065 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16066 }
16067 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&((void)0)
16068 "In-lane shuffle mask expected")((void)0);
16069
16070 // Flip the lanes, and shuffle the results which should now be in-lane.
16071 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16072 SDValue Flipped = DAG.getBitcast(PVT, V1);
16073 Flipped =
16074 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16075 Flipped = DAG.getBitcast(VT, Flipped);
16076 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16077}
16078
16079/// Handle lowering 2-lane 128-bit shuffles.
16080static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16081 SDValue V2, ArrayRef<int> Mask,
16082 const APInt &Zeroable,
16083 const X86Subtarget &Subtarget,
16084 SelectionDAG &DAG) {
16085 if (V2.isUndef()) {
16086 // Attempt to match VBROADCAST*128 subvector broadcast load.
16087 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16088 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16089 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16090 MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
16091 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16092 if (!Ld->isNonTemporal()) {
16093 MVT MemVT = VT.getHalfNumVectorElementsVT();
16094 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16095 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
16096 SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
16097 TypeSize::Fixed(Ofs), DL);
16098 SDValue Ops[] = {Ld->getChain(), Ptr};
16099 SDValue BcastLd = DAG.getMemIntrinsicNode(
16100 X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
16101 DAG.getMachineFunction().getMachineMemOperand(
16102 Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
16103 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
16104 return BcastLd;
16105 }
16106 }
16107
16108 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16109 if (Subtarget.hasAVX2())
16110 return SDValue();
16111 }
16112
16113 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16114
16115 SmallVector<int, 4> WidenedMask;
16116 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16117 return SDValue();
16118
16119 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16120 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16121
16122 // Try to use an insert into a zero vector.
16123 if (WidenedMask[0] == 0 && IsHighZero) {
16124 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16125 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16126 DAG.getIntPtrConstant(0, DL));
16127 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16128 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16129 DAG.getIntPtrConstant(0, DL));
16130 }
16131
16132 // TODO: If minimizing size and one of the inputs is a zero vector and the
16133 // the zero vector has only one use, we could use a VPERM2X128 to save the
16134 // instruction bytes needed to explicitly generate the zero vector.
16135
16136 // Blends are faster and handle all the non-lane-crossing cases.
16137 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16138 Subtarget, DAG))
16139 return Blend;
16140
16141 // If either input operand is a zero vector, use VPERM2X128 because its mask
16142 // allows us to replace the zero input with an implicit zero.
16143 if (!IsLowZero && !IsHighZero) {
16144 // Check for patterns which can be matched with a single insert of a 128-bit
16145 // subvector.
16146 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16147 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16148
16149 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16150 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16151 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16152 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16153 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16154 OnlyUsesV1 ? V1 : V2,
16155 DAG.getIntPtrConstant(0, DL));
16156 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16157 DAG.getIntPtrConstant(2, DL));
16158 }
16159 }
16160
16161 // Try to use SHUF128 if possible.
16162 if (Subtarget.hasVLX()) {
16163 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16164 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16165 ((WidenedMask[1] % 2) << 1);
16166 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16167 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16168 }
16169 }
16170 }
16171
16172 // Otherwise form a 128-bit permutation. After accounting for undefs,
16173 // convert the 64-bit shuffle mask selection values into 128-bit
16174 // selection bits by dividing the indexes by 2 and shifting into positions
16175 // defined by a vperm2*128 instruction's immediate control byte.
16176
16177 // The immediate permute control byte looks like this:
16178 // [1:0] - select 128 bits from sources for low half of destination
16179 // [2] - ignore
16180 // [3] - zero low half of destination
16181 // [5:4] - select 128 bits from sources for high half of destination
16182 // [6] - ignore
16183 // [7] - zero high half of destination
16184
16185 assert((WidenedMask[0] >= 0 || IsLowZero) &&((void)0)
16186 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")((void)0);
16187
16188 unsigned PermMask = 0;
16189 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16190 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16191
16192 // Check the immediate mask and replace unused sources with undef.
16193 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16194 V1 = DAG.getUNDEF(VT);
16195 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16196 V2 = DAG.getUNDEF(VT);
16197
16198 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16199 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16200}
16201
16202/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16203/// shuffling each lane.
16204///
16205/// This attempts to create a repeated lane shuffle where each lane uses one
16206/// or two of the lanes of the inputs. The lanes of the input vectors are
16207/// shuffled in one or two independent shuffles to get the lanes into the
16208/// position needed by the final shuffle.
16209static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16210 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16211 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16212 assert(!V2.isUndef() && "This is only useful with multiple inputs.")((void)0);
16213
16214 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
13
Assuming the condition is false
14
Taking false branch
16215 return SDValue();
16216
16217 int NumElts = Mask.size();
16218 int NumLanes = VT.getSizeInBits() / 128;
16219 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15
'NumLaneElts' initialized here
16220 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16221 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16222
16223 // First pass will try to fill in the RepeatMask from lanes that need two
16224 // sources.
16225 for (int Lane = 0; Lane != NumLanes; ++Lane) {
16
Assuming 'Lane' is not equal to 'NumLanes'
17
Loop condition is true. Entering loop body
22
Assuming 'Lane' is equal to 'NumLanes'
23
Loop condition is false. Execution continues on line 16297
16226 int Srcs[2] = {-1, -1};
16227 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16228 for (int i = 0; i != NumLaneElts; ++i) {
18
Assuming 'i' is equal to 'NumLaneElts'
19
Loop condition is false. Execution continues on line 16250
16229 int M = Mask[(Lane * NumLaneElts) + i];
16230 if (M < 0)
16231 continue;
16232 // Determine which of the possible input lanes (NumLanes from each source)
16233 // this element comes from. Assign that as one of the sources for this
16234 // lane. We can assign up to 2 sources for this lane. If we run out
16235 // sources we can't do anything.
16236 int LaneSrc = M / NumLaneElts;
16237 int Src;
16238 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16239 Src = 0;
16240 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16241 Src = 1;
16242 else
16243 return SDValue();
16244
16245 Srcs[Src] = LaneSrc;
16246 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16247 }
16248
16249 // If this lane has two sources, see if it fits with the repeat mask so far.
16250 if (Srcs[1] < 0)
20
Taking true branch
16251 continue;
21
Execution continues on line 16225
16252
16253 LaneSrcs[Lane][0] = Srcs[0];
16254 LaneSrcs[Lane][1] = Srcs[1];
16255
16256 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16257 assert(M1.size() == M2.size() && "Unexpected mask size")((void)0);
16258 for (int i = 0, e = M1.size(); i != e; ++i)
16259 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16260 return false;
16261 return true;
16262 };
16263
16264 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16265 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")((void)0);
16266 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16267 int M = Mask[i];
16268 if (M < 0)
16269 continue;
16270 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&((void)0)
16271 "Unexpected mask element")((void)0);
16272 MergedMask[i] = M;
16273 }
16274 };
16275
16276 if (MatchMasks(InLaneMask, RepeatMask)) {
16277 // Merge this lane mask into the final repeat mask.
16278 MergeMasks(InLaneMask, RepeatMask);
16279 continue;
16280 }
16281
16282 // Didn't find a match. Swap the operands and try again.
16283 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16284 ShuffleVectorSDNode::commuteMask(InLaneMask);
16285
16286 if (MatchMasks(InLaneMask, RepeatMask)) {
16287 // Merge this lane mask into the final repeat mask.
16288 MergeMasks(InLaneMask, RepeatMask);
16289 continue;
16290 }
16291
16292 // Couldn't find a match with the operands in either order.
16293 return SDValue();
16294 }
16295
16296 // Now handle any lanes with only one source.
16297 for (int Lane = 0; Lane != NumLanes; ++Lane) {
24
Loop condition is true. Entering loop body
28
Loop condition is false. Execution continues on line 16326
16298 // If this lane has already been processed, skip it.
16299 if (LaneSrcs[Lane][0] >= 0)
25
Assuming the condition is true
26
Taking true branch
16300 continue;
27
Execution continues on line 16297
16301
16302 for (int i = 0; i != NumLaneElts; ++i) {
16303 int M = Mask[(Lane * NumLaneElts) + i];
16304 if (M < 0)
16305 continue;
16306
16307 // If RepeatMask isn't defined yet we can define it ourself.
16308 if (RepeatMask[i] < 0)
16309 RepeatMask[i] = M % NumLaneElts;
16310
16311 if (RepeatMask[i] < NumElts) {
16312 if (RepeatMask[i] != M % NumLaneElts)
16313 return SDValue();
16314 LaneSrcs[Lane][0] = M / NumLaneElts;
16315 } else {
16316 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16317 return SDValue();
16318 LaneSrcs[Lane][1] = M / NumLaneElts;
16319 }
16320 }
16321
16322 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16323 return SDValue();
16324 }
16325
16326 SmallVector<int, 16> NewMask(NumElts, -1);
16327 for (int Lane = 0; Lane != NumLanes; ++Lane) {
29
Loop condition is true. Entering loop body
31
Loop condition is false. Execution continues on line 16336
16328 int Src = LaneSrcs[Lane][0];
16329 for (int i = 0; i != NumLaneElts; ++i) {
30
Loop condition is false. Execution continues on line 16327
16330 int M = -1;
16331 if (Src >= 0)
16332 M = Src * NumLaneElts + i;
16333 NewMask[Lane * NumLaneElts + i] = M;
16334 }
16335 }
16336 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16337 // Ensure we didn't get back the shuffle we started with.
16338 // FIXME: This is a hack to make up for some splat handling code in
16339 // getVectorShuffle.
16340 if (isa<ShuffleVectorSDNode>(NewV1) &&
32
Assuming 'NewV1' is not a 'ShuffleVectorSDNode'
33
Taking false branch
16341 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16342 return SDValue();
16343
16344 for (int Lane = 0; Lane != NumLanes; ++Lane) {
34
Loop condition is true. Entering loop body
36
Loop condition is false. Execution continues on line 16353
16345 int Src = LaneSrcs[Lane][1];
16346 for (int i = 0; i != NumLaneElts; ++i) {
35
Loop condition is false. Execution continues on line 16344
16347 int M = -1;
16348 if (Src >= 0)
16349 M = Src * NumLaneElts + i;
16350 NewMask[Lane * NumLaneElts + i] = M;
16351 }
16352 }
16353 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16354 // Ensure we didn't get back the shuffle we started with.
16355 // FIXME: This is a hack to make up for some splat handling code in
16356 // getVectorShuffle.
16357 if (isa<ShuffleVectorSDNode>(NewV2) &&
37
Assuming 'NewV2' is not a 'ShuffleVectorSDNode'
38
Taking false branch
16358 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16359 return SDValue();
16360
16361 for (int i = 0; i != NumElts; ++i) {
39
Assuming 'i' is not equal to 'NumElts'
40
Loop condition is true. Entering loop body
16362 NewMask[i] = RepeatMask[i % NumLaneElts];
41
Division by zero
16363 if (NewMask[i] < 0)
16364 continue;
16365
16366 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16367 }
16368 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16369}
16370
16371/// If the input shuffle mask results in a vector that is undefined in all upper
16372/// or lower half elements and that mask accesses only 2 halves of the
16373/// shuffle's operands, return true. A mask of half the width with mask indexes
16374/// adjusted to access the extracted halves of the original shuffle operands is
16375/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16376/// lower half of each input operand is accessed.
16377static bool
16378getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16379 int &HalfIdx1, int &HalfIdx2) {
16380 assert((Mask.size() == HalfMask.size() * 2) &&((void)0)
16381 "Expected input mask to be twice as long as output")((void)0);
16382
16383 // Exactly one half of the result must be undef to allow narrowing.
16384 bool UndefLower = isUndefLowerHalf(Mask);
16385 bool UndefUpper = isUndefUpperHalf(Mask);
16386 if (UndefLower == UndefUpper)
16387 return false;
16388
16389 unsigned HalfNumElts = HalfMask.size();
16390 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16391 HalfIdx1 = -1;
16392 HalfIdx2 = -1;
16393 for (unsigned i = 0; i != HalfNumElts; ++i) {
16394 int M = Mask[i + MaskIndexOffset];
16395 if (M < 0) {
16396 HalfMask[i] = M;
16397 continue;
16398 }
16399
16400 // Determine which of the 4 half vectors this element is from.
16401 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16402 int HalfIdx = M / HalfNumElts;
16403
16404 // Determine the element index into its half vector source.
16405 int HalfElt = M % HalfNumElts;
16406
16407 // We can shuffle with up to 2 half vectors, set the new 'half'
16408 // shuffle mask accordingly.
16409 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16410 HalfMask[i] = HalfElt;
16411 HalfIdx1 = HalfIdx;
16412 continue;
16413 }
16414 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16415 HalfMask[i] = HalfElt + HalfNumElts;
16416 HalfIdx2 = HalfIdx;
16417 continue;
16418 }
16419
16420 // Too many half vectors referenced.
16421 return false;
16422 }
16423
16424 return true;
16425}
16426
16427/// Given the output values from getHalfShuffleMask(), create a half width
16428/// shuffle of extracted vectors followed by an insert back to full width.
16429static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16430 ArrayRef<int> HalfMask, int HalfIdx1,
16431 int HalfIdx2, bool UndefLower,
16432 SelectionDAG &DAG, bool UseConcat = false) {
16433 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")((void)0);
16434 assert(V1.getValueType().isSimple() && "Expecting only simple types")((void)0);
16435
16436 MVT VT = V1.getSimpleValueType();
16437 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16438 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16439
16440 auto getHalfVector = [&](int HalfIdx) {
16441 if (HalfIdx < 0)
16442 return DAG.getUNDEF(HalfVT);
16443 SDValue V = (HalfIdx < 2 ? V1 : V2);
16444 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16445 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16446 DAG.getIntPtrConstant(HalfIdx, DL));
16447 };
16448
16449 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16450 SDValue Half1 = getHalfVector(HalfIdx1);
16451 SDValue Half2 = getHalfVector(HalfIdx2);
16452 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16453 if (UseConcat) {
16454 SDValue Op0 = V;
16455 SDValue Op1 = DAG.getUNDEF(HalfVT);
16456 if (UndefLower)
16457 std::swap(Op0, Op1);
16458 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16459 }
16460
16461 unsigned Offset = UndefLower ? HalfNumElts : 0;
16462 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16463 DAG.getIntPtrConstant(Offset, DL));
16464}
16465
16466/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16467/// This allows for fast cases such as subvector extraction/insertion
16468/// or shuffling smaller vector types which can lower more efficiently.
16469static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16470 SDValue V2, ArrayRef<int> Mask,
16471 const X86Subtarget &Subtarget,
16472 SelectionDAG &DAG) {
16473 assert((VT.is256BitVector() || VT.is512BitVector()) &&((void)0)
16474 "Expected 256-bit or 512-bit vector")((void)0);
16475
16476 bool UndefLower = isUndefLowerHalf(Mask);
16477 if (!UndefLower && !isUndefUpperHalf(Mask))
16478 return SDValue();
16479
16480 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&((void)0)
16481 "Completely undef shuffle mask should have been simplified already")((void)0);
16482
16483 // Upper half is undef and lower half is whole upper subvector.
16484 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16485 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16486 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16487 if (!UndefLower &&
16488 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16489 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16490 DAG.getIntPtrConstant(HalfNumElts, DL));
16491 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16492 DAG.getIntPtrConstant(0, DL));
16493 }
16494
16495 // Lower half is undef and upper half is whole lower subvector.
16496 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16497 if (UndefLower &&
16498 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16499 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16500 DAG.getIntPtrConstant(0, DL));
16501 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16502 DAG.getIntPtrConstant(HalfNumElts, DL));
16503 }
16504
16505 int HalfIdx1, HalfIdx2;
16506 SmallVector<int, 8> HalfMask(HalfNumElts);
16507 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16508 return SDValue();
16509
16510 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((void)0);
16511
16512 // Only shuffle the halves of the inputs when useful.
16513 unsigned NumLowerHalves =
16514 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16515 unsigned NumUpperHalves =
16516 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16517 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")((void)0);
16518
16519 // Determine the larger pattern of undef/halves, then decide if it's worth
16520 // splitting the shuffle based on subtarget capabilities and types.
16521 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16522 if (!UndefLower) {
16523 // XXXXuuuu: no insert is needed.
16524 // Always extract lowers when setting lower - these are all free subreg ops.
16525 if (NumUpperHalves == 0)
16526 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16527 UndefLower, DAG);
16528
16529 if (NumUpperHalves == 1) {
16530 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16531 if (Subtarget.hasAVX2()) {
16532 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16533 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16534 !is128BitUnpackShuffleMask(HalfMask) &&
16535 (!isSingleSHUFPSMask(HalfMask) ||
16536 Subtarget.hasFastVariableCrossLaneShuffle()))
16537 return SDValue();
16538 // If this is a unary shuffle (assume that the 2nd operand is
16539 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16540 // are better off extracting the upper half of 1 operand and using a
16541 // narrow shuffle.
16542 if (EltWidth == 64 && V2.isUndef())
16543 return SDValue();
16544 }
16545 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16546 if (Subtarget.hasAVX512() && VT.is512BitVector())
16547 return SDValue();
16548 // Extract + narrow shuffle is better than the wide alternative.
16549 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16550 UndefLower, DAG);
16551 }
16552
16553 // Don't extract both uppers, instead shuffle and then extract.
16554 assert(NumUpperHalves == 2 && "Half vector count went wrong")((void)0);
16555 return SDValue();
16556 }
16557
16558 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16559 if (NumUpperHalves == 0) {
16560 // AVX2 has efficient 64-bit element cross-lane shuffles.
16561 // TODO: Refine to account for unary shuffle, splat, and other masks?
16562 if (Subtarget.hasAVX2() && EltWidth == 64)
16563 return SDValue();
16564 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16565 if (Subtarget.hasAVX512() && VT.is512BitVector())
16566 return SDValue();
16567 // Narrow shuffle + insert is better than the wide alternative.
16568 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16569 UndefLower, DAG);
16570 }
16571
16572 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16573 return SDValue();
16574}
16575
16576/// Test whether the specified input (0 or 1) is in-place blended by the
16577/// given mask.
16578///
16579/// This returns true if the elements from a particular input are already in the
16580/// slot required by the given mask and require no permutation.
16581static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16582 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")((void)0);
16583 int Size = Mask.size();
16584 for (int i = 0; i < Size; ++i)
16585 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16586 return false;
16587
16588 return true;
16589}
16590
16591/// Handle case where shuffle sources are coming from the same 128-bit lane and
16592/// every lane can be represented as the same repeating mask - allowing us to
16593/// shuffle the sources with the repeating shuffle and then permute the result
16594/// to the destination lanes.
16595static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16596 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16597 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16598 int NumElts = VT.getVectorNumElements();
16599 int NumLanes = VT.getSizeInBits() / 128;
16600 int NumLaneElts = NumElts / NumLanes;
16601
16602 // On AVX2 we may be able to just shuffle the lowest elements and then
16603 // broadcast the result.
16604 if (Subtarget.hasAVX2()) {
16605 for (unsigned BroadcastSize : {16, 32, 64}) {
16606 if (BroadcastSize <= VT.getScalarSizeInBits())
16607 continue;
16608 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16609
16610 // Attempt to match a repeating pattern every NumBroadcastElts,
16611 // accounting for UNDEFs but only references the lowest 128-bit
16612 // lane of the inputs.
16613 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16614 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16615 for (int j = 0; j != NumBroadcastElts; ++j) {
16616 int M = Mask[i + j];
16617 if (M < 0)
16618 continue;
16619 int &R = RepeatMask[j];
16620 if (0 != ((M % NumElts) / NumLaneElts))
16621 return false;
16622 if (0 <= R && R != M)
16623 return false;
16624 R = M;
16625 }
16626 return true;
16627 };
16628
16629 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16630 if (!FindRepeatingBroadcastMask(RepeatMask))
16631 continue;
16632
16633 // Shuffle the (lowest) repeated elements in place for broadcast.
16634 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16635
16636 // Shuffle the actual broadcast.
16637 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16638 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16639 for (int j = 0; j != NumBroadcastElts; ++j)
16640 BroadcastMask[i + j] = j;
16641 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16642 BroadcastMask);
16643 }
16644 }
16645
16646 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16647 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16648 return SDValue();
16649
16650 // Bail if we already have a repeated lane shuffle mask.
16651 SmallVector<int, 8> RepeatedShuffleMask;
16652 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16653 return SDValue();
16654
16655 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16656 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16657 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16658 int NumSubLanes = NumLanes * SubLaneScale;
16659 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16660
16661 // Check that all the sources are coming from the same lane and see if we can
16662 // form a repeating shuffle mask (local to each sub-lane). At the same time,
16663 // determine the source sub-lane for each destination sub-lane.
16664 int TopSrcSubLane = -1;
16665 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16666 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16667 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16668 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16669
16670 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16671 // Extract the sub-lane mask, check that it all comes from the same lane
16672 // and normalize the mask entries to come from the first lane.
16673 int SrcLane = -1;
16674 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16675 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16676 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16677 if (M < 0)
16678 continue;
16679 int Lane = (M % NumElts) / NumLaneElts;
16680 if ((0 <= SrcLane) && (SrcLane != Lane))
16681 return SDValue();
16682 SrcLane = Lane;
16683 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16684 SubLaneMask[Elt] = LocalM;
16685 }
16686
16687 // Whole sub-lane is UNDEF.
16688 if (SrcLane < 0)
16689 continue;
16690
16691 // Attempt to match against the candidate repeated sub-lane masks.
16692 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16693 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16694 for (int i = 0; i != NumSubLaneElts; ++i) {
16695 if (M1[i] < 0 || M2[i] < 0)
16696 continue;
16697 if (M1[i] != M2[i])
16698 return false;
16699 }
16700 return true;
16701 };
16702
16703 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16704 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16705 continue;
16706
16707 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16708 for (int i = 0; i != NumSubLaneElts; ++i) {
16709 int M = SubLaneMask[i];
16710 if (M < 0)
16711 continue;
16712 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&((void)0)
16713 "Unexpected mask element")((void)0);
16714 RepeatedSubLaneMask[i] = M;
16715 }
16716
16717 // Track the top most source sub-lane - by setting the remaining to UNDEF
16718 // we can greatly simplify shuffle matching.
16719 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16720 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16721 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16722 break;
16723 }
16724
16725 // Bail if we failed to find a matching repeated sub-lane mask.
16726 if (Dst2SrcSubLanes[DstSubLane] < 0)
16727 return SDValue();
16728 }
16729 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((void)0)
16730 "Unexpected source lane")((void)0);
16731
16732 // Create a repeating shuffle mask for the entire vector.
16733 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16734 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16735 int Lane = SubLane / SubLaneScale;
16736 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16737 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16738 int M = RepeatedSubLaneMask[Elt];
16739 if (M < 0)
16740 continue;
16741 int Idx = (SubLane * NumSubLaneElts) + Elt;
16742 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16743 }
16744 }
16745 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16746
16747 // Shuffle each source sub-lane to its destination.
16748 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16749 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16750 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16751 if (SrcSubLane < 0)
16752 continue;
16753 for (int j = 0; j != NumSubLaneElts; ++j)
16754 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16755 }
16756
16757 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16758 SubLaneMask);
16759}
16760
16761static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
16762 bool &ForceV1Zero, bool &ForceV2Zero,
16763 unsigned &ShuffleImm, ArrayRef<int> Mask,
16764 const APInt &Zeroable) {
16765 int NumElts = VT.getVectorNumElements();
16766 assert(VT.getScalarSizeInBits() == 64 &&((void)0)
16767 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&((void)0)
16768 "Unexpected data type for VSHUFPD")((void)0);
16769 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&((void)0)
16770 "Illegal shuffle mask")((void)0);
16771
16772 bool ZeroLane[2] = { true, true };
16773 for (int i = 0; i < NumElts; ++i)
16774 ZeroLane[i & 1] &= Zeroable[i];
16775
16776 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16777 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16778 ShuffleImm = 0;
16779 bool ShufpdMask = true;
16780 bool CommutableMask = true;
16781 for (int i = 0; i < NumElts; ++i) {
16782 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16783 continue;
16784 if (Mask[i] < 0)
16785 return false;
16786 int Val = (i & 6) + NumElts * (i & 1);
16787 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16788 if (Mask[i] < Val || Mask[i] > Val + 1)
16789 ShufpdMask = false;
16790 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16791 CommutableMask = false;
16792 ShuffleImm |= (Mask[i] % 2) << i;
16793 }
16794
16795 if (!ShufpdMask && !CommutableMask)
16796 return false;
16797
16798 if (!ShufpdMask && CommutableMask)
16799 std::swap(V1, V2);
16800
16801 ForceV1Zero = ZeroLane[0];
16802 ForceV2Zero = ZeroLane[1];
16803 return true;
16804}
16805
16806static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
16807 SDValue V2, ArrayRef<int> Mask,
16808 const APInt &Zeroable,
16809 const X86Subtarget &Subtarget,
16810 SelectionDAG &DAG) {
16811 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&((void)0)
16812 "Unexpected data type for VSHUFPD")((void)0);
16813
16814 unsigned Immediate = 0;
16815 bool ForceV1Zero = false, ForceV2Zero = false;
16816 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16817 Mask, Zeroable))
16818 return SDValue();
16819
16820 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16821 if (ForceV1Zero)
16822 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16823 if (ForceV2Zero)
16824 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16825
16826 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16827 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16828}
16829
16830// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16831// by zeroable elements in the remaining 24 elements. Turn this into two
16832// vmovqb instructions shuffled together.
16833static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16834 SDValue V1, SDValue V2,
16835 ArrayRef<int> Mask,
16836 const APInt &Zeroable,
16837 SelectionDAG &DAG) {
16838 assert(VT == MVT::v32i8 && "Unexpected type!")((void)0);
16839
16840 // The first 8 indices should be every 8th element.
16841 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16842 return SDValue();
16843
16844 // Remaining elements need to be zeroable.
16845 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16846 return SDValue();
16847
16848 V1 = DAG.getBitcast(MVT::v4i64, V1);
16849 V2 = DAG.getBitcast(MVT::v4i64, V2);
16850
16851 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16852 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16853
16854 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16855 // the upper bits of the result using an unpckldq.
16856 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16857 { 0, 1, 2, 3, 16, 17, 18, 19,
16858 4, 5, 6, 7, 20, 21, 22, 23 });
16859 // Insert the unpckldq into a zero vector to widen to v32i8.
16860 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16861 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16862 DAG.getIntPtrConstant(0, DL));
16863}
16864
16865
16866/// Handle lowering of 4-lane 64-bit floating point shuffles.
16867///
16868/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16869/// isn't available.
16870static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16871 const APInt &Zeroable, SDValue V1, SDValue V2,
16872 const X86Subtarget &Subtarget,
16873 SelectionDAG &DAG) {
16874 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((void)0);
16875 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((void)0);
16876 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
16877
16878 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16879 Subtarget, DAG))
16880 return V;
16881
16882 if (V2.isUndef()) {
16883 // Check for being able to broadcast a single element.
16884 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16885 Mask, Subtarget, DAG))
16886 return Broadcast;
16887
16888 // Use low duplicate instructions for masks that match their pattern.
16889 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16890 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16891
16892 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16893 // Non-half-crossing single input shuffles can be lowered with an
16894 // interleaved permutation.
16895 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16896 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16897 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16898 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16899 }
16900
16901 // With AVX2 we have direct support for this permutation.
16902 if (Subtarget.hasAVX2())
16903 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16904 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16905
16906 // Try to create an in-lane repeating shuffle mask and then shuffle the
16907 // results into the target lanes.
16908 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16909 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16910 return V;
16911
16912 // Try to permute the lanes and then use a per-lane permute.
16913 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16914 Mask, DAG, Subtarget))
16915 return V;
16916
16917 // Otherwise, fall back.
16918 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16919 DAG, Subtarget);
16920 }
16921
16922 // Use dedicated unpack instructions for masks that match their pattern.
16923 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16924 return V;
16925
16926 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16927 Zeroable, Subtarget, DAG))
16928 return Blend;
16929
16930 // Check if the blend happens to exactly fit that of SHUFPD.
16931 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16932 Zeroable, Subtarget, DAG))
16933 return Op;
16934
16935 // If we have lane crossing shuffles AND they don't all come from the lower
16936 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16937 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16938 // canonicalize to a blend of splat which isn't necessary for this combine.
16939 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16940 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16941 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16942 (V2.getOpcode() != ISD::BUILD_VECTOR))
16943 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16944 Mask, DAG))
16945 return Op;
16946
16947 // If we have one input in place, then we can permute the other input and
16948 // blend the result.
16949 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16950 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16951 Subtarget, DAG);
16952
16953 // Try to create an in-lane repeating shuffle mask and then shuffle the
16954 // results into the target lanes.
16955 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16956 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16957 return V;
16958
16959 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16960 // shuffle. However, if we have AVX2 and either inputs are already in place,
16961 // we will be able to shuffle even across lanes the other input in a single
16962 // instruction so skip this pattern.
16963 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16964 isShuffleMaskInputInPlace(1, Mask))))
16965 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16966 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16967 return V;
16968
16969 // If we have VLX support, we can use VEXPAND.
16970 if (Subtarget.hasVLX())
16971 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16972 DAG, Subtarget))
16973 return V;
16974
16975 // If we have AVX2 then we always want to lower with a blend because an v4 we
16976 // can fully permute the elements.
16977 if (Subtarget.hasAVX2())
16978 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16979 Subtarget, DAG);
16980
16981 // Otherwise fall back on generic lowering.
16982 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16983 Subtarget, DAG);
16984}
16985
16986/// Handle lowering of 4-lane 64-bit integer shuffles.
16987///
16988/// This routine is only called when we have AVX2 and thus a reasonable
16989/// instruction set for v4i64 shuffling..
16990static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16991 const APInt &Zeroable, SDValue V1, SDValue V2,
16992 const X86Subtarget &Subtarget,
16993 SelectionDAG &DAG) {
16994 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((void)0);
16995 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((void)0);
16996 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((void)0);
16997 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((void)0);
16998
16999 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
1
Taking false branch
17000 Subtarget, DAG))
17001 return V;
17002
17003 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
2
Taking false branch
17004 Zeroable, Subtarget, DAG))
17005 return Blend;
17006
17007 // Check for being able to broadcast a single element.
17008 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
3
Taking false branch
17009 Subtarget, DAG))
17010 return Broadcast;
17011
17012 if (V2.isUndef()) {
4
Taking false branch
17013 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17014 // can use lower latency instructions that will operate on both lanes.
17015 SmallVector<int, 2> RepeatedMask;
17016 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17017 SmallVector<int, 4> PSHUFDMask;
17018 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17019 return DAG.getBitcast(
17020 MVT::v4i64,
17021 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17022 DAG.getBitcast(MVT::v8i32, V1),
17023 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17024 }
17025
17026 // AVX2 provides a direct instruction for permuting a single input across
17027 // lanes.
17028 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17029 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17030 }
17031
17032 // Try to use shift instructions.
17033 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
5
Taking false branch
17034 Zeroable, Subtarget, DAG))
17035 return Shift;
17036
17037 // If we have VLX support, we can use VALIGN or VEXPAND.
17038 if (Subtarget.hasVLX()) {
6
Taking false branch
17039 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17040 Subtarget, DAG))
17041 return Rotate;
17042
17043 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17044 DAG, Subtarget))
17045 return V;
17046 }
17047
17048 // Try to use PALIGNR.
17049 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
7
Taking false branch
17050 Subtarget, DAG))
17051 return Rotate;
17052
17053 // Use dedicated unpack instructions for masks that match their pattern.
17054 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
8
Taking false branch
17055 return V;
17056
17057 // If we have one input in place, then we can permute the other input and
17058 // blend the result.
17059 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
9
Taking false branch
17060 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17061 Subtarget, DAG);
17062
17063 // Try to create an in-lane repeating shuffle mask and then shuffle the
17064 // results into the target lanes.
17065 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
10
Taking false branch
17066 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17067 return V;
17068
17069 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17070 // shuffle. However, if we have AVX2 and either inputs are already in place,
17071 // we will be able to shuffle even across lanes the other input in a single
17072 // instruction so skip this pattern.
17073 if (!isShuffleMaskInputInPlace(0, Mask) &&
11
Taking true branch
17074 !isShuffleMaskInputInPlace(1, Mask))
17075 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
12
Calling 'lowerShuffleAsLanePermuteAndRepeatedMask'
17076 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17077 return Result;
17078
17079 // Otherwise fall back on generic blend lowering.
17080 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17081 Subtarget, DAG);
17082}
17083
17084/// Handle lowering of 8-lane 32-bit floating point shuffles.
17085///
17086/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17087/// isn't available.
17088static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17089 const APInt &Zeroable, SDValue V1, SDValue V2,
17090 const X86Subtarget &Subtarget,
17091 SelectionDAG &DAG) {
17092 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((void)0);
17093 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((void)0);
17094 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17095
17096 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17097 Zeroable, Subtarget, DAG))
17098 return Blend;
17099
17100 // Check for being able to broadcast a single element.
17101 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17102 Subtarget, DAG))
17103 return Broadcast;
17104
17105 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17106 // options to efficiently lower the shuffle.
17107 SmallVector<int, 4> RepeatedMask;
17108 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17109 assert(RepeatedMask.size() == 4 &&((void)0)
17110 "Repeated masks must be half the mask width!")((void)0);
17111
17112 // Use even/odd duplicate instructions for masks that match their pattern.
17113 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17114 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17115 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17116 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17117
17118 if (V2.isUndef())
17119 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17120 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17121
17122 // Use dedicated unpack instructions for masks that match their pattern.
17123 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17124 return V;
17125
17126 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17127 // have already handled any direct blends.
17128 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17129 }
17130
17131 // Try to create an in-lane repeating shuffle mask and then shuffle the
17132 // results into the target lanes.
17133 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17134 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17135 return V;
17136
17137 // If we have a single input shuffle with different shuffle patterns in the
17138 // two 128-bit lanes use the variable mask to VPERMILPS.
17139 if (V2.isUndef()) {
17140 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17141 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17142 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17143 }
17144 if (Subtarget.hasAVX2()) {
17145 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17146 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17147 }
17148 // Otherwise, fall back.
17149 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17150 DAG, Subtarget);
17151 }
17152
17153 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17154 // shuffle.
17155 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17156 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17157 return Result;
17158
17159 // If we have VLX support, we can use VEXPAND.
17160 if (Subtarget.hasVLX())
17161 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17162 DAG, Subtarget))
17163 return V;
17164
17165 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17166 // since after split we get a more efficient code using vpunpcklwd and
17167 // vpunpckhwd instrs than vblend.
17168 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17169 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17170 DAG);
17171
17172 // If we have AVX2 then we always want to lower with a blend because at v8 we
17173 // can fully permute the elements.
17174 if (Subtarget.hasAVX2())
17175 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17176 Subtarget, DAG);
17177
17178 // Otherwise fall back on generic lowering.
17179 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17180 Subtarget, DAG);
17181}
17182
17183/// Handle lowering of 8-lane 32-bit integer shuffles.
17184///
17185/// This routine is only called when we have AVX2 and thus a reasonable
17186/// instruction set for v8i32 shuffling..
17187static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17188 const APInt &Zeroable, SDValue V1, SDValue V2,
17189 const X86Subtarget &Subtarget,
17190 SelectionDAG &DAG) {
17191 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((void)0);
17192 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((void)0);
17193 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17194 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((void)0);
17195
17196 // Whenever we can lower this as a zext, that instruction is strictly faster
17197 // than any alternative. It also allows us to fold memory operands into the
17198 // shuffle in many cases.
17199 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17200 Zeroable, Subtarget, DAG))
17201 return ZExt;
17202
17203 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17204 // since after split we get a more efficient code than vblend by using
17205 // vpunpcklwd and vpunpckhwd instrs.
17206 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17207 !Subtarget.hasAVX512())
17208 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17209 DAG);
17210
17211 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17212 Zeroable, Subtarget, DAG))
17213 return Blend;
17214
17215 // Check for being able to broadcast a single element.
17216 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17217 Subtarget, DAG))
17218 return Broadcast;
17219
17220 // If the shuffle mask is repeated in each 128-bit lane we can use more
17221 // efficient instructions that mirror the shuffles across the two 128-bit
17222 // lanes.
17223 SmallVector<int, 4> RepeatedMask;
17224 bool Is128BitLaneRepeatedShuffle =
17225 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17226 if (Is128BitLaneRepeatedShuffle) {
17227 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17228 if (V2.isUndef())
17229 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17230 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17231
17232 // Use dedicated unpack instructions for masks that match their pattern.
17233 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17234 return V;
17235 }
17236
17237 // Try to use shift instructions.
17238 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17239 Zeroable, Subtarget, DAG))
17240 return Shift;
17241
17242 // If we have VLX support, we can use VALIGN or EXPAND.
17243 if (Subtarget.hasVLX()) {
17244 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17245 Subtarget, DAG))
17246 return Rotate;
17247
17248 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17249 DAG, Subtarget))
17250 return V;
17251 }
17252
17253 // Try to use byte rotation instructions.
17254 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17255 Subtarget, DAG))
17256 return Rotate;
17257
17258 // Try to create an in-lane repeating shuffle mask and then shuffle the
17259 // results into the target lanes.
17260 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17261 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17262 return V;
17263
17264 if (V2.isUndef()) {
17265 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17266 // because that should be faster than the variable permute alternatives.
17267 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17268 return V;
17269
17270 // If the shuffle patterns aren't repeated but it's a single input, directly
17271 // generate a cross-lane VPERMD instruction.
17272 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17273 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17274 }
17275
17276 // Assume that a single SHUFPS is faster than an alternative sequence of
17277 // multiple instructions (even if the CPU has a domain penalty).
17278 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17279 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17280 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17281 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17282 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17283 CastV1, CastV2, DAG);
17284 return DAG.getBitcast(MVT::v8i32, ShufPS);
17285 }
17286
17287 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17288 // shuffle.
17289 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17290 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17291 return Result;
17292
17293 // Otherwise fall back on generic blend lowering.
17294 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17295 Subtarget, DAG);
17296}
17297
17298/// Handle lowering of 16-lane 16-bit integer shuffles.
17299///
17300/// This routine is only called when we have AVX2 and thus a reasonable
17301/// instruction set for v16i16 shuffling..
17302static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17303 const APInt &Zeroable, SDValue V1, SDValue V2,
17304 const X86Subtarget &Subtarget,
17305 SelectionDAG &DAG) {
17306 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((void)0);
17307 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((void)0);
17308 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17309 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((void)0);
17310
17311 // Whenever we can lower this as a zext, that instruction is strictly faster
17312 // than any alternative. It also allows us to fold memory operands into the
17313 // shuffle in many cases.
17314 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17315 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17316 return ZExt;
17317
17318 // Check for being able to broadcast a single element.
17319 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17320 Subtarget, DAG))
17321 return Broadcast;
17322
17323 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17324 Zeroable, Subtarget, DAG))
17325 return Blend;
17326
17327 // Use dedicated unpack instructions for masks that match their pattern.
17328 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17329 return V;
17330
17331 // Use dedicated pack instructions for masks that match their pattern.
17332 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17333 Subtarget))
17334 return V;
17335
17336 // Try to use lower using a truncation.
17337 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17338 Subtarget, DAG))
17339 return V;
17340
17341 // Try to use shift instructions.
17342 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17343 Zeroable, Subtarget, DAG))
17344 return Shift;
17345
17346 // Try to use byte rotation instructions.
17347 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17348 Subtarget, DAG))
17349 return Rotate;
17350
17351 // Try to create an in-lane repeating shuffle mask and then shuffle the
17352 // results into the target lanes.
17353 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17354 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17355 return V;
17356
17357 if (V2.isUndef()) {
17358 // Try to use bit rotation instructions.
17359 if (SDValue Rotate =
17360 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17361 return Rotate;
17362
17363 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17364 // because that should be faster than the variable permute alternatives.
17365 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17366 return V;
17367
17368 // There are no generalized cross-lane shuffle operations available on i16
17369 // element types.
17370 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17371 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17372 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17373 return V;
17374
17375 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17376 DAG, Subtarget);
17377 }
17378
17379 SmallVector<int, 8> RepeatedMask;
17380 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17381 // As this is a single-input shuffle, the repeated mask should be
17382 // a strictly valid v8i16 mask that we can pass through to the v8i16
17383 // lowering to handle even the v16 case.
17384 return lowerV8I16GeneralSingleInputShuffle(
17385 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17386 }
17387 }
17388
17389 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17390 Zeroable, Subtarget, DAG))
17391 return PSHUFB;
17392
17393 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17394 if (Subtarget.hasBWI())
17395 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17396
17397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17398 // shuffle.
17399 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17400 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17401 return Result;
17402
17403 // Try to permute the lanes and then use a per-lane permute.
17404 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17405 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17406 return V;
17407
17408 // Otherwise fall back on generic lowering.
17409 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17410 Subtarget, DAG);
17411}
17412
17413/// Handle lowering of 32-lane 8-bit integer shuffles.
17414///
17415/// This routine is only called when we have AVX2 and thus a reasonable
17416/// instruction set for v32i8 shuffling..
17417static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17418 const APInt &Zeroable, SDValue V1, SDValue V2,
17419 const X86Subtarget &Subtarget,
17420 SelectionDAG &DAG) {
17421 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((void)0);
17422 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((void)0);
17423 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((void)0);
17424 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((void)0);
17425
17426 // Whenever we can lower this as a zext, that instruction is strictly faster
17427 // than any alternative. It also allows us to fold memory operands into the
17428 // shuffle in many cases.
17429 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17430 Zeroable, Subtarget, DAG))
17431 return ZExt;
17432
17433 // Check for being able to broadcast a single element.
17434 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17435 Subtarget, DAG))
17436 return Broadcast;
17437
17438 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17439 Zeroable, Subtarget, DAG))
17440 return Blend;
17441
17442 // Use dedicated unpack instructions for masks that match their pattern.
17443 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17444 return V;
17445
17446 // Use dedicated pack instructions for masks that match their pattern.
17447 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17448 Subtarget))
17449 return V;
17450
17451 // Try to use lower using a truncation.
17452 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17453 Subtarget, DAG))
17454 return V;
17455
17456 // Try to use shift instructions.
17457 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17458 Zeroable, Subtarget, DAG))
17459 return Shift;
17460
17461 // Try to use byte rotation instructions.
17462 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17463 Subtarget, DAG))
17464 return Rotate;
17465
17466 // Try to use bit rotation instructions.
17467 if (V2.isUndef())
17468 if (SDValue Rotate =
17469 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17470 return Rotate;
17471
17472 // Try to create an in-lane repeating shuffle mask and then shuffle the
17473 // results into the target lanes.
17474 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17475 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17476 return V;
17477
17478 // There are no generalized cross-lane shuffle operations available on i8
17479 // element types.
17480 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17481 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17482 // because that should be faster than the variable permute alternatives.
17483 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17484 return V;
17485
17486 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17487 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17488 return V;
17489
17490 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17491 DAG, Subtarget);
17492 }
17493
17494 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17495 Zeroable, Subtarget, DAG))
17496 return PSHUFB;
17497
17498 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17499 if (Subtarget.hasVBMI())
17500 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17501
17502 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17503 // shuffle.
17504 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17505 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17506 return Result;
17507
17508 // Try to permute the lanes and then use a per-lane permute.
17509 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17510 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17511 return V;
17512
17513 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17514 // by zeroable elements in the remaining 24 elements. Turn this into two
17515 // vmovqb instructions shuffled together.
17516 if (Subtarget.hasVLX())
17517 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17518 Mask, Zeroable, DAG))
17519 return V;
17520
17521 // Otherwise fall back on generic lowering.
17522 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17523 Subtarget, DAG);
17524}
17525
17526/// High-level routine to lower various 256-bit x86 vector shuffles.
17527///
17528/// This routine either breaks down the specific type of a 256-bit x86 vector
17529/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17530/// together based on the available instructions.
17531static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17532 SDValue V1, SDValue V2, const APInt &Zeroable,
17533 const X86Subtarget &Subtarget,
17534 SelectionDAG &DAG) {
17535 // If we have a single input to the zero element, insert that into V1 if we
17536 // can do so cheaply.
17537 int NumElts = VT.getVectorNumElements();
17538 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17539
17540 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17541 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17542 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17543 return Insertion;
17544
17545 // Handle special cases where the lower or upper half is UNDEF.
17546 if (SDValue V =
17547 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17548 return V;
17549
17550 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17551 // can check for those subtargets here and avoid much of the subtarget
17552 // querying in the per-vector-type lowering routines. With AVX1 we have
17553 // essentially *zero* ability to manipulate a 256-bit vector with integer
17554 // types. Since we'll use floating point types there eventually, just
17555 // immediately cast everything to a float and operate entirely in that domain.
17556 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17557 int ElementBits = VT.getScalarSizeInBits();
17558 if (ElementBits < 32) {
17559 // No floating point type available, if we can't use the bit operations
17560 // for masking/blending then decompose into 128-bit vectors.
17561 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17562 Subtarget, DAG))
17563 return V;
17564 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17565 return V;
17566 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17567 }
17568
17569 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17570 VT.getVectorNumElements());
17571 V1 = DAG.getBitcast(FpVT, V1);
17572 V2 = DAG.getBitcast(FpVT, V2);
17573 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17574 }
17575
17576 switch (VT.SimpleTy) {
17577 case MVT::v4f64:
17578 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17579 case MVT::v4i64:
17580 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17581 case MVT::v8f32:
17582 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17583 case MVT::v8i32:
17584 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17585 case MVT::v16i16:
17586 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17587 case MVT::v32i8:
17588 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17589
17590 default:
17591 llvm_unreachable("Not a valid 256-bit x86 vector type!")__builtin_unreachable();
17592 }
17593}
17594
17595/// Try to lower a vector shuffle as a 128-bit shuffles.
17596static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17597 const APInt &Zeroable, SDValue V1, SDValue V2,
17598 const X86Subtarget &Subtarget,
17599 SelectionDAG &DAG) {
17600 assert(VT.getScalarSizeInBits() == 64 &&((void)0)
17601 "Unexpected element type size for 128bit shuffle.")((void)0);
17602
17603 // To handle 256 bit vector requires VLX and most probably
17604 // function lowerV2X128VectorShuffle() is better solution.
17605 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((void)0);
17606
17607 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17608 SmallVector<int, 4> Widened128Mask;
17609 if (!canWidenShuffleElements(Mask, Widened128Mask))
17610 return SDValue();
17611 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")((void)0);
17612
17613 // Try to use an insert into a zero vector.
17614 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17615 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17616 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17617 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17618 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17619 DAG.getIntPtrConstant(0, DL));
17620 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17621 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17622 DAG.getIntPtrConstant(0, DL));
17623 }
17624
17625 // Check for patterns which can be matched with a single insert of a 256-bit
17626 // subvector.
17627 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17628 if (OnlyUsesV1 ||
17629 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17630 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17631 SDValue SubVec =
17632 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17633 DAG.getIntPtrConstant(0, DL));
17634 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17635 DAG.getIntPtrConstant(4, DL));
17636 }
17637
17638 // See if this is an insertion of the lower 128-bits of V2 into V1.
17639 bool IsInsert = true;
17640 int V2Index = -1;
17641 for (int i = 0; i < 4; ++i) {
17642 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((void)0);
17643 if (Widened128Mask[i] < 0)
17644 continue;
17645
17646 // Make sure all V1 subvectors are in place.
17647 if (Widened128Mask[i] < 4) {
17648 if (Widened128Mask[i] != i) {
17649 IsInsert = false;
17650 break;
17651 }
17652 } else {
17653 // Make sure we only have a single V2 index and its the lowest 128-bits.
17654 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17655 IsInsert = false;
17656 break;
17657 }
17658 V2Index = i;
17659 }
17660 }
17661 if (IsInsert && V2Index >= 0) {
17662 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17663 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17664 DAG.getIntPtrConstant(0, DL));
17665 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17666 }
17667
17668 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17669 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17670 // possible we at least ensure the lanes stay sequential to help later
17671 // combines.
17672 SmallVector<int, 2> Widened256Mask;
17673 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17674 Widened128Mask.clear();
17675 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17676 }
17677
17678 // Try to lower to vshuf64x2/vshuf32x4.
17679 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17680 unsigned PermMask = 0;
17681 // Insure elements came from the same Op.
17682 for (int i = 0; i < 4; ++i) {
17683 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")((void)0);
17684 if (Widened128Mask[i] < 0)
17685 continue;
17686
17687 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17688 unsigned OpIndex = i / 2;
17689 if (Ops[OpIndex].isUndef())
17690 Ops[OpIndex] = Op;
17691 else if (Ops[OpIndex] != Op)
17692 return SDValue();
17693
17694 // Convert the 128-bit shuffle mask selection values into 128-bit selection
17695 // bits defined by a vshuf64x2 instruction's immediate control byte.
17696 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17697 }
17698
17699 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17700 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17701}
17702
17703/// Handle lowering of 8-lane 64-bit floating point shuffles.
17704static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17705 const APInt &Zeroable, SDValue V1, SDValue V2,
17706 const X86Subtarget &Subtarget,
17707 SelectionDAG &DAG) {
17708 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((void)0);
17709 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((void)0);
17710 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17711
17712 if (V2.isUndef()) {
17713 // Use low duplicate instructions for masks that match their pattern.
17714 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17715 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17716
17717 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17718 // Non-half-crossing single input shuffles can be lowered with an
17719 // interleaved permutation.
17720 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17721 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17722 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17723 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17724 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17725 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17726 }
17727
17728 SmallVector<int, 4> RepeatedMask;
17729 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17730 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17731 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17732 }
17733
17734 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17735 V2, Subtarget, DAG))
17736 return Shuf128;
17737
17738 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
17739 return Unpck;
17740
17741 // Check if the blend happens to exactly fit that of SHUFPD.
17742 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17743 Zeroable, Subtarget, DAG))
17744 return Op;
17745
17746 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
17747 DAG, Subtarget))
17748 return V;
17749
17750 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17751 Zeroable, Subtarget, DAG))
17752 return Blend;
17753
17754 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17755}
17756
17757/// Handle lowering of 16-lane 32-bit floating point shuffles.
17758static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17759 const APInt &Zeroable, SDValue V1, SDValue V2,
17760 const X86Subtarget &Subtarget,
17761 SelectionDAG &DAG) {
17762 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((void)0);
17763 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((void)0);
17764 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17765
17766 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17767 // options to efficiently lower the shuffle.
17768 SmallVector<int, 4> RepeatedMask;
17769 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17770 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17771
17772 // Use even/odd duplicate instructions for masks that match their pattern.
17773 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17774 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17775 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17776 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17777
17778 if (V2.isUndef())
17779 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17780 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17781
17782 // Use dedicated unpack instructions for masks that match their pattern.
17783 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
17784 return V;
17785
17786 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17787 Zeroable, Subtarget, DAG))
17788 return Blend;
17789
17790 // Otherwise, fall back to a SHUFPS sequence.
17791 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17792 }
17793
17794 // Try to create an in-lane repeating shuffle mask and then shuffle the
17795 // results into the target lanes.
17796 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17797 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17798 return V;
17799
17800 // If we have a single input shuffle with different shuffle patterns in the
17801 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17802 if (V2.isUndef() &&
17803 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17804 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17805 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17806 }
17807
17808 // If we have AVX512F support, we can use VEXPAND.
17809 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
17810 V1, V2, DAG, Subtarget))
17811 return V;
17812
17813 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17814}
17815
17816/// Handle lowering of 8-lane 64-bit integer shuffles.
17817static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17818 const APInt &Zeroable, SDValue V1, SDValue V2,
17819 const X86Subtarget &Subtarget,
17820 SelectionDAG &DAG) {
17821 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((void)0);
17822 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((void)0);
17823 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((void)0);
17824
17825 if (V2.isUndef()) {
17826 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17827 // can use lower latency instructions that will operate on all four
17828 // 128-bit lanes.
17829 SmallVector<int, 2> Repeated128Mask;
17830 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17831 SmallVector<int, 4> PSHUFDMask;
17832 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17833 return DAG.getBitcast(
17834 MVT::v8i64,
17835 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17836 DAG.getBitcast(MVT::v16i32, V1),
17837 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17838 }
17839
17840 SmallVector<int, 4> Repeated256Mask;
17841 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17842 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17843 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17844 }
17845
17846 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17847 V2, Subtarget, DAG))
17848 return Shuf128;
17849
17850 // Try to use shift instructions.
17851 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
17852 Zeroable, Subtarget, DAG))
17853 return Shift;
17854
17855 // Try to use VALIGN.
17856 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17857 Subtarget, DAG))
17858 return Rotate;
17859
17860 // Try to use PALIGNR.
17861 if (Subtarget.hasBWI())
17862 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17863 Subtarget, DAG))
17864 return Rotate;
17865
17866 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17867 return Unpck;
17868
17869 // If we have AVX512F support, we can use VEXPAND.
17870 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17871 DAG, Subtarget))
17872 return V;
17873
17874 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17875 Zeroable, Subtarget, DAG))
17876 return Blend;
17877
17878 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17879}
17880
17881/// Handle lowering of 16-lane 32-bit integer shuffles.
17882static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17883 const APInt &Zeroable, SDValue V1, SDValue V2,
17884 const X86Subtarget &Subtarget,
17885 SelectionDAG &DAG) {
17886 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((void)0);
17887 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((void)0);
17888 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((void)0);
17889
17890 // Whenever we can lower this as a zext, that instruction is strictly faster
17891 // than any alternative. It also allows us to fold memory operands into the
17892 // shuffle in many cases.
17893 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17894 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17895 return ZExt;
17896
17897 // If the shuffle mask is repeated in each 128-bit lane we can use more
17898 // efficient instructions that mirror the shuffles across the four 128-bit
17899 // lanes.
17900 SmallVector<int, 4> RepeatedMask;
17901 bool Is128BitLaneRepeatedShuffle =
17902 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17903 if (Is128BitLaneRepeatedShuffle) {
17904 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((void)0);
17905 if (V2.isUndef())
17906 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17907 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17908
17909 // Use dedicated unpack instructions for masks that match their pattern.
17910 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17911 return V;
17912 }
17913
17914 // Try to use shift instructions.
17915 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17916 Zeroable, Subtarget, DAG))
17917 return Shift;
17918
17919 // Try to use VALIGN.
17920 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17921 Subtarget, DAG))
17922 return Rotate;
17923
17924 // Try to use byte rotation instructions.
17925 if (Subtarget.hasBWI())
17926 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17927 Subtarget, DAG))
17928 return Rotate;
17929
17930 // Assume that a single SHUFPS is faster than using a permv shuffle.
17931 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17932 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17933 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17934 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17935 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17936 CastV1, CastV2, DAG);
17937 return DAG.getBitcast(MVT::v16i32, ShufPS);
17938 }
17939
17940 // Try to create an in-lane repeating shuffle mask and then shuffle the
17941 // results into the target lanes.
17942 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17943 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17944 return V;
17945
17946 // If we have AVX512F support, we can use VEXPAND.
17947 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17948 DAG, Subtarget))
17949 return V;
17950
17951 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17952 Zeroable, Subtarget, DAG))
17953 return Blend;
17954
17955 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17956}
17957
17958/// Handle lowering of 32-lane 16-bit integer shuffles.
17959static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17960 const APInt &Zeroable, SDValue V1, SDValue V2,
17961 const X86Subtarget &Subtarget,
17962 SelectionDAG &DAG) {
17963 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((void)0);
17964 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((void)0);
17965 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((void)0);
17966 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((void)0);
17967
17968 // Whenever we can lower this as a zext, that instruction is strictly faster
17969 // than any alternative. It also allows us to fold memory operands into the
17970 // shuffle in many cases.
17971 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17972 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17973 return ZExt;
17974
17975 // Use dedicated unpack instructions for masks that match their pattern.
17976 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17977 return V;
17978
17979 // Use dedicated pack instructions for masks that match their pattern.
17980 if (SDValue V =
17981 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17982 return V;
17983
17984 // Try to use shift instructions.
17985 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17986 Zeroable, Subtarget, DAG))
17987 return Shift;
17988
17989 // Try to use byte rotation instructions.
17990 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17991 Subtarget, DAG))
17992 return Rotate;
17993
17994 if (V2.isUndef()) {
17995 // Try to use bit rotation instructions.
17996 if (SDValue Rotate =
17997 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17998 return Rotate;
17999
18000 SmallVector<int, 8> RepeatedMask;
18001 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18002 // As this is a single-input shuffle, the repeated mask should be
18003 // a strictly valid v8i16 mask that we can pass through to the v8i16
18004 // lowering to handle even the v32 case.
18005 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18006 RepeatedMask, Subtarget, DAG);
18007 }
18008 }
18009
18010 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18011 Zeroable, Subtarget, DAG))
18012 return Blend;
18013
18014 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18015 Zeroable, Subtarget, DAG))
18016 return PSHUFB;
18017
18018 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18019}
18020
18021/// Handle lowering of 64-lane 8-bit integer shuffles.
18022static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18023 const APInt &Zeroable, SDValue V1, SDValue V2,
18024 const X86Subtarget &Subtarget,
18025 SelectionDAG &DAG) {
18026 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((void)0);
18027 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((void)0);
18028 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((void)0);
18029 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((void)0);
18030
18031 // Whenever we can lower this as a zext, that instruction is strictly faster
18032 // than any alternative. It also allows us to fold memory operands into the
18033 // shuffle in many cases.
18034 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18035 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18036 return ZExt;
18037
18038 // Use dedicated unpack instructions for masks that match their pattern.
18039 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
18040 return V;
18041
18042 // Use dedicated pack instructions for masks that match their pattern.
18043 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
18044 Subtarget))
18045 return V;
18046
18047 // Try to use shift instructions.
18048 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
18049 Zeroable, Subtarget, DAG))
18050 return Shift;
18051
18052 // Try to use byte rotation instructions.
18053 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
18054 Subtarget, DAG))
18055 return Rotate;
18056
18057 // Try to use bit rotation instructions.
18058 if (V2.isUndef())
18059 if (SDValue Rotate =
18060 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18061 return Rotate;
18062
18063 // Lower as AND if possible.
18064 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
18065 Zeroable, Subtarget, DAG))
18066 return Masked;
18067
18068 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
18069 Zeroable, Subtarget, DAG))
18070 return PSHUFB;
18071
18072 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18073 if (Subtarget.hasVBMI())
18074 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18075
18076 // Try to create an in-lane repeating shuffle mask and then shuffle the
18077 // results into the target lanes.
18078 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18079 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18080 return V;
18081
18082 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18083 Zeroable, Subtarget, DAG))
18084 return Blend;
18085
18086 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18087 // shuffle.
18088 if (!V2.isUndef())
18089 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18090 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18091 return Result;
18092
18093 // FIXME: Implement direct support for this type!
18094 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18095}
18096
18097/// High-level routine to lower various 512-bit x86 vector shuffles.
18098///
18099/// This routine either breaks down the specific type of a 512-bit x86 vector
18100/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18101/// together based on the available instructions.
18102static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18103 MVT VT, SDValue V1, SDValue V2,
18104 const APInt &Zeroable,
18105 const X86Subtarget &Subtarget,
18106 SelectionDAG &DAG) {
18107 assert(Subtarget.hasAVX512() &&((void)0)
18108 "Cannot lower 512-bit vectors w/ basic ISA!")((void)0);
18109
18110 // If we have a single input to the zero element, insert that into V1 if we
18111 // can do so cheaply.
18112 int NumElts = Mask.size();
18113 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18114
18115 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18116 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18117 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18118 return Insertion;
18119
18120 // Handle special cases where the lower or upper half is UNDEF.
18121 if (SDValue V =
18122 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18123 return V;
18124
18125 // Check for being able to broadcast a single element.
18126 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18127 Subtarget, DAG))
18128 return Broadcast;
18129
18130 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18131 // Try using bit ops for masking and blending before falling back to
18132 // splitting.
18133 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18134 Subtarget, DAG))
18135 return V;
18136 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18137 return V;
18138
18139 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18140 }
18141
18142 // Dispatch to each element type for lowering. If we don't have support for
18143 // specific element type shuffles at 512 bits, immediately split them and
18144 // lower them. Each lowering routine of a given type is allowed to assume that
18145 // the requisite ISA extensions for that element type are available.
18146 switch (VT.SimpleTy) {
18147 case MVT::v8f64:
18148 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18149 case MVT::v16f32:
18150 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18151 case MVT::v8i64:
18152 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18153 case MVT::v16i32:
18154 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18155 case MVT::v32i16:
18156 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18157 case MVT::v64i8:
18158 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18159
18160 default:
18161 llvm_unreachable("Not a valid 512-bit x86 vector type!")__builtin_unreachable();
18162 }
18163}
18164
18165static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18166 MVT VT, SDValue V1, SDValue V2,
18167 const X86Subtarget &Subtarget,
18168 SelectionDAG &DAG) {
18169 // Shuffle should be unary.
18170 if (!V2.isUndef())
18171 return SDValue();
18172
18173 int ShiftAmt = -1;
18174 int NumElts = Mask.size();
18175 for (int i = 0; i != NumElts; ++i) {
18176 int M = Mask[i];
18177 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&((void)0)
18178 "Unexpected mask index.")((void)0);
18179 if (M < 0)
18180 continue;
18181
18182 // The first non-undef element determines our shift amount.
18183 if (ShiftAmt < 0) {
18184 ShiftAmt = M - i;
18185 // Need to be shifting right.
18186 if (ShiftAmt <= 0)
18187 return SDValue();
18188 }
18189 // All non-undef elements must shift by the same amount.
18190 if (ShiftAmt != M - i)
18191 return SDValue();
18192 }
18193 assert(ShiftAmt >= 0 && "All undef?")((void)0);
18194
18195 // Great we found a shift right.
18196 MVT WideVT = VT;
18197 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18198 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18199 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18200 DAG.getUNDEF(WideVT), V1,
18201 DAG.getIntPtrConstant(0, DL));
18202 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18203 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18204 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18205 DAG.getIntPtrConstant(0, DL));
18206}
18207
18208// Determine if this shuffle can be implemented with a KSHIFT instruction.
18209// Returns the shift amount if possible or -1 if not. This is a simplified
18210// version of matchShuffleAsShift.
18211static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18212 int MaskOffset, const APInt &Zeroable) {
18213 int Size = Mask.size();
18214
18215 auto CheckZeros = [&](int Shift, bool Left) {
18216 for (int j = 0; j < Shift; ++j)
18217 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18218 return false;
18219
18220 return true;
18221 };
18222
18223 auto MatchShift = [&](int Shift, bool Left) {
18224 unsigned Pos = Left ? Shift : 0;
18225 unsigned Low = Left ? 0 : Shift;
18226 unsigned Len = Size - Shift;
18227 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18228 };
18229
18230 for (int Shift = 1; Shift != Size; ++Shift)
18231 for (bool Left : {true, false})
18232 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18233 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18234 return Shift;
18235 }
18236
18237 return -1;
18238}
18239
18240
18241// Lower vXi1 vector shuffles.
18242// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18243// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18244// vector, shuffle and then truncate it back.
18245static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18246 MVT VT, SDValue V1, SDValue V2,
18247 const APInt &Zeroable,
18248 const X86Subtarget &Subtarget,
18249 SelectionDAG &DAG) {
18250 assert(Subtarget.hasAVX512() &&((void)0)
18251 "Cannot lower 512-bit vectors w/o basic ISA!")((void)0);
18252
18253 int NumElts = Mask.size();
18254
18255 // Try to recognize shuffles that are just padding a subvector with zeros.
18256 int SubvecElts = 0;
18257 int Src = -1;
18258 for (int i = 0; i != NumElts; ++i) {
18259 if (Mask[i] >= 0) {
18260 // Grab the source from the first valid mask. All subsequent elements need
18261 // to use this same source.
18262 if (Src < 0)
18263 Src = Mask[i] / NumElts;
18264 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18265 break;
18266 }
18267
18268 ++SubvecElts;
18269 }
18270 assert(SubvecElts != NumElts && "Identity shuffle?")((void)0);
18271
18272 // Clip to a power 2.
18273 SubvecElts = PowerOf2Floor(SubvecElts);
18274
18275 // Make sure the number of zeroable bits in the top at least covers the bits
18276 // not covered by the subvector.
18277 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18278 assert(Src >= 0 && "Expected a source!")((void)0);
18279 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18280 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18281 Src == 0 ? V1 : V2,
18282 DAG.getIntPtrConstant(0, DL));
18283 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18284 DAG.getConstant(0, DL, VT),
18285 Extract, DAG.getIntPtrConstant(0, DL));
18286 }
18287
18288 // Try a simple shift right with undef elements. Later we'll try with zeros.
18289 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18290 DAG))
18291 return Shift;
18292
18293 // Try to match KSHIFTs.
18294 unsigned Offset = 0;
18295 for (SDValue V : { V1, V2 }) {
18296 unsigned Opcode;
18297 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18298 if (ShiftAmt >= 0) {
18299 MVT WideVT = VT;
18300 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18301 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18302 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18303 DAG.getUNDEF(WideVT), V,
18304 DAG.getIntPtrConstant(0, DL));
18305 // Widened right shifts need two shifts to ensure we shift in zeroes.
18306 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18307 int WideElts = WideVT.getVectorNumElements();
18308 // Shift left to put the original vector in the MSBs of the new size.
18309 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18310 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18311 // Increase the shift amount to account for the left shift.
18312 ShiftAmt += WideElts - NumElts;
18313 }
18314
18315 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18316 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18317 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18318 DAG.getIntPtrConstant(0, DL));
18319 }
18320 Offset += NumElts; // Increment for next iteration.
18321 }
18322
18323
18324
18325 MVT ExtVT;
18326 switch (VT.SimpleTy) {
18327 default:
18328 llvm_unreachable("Expected a vector of i1 elements")__builtin_unreachable();
18329 case MVT::v2i1:
18330 ExtVT = MVT::v2i64;
18331 break;
18332 case MVT::v4i1:
18333 ExtVT = MVT::v4i32;
18334 break;
18335 case MVT::v8i1:
18336 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18337 // shuffle.
18338 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18339 break;
18340 case MVT::v16i1:
18341 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18342 // 256-bit operation available.
18343 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18344 break;
18345 case MVT::v32i1:
18346 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18347 // 256-bit operation available.
18348 assert(Subtarget.hasBWI() && "Expected AVX512BW support")((void)0);
18349 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18350 break;
18351 case MVT::v64i1:
18352 // Fall back to scalarization. FIXME: We can do better if the shuffle
18353 // can be partitioned cleanly.
18354 if (!Subtarget.useBWIRegs())
18355 return SDValue();
18356 ExtVT = MVT::v64i8;
18357 break;
18358 }
18359
18360 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18361 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18362
18363 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18364 // i1 was sign extended we can use X86ISD::CVT2MASK.
18365 int NumElems = VT.getVectorNumElements();
18366 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18367 (Subtarget.hasDQI() && (NumElems < 32)))
18368 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18369 Shuffle, ISD::SETGT);
18370
18371 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18372}
18373
18374/// Helper function that returns true if the shuffle mask should be
18375/// commuted to improve canonicalization.
18376static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18377 int NumElements = Mask.size();
18378
18379 int NumV1Elements = 0, NumV2Elements = 0;
18380 for (int M : Mask)
18381 if (M < 0)
18382 continue;
18383 else if (M < NumElements)
18384 ++NumV1Elements;
18385 else
18386 ++NumV2Elements;
18387
18388 // Commute the shuffle as needed such that more elements come from V1 than
18389 // V2. This allows us to match the shuffle pattern strictly on how many
18390 // elements come from V1 without handling the symmetric cases.
18391 if (NumV2Elements > NumV1Elements)
18392 return true;
18393
18394 assert(NumV1Elements > 0 && "No V1 indices")((void)0);
18395
18396 if (NumV2Elements == 0)
18397 return false;
18398
18399 // When the number of V1 and V2 elements are the same, try to minimize the
18400 // number of uses of V2 in the low half of the vector. When that is tied,
18401 // ensure that the sum of indices for V1 is equal to or lower than the sum
18402 // indices for V2. When those are equal, try to ensure that the number of odd
18403 // indices for V1 is lower than the number of odd indices for V2.
18404 if (NumV1Elements == NumV2Elements) {
18405 int LowV1Elements = 0, LowV2Elements = 0;
18406 for (int M : Mask.slice(0, NumElements / 2))
18407 if (M >= NumElements)
18408 ++LowV2Elements;
18409 else if (M >= 0)
18410 ++LowV1Elements;
18411 if (LowV2Elements > LowV1Elements)
18412 return true;
18413 if (LowV2Elements == LowV1Elements) {
18414 int SumV1Indices = 0, SumV2Indices = 0;
18415 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18416 if (Mask[i] >= NumElements)
18417 SumV2Indices += i;
18418 else if (Mask[i] >= 0)
18419 SumV1Indices += i;
18420 if (SumV2Indices < SumV1Indices)
18421 return true;
18422 if (SumV2Indices == SumV1Indices) {
18423 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18424 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18425 if (Mask[i] >= NumElements)
18426 NumV2OddIndices += i % 2;
18427 else if (Mask[i] >= 0)
18428 NumV1OddIndices += i % 2;
18429 if (NumV2OddIndices < NumV1OddIndices)
18430 return true;
18431 }
18432 }
18433 }
18434
18435 return false;
18436}
18437
18438/// Top-level lowering for x86 vector shuffles.
18439///
18440/// This handles decomposition, canonicalization, and lowering of all x86
18441/// vector shuffles. Most of the specific lowering strategies are encapsulated
18442/// above in helper routines. The canonicalization attempts to widen shuffles
18443/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18444/// s.t. only one of the two inputs needs to be tested, etc.
18445static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18446 SelectionDAG &DAG) {
18447 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18448 ArrayRef<int> OrigMask = SVOp->getMask();
18449 SDValue V1 = Op.getOperand(0);
18450 SDValue V2 = Op.getOperand(1);
18451 MVT VT = Op.getSimpleValueType();
18452 int NumElements = VT.getVectorNumElements();
18453 SDLoc DL(Op);
18454 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18455
18456 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&((void)0)
18457 "Can't lower MMX shuffles")((void)0);
18458
18459 bool V1IsUndef = V1.isUndef();
18460 bool V2IsUndef = V2.isUndef();
18461 if (V1IsUndef && V2IsUndef)
18462 return DAG.getUNDEF(VT);
18463
18464 // When we create a shuffle node we put the UNDEF node to second operand,
18465 // but in some cases the first operand may be transformed to UNDEF.
18466 // In this case we should just commute the node.
18467 if (V1IsUndef)
18468 return DAG.getCommutedVectorShuffle(*SVOp);
18469
18470 // Check for non-undef masks pointing at an undef vector and make the masks
18471 // undef as well. This makes it easier to match the shuffle based solely on
18472 // the mask.
18473 if (V2IsUndef &&
18474 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18475 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18476 for (int &M : NewMask)
18477 if (M >= NumElements)
18478 M = -1;
18479 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18480 }
18481
18482 // Check for illegal shuffle mask element index values.
18483 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18484 (void)MaskUpperLimit;
18485 assert(llvm::all_of(OrigMask,((void)0)
18486 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((void)0)
18487 "Out of bounds shuffle index")((void)0);
18488
18489 // We actually see shuffles that are entirely re-arrangements of a set of
18490 // zero inputs. This mostly happens while decomposing complex shuffles into
18491 // simple ones. Directly lower these as a buildvector of zeros.
18492 APInt KnownUndef, KnownZero;
18493 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18494
18495 APInt Zeroable = KnownUndef | KnownZero;
18496 if (Zeroable.isAllOnesValue())
18497 return getZeroVector(VT, Subtarget, DAG, DL);
18498
18499 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18500
18501 // Try to collapse shuffles into using a vector type with fewer elements but
18502 // wider element types. We cap this to not form integers or floating point
18503 // elements wider than 64 bits. It does not seem beneficial to form i128
18504 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18505 SmallVector<int, 16> WidenedMask;
18506 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18507 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18508 // Shuffle mask widening should not interfere with a broadcast opportunity
18509 // by obfuscating the operands with bitcasts.
18510 // TODO: Avoid lowering directly from this top-level function: make this
18511 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18512 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18513 Subtarget, DAG))
18514 return Broadcast;
18515
18516 MVT NewEltVT = VT.isFloatingPoint()
18517 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18518 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18519 int NewNumElts = NumElements / 2;
18520 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18521 // Make sure that the new vector type is legal. For example, v2f64 isn't
18522 // legal on SSE1.
18523 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18524 if (V2IsZero) {
18525 // Modify the new Mask to take all zeros from the all-zero vector.
18526 // Choose indices that are blend-friendly.
18527 bool UsedZeroVector = false;
18528 assert(is_contained(WidenedMask, SM_SentinelZero) &&((void)0)
18529 "V2's non-undef elements are used?!")((void)0);
18530 for (int i = 0; i != NewNumElts; ++i)
18531 if (WidenedMask[i] == SM_SentinelZero) {
18532 WidenedMask[i] = i + NewNumElts;
18533 UsedZeroVector = true;
18534 }
18535 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18536 // some elements to be undef.
18537 if (UsedZeroVector)
18538 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18539 }
18540 V1 = DAG.getBitcast(NewVT, V1);
18541 V2 = DAG.getBitcast(NewVT, V2);
18542 return DAG.getBitcast(
18543 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18544 }
18545 }
18546
18547 // Commute the shuffle if it will improve canonicalization.
18548 SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
18549 if (canonicalizeShuffleMaskWithCommute(Mask)) {
18550 ShuffleVectorSDNode::commuteMask(Mask);
18551 std::swap(V1, V2);
18552 }
18553
18554 // For each vector width, delegate to a specialized lowering routine.
18555 if (VT.is128BitVector())
18556 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18557
18558 if (VT.is256BitVector())
18559 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18560
18561 if (VT.is512BitVector())
18562 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18563
18564 if (Is1BitVector)
18565 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18566
18567 llvm_unreachable("Unimplemented!")__builtin_unreachable();
18568}
18569
18570/// Try to lower a VSELECT instruction to a vector shuffle.
18571static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18572 const X86Subtarget &Subtarget,
18573 SelectionDAG &DAG) {
18574 SDValue Cond = Op.getOperand(0);
18575 SDValue LHS = Op.getOperand(1);
18576 SDValue RHS = Op.getOperand(2);
18577 MVT VT = Op.getSimpleValueType();
18578
18579 // Only non-legal VSELECTs reach this lowering, convert those into generic
18580 // shuffles and re-use the shuffle lowering path for blends.
18581 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18582 SmallVector<int, 32> Mask;
18583 if (createShuffleMaskFromVSELECT(Mask, Cond))
18584 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18585 }
18586
18587 return SDValue();
18588}
18589
18590SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18591 SDValue Cond = Op.getOperand(0);
18592 SDValue LHS = Op.getOperand(1);
18593 SDValue RHS = Op.getOperand(2);
18594
18595 // A vselect where all conditions and data are constants can be optimized into
18596 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18597 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18598 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18599 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18600 return SDValue();
18601
18602 // Try to lower this to a blend-style vector shuffle. This can handle all
18603 // constant condition cases.
18604 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18605 return BlendOp;
18606
18607 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18608 // with patterns on the mask registers on AVX-512.
18609 MVT CondVT = Cond.getSimpleValueType();
18610 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18611 if (CondEltSize == 1)
18612 return Op;
18613
18614 // Variable blends are only legal from SSE4.1 onward.
18615 if (!Subtarget.hasSSE41())
18616 return SDValue();
18617
18618 SDLoc dl(Op);
18619 MVT VT = Op.getSimpleValueType();
18620 unsigned EltSize = VT.getScalarSizeInBits();
18621 unsigned NumElts = VT.getVectorNumElements();
18622
18623 // Expand v32i16/v64i8 without BWI.
18624 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18625 return SDValue();
18626
18627 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18628 // into an i1 condition so that we can use the mask-based 512-bit blend
18629 // instructions.
18630 if (VT.getSizeInBits() == 512) {
18631 // Build a mask by testing the condition against zero.
18632 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18633 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18634 DAG.getConstant(0, dl, CondVT),
18635 ISD::SETNE);
18636 // Now return a new VSELECT using the mask.
18637 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18638 }
18639
18640 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18641 if (CondEltSize != EltSize) {
18642 // If we don't have a sign splat, rely on the expansion.
18643 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18644 return SDValue();
18645
18646 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18647 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18648 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18649 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18650 }
18651
18652 // Only some types will be legal on some subtargets. If we can emit a legal
18653 // VSELECT-matching blend, return Op, and but if we need to expand, return
18654 // a null value.
18655 switch (VT.SimpleTy) {
18656 default:
18657 // Most of the vector types have blends past SSE4.1.
18658 return Op;
18659
18660 case MVT::v32i8:
18661 // The byte blends for AVX vectors were introduced only in AVX2.
18662 if (Subtarget.hasAVX2())
18663 return Op;
18664
18665 return SDValue();
18666
18667 case MVT::v8i16:
18668 case MVT::v16i16: {
18669 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18670 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18671 Cond = DAG.getBitcast(CastVT, Cond);
18672 LHS = DAG.getBitcast(CastVT, LHS);
18673 RHS = DAG.getBitcast(CastVT, RHS);
18674 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18675 return DAG.getBitcast(VT, Select);
18676 }
18677 }
18678}
18679
18680static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18681 MVT VT = Op.getSimpleValueType();
18682 SDValue Vec = Op.getOperand(0);
18683 SDValue Idx = Op.getOperand(1);
18684 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")((void)0);
18685 SDLoc dl(Op);
18686
18687 if (!Vec.getSimpleValueType().is128BitVector())
18688 return SDValue();
18689
18690 if (VT.getSizeInBits() == 8) {
18691 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18692 // we're going to zero extend the register or fold the store.
18693 if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
18694 !MayFoldIntoStore(Op))
18695 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18696 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18697 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18698
18699 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18700 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18701 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18702 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18703 }
18704
18705 if (VT == MVT::f32) {
18706 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18707 // the result back to FR32 register. It's only worth matching if the
18708 // result has a single use which is a store or a bitcast to i32. And in
18709 // the case of a store, it's not worth it if the index is a constant 0,
18710 // because a MOVSSmr can be used instead, which is smaller and faster.
18711 if (!Op.hasOneUse())
18712 return SDValue();
18713 SDNode *User = *Op.getNode()->use_begin();
18714 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18715 (User->getOpcode() != ISD::BITCAST ||
18716 User->getValueType(0) != MVT::i32))
18717 return SDValue();
18718 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18719 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18720 return DAG.getBitcast(MVT::f32, Extract);
18721 }
18722
18723 if (VT == MVT::i32 || VT == MVT::i64)
18724 return Op;
18725
18726 return SDValue();
18727}
18728
18729/// Extract one bit from mask vector, like v16i1 or v8i1.
18730/// AVX-512 feature.
18731static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
18732 const X86Subtarget &Subtarget) {
18733 SDValue Vec = Op.getOperand(0);
18734 SDLoc dl(Vec);
18735 MVT VecVT = Vec.getSimpleValueType();
18736 SDValue Idx = Op.getOperand(1);
18737 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18738 MVT EltVT = Op.getSimpleValueType();
18739
18740 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&((void)0)
18741 "Unexpected vector type in ExtractBitFromMaskVector")((void)0);
18742
18743 // variable index can't be handled in mask registers,
18744 // extend vector to VR512/128
18745 if (!IdxC) {
18746 unsigned NumElts = VecVT.getVectorNumElements();
18747 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18748 // than extending to 128/256bit.
18749 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18750 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18751 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18752 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18753 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18754 }
18755
18756 unsigned IdxVal = IdxC->getZExtValue();
18757 if (IdxVal == 0) // the operation is legal
18758 return Op;
18759
18760 // Extend to natively supported kshift.
18761 unsigned NumElems = VecVT.getVectorNumElements();
18762 MVT WideVecVT = VecVT;
18763 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18764 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18765 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18766 DAG.getUNDEF(WideVecVT), Vec,
18767 DAG.getIntPtrConstant(0, dl));
18768 }
18769
18770 // Use kshiftr instruction to move to the lower element.
18771 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18772 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18773
18774 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18775 DAG.getIntPtrConstant(0, dl));
18776}
18777
18778SDValue
18779X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18780 SelectionDAG &DAG) const {
18781 SDLoc dl(Op);
18782 SDValue Vec = Op.getOperand(0);
18783 MVT VecVT = Vec.getSimpleValueType();
18784 SDValue Idx = Op.getOperand(1);
18785 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18786
18787 if (VecVT.getVectorElementType() == MVT::i1)
18788 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18789
18790 if (!IdxC) {
18791 // Its more profitable to go through memory (1 cycles throughput)
18792 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
18793 // IACA tool was used to get performance estimation
18794 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18795 //
18796 // example : extractelement <16 x i8> %a, i32 %i
18797 //
18798 // Block Throughput: 3.00 Cycles
18799 // Throughput Bottleneck: Port5
18800 //
18801 // | Num Of | Ports pressure in cycles | |
18802 // | Uops | 0 - DV | 5 | 6 | 7 | |
18803 // ---------------------------------------------
18804 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18805 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18806 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18807 // Total Num Of Uops: 4
18808 //
18809 //
18810 // Block Throughput: 1.00 Cycles
18811 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18812 //
18813 // | | Ports pressure in cycles | |
18814 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18815 // ---------------------------------------------------------
18816 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18817 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18818 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18819 // Total Num Of Uops: 4
18820
18821 return SDValue();
18822 }
18823
18824 unsigned IdxVal = IdxC->getZExtValue();
18825
18826 // If this is a 256-bit vector result, first extract the 128-bit vector and
18827 // then extract the element from the 128-bit vector.
18828 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18829 // Get the 128-bit vector.
18830 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18831 MVT EltVT = VecVT.getVectorElementType();
18832
18833 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18834 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((void)0);
18835
18836 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18837 // this can be done with a mask.
18838 IdxVal &= ElemsPerChunk - 1;
18839 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18840 DAG.getIntPtrConstant(IdxVal, dl));
18841 }
18842
18843 assert(VecVT.is128BitVector() && "Unexpected vector length")((void)0);
18844
18845 MVT VT = Op.getSimpleValueType();
18846
18847 if (VT.getSizeInBits() == 16) {
18848 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18849 // we're going to zero extend the register or fold the store (SSE41 only).
18850 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
18851 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
18852 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18853 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18854 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18855
18856 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18857 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18858 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18859 }
18860
18861 if (Subtarget.hasSSE41())
18862 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18863 return Res;
18864
18865 // TODO: We only extract a single element from v16i8, we can probably afford
18866 // to be more aggressive here before using the default approach of spilling to
18867 // stack.
18868 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
18869 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18870 int DWordIdx = IdxVal / 4;
18871 if (DWordIdx == 0) {
18872 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18873 DAG.getBitcast(MVT::v4i32, Vec),
18874 DAG.getIntPtrConstant(DWordIdx, dl));
18875 int ShiftVal = (IdxVal % 4) * 8;
18876 if (ShiftVal != 0)
18877 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18878 DAG.getConstant(ShiftVal, dl, MVT::i8));
18879 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18880 }
18881
18882 int WordIdx = IdxVal / 2;
18883 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18884 DAG.getBitcast(MVT::v8i16, Vec),
18885 DAG.getIntPtrConstant(WordIdx, dl));
18886 int ShiftVal = (IdxVal % 2) * 8;
18887 if (ShiftVal != 0)
18888 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18889 DAG.getConstant(ShiftVal, dl, MVT::i8));
18890 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18891 }
18892
18893 if (VT.getSizeInBits() == 32) {
18894 if (IdxVal == 0)
18895 return Op;
18896
18897 // SHUFPS the element to the lowest double word, then movss.
18898 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18899 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18900 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18901 DAG.getIntPtrConstant(0, dl));
18902 }
18903
18904 if (VT.getSizeInBits() == 64) {
18905 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18906 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18907 // to match extract_elt for f64.
18908 if (IdxVal == 0)
18909 return Op;
18910
18911 // UNPCKHPD the element to the lowest double word, then movsd.
18912 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18913 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18914 int Mask[2] = { 1, -1 };
18915 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18916 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18917 DAG.getIntPtrConstant(0, dl));
18918 }
18919
18920 return SDValue();
18921}
18922
18923/// Insert one bit to mask vector, like v16i1 or v8i1.
18924/// AVX-512 feature.
18925static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18926 const X86Subtarget &Subtarget) {
18927 SDLoc dl(Op);
18928 SDValue Vec = Op.getOperand(0);
18929 SDValue Elt = Op.getOperand(1);
18930 SDValue Idx = Op.getOperand(2);
18931 MVT VecVT = Vec.getSimpleValueType();
18932
18933 if (!isa<ConstantSDNode>(Idx)) {
18934 // Non constant index. Extend source and destination,
18935 // insert element and then truncate the result.
18936 unsigned NumElts = VecVT.getVectorNumElements();
18937 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18938 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18939 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18940 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18941 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18942 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18943 }
18944
18945 // Copy into a k-register, extract to v1i1 and insert_subvector.
18946 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18947 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18948}
18949
18950SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18951 SelectionDAG &DAG) const {
18952 MVT VT = Op.getSimpleValueType();
18953 MVT EltVT = VT.getVectorElementType();
18954 unsigned NumElts = VT.getVectorNumElements();
18955 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18956
18957 if (EltVT == MVT::i1)
18958 return InsertBitToMaskVector(Op, DAG, Subtarget);
18959
18960 SDLoc dl(Op);
18961 SDValue N0 = Op.getOperand(0);
18962 SDValue N1 = Op.getOperand(1);
18963 SDValue N2 = Op.getOperand(2);
18964 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18965
18966 if (!N2C) {
18967 // Variable insertion indices, usually we're better off spilling to stack,
18968 // but AVX512 can use a variable compare+select by comparing against all
18969 // possible vector indices, and FP insertion has less gpr->simd traffic.
18970 if (!(Subtarget.hasBWI() ||
18971 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18972 (Subtarget.hasSSE41() && VT.isFloatingPoint())))
18973 return SDValue();
18974
18975 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18976 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18977 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18978 return SDValue();
18979
18980 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18981 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18982 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18983
18984 SmallVector<SDValue, 16> RawIndices;
18985 for (unsigned I = 0; I != NumElts; ++I)
18986 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18987 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18988
18989 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18990 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18991 ISD::CondCode::SETEQ);
18992 }
18993
18994 if (N2C->getAPIntValue().uge(NumElts))
18995 return SDValue();
18996 uint64_t IdxVal = N2C->getZExtValue();
18997
18998 bool IsZeroElt = X86::isZeroNode(N1);
18999 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19000
19001 // If we are inserting a element, see if we can do this more efficiently with
19002 // a blend shuffle with a rematerializable vector than a costly integer
19003 // insertion.
19004 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
19005 (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
19006 SmallVector<int, 8> BlendMask;
19007 for (unsigned i = 0; i != NumElts; ++i)
19008 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19009 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19010 : getOnesVector(VT, DAG, dl);
19011 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19012 }
19013
19014 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19015 // into that, and then insert the subvector back into the result.
19016 if (VT.is256BitVector() || VT.is512BitVector()) {
19017 // With a 256-bit vector, we can insert into the zero element efficiently
19018 // using a blend if we have AVX or AVX2 and the right data type.
19019 if (VT.is256BitVector() && IdxVal == 0) {
19020 // TODO: It is worthwhile to cast integer to floating point and back
19021 // and incur a domain crossing penalty if that's what we'll end up
19022 // doing anyway after extracting to a 128-bit vector.
19023 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19024 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19025 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19026 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19027 DAG.getTargetConstant(1, dl, MVT::i8));
19028 }
19029 }
19030
19031 // Get the desired 128-bit vector chunk.
19032 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19033
19034 // Insert the element into the desired chunk.
19035 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19036 assert(isPowerOf2_32(NumEltsIn128))((void)0);
19037 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19038 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19039
19040 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19041 DAG.getIntPtrConstant(IdxIn128, dl));
19042
19043 // Insert the changed part back into the bigger vector
19044 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19045 }
19046 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((void)0);
19047
19048 // This will be just movd/movq/movss/movsd.
19049 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19050 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19051 EltVT == MVT::i64) {
19052 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19053 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19054 }
19055
19056 // We can't directly insert an i8 or i16 into a vector, so zero extend
19057 // it to i32 first.
19058 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19059 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19060 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19061 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19062 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19063 return DAG.getBitcast(VT, N1);
19064 }
19065 }
19066
19067 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19068 // argument. SSE41 required for pinsrb.
19069 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19070 unsigned Opc;
19071 if (VT == MVT::v8i16) {
19072 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((void)0);
19073 Opc = X86ISD::PINSRW;
19074 } else {
19075 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((void)0);
19076 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((void)0);
19077 Opc = X86ISD::PINSRB;
19078 }
19079
19080 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")((void)0);
19081 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19082 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19083 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19084 }
19085
19086 if (Subtarget.hasSSE41()) {
19087 if (EltVT == MVT::f32) {
19088 // Bits [7:6] of the constant are the source select. This will always be
19089 // zero here. The DAG Combiner may combine an extract_elt index into
19090 // these bits. For example (insert (extract, 3), 2) could be matched by
19091 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19092 // Bits [5:4] of the constant are the destination select. This is the
19093 // value of the incoming immediate.
19094 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19095 // combine either bitwise AND or insert of float 0.0 to set these bits.
19096
19097 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19098 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19099 // If this is an insertion of 32-bits into the low 32-bits of
19100 // a vector, we prefer to generate a blend with immediate rather
19101 // than an insertps. Blends are simpler operations in hardware and so
19102 // will always have equal or better performance than insertps.
19103 // But if optimizing for size and there's a load folding opportunity,
19104 // generate insertps because blendps does not have a 32-bit memory
19105 // operand form.
19106 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19107 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19108 DAG.getTargetConstant(1, dl, MVT::i8));
19109 }
19110 // Create this as a scalar to vector..
19111 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19112 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19113 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19114 }
19115
19116 // PINSR* works with constant index.
19117 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19118 return Op;
19119 }
19120
19121 return SDValue();
19122}
19123
19124static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19125 SelectionDAG &DAG) {
19126 SDLoc dl(Op);
19127 MVT OpVT = Op.getSimpleValueType();
19128
19129 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19130 // combines.
19131 if (X86::isZeroNode(Op.getOperand(0)))
19132 return getZeroVector(OpVT, Subtarget, DAG, dl);
19133
19134 // If this is a 256-bit vector result, first insert into a 128-bit
19135 // vector and then insert into the 256-bit vector.
19136 if (!OpVT.is128BitVector()) {
19137 // Insert into a 128-bit vector.
19138 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19139 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19140 OpVT.getVectorNumElements() / SizeFactor);
19141
19142 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19143
19144 // Insert the 128-bit vector.
19145 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19146 }
19147 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&((void)0)
19148 "Expected an SSE type!")((void)0);
19149
19150 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
19151 if (OpVT == MVT::v4i32)
19152 return Op;
19153
19154 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19155 return DAG.getBitcast(
19156 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19157}
19158
19159// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19160// simple superregister reference or explicit instructions to insert
19161// the upper bits of a vector.
19162static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19163 SelectionDAG &DAG) {
19164 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((void)0);
19165
19166 return insert1BitVector(Op, DAG, Subtarget);
19167}
19168
19169static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19170 SelectionDAG &DAG) {
19171 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((void)0)
19172 "Only vXi1 extract_subvectors need custom lowering")((void)0);
19173
19174 SDLoc dl(Op);
19175 SDValue Vec = Op.getOperand(0);
19176 uint64_t IdxVal = Op.getConstantOperandVal(1);
19177
19178 if (IdxVal == 0) // the operation is legal
19179 return Op;
19180
19181 MVT VecVT = Vec.getSimpleValueType();
19182 unsigned NumElems = VecVT.getVectorNumElements();
19183
19184 // Extend to natively supported kshift.
19185 MVT WideVecVT = VecVT;
19186 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19187 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19188 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19189 DAG.getUNDEF(WideVecVT), Vec,
19190 DAG.getIntPtrConstant(0, dl));
19191 }
19192
19193 // Shift to the LSB.
19194 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19195 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19196
19197 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19198 DAG.getIntPtrConstant(0, dl));
19199}
19200
19201// Returns the appropriate wrapper opcode for a global reference.
19202unsigned X86TargetLowering::getGlobalWrapperKind(
19203 const GlobalValue *GV, const unsigned char OpFlags) const {
19204 // References to absolute symbols are never PC-relative.
19205 if (GV && GV->isAbsoluteSymbolRef())
19206 return X86ISD::Wrapper;
19207
19208 CodeModel::Model M = getTargetMachine().getCodeModel();
19209 if (Subtarget.isPICStyleRIPRel() &&
19210 (M == CodeModel::Small || M == CodeModel::Kernel))
19211 return X86ISD::WrapperRIP;
19212
19213 // GOTPCREL references must always use RIP.
19214 if (OpFlags == X86II::MO_GOTPCREL)
19215 return X86ISD::WrapperRIP;
19216
19217 return X86ISD::Wrapper;
19218}
19219
19220// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19221// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19222// one of the above mentioned nodes. It has to be wrapped because otherwise
19223// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19224// be used to form addressing mode. These wrapped nodes will be selected
19225// into MOV32ri.
19226SDValue
19227X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19228 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19229
19230 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19231 // global base reg.
19232 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19233
19234 auto PtrVT = getPointerTy(DAG.getDataLayout());
19235 SDValue Result = DAG.getTargetConstantPool(
19236 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19237 SDLoc DL(CP);
19238 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19239 // With PIC, the address is actually $g + Offset.
19240 if (OpFlag) {
19241 Result =
19242 DAG.getNode(ISD::ADD, DL, PtrVT,
19243 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19244 }
19245
19246 return Result;
19247}
19248
19249SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19250 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19251
19252 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19253 // global base reg.
19254 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19255
19256 auto PtrVT = getPointerTy(DAG.getDataLayout());
19257 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19258 SDLoc DL(JT);
19259 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19260
19261 // With PIC, the address is actually $g + Offset.
19262 if (OpFlag)
19263 Result =
19264 DAG.getNode(ISD::ADD, DL, PtrVT,
19265 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19266
19267 return Result;
19268}
19269
19270SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19271 SelectionDAG &DAG) const {
19272 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19273}
19274
19275SDValue
19276X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19277 // Create the TargetBlockAddressAddress node.
19278 unsigned char OpFlags =
19279 Subtarget.classifyBlockAddressReference();
19280 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19281 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19282 SDLoc dl(Op);
19283 auto PtrVT = getPointerTy(DAG.getDataLayout());
19284 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19285 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19286
19287 // With PIC, the address is actually $g + Offset.
19288 if (isGlobalRelativeToPICBase(OpFlags)) {
19289 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19290 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19291 }
19292
19293 return Result;
19294}
19295
19296/// Creates target global address or external symbol nodes for calls or
19297/// other uses.
19298SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19299 bool ForCall) const {
19300 // Unpack the global address or external symbol.
19301 const SDLoc &dl = SDLoc(Op);
19302 const GlobalValue *GV = nullptr;
19303 int64_t Offset = 0;
19304 const char *ExternalSym = nullptr;
19305 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19306 GV = G->getGlobal();
19307 Offset = G->getOffset();
19308 } else {
19309 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19310 ExternalSym = ES->getSymbol();
19311 }
19312
19313 // Calculate some flags for address lowering.
19314 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19315 unsigned char OpFlags;
19316 if (ForCall)
19317 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19318 else
19319 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19320 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19321 bool NeedsLoad = isGlobalStubReference(OpFlags);
19322
19323 CodeModel::Model M = DAG.getTarget().getCodeModel();
19324 auto PtrVT = getPointerTy(DAG.getDataLayout());
19325 SDValue Result;
19326
19327 if (GV) {
19328 // Create a target global address if this is a global. If possible, fold the
19329 // offset into the global address reference. Otherwise, ADD it on later.
19330 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19331 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19332 // relocation will compute to a negative value, which is invalid.
19333 int64_t GlobalOffset = 0;
19334 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19335 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19336 std::swap(GlobalOffset, Offset);
19337 }
19338 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19339 } else {
19340 // If this is not a global address, this must be an external symbol.
19341 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19342 }
19343
19344 // If this is a direct call, avoid the wrapper if we don't need to do any
19345 // loads or adds. This allows SDAG ISel to match direct calls.
19346 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19347 return Result;
19348
19349 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19350
19351 // With PIC, the address is actually $g + Offset.
19352 if (HasPICReg) {
19353 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19354 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19355 }
19356
19357 // For globals that require a load from a stub to get the address, emit the
19358 // load.
19359 if (NeedsLoad)
19360 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19361 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19362
19363 // If there was a non-zero offset that we didn't fold, create an explicit
19364 // addition for it.
19365 if (Offset != 0)
19366 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19367 DAG.getConstant(Offset, dl, PtrVT));
19368
19369 return Result;
19370}
19371
19372SDValue
19373X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19374 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19375}
19376
19377static SDValue
19378GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19379 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19380 unsigned char OperandFlags, bool LocalDynamic = false) {
19381 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19382 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19383 SDLoc dl(GA);
19384 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19385 GA->getValueType(0),
19386 GA->getOffset(),
19387 OperandFlags);
19388
19389 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19390 : X86ISD::TLSADDR;
19391
19392 if (InFlag) {
19393 SDValue Ops[] = { Chain, TGA, *InFlag };
19394 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19395 } else {
19396 SDValue Ops[] = { Chain, TGA };
19397 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19398 }
19399
19400 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19401 MFI.setAdjustsStack(true);
19402 MFI.setHasCalls(true);
19403
19404 SDValue Flag = Chain.getValue(1);
19405 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19406}
19407
19408// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19409static SDValue
19410LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19411 const EVT PtrVT) {
19412 SDValue InFlag;
19413 SDLoc dl(GA); // ? function entry point might be better
19414 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19415 DAG.getNode(X86ISD::GlobalBaseReg,
19416 SDLoc(), PtrVT), InFlag);
19417 InFlag = Chain.getValue(1);
19418
19419 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19420}
19421
19422// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19423static SDValue
19424LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19425 const EVT PtrVT) {
19426 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19427 X86::RAX, X86II::MO_TLSGD);
19428}
19429
19430// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19431static SDValue
19432LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19433 const EVT PtrVT) {
19434 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19435 X86::EAX, X86II::MO_TLSGD);
19436}
19437
19438static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19439 SelectionDAG &DAG, const EVT PtrVT,
19440 bool Is64Bit, bool Is64BitLP64) {
19441 SDLoc dl(GA);
19442
19443 // Get the start address of the TLS block for this module.
19444 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19445 .getInfo<X86MachineFunctionInfo>();
19446 MFI->incNumLocalDynamicTLSAccesses();
19447
19448 SDValue Base;
19449 if (Is64Bit) {
19450 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19451 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19452 X86II::MO_TLSLD, /*LocalDynamic=*/true);
19453 } else {
19454 SDValue InFlag;
19455 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19456 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19457 InFlag = Chain.getValue(1);
19458 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19459 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19460 }
19461
19462 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19463 // of Base.
19464
19465 // Build x@dtpoff.
19466 unsigned char OperandFlags = X86II::MO_DTPOFF;
19467 unsigned WrapperKind = X86ISD::Wrapper;
19468 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19469 GA->getValueType(0),
19470 GA->getOffset(), OperandFlags);
19471 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19472
19473 // Add x@dtpoff with the base.
19474 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19475}
19476
19477// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19478static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19479 const EVT PtrVT, TLSModel::Model model,
19480 bool is64Bit, bool isPIC) {
19481 SDLoc dl(GA);
19482
19483 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19484 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19485 is64Bit ? 257 : 256));
19486
19487 SDValue ThreadPointer =
19488 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19489 MachinePointerInfo(Ptr));
19490
19491 unsigned char OperandFlags = 0;
19492 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19493 // initialexec.
19494 unsigned WrapperKind = X86ISD::Wrapper;
19495 if (model == TLSModel::LocalExec) {
19496 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19497 } else if (model == TLSModel::InitialExec) {
19498 if (is64Bit) {
19499 OperandFlags = X86II::MO_GOTTPOFF;
19500 WrapperKind = X86ISD::WrapperRIP;
19501 } else {
19502 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19503 }
19504 } else {
19505 llvm_unreachable("Unexpected model")__builtin_unreachable();
19506 }
19507
19508 // emit "addl x@ntpoff,%eax" (local exec)
19509 // or "addl x@indntpoff,%eax" (initial exec)
19510 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19511 SDValue TGA =
19512 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19513 GA->getOffset(), OperandFlags);
19514 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19515
19516 if (model == TLSModel::InitialExec) {
19517 if (isPIC && !is64Bit) {
19518 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19519 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19520 Offset);
19521 }
19522
19523 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19524 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19525 }
19526
19527 // The address of the thread local variable is the add of the thread
19528 // pointer with the offset of the variable.
19529 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19530}
19531
19532SDValue
19533X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19534
19535 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19536
19537 if (DAG.getTarget().useEmulatedTLS())
19538 return LowerToTLSEmulatedModel(GA, DAG);
19539
19540 const GlobalValue *GV = GA->getGlobal();
19541 auto PtrVT = getPointerTy(DAG.getDataLayout());
19542 bool PositionIndependent = isPositionIndependent();
19543
19544 if (Subtarget.isTargetELF()) {
19545 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19546 switch (model) {
19547 case TLSModel::GeneralDynamic:
19548 if (Subtarget.is64Bit()) {
19549 if (Subtarget.isTarget64BitLP64())
19550 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19551 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19552 }
19553 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19554 case TLSModel::LocalDynamic:
19555 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19556 Subtarget.isTarget64BitLP64());
19557 case TLSModel::InitialExec:
19558 case TLSModel::LocalExec:
19559 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19560 PositionIndependent);
19561 }
19562 llvm_unreachable("Unknown TLS model.")__builtin_unreachable();
19563 }
19564
19565 if (Subtarget.isTargetDarwin()) {
19566 // Darwin only has one model of TLS. Lower to that.
19567 unsigned char OpFlag = 0;
19568 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19569 X86ISD::WrapperRIP : X86ISD::Wrapper;
19570
19571 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19572 // global base reg.
19573 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19574 if (PIC32)
19575 OpFlag = X86II::MO_TLVP_PIC_BASE;
19576 else
19577 OpFlag = X86II::MO_TLVP;
19578 SDLoc DL(Op);
19579 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19580 GA->getValueType(0),
19581 GA->getOffset(), OpFlag);
19582 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19583
19584 // With PIC32, the address is actually $g + Offset.
19585 if (PIC32)
19586 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19587 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19588 Offset);
19589
19590 // Lowering the machine isd will make sure everything is in the right
19591 // location.
19592 SDValue Chain = DAG.getEntryNode();
19593 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19594 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19595 SDValue Args[] = { Chain, Offset };
19596 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19597 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19598 DAG.getIntPtrConstant(0, DL, true),
19599 Chain.getValue(1), DL);
19600
19601 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19602 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19603 MFI.setAdjustsStack(true);
19604
19605 // And our return value (tls address) is in the standard call return value
19606 // location.
19607 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19608 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19609 }
19610
19611 if (Subtarget.isOSWindows()) {
19612 // Just use the implicit TLS architecture
19613 // Need to generate something similar to:
19614 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19615 // ; from TEB
19616 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19617 // mov rcx, qword [rdx+rcx*8]
19618 // mov eax, .tls$:tlsvar
19619 // [rax+rcx] contains the address
19620 // Windows 64bit: gs:0x58
19621 // Windows 32bit: fs:__tls_array
19622
19623 SDLoc dl(GA);
19624 SDValue Chain = DAG.getEntryNode();
19625
19626 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19627 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19628 // use its literal value of 0x2C.
19629 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19630 ? Type::getInt8PtrTy(*DAG.getContext(),
19631 256)
19632 : Type::getInt32PtrTy(*DAG.getContext(),
19633 257));
19634
19635 SDValue TlsArray = Subtarget.is64Bit()
19636 ? DAG.getIntPtrConstant(0x58, dl)
19637 : (Subtarget.isTargetWindowsGNU()
19638 ? DAG.getIntPtrConstant(0x2C, dl)
19639 : DAG.getExternalSymbol("_tls_array", PtrVT));
19640
19641 SDValue ThreadPointer =
19642 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19643
19644 SDValue res;
19645 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19646 res = ThreadPointer;
19647 } else {
19648 // Load the _tls_index variable
19649 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19650 if (Subtarget.is64Bit())
19651 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19652 MachinePointerInfo(), MVT::i32);
19653 else
19654 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19655
19656 const DataLayout &DL = DAG.getDataLayout();
19657 SDValue Scale =
19658 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19659 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19660
19661 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19662 }
19663
19664 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19665
19666 // Get the offset of start of .tls section
19667 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19668 GA->getValueType(0),
19669 GA->getOffset(), X86II::MO_SECREL);
19670 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19671
19672 // The address of the thread local variable is the add of the thread
19673 // pointer with the offset of the variable.
19674 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19675 }
19676
19677 llvm_unreachable("TLS not implemented for this target.")__builtin_unreachable();
19678}
19679
19680/// Lower SRA_PARTS and friends, which return two i32 values
19681/// and take a 2 x i32 value to shift plus a shift amount.
19682/// TODO: Can this be moved to general expansion code?
19683static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
19684 SDValue Lo, Hi;
19685 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19686 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19687}
19688
19689static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
19690 SelectionDAG &DAG) {
19691 MVT VT = Op.getSimpleValueType();
19692 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&((void)0)
19693 "Unexpected funnel shift opcode!")((void)0);
19694
19695 SDLoc DL(Op);
19696 SDValue Op0 = Op.getOperand(0);
19697 SDValue Op1 = Op.getOperand(1);
19698 SDValue Amt = Op.getOperand(2);
19699
19700 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
19701
19702 if (VT.isVector()) {
19703 assert(Subtarget.hasVBMI2() && "Expected VBMI2")((void)0);
19704
19705 if (IsFSHR)
19706 std::swap(Op0, Op1);
19707
19708 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19709 if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
19710 Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
19711 Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
19712 }
19713
19714 SDValue Funnel;
19715 APInt APIntShiftAmt;
19716 MVT ResultVT = Op0.getSimpleValueType();
19717 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
19718 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
19719 Funnel =
19720 DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
19721 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19722 } else {
19723 if (!Subtarget.hasVLX() && !VT.is512BitVector())
19724 Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
19725 Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
19726 ResultVT, Op0, Op1, Amt);
19727 }
19728 if (!Subtarget.hasVLX() && !VT.is512BitVector())
19729 Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
19730 return Funnel;
19731 }
19732 assert(((void)0)
19733 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&((void)0)
19734 "Unexpected funnel shift type!")((void)0);
19735
19736 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
19737 bool OptForSize = DAG.shouldOptForSize();
19738 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
19739
19740 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
19741 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
19742 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
19743 !isa<ConstantSDNode>(Amt)) {
19744 unsigned EltSizeInBits = VT.getScalarSizeInBits();
19745 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
19746 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
19747 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
19748 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
19749 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
19750 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
19751 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
19752 if (IsFSHR) {
19753 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
19754 } else {
19755 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
19756 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
19757 }
19758 return DAG.getZExtOrTrunc(Res, DL, VT);
19759 }
19760
19761 if (VT == MVT::i8 || ExpandFunnel)
19762 return SDValue();
19763
19764 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
19765 if (VT == MVT::i16) {
19766 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
19767 DAG.getConstant(15, DL, Amt.getValueType()));
19768 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
19769 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
19770 }
19771
19772 return Op;
19773}
19774
19775// Try to use a packed vector operation to handle i64 on 32-bit targets when
19776// AVX512DQ is enabled.
19777static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
19778 const X86Subtarget &Subtarget) {
19779 assert((Op.getOpcode() == ISD::SINT_TO_FP ||((void)0)
19780 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||((void)0)
19781 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||((void)0)
19782 Op.getOpcode() == ISD::UINT_TO_FP) &&((void)0)
19783 "Unexpected opcode!")((void)0);
19784 bool IsStrict = Op->isStrictFPOpcode();
19785 unsigned OpNo = IsStrict ? 1 : 0;
19786 SDValue Src = Op.getOperand(OpNo);
19787 MVT SrcVT = Src.getSimpleValueType();
19788 MVT VT = Op.getSimpleValueType();
19789
19790 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19791 (VT != MVT::f32 && VT != MVT::f64))
19792 return SDValue();
19793
19794 // Pack the i64 into a vector, do the operation and extract.
19795
19796 // Using 256-bit to ensure result is 128-bits for f32 case.
19797 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19798 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19799 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19800
19801 SDLoc dl(Op);
19802 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19803 if (IsStrict) {
19804 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19805 {Op.getOperand(0), InVec});
19806 SDValue Chain = CvtVec.getValue(1);
19807 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19808 DAG.getIntPtrConstant(0, dl));
19809 return DAG.getMergeValues({Value, Chain}, dl);
19810 }
19811
19812 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19813
19814 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19815 DAG.getIntPtrConstant(0, dl));
19816}
19817
19818static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19819 const X86Subtarget &Subtarget) {
19820 switch (Opcode) {
19821 case ISD::SINT_TO_FP:
19822 // TODO: Handle wider types with AVX/AVX512.
19823 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19824 return false;
19825 // CVTDQ2PS or (V)CVTDQ2PD
19826 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19827
19828 case ISD::UINT_TO_FP:
19829 // TODO: Handle wider types and i64 elements.
19830 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19831 return false;
19832 // VCVTUDQ2PS or VCVTUDQ2PD
19833 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19834
19835 default:
19836 return false;
19837 }
19838}
19839
19840/// Given a scalar cast operation that is extracted from a vector, try to
19841/// vectorize the cast op followed by extraction. This will avoid an expensive
19842/// round-trip between XMM and GPR.
19843static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
19844 const X86Subtarget &Subtarget) {
19845 // TODO: This could be enhanced to handle smaller integer types by peeking
19846 // through an extend.
19847 SDValue Extract = Cast.getOperand(0);
19848 MVT DestVT = Cast.getSimpleValueType();
19849 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19850 !isa<ConstantSDNode>(Extract.getOperand(1)))
19851 return SDValue();
19852
19853 // See if we have a 128-bit vector cast op for this type of cast.
19854 SDValue VecOp = Extract.getOperand(0);
19855 MVT FromVT = VecOp.getSimpleValueType();
19856 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19857 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19858 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19859 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19860 return SDValue();
19861
19862 // If we are extracting from a non-zero element, first shuffle the source
19863 // vector to allow extracting from element zero.
19864 SDLoc DL(Cast);
19865 if (!isNullConstant(Extract.getOperand(1))) {
19866 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19867 Mask[0] = Extract.getConstantOperandVal(1);
19868 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19869 }
19870 // If the source vector is wider than 128-bits, extract the low part. Do not
19871 // create an unnecessarily wide vector cast op.
19872 if (FromVT != Vec128VT)
19873 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19874
19875 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19876 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19877 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19878 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19879 DAG.getIntPtrConstant(0, DL));
19880}
19881
19882/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19883/// try to vectorize the cast ops. This will avoid an expensive round-trip
19884/// between XMM and GPR.
19885static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
19886 const X86Subtarget &Subtarget) {
19887 // TODO: Allow FP_TO_UINT.
19888 SDValue CastToInt = CastToFP.getOperand(0);
19889 MVT VT = CastToFP.getSimpleValueType();
19890 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19891 return SDValue();
19892
19893 MVT IntVT = CastToInt.getSimpleValueType();
19894 SDValue X = CastToInt.getOperand(0);
19895 MVT SrcVT = X.getSimpleValueType();
19896 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19897 return SDValue();
19898
19899 // See if we have 128-bit vector cast instructions for this type of cast.
19900 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19901 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19902 IntVT != MVT::i32)
19903 return SDValue();
19904
19905 unsigned SrcSize = SrcVT.getSizeInBits();
19906 unsigned IntSize = IntVT.getSizeInBits();
19907 unsigned VTSize = VT.getSizeInBits();
19908 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19909 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19910 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19911
19912 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19913 unsigned ToIntOpcode =
19914 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19915 unsigned ToFPOpcode =
19916 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19917
19918 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19919 //
19920 // We are not defining the high elements (for example, zero them) because
19921 // that could nullify any performance advantage that we hoped to gain from
19922 // this vector op hack. We do not expect any adverse effects (like denorm
19923 // penalties) with cast ops.
19924 SDLoc DL(CastToFP);
19925 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19926 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19927 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19928 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19929 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19930}
19931
19932static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
19933 const X86Subtarget &Subtarget) {
19934 SDLoc DL(Op);
19935 bool IsStrict = Op->isStrictFPOpcode();
19936 MVT VT = Op->getSimpleValueType(0);
19937 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19938
19939 if (Subtarget.hasDQI()) {
19940 assert(!Subtarget.hasVLX() && "Unexpected features")((void)0);
19941
19942 assert((Src.getSimpleValueType() == MVT::v2i64 ||((void)0)
19943 Src.getSimpleValueType() == MVT::v4i64) &&((void)0)
19944 "Unsupported custom type")((void)0);
19945
19946 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19947 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&((void)0)
19948 "Unexpected VT!")((void)0);
19949 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19950
19951 // Need to concat with zero vector for strict fp to avoid spurious
19952 // exceptions.
19953 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19954 : DAG.getUNDEF(MVT::v8i64);
19955 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19956 DAG.getIntPtrConstant(0, DL));
19957 SDValue Res, Chain;
19958 if (IsStrict) {
19959 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19960 {Op->getOperand(0), Src});
19961 Chain = Res.getValue(1);
19962 } else {
19963 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19964 }
19965
19966 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19967 DAG.getIntPtrConstant(0, DL));
19968
19969 if (IsStrict)
19970 return DAG.getMergeValues({Res, Chain}, DL);
19971 return Res;
19972 }
19973
19974 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19975 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19976 if (VT != MVT::v4f32 || IsSigned)
19977 return SDValue();
19978
19979 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19980 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19981 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19982 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19983 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19984 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19985 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19986 SmallVector<SDValue, 4> SignCvts(4);
19987 SmallVector<SDValue, 4> Chains(4);
19988 for (int i = 0; i != 4; ++i) {
19989 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19990 DAG.getIntPtrConstant(i, DL));
19991 if (IsStrict) {
19992 SignCvts[i] =
19993 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19994 {Op.getOperand(0), Elt});
19995 Chains[i] = SignCvts[i].getValue(1);
19996 } else {
19997 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19998 }
19999 }
20000 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20001
20002 SDValue Slow, Chain;
20003 if (IsStrict) {
20004 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20005 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20006 {Chain, SignCvt, SignCvt});
20007 Chain = Slow.getValue(1);
20008 } else {
20009 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20010 }
20011
20012 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20013 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20014
20015 if (IsStrict)
20016 return DAG.getMergeValues({Cvt, Chain}, DL);
20017
20018 return Cvt;
20019}
20020
20021SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20022 SelectionDAG &DAG) const {
20023 bool IsStrict = Op->isStrictFPOpcode();
20024 unsigned OpNo = IsStrict ? 1 : 0;
20025 SDValue Src = Op.getOperand(OpNo);
20026 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20027 MVT SrcVT = Src.getSimpleValueType();
20028 MVT VT = Op.getSimpleValueType();
20029 SDLoc dl(Op);
20030
20031 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20032 return Extract;
20033
20034 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20035 return R;
20036
20037 if (SrcVT.isVector()) {
20038 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20039 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20040 // source for strict FP.
20041 if (IsStrict)
20042 return DAG.getNode(
20043 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20044 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20045 DAG.getUNDEF(SrcVT))});
20046 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20047 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20048 DAG.getUNDEF(SrcVT)));
20049 }
20050 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20051 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20052
20053 return SDValue();
20054 }
20055
20056 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((void)0)
20057 "Unknown SINT_TO_FP to lower!")((void)0);
20058
20059 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20060
20061 // These are really Legal; return the operand so the caller accepts it as
20062 // Legal.
20063 if (SrcVT == MVT::i32 && UseSSEReg)
20064 return Op;
20065 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20066 return Op;
20067
20068 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20069 return V;
20070
20071 // SSE doesn't have an i16 conversion so we need to promote.
20072 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20073 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20074 if (IsStrict)
20075 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20076 {Chain, Ext});
20077
20078 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20079 }
20080
20081 if (VT == MVT::f128)
20082 return SDValue();
20083
20084 SDValue ValueToStore = Src;
20085 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20086 // Bitcasting to f64 here allows us to do a single 64-bit store from
20087 // an SSE register, avoiding the store forwarding penalty that would come
20088 // with two 32-bit stores.
20089 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20090
20091 unsigned Size = SrcVT.getStoreSize();
20092 Align Alignment(Size);
20093 MachineFunction &MF = DAG.getMachineFunction();
20094 auto PtrVT = getPointerTy(MF.getDataLayout());
20095 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20096 MachinePointerInfo MPI =
20097 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20098 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20099 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20100 std::pair<SDValue, SDValue> Tmp =
20101 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20102
20103 if (IsStrict)
20104 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20105
20106 return Tmp.first;
20107}
20108
20109std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20110 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20111 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20112 // Build the FILD
20113 SDVTList Tys;
20114 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20115 if (useSSE)
20116 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20117 else
20118 Tys = DAG.getVTList(DstVT, MVT::Other);
20119
20120 SDValue FILDOps[] = {Chain, Pointer};
20121 SDValue Result =
20122 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20123 Alignment, MachineMemOperand::MOLoad);
20124 Chain = Result.getValue(1);
20125
20126 if (useSSE) {
20127 MachineFunction &MF = DAG.getMachineFunction();
20128 unsigned SSFISize = DstVT.getStoreSize();
20129 int SSFI =
20130 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20131 auto PtrVT = getPointerTy(MF.getDataLayout());
20132 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20133 Tys = DAG.getVTList(MVT::Other);
20134 SDValue FSTOps[] = {Chain, Result, StackSlot};
20135 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20136 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20137 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20138
20139 Chain =
20140 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20141 Result = DAG.getLoad(
20142 DstVT, DL, Chain, StackSlot,
20143 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20144 Chain = Result.getValue(1);
20145 }
20146
20147 return { Result, Chain };
20148}
20149
20150/// Horizontal vector math instructions may be slower than normal math with
20151/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20152/// implementation, and likely shuffle complexity of the alternate sequence.
20153static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20154 const X86Subtarget &Subtarget) {
20155 bool IsOptimizingSize = DAG.shouldOptForSize();
20156 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20157 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20158}
20159
20160/// 64-bit unsigned integer to double expansion.
20161static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20162 const X86Subtarget &Subtarget) {
20163 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20164 // when converting 0 when rounding toward negative infinity. Caller will
20165 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20166 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")((void)0);
20167 // This algorithm is not obvious. Here it is what we're trying to output:
20168 /*
20169 movq %rax, %xmm0
20170 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20171 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20172 #ifdef __SSE3__
20173 haddpd %xmm0, %xmm0
20174 #else
20175 pshufd $0x4e, %xmm0, %xmm1
20176 addpd %xmm1, %xmm0
20177 #endif
20178 */
20179
20180 SDLoc dl(Op);
20181 LLVMContext *Context = DAG.getContext();
20182
20183 // Build some magic constants.
20184 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20185 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20186 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20187 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20188
20189 SmallVector<Constant*,2> CV1;
20190 CV1.push_back(
20191 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20192 APInt(64, 0x4330000000000000ULL))));
20193 CV1.push_back(
20194 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20195 APInt(64, 0x4530000000000000ULL))));
20196 Constant *C1 = ConstantVector::get(CV1);
20197 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20198
20199 // Load the 64-bit value into an XMM register.
20200 SDValue XR1 =
20201 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20202 SDValue CLod0 = DAG.getLoad(
20203 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20204 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20205 SDValue Unpck1 =
20206 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20207
20208 SDValue CLod1 = DAG.getLoad(
20209 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20210 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20211 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20212 // TODO: Are there any fast-math-flags to propagate here?
20213 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20214 SDValue Result;
20215
20216 if (Subtarget.hasSSE3() &&
20217 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20218 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20219 } else {
20220 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20221 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20222 }
20223 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20224 DAG.getIntPtrConstant(0, dl));
20225 return Result;
20226}
20227
20228/// 32-bit unsigned integer to float expansion.
20229static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20230 const X86Subtarget &Subtarget) {
20231 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20232 SDLoc dl(Op);
20233 // FP constant to bias correct the final result.
20234 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20235 MVT::f64);
20236
20237 // Load the 32-bit value into an XMM register.
20238 SDValue Load =
20239 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20240
20241 // Zero out the upper parts of the register.
20242 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20243
20244 // Or the load with the bias.
20245 SDValue Or = DAG.getNode(
20246 ISD::OR, dl, MVT::v2i64,
20247 DAG.getBitcast(MVT::v2i64, Load),
20248 DAG.getBitcast(MVT::v2i64,
20249 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20250 Or =
20251 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20252 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20253
20254 if (Op.getNode()->isStrictFPOpcode()) {
20255 // Subtract the bias.
20256 // TODO: Are there any fast-math-flags to propagate here?
20257 SDValue Chain = Op.getOperand(0);
20258 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20259 {Chain, Or, Bias});
20260
20261 if (Op.getValueType() == Sub.getValueType())
20262 return Sub;
20263
20264 // Handle final rounding.
20265 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20266 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20267
20268 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20269 }
20270
20271 // Subtract the bias.
20272 // TODO: Are there any fast-math-flags to propagate here?
20273 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20274
20275 // Handle final rounding.
20276 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20277}
20278
20279static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20280 const X86Subtarget &Subtarget,
20281 const SDLoc &DL) {
20282 if (Op.getSimpleValueType() != MVT::v2f64)
20283 return SDValue();
20284
20285 bool IsStrict = Op->isStrictFPOpcode();
20286
20287 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20288 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((void)0);
20289
20290 if (Subtarget.hasAVX512()) {
20291 if (!Subtarget.hasVLX()) {
20292 // Let generic type legalization widen this.
20293 if (!IsStrict)
20294 return SDValue();
20295 // Otherwise pad the integer input with 0s and widen the operation.
20296 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20297 DAG.getConstant(0, DL, MVT::v2i32));
20298 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20299 {Op.getOperand(0), N0});
20300 SDValue Chain = Res.getValue(1);
20301 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20302 DAG.getIntPtrConstant(0, DL));
20303 return DAG.getMergeValues({Res, Chain}, DL);
20304 }
20305
20306 // Legalize to v4i32 type.
20307 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20308 DAG.getUNDEF(MVT::v2i32));
20309 if (IsStrict)
20310 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20311 {Op.getOperand(0), N0});
20312 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20313 }
20314
20315 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20316 // This gives us the floating point equivalent of 2^52 + the i32 integer
20317 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20318 // point leaving just our i32 integers in double format.
20319 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20320 SDValue VBias =
20321 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20322 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20323 DAG.getBitcast(MVT::v2i64, VBias));
20324 Or = DAG.getBitcast(MVT::v2f64, Or);
20325
20326 if (IsStrict)
20327 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20328 {Op.getOperand(0), Or, VBias});
20329 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20330}
20331
20332static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20333 const X86Subtarget &Subtarget) {
20334 SDLoc DL(Op);
20335 bool IsStrict = Op->isStrictFPOpcode();
20336 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20337 MVT VecIntVT = V.getSimpleValueType();
20338 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&((void)0)
20339 "Unsupported custom type")((void)0);
20340
20341 if (Subtarget.hasAVX512()) {
20342 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20343 assert(!Subtarget.hasVLX() && "Unexpected features")((void)0);
20344 MVT VT = Op->getSimpleValueType(0);
20345
20346 // v8i32->v8f64 is legal with AVX512 so just return it.
20347 if (VT == MVT::v8f64)
20348 return Op;
20349
20350 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&((void)0)
20351 "Unexpected VT!")((void)0);
20352 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20353 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20354 // Need to concat with zero vector for strict fp to avoid spurious
20355 // exceptions.
20356 SDValue Tmp =
20357 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20358 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20359 DAG.getIntPtrConstant(0, DL));
20360 SDValue Res, Chain;
20361 if (IsStrict) {
20362 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20363 {Op->getOperand(0), V});
20364 Chain = Res.getValue(1);
20365 } else {
20366 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20367 }
20368
20369 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20370 DAG.getIntPtrConstant(0, DL));
20371
20372 if (IsStrict)
20373 return DAG.getMergeValues({Res, Chain}, DL);
20374 return Res;
20375 }
20376
20377 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20378 Op->getSimpleValueType(0) == MVT::v4f64) {
20379 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20380 Constant *Bias = ConstantFP::get(
20381 *DAG.getContext(),
20382 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20383 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20384 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20385 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20386 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20387 SDValue VBias = DAG.getMemIntrinsicNode(
20388 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20389 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20390 MachineMemOperand::MOLoad);
20391
20392 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20393 DAG.getBitcast(MVT::v4i64, VBias));
20394 Or = DAG.getBitcast(MVT::v4f64, Or);
20395
20396 if (IsStrict)
20397 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20398 {Op.getOperand(0), Or, VBias});
20399 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20400 }
20401
20402 // The algorithm is the following:
20403 // #ifdef __SSE4_1__
20404 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20405 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20406 // (uint4) 0x53000000, 0xaa);
20407 // #else
20408 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20409 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20410 // #endif
20411 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20412 // return (float4) lo + fhi;
20413
20414 bool Is128 = VecIntVT == MVT::v4i32;
20415 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20416 // If we convert to something else than the supported type, e.g., to v4f64,
20417 // abort early.
20418 if (VecFloatVT != Op->getSimpleValueType(0))
20419 return SDValue();
20420
20421 // In the #idef/#else code, we have in common:
20422 // - The vector of constants:
20423 // -- 0x4b000000
20424 // -- 0x53000000
20425 // - A shift:
20426 // -- v >> 16
20427
20428 // Create the splat vector for 0x4b000000.
20429 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20430 // Create the splat vector for 0x53000000.
20431 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20432
20433 // Create the right shift.
20434 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20435 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20436
20437 SDValue Low, High;
20438 if (Subtarget.hasSSE41()) {
20439 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20440 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20441 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20442 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20443 // Low will be bitcasted right away, so do not bother bitcasting back to its
20444 // original type.
20445 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20446 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20447 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20448 // (uint4) 0x53000000, 0xaa);
20449 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20450 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20451 // High will be bitcasted right away, so do not bother bitcasting back to
20452 // its original type.
20453 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20454 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20455 } else {
20456 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20457 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20458 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20459 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20460
20461 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20462 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20463 }
20464
20465 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20466 SDValue VecCstFSub = DAG.getConstantFP(
20467 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20468
20469 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20470 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20471 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20472 // enabled. See PR24512.
20473 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20474 // TODO: Are there any fast-math-flags to propagate here?
20475 // (float4) lo;
20476 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20477 // return (float4) lo + fhi;
20478 if (IsStrict) {
20479 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20480 {Op.getOperand(0), HighBitcast, VecCstFSub});
20481 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20482 {FHigh.getValue(1), LowBitcast, FHigh});
20483 }
20484
20485 SDValue FHigh =
20486 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20487 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20488}
20489
20490static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20491 const X86Subtarget &Subtarget) {
20492 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20493 SDValue N0 = Op.getOperand(OpNo);
20494 MVT SrcVT = N0.getSimpleValueType();
20495 SDLoc dl(Op);
20496
20497 switch (SrcVT.SimpleTy) {
20498 default:
20499 llvm_unreachable("Custom UINT_TO_FP is not supported!")__builtin_unreachable();
20500 case MVT::v2i32:
20501 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20502 case MVT::v4i32:
20503 case MVT::v8i32:
20504 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20505 case MVT::v2i64:
20506 case MVT::v4i64:
20507 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20508 }
20509}
20510
20511SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20512 SelectionDAG &DAG) const {
20513 bool IsStrict = Op->isStrictFPOpcode();
20514 unsigned OpNo = IsStrict ? 1 : 0;
20515 SDValue Src = Op.getOperand(OpNo);
20516 SDLoc dl(Op);
20517 auto PtrVT = getPointerTy(DAG.getDataLayout());
20518 MVT SrcVT = Src.getSimpleValueType();
20519 MVT DstVT = Op->getSimpleValueType(0);
20520 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20521
20522 if (DstVT == MVT::f128)
20523 return SDValue();
20524
20525 if (DstVT.isVector())
20526 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20527
20528 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20529 return Extract;
20530
20531 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20532 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20533 // Conversions from unsigned i32 to f32/f64 are legal,
20534 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20535 return Op;
20536 }
20537
20538 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20539 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20540 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20541 if (IsStrict)
20542 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20543 {Chain, Src});
20544 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20545 }
20546
20547 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20548 return V;
20549
20550 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20551 // infinity. It produces -0.0, so disable under strictfp.
20552 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20553 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20554 if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20555 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20556 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20557 (DstVT == MVT::f32 || DstVT == MVT::f64))
20558 return SDValue();
20559
20560 // Make a 64-bit buffer, and use it to build an FILD.
20561 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20562 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20563 Align SlotAlign(8);
20564 MachinePointerInfo MPI =
20565 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20566 if (SrcVT == MVT::i32) {
20567 SDValue OffsetSlot =
20568 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20569 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20570 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20571 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20572 std::pair<SDValue, SDValue> Tmp =
20573 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20574 if (IsStrict)
20575 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20576
20577 return Tmp.first;
20578 }
20579
20580 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((void)0);
20581 SDValue ValueToStore = Src;
20582 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20583 // Bitcasting to f64 here allows us to do a single 64-bit store from
20584 // an SSE register, avoiding the store forwarding penalty that would come
20585 // with two 32-bit stores.
20586 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20587 }
20588 SDValue Store =
20589 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20590 // For i64 source, we need to add the appropriate power of 2 if the input
20591 // was negative. We must be careful to do the computation in x87 extended
20592 // precision, not in SSE.
20593 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20594 SDValue Ops[] = { Store, StackSlot };
20595 SDValue Fild =
20596 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20597 SlotAlign, MachineMemOperand::MOLoad);
20598 Chain = Fild.getValue(1);
20599
20600
20601 // Check whether the sign bit is set.
20602 SDValue SignSet = DAG.getSetCC(
20603 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20604 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20605
20606 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20607 APInt FF(64, 0x5F80000000000000ULL);
20608 SDValue FudgePtr = DAG.getConstantPool(
20609 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20610 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20611
20612 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20613 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20614 SDValue Four = DAG.getIntPtrConstant(4, dl);
20615 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20616 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20617
20618 // Load the value out, extending it from f32 to f80.
20619 SDValue Fudge = DAG.getExtLoad(
20620 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20621 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20622 CPAlignment);
20623 Chain = Fudge.getValue(1);
20624 // Extend everything to 80 bits to force it to be done on x87.
20625 // TODO: Are there any fast-math-flags to propagate here?
20626 if (IsStrict) {
20627 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20628 {Chain, Fild, Fudge});
20629 // STRICT_FP_ROUND can't handle equal types.
20630 if (DstVT == MVT::f80)
20631 return Add;
20632 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20633 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
20634 }
20635 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
20636 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20637 DAG.getIntPtrConstant(0, dl));
20638}
20639
20640// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20641// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20642// just return an SDValue().
20643// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20644// to i16, i32 or i64, and we lower it to a legal sequence and return the
20645// result.
20646SDValue
20647X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20648 bool IsSigned, SDValue &Chain) const {
20649 bool IsStrict = Op->isStrictFPOpcode();
20650 SDLoc DL(Op);
20651
20652 EVT DstTy = Op.getValueType();
20653 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20654 EVT TheVT = Value.getValueType();
20655 auto PtrVT = getPointerTy(DAG.getDataLayout());
20656
20657 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20658 // f16 must be promoted before using the lowering in this routine.
20659 // fp128 does not use this lowering.
20660 return SDValue();
20661 }
20662
20663 // If using FIST to compute an unsigned i64, we'll need some fixup
20664 // to handle values above the maximum signed i64. A FIST is always
20665 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20666 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20667
20668 // FIXME: This does not generate an invalid exception if the input does not
20669 // fit in i32. PR44019
20670 if (!IsSigned && DstTy != MVT::i64) {
20671 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20672 // The low 32 bits of the fist result will have the correct uint32 result.
20673 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((void)0);
20674 DstTy = MVT::i64;
20675 }
20676
20677 assert(DstTy.getSimpleVT() <= MVT::i64 &&((void)0)
20678 DstTy.getSimpleVT() >= MVT::i16 &&((void)0)
20679 "Unknown FP_TO_INT to lower!")((void)0);
20680
20681 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20682 // stack slot.
20683 MachineFunction &MF = DAG.getMachineFunction();
20684 unsigned MemSize = DstTy.getStoreSize();
20685 int SSFI =
20686 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20687 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20688
20689 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20690
20691 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20692
20693 if (UnsignedFixup) {
20694 //
20695 // Conversion to unsigned i64 is implemented with a select,
20696 // depending on whether the source value fits in the range
20697 // of a signed i64. Let Thresh be the FP equivalent of
20698 // 0x8000000000000000ULL.
20699 //
20700 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20701 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20702 // FistSrc = (Value - FltOfs);
20703 // Fist-to-mem64 FistSrc
20704 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20705 // to XOR'ing the high 32 bits with Adjust.
20706 //
20707 // Being a power of 2, Thresh is exactly representable in all FP formats.
20708 // For X87 we'd like to use the smallest FP type for this constant, but
20709 // for DAG type consistency we have to match the FP operand type.
20710
20711 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20712 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
20713 bool LosesInfo = false;
20714 if (TheVT == MVT::f64)
20715 // The rounding mode is irrelevant as the conversion should be exact.
20716 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20717 &LosesInfo);
20718 else if (TheVT == MVT::f80)
20719 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20720 APFloat::rmNearestTiesToEven, &LosesInfo);
20721
20722 assert(Status == APFloat::opOK && !LosesInfo &&((void)0)
20723 "FP conversion should have been exact")((void)0);
20724
20725 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20726
20727 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20728 *DAG.getContext(), TheVT);
20729 SDValue Cmp;
20730 if (IsStrict) {
20731 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20732 /*IsSignaling*/ true);
20733 Chain = Cmp.getValue(1);
20734 } else {
20735 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20736 }
20737
20738 // Our preferred lowering of
20739 //
20740 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20741 //
20742 // is
20743 //
20744 // (Value >= Thresh) << 63
20745 //
20746 // but since we can get here after LegalOperations, DAGCombine might do the
20747 // wrong thing if we create a select. So, directly create the preferred
20748 // version.
20749 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20750 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20751 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20752
20753 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20754 DAG.getConstantFP(0.0, DL, TheVT));
20755
20756 if (IsStrict) {
20757 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20758 { Chain, Value, FltOfs });
20759 Chain = Value.getValue(1);
20760 } else
20761 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20762 }
20763
20764 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20765
20766 // FIXME This causes a redundant load/store if the SSE-class value is already
20767 // in memory, such as if it is on the callstack.
20768 if (isScalarFPTypeInSSEReg(TheVT)) {
20769 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((void)0);
20770 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20771 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20772 SDValue Ops[] = { Chain, StackSlot };
20773
20774 unsigned FLDSize = TheVT.getStoreSize();
20775 assert(FLDSize <= MemSize && "Stack slot not big enough")((void)0);
20776 MachineMemOperand *MMO = MF.getMachineMemOperand(
20777 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20778 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20779 Chain = Value.getValue(1);
20780 }
20781
20782 // Build the FP_TO_INT*_IN_MEM
20783 MachineMemOperand *MMO = MF.getMachineMemOperand(
20784 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20785 SDValue Ops[] = { Chain, Value, StackSlot };
20786 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
20787 DAG.getVTList(MVT::Other),
20788 Ops, DstTy, MMO);
20789
20790 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20791 Chain = Res.getValue(1);
20792
20793 // If we need an unsigned fixup, XOR the result with adjust.
20794 if (UnsignedFixup)
20795 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20796
20797 return Res;
20798}
20799
20800static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
20801 const X86Subtarget &Subtarget) {
20802 MVT VT = Op.getSimpleValueType();
20803 SDValue In = Op.getOperand(0);
20804 MVT InVT = In.getSimpleValueType();
20805 SDLoc dl(Op);
20806 unsigned Opc = Op.getOpcode();
20807
20808 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((void)0);
20809 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&((void)0)
20810 "Unexpected extension opcode")((void)0);
20811 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
20812 "Expected same number of elements")((void)0);
20813 assert((VT.getVectorElementType() == MVT::i16 ||((void)0)
20814 VT.getVectorElementType() == MVT::i32 ||((void)0)
20815 VT.getVectorElementType() == MVT::i64) &&((void)0)
20816 "Unexpected element type")((void)0);
20817 assert((InVT.getVectorElementType() == MVT::i8 ||((void)0)
20818 InVT.getVectorElementType() == MVT::i16 ||((void)0)
20819 InVT.getVectorElementType() == MVT::i32) &&((void)0)
20820 "Unexpected element type")((void)0);
20821
20822 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
20823
20824 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20825 assert(InVT == MVT::v32i8 && "Unexpected VT!")((void)0);
20826 return splitVectorIntUnary(Op, DAG);
20827 }
20828
20829 if (Subtarget.hasInt256())
20830 return Op;
20831
20832 // Optimize vectors in AVX mode:
20833 //
20834 // v8i16 -> v8i32
20835 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20836 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20837 // Concat upper and lower parts.
20838 //
20839 // v4i32 -> v4i64
20840 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20841 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20842 // Concat upper and lower parts.
20843 //
20844 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20845 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20846
20847 // Short-circuit if we can determine that each 128-bit half is the same value.
20848 // Otherwise, this is difficult to match and optimize.
20849 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20850 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20851 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20852
20853 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20854 SDValue Undef = DAG.getUNDEF(InVT);
20855 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20856 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20857 OpHi = DAG.getBitcast(HalfVT, OpHi);
20858
20859 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20860}
20861
20862// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20863static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20864 const SDLoc &dl, SelectionDAG &DAG) {
20865 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")((void)0);
20866 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20867 DAG.getIntPtrConstant(0, dl));
20868 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20869 DAG.getIntPtrConstant(8, dl));
20870 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20871 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20872 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20873 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20874}
20875
20876static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
20877 const X86Subtarget &Subtarget,
20878 SelectionDAG &DAG) {
20879 MVT VT = Op->getSimpleValueType(0);
20880 SDValue In = Op->getOperand(0);
20881 MVT InVT = In.getSimpleValueType();
20882 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((void)0);
20883 SDLoc DL(Op);
20884 unsigned NumElts = VT.getVectorNumElements();
20885
20886 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20887 // avoids a constant pool load.
20888 if (VT.getVectorElementType() != MVT::i8) {
20889 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20890 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20891 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20892 }
20893
20894 // Extend VT if BWI is not supported.
20895 MVT ExtVT = VT;
20896 if (!Subtarget.hasBWI()) {
20897 // If v16i32 is to be avoided, we'll need to split and concatenate.
20898 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20899 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20900
20901 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20902 }
20903
20904 // Widen to 512-bits if VLX is not supported.
20905 MVT WideVT = ExtVT;
20906 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20907 NumElts *= 512 / ExtVT.getSizeInBits();
20908 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20909 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20910 In, DAG.getIntPtrConstant(0, DL));
20911 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20912 NumElts);
20913 }
20914
20915 SDValue One = DAG.getConstant(1, DL, WideVT);
20916 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20917
20918 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20919
20920 // Truncate if we had to extend above.
20921 if (VT != ExtVT) {
20922 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20923 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20924 }
20925
20926 // Extract back to 128/256-bit if we widened.
20927 if (WideVT != VT)
20928 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20929 DAG.getIntPtrConstant(0, DL));
20930
20931 return SelectedVal;
20932}
20933
20934static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
20935 SelectionDAG &DAG) {
20936 SDValue In = Op.getOperand(0);
20937 MVT SVT = In.getSimpleValueType();
20938
20939 if (SVT.getVectorElementType() == MVT::i1)
20940 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20941
20942 assert(Subtarget.hasAVX() && "Expected AVX support")((void)0);
20943 return LowerAVXExtend(Op, DAG, Subtarget);
20944}
20945
20946/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20947/// It makes use of the fact that vectors with enough leading sign/zero bits
20948/// prevent the PACKSS/PACKUS from saturating the results.
20949/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20950/// within each 128-bit lane.
20951static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20952 const SDLoc &DL, SelectionDAG &DAG,
20953 const X86Subtarget &Subtarget) {
20954 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&((void)0)
20955 "Unexpected PACK opcode")((void)0);
20956 assert(DstVT.isVector() && "VT not a vector?")((void)0);
20957
20958 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20959 if (!Subtarget.hasSSE2())
20960 return SDValue();
20961
20962 EVT SrcVT = In.getValueType();
20963
20964 // No truncation required, we might get here due to recursive calls.
20965 if (SrcVT == DstVT)
20966 return In;
20967
20968 // We only support vector truncation to 64bits or greater from a
20969 // 128bits or greater source.
20970 unsigned DstSizeInBits = DstVT.getSizeInBits();
20971 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20972 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
20973 return SDValue();
20974
20975 unsigned NumElems = SrcVT.getVectorNumElements();
20976 if (!isPowerOf2_32(NumElems))
20977 return SDValue();
20978
20979 LLVMContext &Ctx = *DAG.getContext();
20980 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((void)0);
20981 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")((void)0);
20982
20983 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20984
20985 // Pack to the largest type possible:
20986 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20987 EVT InVT = MVT::i16, OutVT = MVT::i8;
20988 if (SrcVT.getScalarSizeInBits() > 16 &&
20989 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20990 InVT = MVT::i32;
20991 OutVT = MVT::i16;
20992 }
20993
20994 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20995 if (SrcVT.is128BitVector()) {
20996 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20997 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20998 In = DAG.getBitcast(InVT, In);
20999 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21000 Res = extractSubVector(Res, 0, DAG, DL, 64);
21001 return DAG.getBitcast(DstVT, Res);
21002 }
21003
21004 // Split lower/upper subvectors.
21005 SDValue Lo, Hi;
21006 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21007
21008 unsigned SubSizeInBits = SrcSizeInBits / 2;
21009 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21010 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21011
21012 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21013 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21014 Lo = DAG.getBitcast(InVT, Lo);
21015 Hi = DAG.getBitcast(InVT, Hi);
21016 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21017 return DAG.getBitcast(DstVT, Res);
21018 }
21019
21020 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21021 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21022 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21023 Lo = DAG.getBitcast(InVT, Lo);
21024 Hi = DAG.getBitcast(InVT, Hi);
21025 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21026
21027 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21028 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21029 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21030 SmallVector<int, 64> Mask;
21031 int Scale = 64 / OutVT.getScalarSizeInBits();
21032 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21033 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21034
21035 if (DstVT.is256BitVector())
21036 return DAG.getBitcast(DstVT, Res);
21037
21038 // If 512bit -> 128bit truncate another stage.
21039 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21040 Res = DAG.getBitcast(PackedVT, Res);
21041 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21042 }
21043
21044 // Recursively pack lower/upper subvectors, concat result and pack again.
21045 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")((void)0);
21046 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21047 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21048 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21049
21050 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21051 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21052 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21053}
21054
21055static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21056 const X86Subtarget &Subtarget) {
21057
21058 SDLoc DL(Op);
21059 MVT VT = Op.getSimpleValueType();
21060 SDValue In = Op.getOperand(0);
21061 MVT InVT = In.getSimpleValueType();
21062
21063 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((void)0);
21064
21065 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21066 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21067 if (InVT.getScalarSizeInBits() <= 16) {
21068 if (Subtarget.hasBWI()) {
21069 // legal, will go to VPMOVB2M, VPMOVW2M
21070 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21071 // We need to shift to get the lsb into sign position.
21072 // Shift packed bytes not supported natively, bitcast to word
21073 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21074 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21075 DAG.getBitcast(ExtVT, In),
21076 DAG.getConstant(ShiftInx, DL, ExtVT));
21077 In = DAG.getBitcast(InVT, In);
21078 }
21079 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21080 In, ISD::SETGT);
21081 }
21082 // Use TESTD/Q, extended vector to packed dword/qword.
21083 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&((void)0)
21084 "Unexpected vector type.")((void)0);
21085 unsigned NumElts = InVT.getVectorNumElements();
21086 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")((void)0);
21087 // We need to change to a wider element type that we have support for.
21088 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21089 // For 16 element vectors we extend to v16i32 unless we are explicitly
21090 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21091 // we need to split into two 8 element vectors which we can extend to v8i32,
21092 // truncate and concat the results. There's an additional complication if
21093 // the original type is v16i8. In that case we can't split the v16i8
21094 // directly, so we need to shuffle high elements to low and use
21095 // sign_extend_vector_inreg.
21096 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21097 SDValue Lo, Hi;
21098 if (InVT == MVT::v16i8) {
21099 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21100 Hi = DAG.getVectorShuffle(
21101 InVT, DL, In, In,
21102 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21103 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21104 } else {
21105 assert(InVT == MVT::v16i16 && "Unexpected VT!")((void)0);
21106 Lo = extract128BitVector(In, 0, DAG, DL);
21107 Hi = extract128BitVector(In, 8, DAG, DL);
21108 }
21109 // We're split now, just emit two truncates and a concat. The two
21110 // truncates will trigger legalization to come back to this function.
21111 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21112 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21113 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21114 }
21115 // We either have 8 elements or we're allowed to use 512-bit vectors.
21116 // If we have VLX, we want to use the narrowest vector that can get the
21117 // job done so we use vXi32.
21118 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21119 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21120 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21121 InVT = ExtVT;
21122 ShiftInx = InVT.getScalarSizeInBits() - 1;
21123 }
21124
21125 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21126 // We need to shift to get the lsb into sign position.
21127 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21128 DAG.getConstant(ShiftInx, DL, InVT));
21129 }
21130 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21131 if (Subtarget.hasDQI())
21132 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21133 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21134}
21135
21136SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21137 SDLoc DL(Op);
21138 MVT VT = Op.getSimpleValueType();
21139 SDValue In = Op.getOperand(0);
21140 MVT InVT = In.getSimpleValueType();
21141 unsigned InNumEltBits = InVT.getScalarSizeInBits();
21142
21143 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
21144 "Invalid TRUNCATE operation")((void)0);
21145
21146 // If we're called by the type legalizer, handle a few cases.
21147 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21148 if (!TLI.isTypeLegal(InVT)) {
21149 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21150 VT.is128BitVector()) {
21151 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&((void)0)
21152 "Unexpected subtarget!")((void)0);
21153 // The default behavior is to truncate one step, concatenate, and then
21154 // truncate the remainder. We'd rather produce two 64-bit results and
21155 // concatenate those.
21156 SDValue Lo, Hi;
21157 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21158
21159 EVT LoVT, HiVT;
21160 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21161
21162 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21163 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21164 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21165 }
21166
21167 // Otherwise let default legalization handle it.
21168 return SDValue();
21169 }
21170
21171 if (VT.getVectorElementType() == MVT::i1)
21172 return LowerTruncateVecI1(Op, DAG, Subtarget);
21173
21174 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21175 if (Subtarget.hasAVX512()) {
21176 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21177 assert(VT == MVT::v32i8 && "Unexpected VT!")((void)0);
21178 return splitVectorIntUnary(Op, DAG);
21179 }
21180
21181 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21182 // and then truncate that. But we should only do that if we haven't been
21183 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21184 // handled by isel patterns.
21185 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21186 Subtarget.canExtendTo512DQ())
21187 return Op;
21188 }
21189
21190 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21191 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21192
21193 // Truncate with PACKUS if we are truncating a vector with leading zero bits
21194 // that extend all the way to the packed/truncated value.
21195 // Pre-SSE41 we can only use PACKUSWB.
21196 KnownBits Known = DAG.computeKnownBits(In);
21197 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21198 if (SDValue V =
21199 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21200 return V;
21201
21202 // Truncate with PACKSS if we are truncating a vector with sign-bits that
21203 // extend all the way to the packed/truncated value.
21204 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21205 if (SDValue V =
21206 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21207 return V;
21208
21209 // Handle truncation of V256 to V128 using shuffles.
21210 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")((void)0);
21211
21212 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21213 In = DAG.getBitcast(MVT::v8i32, In);
21214
21215 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21216 if (Subtarget.hasInt256()) {
21217 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21218 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21219 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21220 DAG.getIntPtrConstant(0, DL));
21221 }
21222
21223 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21224 DAG.getIntPtrConstant(0, DL));
21225 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21226 DAG.getIntPtrConstant(4, DL));
21227 static const int ShufMask[] = {0, 2, 4, 6};
21228 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21229 }
21230
21231 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21232 In = DAG.getBitcast(MVT::v32i8, In);
21233
21234 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21235 if (Subtarget.hasInt256()) {
21236 // The PSHUFB mask:
21237 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21238 -1, -1, -1, -1, -1, -1, -1, -1,
21239 16, 17, 20, 21, 24, 25, 28, 29,
21240 -1, -1, -1, -1, -1, -1, -1, -1 };
21241 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21242 In = DAG.getBitcast(MVT::v4i64, In);
21243
21244 static const int ShufMask2[] = {0, 2, -1, -1};
21245 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21246 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21247 DAG.getBitcast(MVT::v16i16, In),
21248 DAG.getIntPtrConstant(0, DL));
21249 }
21250
21251 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21252 DAG.getIntPtrConstant(0, DL));
21253 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21254 DAG.getIntPtrConstant(16, DL));
21255
21256 // The PSHUFB mask:
21257 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
21258 -1, -1, -1, -1, -1, -1, -1, -1};
21259
21260 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21261 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21262
21263 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21264 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21265
21266 // The MOVLHPS Mask:
21267 static const int ShufMask2[] = {0, 1, 4, 5};
21268 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21269 return DAG.getBitcast(MVT::v8i16, res);
21270 }
21271
21272 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21273 // Use an AND to zero uppper bits for PACKUS.
21274 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21275
21276 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21277 DAG.getIntPtrConstant(0, DL));
21278 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21279 DAG.getIntPtrConstant(8, DL));
21280 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21281 }
21282
21283 llvm_unreachable("All 256->128 cases should have been handled above!")__builtin_unreachable();
21284}
21285
21286// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21287// behaves on out of range inputs to generate optimized conversions.
21288static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21289 SelectionDAG &DAG,
21290 const X86Subtarget &Subtarget) {
21291 MVT SrcVT = Src.getSimpleValueType();
21292 unsigned DstBits = VT.getScalarSizeInBits();
21293 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")((void)0);
21294
21295 // Calculate the converted result for values in the range 0 to
21296 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21297 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21298 SDValue Big =
21299 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21300 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21301 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21302
21303 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21304 // and only if the value was out of range. So we can use that
21305 // as our indicator that we rather use "Big" instead of "Small".
21306 //
21307 // Use "Small" if "IsOverflown" has all bits cleared
21308 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21309
21310 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21311 // use the slightly slower blendv select instead.
21312 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21313 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21314 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21315 }
21316
21317 SDValue IsOverflown =
21318 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21319 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21320 return DAG.getNode(ISD::OR, dl, VT, Small,
21321 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21322}
21323
21324SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21325 bool IsStrict = Op->isStrictFPOpcode();
21326 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21327 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21328 MVT VT = Op->getSimpleValueType(0);
21329 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21330 MVT SrcVT = Src.getSimpleValueType();
21331 SDLoc dl(Op);
21332
21333 if (VT.isVector()) {
21334 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21335 MVT ResVT = MVT::v4i32;
21336 MVT TruncVT = MVT::v4i1;
21337 unsigned Opc;
21338 if (IsStrict)
21339 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21340 else
21341 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21342
21343 if (!IsSigned && !Subtarget.hasVLX()) {
21344 assert(Subtarget.useAVX512Regs() && "Unexpected features!")((void)0);
21345 // Widen to 512-bits.
21346 ResVT = MVT::v8i32;
21347 TruncVT = MVT::v8i1;
21348 Opc = Op.getOpcode();
21349 // Need to concat with zero vector for strict fp to avoid spurious
21350 // exceptions.
21351 // TODO: Should we just do this for non-strict as well?
21352 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21353 : DAG.getUNDEF(MVT::v8f64);
21354 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21355 DAG.getIntPtrConstant(0, dl));
21356 }
21357 SDValue Res, Chain;
21358 if (IsStrict) {
21359 Res =
21360 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
21361 Chain = Res.getValue(1);
21362 } else {
21363 Res = DAG.getNode(Opc, dl, ResVT, Src);
21364 }
21365
21366 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21367 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21368 DAG.getIntPtrConstant(0, dl));
21369 if (IsStrict)
21370 return DAG.getMergeValues({Res, Chain}, dl);
21371 return Res;
21372 }
21373
21374 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21375 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21376 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21377 assert(Subtarget.useAVX512Regs() && "Requires avx512f")((void)0);
21378 return Op;
21379 }
21380
21381 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21382 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21383 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21384 Subtarget.useAVX512Regs()) {
21385 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21386 assert(!Subtarget.hasVLX() && "Unexpected features!")((void)0);
21387 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21388 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21389 // Need to concat with zero vector for strict fp to avoid spurious
21390 // exceptions.
21391 // TODO: Should we just do this for non-strict as well?
21392 SDValue Tmp =
21393 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21394 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21395 DAG.getIntPtrConstant(0, dl));
21396
21397 SDValue Res, Chain;
21398 if (IsStrict) {
21399 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21400 {Op->getOperand(0), Src});
21401 Chain = Res.getValue(1);
21402 } else {
21403 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21404 }
21405
21406 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21407 DAG.getIntPtrConstant(0, dl));
21408
21409 if (IsStrict)
21410 return DAG.getMergeValues({Res, Chain}, dl);
21411 return Res;
21412 }
21413
21414 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21415 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21416 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21417 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21418 assert(!Subtarget.hasVLX() && "Unexpected features!")((void)0);
21419 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21420 // Need to concat with zero vector for strict fp to avoid spurious
21421 // exceptions.
21422 // TODO: Should we just do this for non-strict as well?
21423 SDValue Tmp =
21424 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21425 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21426 DAG.getIntPtrConstant(0, dl));
21427
21428 SDValue Res, Chain;
21429 if (IsStrict) {
21430 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21431 {Op->getOperand(0), Src});
21432 Chain = Res.getValue(1);
21433 } else {
21434 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21435 }
21436
21437 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21438 DAG.getIntPtrConstant(0, dl));
21439
21440 if (IsStrict)
21441 return DAG.getMergeValues({Res, Chain}, dl);
21442 return Res;
21443 }
21444
21445 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21446 if (!Subtarget.hasVLX()) {
21447 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21448 // legalizer and then widened again by vector op legalization.
21449 if (!IsStrict)
21450 return SDValue();
21451
21452 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21453 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21454 {Src, Zero, Zero, Zero});
21455 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21456 {Op->getOperand(0), Tmp});
21457 SDValue Chain = Tmp.getValue(1);
21458 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21459 DAG.getIntPtrConstant(0, dl));
21460 return DAG.getMergeValues({Tmp, Chain}, dl);
21461 }
21462
21463 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")((void)0);
21464 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21465 DAG.getUNDEF(MVT::v2f32));
21466 if (IsStrict) {
21467 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21468 : X86ISD::STRICT_CVTTP2UI;
21469 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21470 }
21471 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21472 return DAG.getNode(Opc, dl, VT, Tmp);
21473 }
21474
21475 // Generate optimized instructions for pre AVX512 unsigned conversions from
21476 // vXf32 to vXi32.
21477 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21478 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21479 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21480 assert(!IsSigned && "Expected unsigned conversion!")((void)0);
21481 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21482 }
21483
21484 return SDValue();
21485 }
21486
21487 assert(!VT.isVector())((void)0);
21488
21489 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21490
21491 if (!IsSigned && UseSSEReg) {
21492 // Conversions from f32/f64 with AVX512 should be legal.
21493 if (Subtarget.hasAVX512())
21494 return Op;
21495
21496 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21497 // behaves on out of range inputs to generate optimized conversions.
21498 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21499 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21500 unsigned DstBits = VT.getScalarSizeInBits();
21501 APInt UIntLimit = APInt::getSignMask(DstBits);
21502 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21503 DAG.getConstant(UIntLimit, dl, VT));
21504 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21505
21506 // Calculate the converted result for values in the range:
21507 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21508 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21509 SDValue Small =
21510 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21511 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21512 SDValue Big = DAG.getNode(
21513 X86ISD::CVTTS2SI, dl, VT,
21514 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21515 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21516
21517 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21518 // and only if the value was out of range. So we can use that
21519 // as our indicator that we rather use "Big" instead of "Small".
21520 //
21521 // Use "Small" if "IsOverflown" has all bits cleared
21522 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21523 SDValue IsOverflown = DAG.getNode(
21524 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21525 return DAG.getNode(ISD::OR, dl, VT, Small,
21526 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21527 }
21528
21529 // Use default expansion for i64.
21530 if (VT == MVT::i64)
21531 return SDValue();
21532
21533 assert(VT == MVT::i32 && "Unexpected VT!")((void)0);
21534
21535 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21536 // FIXME: This does not generate an invalid exception if the input does not
21537 // fit in i32. PR44019
21538 if (Subtarget.is64Bit()) {
21539 SDValue Res, Chain;
21540 if (IsStrict) {
21541 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
21542 { Op.getOperand(0), Src });
21543 Chain = Res.getValue(1);
21544 } else
21545 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21546
21547 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21548 if (IsStrict)
21549 return DAG.getMergeValues({ Res, Chain }, dl);
21550 return Res;
21551 }
21552
21553 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21554 // use fisttp which will be handled later.
21555 if (!Subtarget.hasSSE3())
21556 return SDValue();
21557 }
21558
21559 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21560 // FIXME: This does not generate an invalid exception if the input does not
21561 // fit in i16. PR44019
21562 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21563 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")((void)0);
21564 SDValue Res, Chain;
21565 if (IsStrict) {
21566 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
21567 { Op.getOperand(0), Src });
21568 Chain = Res.getValue(1);
21569 } else
21570 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21571
21572 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21573 if (IsStrict)
21574 return DAG.getMergeValues({ Res, Chain }, dl);
21575 return Res;
21576 }
21577
21578 // If this is a FP_TO_SINT using SSEReg we're done.
21579 if (UseSSEReg && IsSigned)
21580 return Op;
21581
21582 // fp128 needs to use a libcall.
21583 if (SrcVT == MVT::f128) {
21584 RTLIB::Libcall LC;
21585 if (IsSigned)
21586 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21587 else
21588 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21589
21590 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21591 MakeLibCallOptions CallOptions;
21592 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21593 SDLoc(Op), Chain);
21594
21595 if (IsStrict)
21596 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21597
21598 return Tmp.first;
21599 }
21600
21601 // Fall back to X87.
21602 SDValue Chain;
21603 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21604 if (IsStrict)
21605 return DAG.getMergeValues({V, Chain}, dl);
21606 return V;
21607 }
21608
21609 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")__builtin_unreachable();
21610}
21611
21612SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21613 SelectionDAG &DAG) const {
21614 SDValue Src = Op.getOperand(0);
21615 MVT SrcVT = Src.getSimpleValueType();
21616
21617 // If the source is in an SSE register, the node is Legal.
21618 if (isScalarFPTypeInSSEReg(SrcVT))
21619 return Op;
21620
21621 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21622}
21623
21624SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21625 SelectionDAG &DAG) const {
21626 EVT DstVT = N->getValueType(0);
21627 SDValue Src = N->getOperand(0);
21628 EVT SrcVT = Src.getValueType();
21629
21630 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21631 // f16 must be promoted before using the lowering in this routine.
21632 // fp128 does not use this lowering.
21633 return SDValue();
21634 }
21635
21636 SDLoc DL(N);
21637 SDValue Chain = DAG.getEntryNode();
21638
21639 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21640
21641 // If we're converting from SSE, the stack slot needs to hold both types.
21642 // Otherwise it only needs to hold the DstVT.
21643 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21644 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21645 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21646 MachinePointerInfo MPI =
21647 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
21648
21649 if (UseSSE) {
21650 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")((void)0);
21651 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21652 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21653 SDValue Ops[] = { Chain, StackPtr };
21654
21655 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21656 /*Align*/ None, MachineMemOperand::MOLoad);
21657 Chain = Src.getValue(1);
21658 }
21659
21660 SDValue StoreOps[] = { Chain, Src, StackPtr };
21661 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21662 StoreOps, DstVT, MPI, /*Align*/ None,
21663 MachineMemOperand::MOStore);
21664
21665 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21666}
21667
21668SDValue
21669X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21670 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21671 // but making use of X86 specifics to produce better instruction sequences.
21672 SDNode *Node = Op.getNode();
21673 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21674 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21675 SDLoc dl(SDValue(Node, 0));
21676 SDValue Src = Node->getOperand(0);
21677
21678 // There are three types involved here: SrcVT is the source floating point
21679 // type, DstVT is the type of the result, and TmpVT is the result of the
21680 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21681 // DstVT).
21682 EVT SrcVT = Src.getValueType();
21683 EVT DstVT = Node->getValueType(0);
21684 EVT TmpVT = DstVT;
21685
21686 // This code is only for floats and doubles. Fall back to generic code for
21687 // anything else.
21688 if (!isScalarFPTypeInSSEReg(SrcVT))
21689 return SDValue();
21690
21691 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21692 unsigned SatWidth = SatVT.getScalarSizeInBits();
21693 unsigned DstWidth = DstVT.getScalarSizeInBits();
21694 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21695 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&((void)0)
21696 "Expected saturation width smaller than result width")((void)0);
21697
21698 // Promote result of FP_TO_*INT to at least 32 bits.
21699 if (TmpWidth < 32) {
21700 TmpVT = MVT::i32;
21701 TmpWidth = 32;
21702 }
21703
21704 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21705 // us to use a native signed conversion instead.
21706 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21707 TmpVT = MVT::i64;
21708 TmpWidth = 64;
21709 }
21710
21711 // If the saturation width is smaller than the size of the temporary result,
21712 // we can always use signed conversion, which is native.
21713 if (SatWidth < TmpWidth)
21714 FpToIntOpcode = ISD::FP_TO_SINT;
21715
21716 // Determine minimum and maximum integer values and their corresponding
21717 // floating-point values.
21718 APInt MinInt, MaxInt;
21719 if (IsSigned) {
21720 MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
21721 MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
21722 } else {
21723 MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
21724 MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
21725 }
21726
21727 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21728 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21729
21730 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21731 MinInt, IsSigned, APFloat::rmTowardZero);
21732 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21733 MaxInt, IsSigned, APFloat::rmTowardZero);
21734 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21735 && !(MaxStatus & APFloat::opStatus::opInexact);
21736
21737 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21738 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21739
21740 // If the integer bounds are exactly representable as floats, emit a
21741 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21742 if (AreExactFloatBounds) {
21743 if (DstVT != TmpVT) {
21744 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21745 SDValue MinClamped = DAG.getNode(
21746 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21747 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21748 SDValue BothClamped = DAG.getNode(
21749 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21750 // Convert clamped value to integer.
21751 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21752
21753 // NaN will become INDVAL, with the top bit set and the rest zero.
21754 // Truncation will discard the top bit, resulting in zero.
21755 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21756 }
21757
21758 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21759 SDValue MinClamped = DAG.getNode(
21760 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21761 // Clamp by MaxFloat from above. NaN cannot occur.
21762 SDValue BothClamped = DAG.getNode(
21763 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21764 // Convert clamped value to integer.
21765 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21766
21767 if (!IsSigned) {
21768 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21769 // which is zero.
21770 return FpToInt;
21771 }
21772
21773 // Otherwise, select zero if Src is NaN.
21774 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21775 return DAG.getSelectCC(
21776 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21777 }
21778
21779 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21780 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21781
21782 // Result of direct conversion, which may be selected away.
21783 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21784
21785 if (DstVT != TmpVT) {
21786 // NaN will become INDVAL, with the top bit set and the rest zero.
21787 // Truncation will discard the top bit, resulting in zero.
21788 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21789 }
21790
21791 SDValue Select = FpToInt;
21792 // For signed conversions where we saturate to the same size as the
21793 // result type of the fptoi instructions, INDVAL coincides with integer
21794 // minimum, so we don't need to explicitly check it.
21795 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21796 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21797 // MinInt if Src is NaN.
21798 Select = DAG.getSelectCC(
21799 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21800 }
21801
21802 // If Src OGT MaxFloat, select MaxInt.
21803 Select = DAG.getSelectCC(
21804 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21805
21806 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21807 // is already zero. The promoted case was already handled above.
21808 if (!IsSigned || DstVT != TmpVT) {
21809 return Select;
21810 }
21811
21812 // Otherwise, select 0 if Src is NaN.
21813 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21814 return DAG.getSelectCC(
21815 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21816}
21817
21818SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21819 bool IsStrict = Op->isStrictFPOpcode();
21820
21821 SDLoc DL(Op);
21822 MVT VT = Op.getSimpleValueType();
21823 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21824 MVT SVT = In.getSimpleValueType();
21825
21826 if (VT == MVT::f128)
21827 return SDValue();
21828
21829 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((void)0);
21830
21831 SDValue Res =
21832 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21833 if (IsStrict)
21834 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21835 {Op->getOperand(0), Res});
21836 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21837}
21838
21839SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21840 bool IsStrict = Op->isStrictFPOpcode();
21841 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21842 // It's legal except when f128 is involved
21843 if (In.getSimpleValueType() != MVT::f128)
21844 return Op;
21845
21846 return SDValue();
21847}
21848
21849static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21850 bool IsStrict = Op->isStrictFPOpcode();
21851 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21852 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&((void)0)
21853 "Unexpected VT!")((void)0);
21854
21855 SDLoc dl(Op);
21856 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21857 DAG.getConstant(0, dl, MVT::v8i16), Src,
21858 DAG.getIntPtrConstant(0, dl));
21859
21860 SDValue Chain;
21861 if (IsStrict) {
21862 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21863 {Op.getOperand(0), Res});
21864 Chain = Res.getValue(1);
21865 } else {
21866 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21867 }
21868
21869 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21870 DAG.getIntPtrConstant(0, dl));
21871
21872 if (IsStrict)
21873 return DAG.getMergeValues({Res, Chain}, dl);
21874
21875 return Res;
21876}
21877
21878static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21879 bool IsStrict = Op->isStrictFPOpcode();
21880 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21881 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&((void)0)
21882 "Unexpected VT!")((void)0);
21883
21884 SDLoc dl(Op);
21885 SDValue Res, Chain;
21886 if (IsStrict) {
21887 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21888 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21889 DAG.getIntPtrConstant(0, dl));
21890 Res = DAG.getNode(
21891 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21892 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21893 Chain = Res.getValue(1);
21894 } else {
21895 // FIXME: Should we use zeros for upper elements for non-strict?
21896 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21897 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21898 DAG.getTargetConstant(4, dl, MVT::i32));
21899 }
21900
21901 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21902 DAG.getIntPtrConstant(0, dl));
21903
21904 if (IsStrict)
21905 return DAG.getMergeValues({Res, Chain}, dl);
21906
21907 return Res;
21908}
21909
21910/// Depending on uarch and/or optimizing for size, we might prefer to use a
21911/// vector operation in place of the typical scalar operation.
21912static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21913 const X86Subtarget &Subtarget) {
21914 // If both operands have other uses, this is probably not profitable.
21915 SDValue LHS = Op.getOperand(0);
21916 SDValue RHS = Op.getOperand(1);
21917 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21918 return Op;
21919
21920 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21921 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21922 if (IsFP && !Subtarget.hasSSE3())
21923 return Op;
21924 if (!IsFP && !Subtarget.hasSSSE3())
21925 return Op;
21926
21927 // Extract from a common vector.
21928 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21929 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21930 LHS.getOperand(0) != RHS.getOperand(0) ||
21931 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21932 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21933 !shouldUseHorizontalOp(true, DAG, Subtarget))
21934 return Op;
21935
21936 // Allow commuted 'hadd' ops.
21937 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21938 unsigned HOpcode;
21939 switch (Op.getOpcode()) {
21940 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21941 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21942 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21943 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21944 default:
21945 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")__builtin_unreachable();
21946 }
21947 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21948 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21949 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21950 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21951 std::swap(LExtIndex, RExtIndex);
21952
21953 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21954 return Op;
21955
21956 SDValue X = LHS.getOperand(0);
21957 EVT VecVT = X.getValueType();
21958 unsigned BitWidth = VecVT.getSizeInBits();
21959 unsigned NumLanes = BitWidth / 128;
21960 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21961 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&((void)0)
21962 "Not expecting illegal vector widths here")((void)0);
21963
21964 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21965 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21966 SDLoc DL(Op);
21967 if (BitWidth == 256 || BitWidth == 512) {
21968 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21969 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21970 LExtIndex %= NumEltsPerLane;
21971 }
21972
21973 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21974 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21975 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21976 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21977 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21978 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21979 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21980}
21981
21982/// Depending on uarch and/or optimizing for size, we might prefer to use a
21983/// vector operation in place of the typical scalar operation.
21984SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21985 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&((void)0)
21986 "Only expecting float/double")((void)0);
21987 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21988}
21989
21990/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21991/// This mode isn't supported in hardware on X86. But as long as we aren't
21992/// compiling with trapping math, we can emulate this with
21993/// floor(X + copysign(nextafter(0.5, 0.0), X)).
21994static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21995 SDValue N0 = Op.getOperand(0);
21996 SDLoc dl(Op);
21997 MVT VT = Op.getSimpleValueType();
21998
21999 // N0 += copysign(nextafter(0.5, 0.0), N0)
22000 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22001 bool Ignored;
22002 APFloat Point5Pred = APFloat(0.5f);
22003 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22004 Point5Pred.next(/*nextDown*/true);
22005
22006 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22007 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22008 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22009
22010 // Truncate the result to remove fraction.
22011 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22012}
22013
22014/// The only differences between FABS and FNEG are the mask and the logic op.
22015/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22016static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22017 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&((void)0)
22018 "Wrong opcode for lowering FABS or FNEG.")((void)0);
22019
22020 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22021
22022 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22023 // into an FNABS. We'll lower the FABS after that if it is still in use.
22024 if (IsFABS)
22025 for (SDNode *User : Op->uses())
22026 if (User->getOpcode() == ISD::FNEG)
22027 return Op;
22028
22029 SDLoc dl(Op);
22030 MVT VT = Op.getSimpleValueType();
22031
22032 bool IsF128 = (VT == MVT::f128);
22033 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||((void)0)
22034 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||((void)0)
22035 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&((void)0)
22036 "Unexpected type in LowerFABSorFNEG")((void)0);
22037
22038 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22039 // decide if we should generate a 16-byte constant mask when we only need 4 or
22040 // 8 bytes for the scalar case.
22041
22042 // There are no scalar bitwise logical SSE/AVX instructions, so we
22043 // generate a 16-byte vector constant and logic op even for the scalar case.
22044 // Using a 16-byte mask allows folding the load of the mask with
22045 // the logic op, so it can save (~4 bytes) on code size.
22046 bool IsFakeVector = !VT.isVector() && !IsF128;
22047 MVT LogicVT = VT;
22048 if (IsFakeVector)
22049 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22050
22051 unsigned EltBits = VT.getScalarSizeInBits();
22052 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22053 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22054 APInt::getSignMask(EltBits);
22055 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22056 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22057
22058 SDValue Op0 = Op.getOperand(0);
22059 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22060 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22061 IsFNABS ? X86ISD::FOR :
22062 X86ISD::FXOR;
22063 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22064
22065 if (VT.isVector() || IsF128)
22066 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22067
22068 // For the scalar case extend to a 128-bit vector, perform the logic op,
22069 // and extract the scalar result back out.
22070 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22071 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22072 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22073 DAG.getIntPtrConstant(0, dl));
22074}
22075
22076static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22077 SDValue Mag = Op.getOperand(0);
22078 SDValue Sign = Op.getOperand(1);
22079 SDLoc dl(Op);
22080
22081 // If the sign operand is smaller, extend it first.
22082 MVT VT = Op.getSimpleValueType();
22083 if (Sign.getSimpleValueType().bitsLT(VT))
22084 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22085
22086 // And if it is bigger, shrink it first.
22087 if (Sign.getSimpleValueType().bitsGT(VT))
22088 Sign =
22089 DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22090
22091 // At this point the operands and the result should have the same
22092 // type, and that won't be f80 since that is not custom lowered.
22093 bool IsF128 = (VT == MVT::f128);
22094 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||((void)0)
22095 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||((void)0)
22096 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&((void)0)
22097 "Unexpected type in LowerFCOPYSIGN")((void)0);
22098
22099 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22100
22101 // Perform all scalar logic operations as 16-byte vectors because there are no
22102 // scalar FP logic instructions in SSE.
22103 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22104 // unnecessary splats, but we might miss load folding opportunities. Should
22105 // this decision be based on OptimizeForSize?
22106 bool IsFakeVector = !VT.isVector() && !IsF128;
22107 MVT LogicVT = VT;
22108 if (IsFakeVector)
22109 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
22110
22111 // The mask constants are automatically splatted for vector types.
22112 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22113 SDValue SignMask = DAG.getConstantFP(
22114 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22115 SDValue MagMask = DAG.getConstantFP(
22116 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22117
22118 // First, clear all bits but the sign bit from the second operand (sign).
22119 if (IsFakeVector)
22120 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22121 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22122
22123 // Next, clear the sign bit from the first operand (magnitude).
22124 // TODO: If we had general constant folding for FP logic ops, this check
22125 // wouldn't be necessary.
22126 SDValue MagBits;
22127 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22128 APFloat APF = Op0CN->getValueAPF();
22129 APF.clearSign();
22130 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22131 } else {
22132 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22133 if (IsFakeVector)
22134 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22135 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22136 }
22137
22138 // OR the magnitude value with the sign bit.
22139 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22140 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22141 DAG.getIntPtrConstant(0, dl));
22142}
22143
22144static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22145 SDValue N0 = Op.getOperand(0);
22146 SDLoc dl(Op);
22147 MVT VT = Op.getSimpleValueType();
22148
22149 MVT OpVT = N0.getSimpleValueType();
22150 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&((void)0)
22151 "Unexpected type for FGETSIGN")((void)0);
22152
22153 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22154 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22155 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22156 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22157 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22158 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22159 return Res;
22160}
22161
22162/// Helper for creating a X86ISD::SETCC node.
22163static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22164 SelectionDAG &DAG) {
22165 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22166 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22167}
22168
22169/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22170/// style scalarized (associative) reduction patterns. Partial reductions
22171/// are supported when the pointer SrcMask is non-null.
22172/// TODO - move this to SelectionDAG?
22173static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22174 SmallVectorImpl<SDValue> &SrcOps,
22175 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22176 SmallVector<SDValue, 8> Opnds;
22177 DenseMap<SDValue, APInt> SrcOpMap;
22178 EVT VT = MVT::Other;
22179
22180 // Recognize a special case where a vector is casted into wide integer to
22181 // test all 0s.
22182 assert(Op.getOpcode() == unsigned(BinOp) &&((void)0)
22183 "Unexpected bit reduction opcode")((void)0);
22184 Opnds.push_back(Op.getOperand(0));
22185 Opnds.push_back(Op.getOperand(1));
22186
22187 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22188 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22189 // BFS traverse all BinOp operands.
22190 if (I->getOpcode() == unsigned(BinOp)) {
22191 Opnds.push_back(I->getOperand(0));
22192 Opnds.push_back(I->getOperand(1));
22193 // Re-evaluate the number of nodes to be traversed.
22194 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22195 continue;
22196 }
22197
22198 // Quit if a non-EXTRACT_VECTOR_ELT
22199 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22200 return false;
22201
22202 // Quit if without a constant index.
22203 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22204 if (!Idx)
22205 return false;
22206
22207 SDValue Src = I->getOperand(0);
22208 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22209 if (M == SrcOpMap.end()) {
22210 VT = Src.getValueType();
22211 // Quit if not the same type.
22212 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22213 return false;
22214 unsigned NumElts = VT.getVectorNumElements();
22215 APInt EltCount = APInt::getNullValue(NumElts);
22216 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22217 SrcOps.push_back(Src);
22218 }
22219
22220 // Quit if element already used.
22221 unsigned CIdx = Idx->getZExtValue();
22222 if (M->second[CIdx])
22223 return false;
22224 M->second.setBit(CIdx);
22225 }
22226
22227 if (SrcMask) {
22228 // Collect the source partial masks.
22229 for (SDValue &SrcOp : SrcOps)
22230 SrcMask->push_back(SrcOpMap[SrcOp]);
22231 } else {
22232 // Quit if not all elements are used.
22233 for (const auto &I : SrcOpMap)
22234 if (!I.second.isAllOnesValue())
22235 return false;
22236 }
22237
22238 return true;
22239}
22240
22241// Helper function for comparing all bits of a vector against zero.
22242static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22243 const APInt &Mask,
22244 const X86Subtarget &Subtarget,
22245 SelectionDAG &DAG, X86::CondCode &X86CC) {
22246 EVT VT = V.getValueType();
22247 unsigned ScalarSize = VT.getScalarSizeInBits();
22248 if (Mask.getBitWidth() != ScalarSize) {
22249 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")((void)0);
22250 return SDValue();
22251 }
22252
22253 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")((void)0);
22254 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22255
22256 auto MaskBits = [&](SDValue Src) {
22257 if (Mask.isAllOnesValue())
22258 return Src;
22259 EVT SrcVT = Src.getValueType();
22260 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22261 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22262 };
22263
22264 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22265 if (VT.getSizeInBits() < 128) {
22266 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22267 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22268 return SDValue();
22269 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22270 DAG.getBitcast(IntVT, MaskBits(V)),
22271 DAG.getConstant(0, DL, IntVT));
22272 }
22273
22274 // Quit if not splittable to 128/256-bit vector.
22275 if (!isPowerOf2_32(VT.getSizeInBits()))
22276 return SDValue();
22277
22278 // Split down to 128/256-bit vector.
22279 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22280 while (VT.getSizeInBits() > TestSize) {
22281 auto Split = DAG.SplitVector(V, DL);
22282 VT = Split.first.getValueType();
22283 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22284 }
22285
22286 bool UsePTEST = Subtarget.hasSSE41();
22287 if (UsePTEST) {
22288 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22289 V = DAG.getBitcast(TestVT, MaskBits(V));
22290 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22291 }
22292
22293 // Without PTEST, a masked v2i64 or-reduction is not faster than
22294 // scalarization.
22295 if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22296 return SDValue();
22297
22298 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22299 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22300 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22301 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22302 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22303 DAG.getConstant(0xFFFF, DL, MVT::i32));
22304}
22305
22306// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22307// CMP(MOVMSK(PCMPEQB(X,0))).
22308static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22309 const SDLoc &DL,
22310 const X86Subtarget &Subtarget,
22311 SelectionDAG &DAG, SDValue &X86CC) {
22312 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")((void)0);
22313
22314 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22315 return SDValue();
22316
22317 // Check whether we're masking/truncating an OR-reduction result, in which
22318 // case track the masked bits.
22319 APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22320 switch (Op.getOpcode()) {
22321 case ISD::TRUNCATE: {
22322 SDValue Src = Op.getOperand(0);
22323 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22324 Op.getScalarValueSizeInBits());
22325 Op = Src;
22326 break;
22327 }
22328 case ISD::AND: {
22329 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22330 Mask = Cst->getAPIntValue();
22331 Op = Op.getOperand(0);
22332 }
22333 break;
22334 }
22335 }
22336
22337 SmallVector<SDValue, 8> VecIns;
22338 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22339 EVT VT = VecIns[0].getValueType();
22340 assert(llvm::all_of(VecIns,((void)0)
22341 [VT](SDValue V) { return VT == V.getValueType(); }) &&((void)0)
22342 "Reduction source vector mismatch")((void)0);
22343
22344 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22345 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22346 return SDValue();
22347
22348 // If more than one full vector is evaluated, OR them first before PTEST.
22349 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22350 Slot += 2, e += 1) {
22351 // Each iteration will OR 2 nodes and append the result until there is
22352 // only 1 node left, i.e. the final OR'd value of all vectors.
22353 SDValue LHS = VecIns[Slot];
22354 SDValue RHS = VecIns[Slot + 1];
22355 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22356 }
22357
22358 X86::CondCode CCode;
22359 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22360 DAG, CCode)) {
22361 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22362 return V;
22363 }
22364 }
22365
22366 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22367 ISD::NodeType BinOp;
22368 if (SDValue Match =
22369 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22370 X86::CondCode CCode;
22371 if (SDValue V =
22372 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22373 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22374 return V;
22375 }
22376 }
22377 }
22378
22379 return SDValue();
22380}
22381
22382/// return true if \c Op has a use that doesn't just read flags.
22383static bool hasNonFlagsUse(SDValue Op) {
22384 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22385 ++UI) {
22386 SDNode *User = *UI;
22387 unsigned UOpNo = UI.getOperandNo();
22388 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22389 // Look pass truncate.
22390 UOpNo = User->use_begin().getOperandNo();
22391 User = *User->use_begin();
22392 }
22393
22394 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22395 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22396 return true;
22397 }
22398 return false;
22399}
22400
22401// Transform to an x86-specific ALU node with flags if there is a chance of
22402// using an RMW op or only the flags are used. Otherwise, leave
22403// the node alone and emit a 'cmp' or 'test' instruction.
22404static bool isProfitableToUseFlagOp(SDValue Op) {
22405 for (SDNode *U : Op->uses())
22406 if (U->getOpcode() != ISD::CopyToReg &&
22407 U->getOpcode() != ISD::SETCC &&
22408 U->getOpcode() != ISD::STORE)
22409 return false;
22410
22411 return true;
22412}
22413
22414/// Emit nodes that will be selected as "test Op0,Op0", or something
22415/// equivalent.
22416static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22417 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22418 // CF and OF aren't always set the way we want. Determine which
22419 // of these we need.
22420 bool NeedCF = false;
22421 bool NeedOF = false;
22422 switch (X86CC) {
22423 default: break;
22424 case X86::COND_A: case X86::COND_AE:
22425 case X86::COND_B: case X86::COND_BE:
22426 NeedCF = true;
22427 break;
22428 case X86::COND_G: case X86::COND_GE:
22429 case X86::COND_L: case X86::COND_LE:
22430 case X86::COND_O: case X86::COND_NO: {
22431 // Check if we really need to set the
22432 // Overflow flag. If NoSignedWrap is present
22433 // that is not actually needed.
22434 switch (Op->getOpcode()) {
22435 case ISD::ADD:
22436 case ISD::SUB:
22437 case ISD::MUL:
22438 case ISD::SHL:
22439 if (Op.getNode()->getFlags().hasNoSignedWrap())
22440 break;
22441 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22442 default:
22443 NeedOF = true;
22444 break;
22445 }
22446 break;
22447 }
22448 }
22449 // See if we can use the EFLAGS value from the operand instead of
22450 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22451 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22452 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22453 // Emit a CMP with 0, which is the TEST pattern.
22454 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22455 DAG.getConstant(0, dl, Op.getValueType()));
22456 }
22457 unsigned Opcode = 0;
22458 unsigned NumOperands = 0;
22459
22460 SDValue ArithOp = Op;
22461
22462 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22463 // which may be the result of a CAST. We use the variable 'Op', which is the
22464 // non-casted variable when we check for possible users.
22465 switch (ArithOp.getOpcode()) {
22466 case ISD::AND:
22467 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22468 // because a TEST instruction will be better.
22469 if (!hasNonFlagsUse(Op))
22470 break;
22471
22472 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22473 case ISD::ADD:
22474 case ISD::SUB:
22475 case ISD::OR:
22476 case ISD::XOR:
22477 if (!isProfitableToUseFlagOp(Op))
22478 break;
22479
22480 // Otherwise use a regular EFLAGS-setting instruction.
22481 switch (ArithOp.getOpcode()) {
22482 default: llvm_unreachable("unexpected operator!")__builtin_unreachable();
22483 case ISD::ADD: Opcode = X86ISD::ADD; break;
22484 case ISD::SUB: Opcode = X86ISD::SUB; break;
22485 case ISD::XOR: Opcode = X86ISD::XOR; break;
22486 case ISD::AND: Opcode = X86ISD::AND; break;
22487 case ISD::OR: Opcode = X86ISD::OR; break;
22488 }
22489
22490 NumOperands = 2;
22491 break;
22492 case X86ISD::ADD:
22493 case X86ISD::SUB:
22494 case X86ISD::OR:
22495 case X86ISD::XOR:
22496 case X86ISD::AND:
22497 return SDValue(Op.getNode(), 1);
22498 case ISD::SSUBO:
22499 case ISD::USUBO: {
22500 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22501 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22502 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22503 Op->getOperand(1)).getValue(1);
22504 }
22505 default:
22506 break;
22507 }
22508
22509 if (Opcode == 0) {
22510 // Emit a CMP with 0, which is the TEST pattern.
22511 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22512 DAG.getConstant(0, dl, Op.getValueType()));
22513 }
22514 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22515 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22516
22517 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22518 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22519 return SDValue(New.getNode(), 1);
22520}
22521
22522/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22523/// equivalent.
22524static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22525 const SDLoc &dl, SelectionDAG &DAG,
22526 const X86Subtarget &Subtarget) {
22527 if (isNullConstant(Op1))
22528 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22529
22530 EVT CmpVT = Op0.getValueType();
22531
22532 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||((void)0)
22533 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")((void)0);
22534
22535 // Only promote the compare up to I32 if it is a 16 bit operation
22536 // with an immediate. 16 bit immediates are to be avoided.
22537 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22538 !DAG.getMachineFunction().getFunction().hasMinSize()) {
22539 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22540 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22541 // Don't do this if the immediate can fit in 8-bits.
22542 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22543 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22544 unsigned ExtendOp =
22545 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22546 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22547 // For equality comparisons try to use SIGN_EXTEND if the input was
22548 // truncate from something with enough sign bits.
22549 if (Op0.getOpcode() == ISD::TRUNCATE) {
22550 SDValue In = Op0.getOperand(0);
22551 unsigned EffBits =
22552 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22553 if (EffBits <= 16)
22554 ExtendOp = ISD::SIGN_EXTEND;
22555 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22556 SDValue In = Op1.getOperand(0);
22557 unsigned EffBits =
22558 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
22559 if (EffBits <= 16)
22560 ExtendOp = ISD::SIGN_EXTEND;
22561 }
22562 }
22563
22564 CmpVT = MVT::i32;
22565 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22566 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22567 }
22568 }
22569
22570 // Try to shrink i64 compares if the input has enough zero bits.
22571 // FIXME: Do this for non-constant compares for constant on LHS?
22572 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22573 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22574 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22575 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22576 CmpVT = MVT::i32;
22577 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22578 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22579 }
22580
22581 // 0-x == y --> x+y == 0
22582 // 0-x != y --> x+y != 0
22583 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22584 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22585 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22586 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22587 return Add.getValue(1);
22588 }
22589
22590 // x == 0-y --> x+y == 0
22591 // x != 0-y --> x+y != 0
22592 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22593 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22594 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22595 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22596 return Add.getValue(1);
22597 }
22598
22599 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22600 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22601 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22602 return Sub.getValue(1);
22603}
22604
22605/// Check if replacement of SQRT with RSQRT should be disabled.
22606bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22607 EVT VT = Op.getValueType();
22608
22609 // We never want to use both SQRT and RSQRT instructions for the same input.
22610 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22611 return false;
22612
22613 if (VT.isVector())
22614 return Subtarget.hasFastVectorFSQRT();
22615 return Subtarget.hasFastScalarFSQRT();
22616}
22617
22618/// The minimum architected relative accuracy is 2^-12. We need one
22619/// Newton-Raphson step to have a good float result (24 bits of precision).
22620SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22621 SelectionDAG &DAG, int Enabled,
22622 int &RefinementSteps,
22623 bool &UseOneConstNR,
22624 bool Reciprocal) const {
22625 EVT VT = Op.getValueType();
22626
22627 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22628 // It is likely not profitable to do this for f64 because a double-precision
22629 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22630 // instructions: convert to single, rsqrtss, convert back to double, refine
22631 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22632 // along with FMA, this could be a throughput win.
22633 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22634 // after legalize types.
22635 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22636 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22637 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22638 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22639 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22640 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22641 RefinementSteps = 1;
22642
22643 UseOneConstNR = false;
22644 // There is no FSQRT for 512-bits, but there is RSQRT14.
22645 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22646 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22647 }
22648 return SDValue();
22649}
22650
22651/// The minimum architected relative accuracy is 2^-12. We need one
22652/// Newton-Raphson step to have a good float result (24 bits of precision).
22653SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22654 int Enabled,
22655 int &RefinementSteps) const {
22656 EVT VT = Op.getValueType();
22657
22658 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22659 // It is likely not profitable to do this for f64 because a double-precision
22660 // reciprocal estimate with refinement on x86 prior to FMA requires
22661 // 15 instructions: convert to single, rcpss, convert back to double, refine
22662 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22663 // along with FMA, this could be a throughput win.
22664
22665 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22666 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22667 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22668 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22669 // Enable estimate codegen with 1 refinement step for vector division.
22670 // Scalar division estimates are disabled because they break too much
22671 // real-world code. These defaults are intended to match GCC behavior.
22672 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22673 return SDValue();
22674
22675 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22676 RefinementSteps = 1;
22677
22678 // There is no FSQRT for 512-bits, but there is RCP14.
22679 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22680 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
22681 }
22682 return SDValue();
22683}
22684
22685/// If we have at least two divisions that use the same divisor, convert to
22686/// multiplication by a reciprocal. This may need to be adjusted for a given
22687/// CPU if a division's cost is not at least twice the cost of a multiplication.
22688/// This is because we still need one division to calculate the reciprocal and
22689/// then we need two multiplies by that reciprocal as replacements for the
22690/// original divisions.
22691unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22692 return 2;
22693}
22694
22695SDValue
22696X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22697 SelectionDAG &DAG,
22698 SmallVectorImpl<SDNode *> &Created) const {
22699 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22700 if (isIntDivCheap(N->getValueType(0), Attr))
22701 return SDValue(N,0); // Lower SDIV as SDIV
22702
22703 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&((void)0)
22704 "Unexpected divisor!")((void)0);
22705
22706 // Only perform this transform if CMOV is supported otherwise the select
22707 // below will become a branch.
22708 if (!Subtarget.hasCMov())
22709 return SDValue();
22710
22711 // fold (sdiv X, pow2)
22712 EVT VT = N->getValueType(0);
22713 // FIXME: Support i8.
22714 if (VT != MVT::i16 && VT != MVT::i32 &&
22715 !(Subtarget.is64Bit() && VT == MVT::i64))
22716 return SDValue();
22717
22718 unsigned Lg2 = Divisor.countTrailingZeros();
22719
22720 // If the divisor is 2 or -2, the default expansion is better.
22721 if (Lg2 == 1)
22722 return SDValue();
22723
22724 SDLoc DL(N);
22725 SDValue N0 = N->getOperand(0);
22726 SDValue Zero = DAG.getConstant(0, DL, VT);
22727 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
22728 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
22729
22730 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
22731 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
22732 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
22733 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
22734
22735 Created.push_back(Cmp.getNode());
22736 Created.push_back(Add.getNode());
22737 Created.push_back(CMov.getNode());
22738
22739 // Divide by pow2.
22740 SDValue SRA =
22741 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
22742
22743 // If we're dividing by a positive value, we're done. Otherwise, we must
22744 // negate the result.
22745 if (Divisor.isNonNegative())
22746 return SRA;
22747
22748 Created.push_back(SRA.getNode());
22749 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
22750}
22751
22752/// Result of 'and' is compared against zero. Change to a BT node if possible.
22753/// Returns the BT node and the condition code needed to use it.
22754static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
22755 const SDLoc &dl, SelectionDAG &DAG,
22756 SDValue &X86CC) {
22757 assert(And.getOpcode() == ISD::AND && "Expected AND node!")((void)0);
22758 SDValue Op0 = And.getOperand(0);
22759 SDValue Op1 = And.getOperand(1);
22760 if (Op0.getOpcode() == ISD::TRUNCATE)
22761 Op0 = Op0.getOperand(0);
22762 if (Op1.getOpcode() == ISD::TRUNCATE)
22763 Op1 = Op1.getOperand(0);
22764
22765 SDValue Src, BitNo;
22766 if (Op1.getOpcode() == ISD::SHL)
22767 std::swap(Op0, Op1);
22768 if (Op0.getOpcode() == ISD::SHL) {
22769 if (isOneConstant(Op0.getOperand(0))) {
22770 // If we looked past a truncate, check that it's only truncating away
22771 // known zeros.
22772 unsigned BitWidth = Op0.getValueSizeInBits();
22773 unsigned AndBitWidth = And.getValueSizeInBits();
22774 if (BitWidth > AndBitWidth) {
22775 KnownBits Known = DAG.computeKnownBits(Op0);
22776 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22777 return SDValue();
22778 }
22779 Src = Op1;
22780 BitNo = Op0.getOperand(1);
22781 }
22782 } else if (Op1.getOpcode() == ISD::Constant) {
22783 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22784 uint64_t AndRHSVal = AndRHS->getZExtValue();
22785 SDValue AndLHS = Op0;
22786
22787 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22788 Src = AndLHS.getOperand(0);
22789 BitNo = AndLHS.getOperand(1);
22790 } else {
22791 // Use BT if the immediate can't be encoded in a TEST instruction or we
22792 // are optimizing for size and the immedaite won't fit in a byte.
22793 bool OptForSize = DAG.shouldOptForSize();
22794 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22795 isPowerOf2_64(AndRHSVal)) {
22796 Src = AndLHS;
22797 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22798 Src.getValueType());
22799 }
22800 }
22801 }
22802
22803 // No patterns found, give up.
22804 if (!Src.getNode())
22805 return SDValue();
22806
22807 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22808 // instruction. Since the shift amount is in-range-or-undefined, we know
22809 // that doing a bittest on the i32 value is ok. We extend to i32 because
22810 // the encoding for the i16 version is larger than the i32 version.
22811 // Also promote i16 to i32 for performance / code size reason.
22812 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
22813 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
22814
22815 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22816 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22817 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22818 // known to be zero.
22819 if (Src.getValueType() == MVT::i64 &&
22820 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22821 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
22822
22823 // If the operand types disagree, extend the shift amount to match. Since
22824 // BT ignores high bits (like shifts) we can use anyextend.
22825 if (Src.getValueType() != BitNo.getValueType())
22826 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
22827
22828 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
22829 dl, MVT::i8);
22830 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
22831}
22832
22833/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22834/// CMPs.
22835static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22836 SDValue &Op1, bool &IsAlwaysSignaling) {
22837 unsigned SSECC;
22838 bool Swap = false;
22839
22840 // SSE Condition code mapping:
22841 // 0 - EQ
22842 // 1 - LT
22843 // 2 - LE
22844 // 3 - UNORD
22845 // 4 - NEQ
22846 // 5 - NLT
22847 // 6 - NLE
22848 // 7 - ORD
22849 switch (SetCCOpcode) {
22850 default: llvm_unreachable("Unexpected SETCC condition")__builtin_unreachable();
22851 case ISD::SETOEQ:
22852 case ISD::SETEQ: SSECC = 0; break;
22853 case ISD::SETOGT:
22854 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22855 case ISD::SETLT:
22856 case ISD::SETOLT: SSECC = 1; break;
22857 case ISD::SETOGE:
22858 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22859 case ISD::SETLE:
22860 case ISD::SETOLE: SSECC = 2; break;
22861 case ISD::SETUO: SSECC = 3; break;
22862 case ISD::SETUNE:
22863 case ISD::SETNE: SSECC = 4; break;
22864 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22865 case ISD::SETUGE: SSECC = 5; break;
22866 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
22867 case ISD::SETUGT: SSECC = 6; break;
22868 case ISD::SETO: SSECC = 7; break;
22869 case ISD::SETUEQ: SSECC = 8; break;
22870 case ISD::SETONE: SSECC = 12; break;
22871 }
22872 if (Swap)
22873 std::swap(Op0, Op1);
22874
22875 switch (SetCCOpcode) {
22876 default:
22877 IsAlwaysSignaling = true;
22878 break;
22879 case ISD::SETEQ:
22880 case ISD::SETOEQ:
22881 case ISD::SETUEQ:
22882 case ISD::SETNE:
22883 case ISD::SETONE:
22884 case ISD::SETUNE:
22885 case ISD::SETO:
22886 case ISD::SETUO:
22887 IsAlwaysSignaling = false;
22888 break;
22889 }
22890
22891 return SSECC;
22892}
22893
22894/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22895/// concatenate the result back.
22896static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22897 ISD::CondCode Cond, SelectionDAG &DAG,
22898 const SDLoc &dl) {
22899 assert(VT.isInteger() && VT == LHS.getValueType() &&((void)0)
22900 VT == RHS.getValueType() && "Unsupported VTs!")((void)0);
22901
22902 SDValue CC = DAG.getCondCode(Cond);
22903
22904 // Extract the LHS Lo/Hi vectors
22905 SDValue LHS1, LHS2;
22906 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22907
22908 // Extract the RHS Lo/Hi vectors
22909 SDValue RHS1, RHS2;
22910 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22911
22912 // Issue the operation on the smaller types and concatenate the result back
22913 EVT LoVT, HiVT;
22914 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22915 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22916 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22917 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22918}
22919
22920static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22921
22922 SDValue Op0 = Op.getOperand(0);
22923 SDValue Op1 = Op.getOperand(1);
22924 SDValue CC = Op.getOperand(2);
22925 MVT VT = Op.getSimpleValueType();
22926 SDLoc dl(Op);
22927
22928 assert(VT.getVectorElementType() == MVT::i1 &&((void)0)
22929 "Cannot set masked compare for this operation")((void)0);
22930
22931 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22932
22933 // Prefer SETGT over SETLT.
22934 if (SetCCOpcode == ISD::SETLT) {
22935 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22936 std::swap(Op0, Op1);
22937 }
22938
22939 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22940}
22941
22942/// Given a buildvector constant, return a new vector constant with each element
22943/// incremented or decremented. If incrementing or decrementing would result in
22944/// unsigned overflow or underflow or this is not a simple vector constant,
22945/// return an empty value.
22946static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
22947 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22948 if (!BV)
22949 return SDValue();
22950
22951 MVT VT = V.getSimpleValueType();
22952 MVT EltVT = VT.getVectorElementType();
22953 unsigned NumElts = VT.getVectorNumElements();
22954 SmallVector<SDValue, 8> NewVecC;
22955 SDLoc DL(V);
22956 for (unsigned i = 0; i < NumElts; ++i) {
22957 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22958 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22959 return SDValue();
22960
22961 // Avoid overflow/underflow.
22962 const APInt &EltC = Elt->getAPIntValue();
22963 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
22964 return SDValue();
22965
22966 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22967 }
22968
22969 return DAG.getBuildVector(VT, DL, NewVecC);
22970}
22971
22972/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22973/// Op0 u<= Op1:
22974/// t = psubus Op0, Op1
22975/// pcmpeq t, <0..0>
22976static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22977 ISD::CondCode Cond, const SDLoc &dl,
22978 const X86Subtarget &Subtarget,
22979 SelectionDAG &DAG) {
22980 if (!Subtarget.hasSSE2())
22981 return SDValue();
22982
22983 MVT VET = VT.getVectorElementType();
22984 if (VET != MVT::i8 && VET != MVT::i16)
22985 return SDValue();
22986
22987 switch (Cond) {
22988 default:
22989 return SDValue();
22990 case ISD::SETULT: {
22991 // If the comparison is against a constant we can turn this into a
22992 // setule. With psubus, setule does not require a swap. This is
22993 // beneficial because the constant in the register is no longer
22994 // destructed as the destination so it can be hoisted out of a loop.
22995 // Only do this pre-AVX since vpcmp* is no longer destructive.
22996 if (Subtarget.hasAVX())
22997 return SDValue();
22998 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
22999 if (!ULEOp1)
23000 return SDValue();
23001 Op1 = ULEOp1;
23002 break;
23003 }
23004 case ISD::SETUGT: {
23005 // If the comparison is against a constant, we can turn this into a setuge.
23006 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23007 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23008 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23009 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23010 if (!UGEOp1)
23011 return SDValue();
23012 Op1 = Op0;
23013 Op0 = UGEOp1;
23014 break;
23015 }
23016 // Psubus is better than flip-sign because it requires no inversion.
23017 case ISD::SETUGE:
23018 std::swap(Op0, Op1);
23019 break;
23020 case ISD::SETULE:
23021 break;
23022 }
23023
23024 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23025 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23026 DAG.getConstant(0, dl, VT));
23027}
23028
23029static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23030 SelectionDAG &DAG) {
23031 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23032 Op.getOpcode() == ISD::STRICT_FSETCCS;
23033 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23034 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23035 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23036 MVT VT = Op->getSimpleValueType(0);
23037 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23038 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23039 SDLoc dl(Op);
23040
23041 if (isFP) {
23042#ifndef NDEBUG1
23043 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23044 assert(EltVT == MVT::f32 || EltVT == MVT::f64)((void)0);
23045#endif
23046
23047 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23048 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23049
23050 // If we have a strict compare with a vXi1 result and the input is 128/256
23051 // bits we can't use a masked compare unless we have VLX. If we use a wider
23052 // compare like we do for non-strict, we might trigger spurious exceptions
23053 // from the upper elements. Instead emit a AVX compare and convert to mask.
23054 unsigned Opc;
23055 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23056 (!IsStrict || Subtarget.hasVLX() ||
23057 Op0.getSimpleValueType().is512BitVector())) {
23058 assert(VT.getVectorNumElements() <= 16)((void)0);
23059 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23060 } else {
23061 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23062 // The SSE/AVX packed FP comparison nodes are defined with a
23063 // floating-point vector result that matches the operand type. This allows
23064 // them to work with an SSE1 target (integer vector types are not legal).
23065 VT = Op0.getSimpleValueType();
23066 }
23067
23068 SDValue Cmp;
23069 bool IsAlwaysSignaling;
23070 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23071 if (!Subtarget.hasAVX()) {
23072 // TODO: We could use following steps to handle a quiet compare with
23073 // signaling encodings.
23074 // 1. Get ordered masks from a quiet ISD::SETO
23075 // 2. Use the masks to mask potential unordered elements in operand A, B
23076 // 3. Get the compare results of masked A, B
23077 // 4. Calculating final result using the mask and result from 3
23078 // But currently, we just fall back to scalar operations.
23079 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23080 return SDValue();
23081
23082 // Insert an extra signaling instruction to raise exception.
23083 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23084 SDValue SignalCmp = DAG.getNode(
23085 Opc, dl, {VT, MVT::Other},
23086 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23087 // FIXME: It seems we need to update the flags of all new strict nodes.
23088 // Otherwise, mayRaiseFPException in MI will return false due to
23089 // NoFPExcept = false by default. However, I didn't find it in other
23090 // patches.
23091 SignalCmp->setFlags(Op->getFlags());
23092 Chain = SignalCmp.getValue(1);
23093 }
23094
23095 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23096 // emit two comparisons and a logic op to tie them together.
23097 if (SSECC >= 8) {
23098 // LLVM predicate is SETUEQ or SETONE.
23099 unsigned CC0, CC1;
23100 unsigned CombineOpc;
23101 if (Cond == ISD::SETUEQ) {
23102 CC0 = 3; // UNORD
23103 CC1 = 0; // EQ
23104 CombineOpc = X86ISD::FOR;
23105 } else {
23106 assert(Cond == ISD::SETONE)((void)0);
23107 CC0 = 7; // ORD
23108 CC1 = 4; // NEQ
23109 CombineOpc = X86ISD::FAND;
23110 }
23111
23112 SDValue Cmp0, Cmp1;
23113 if (IsStrict) {
23114 Cmp0 = DAG.getNode(
23115 Opc, dl, {VT, MVT::Other},
23116 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23117 Cmp1 = DAG.getNode(
23118 Opc, dl, {VT, MVT::Other},
23119 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23120 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23121 Cmp1.getValue(1));
23122 } else {
23123 Cmp0 = DAG.getNode(
23124 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23125 Cmp1 = DAG.getNode(
23126 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23127 }
23128 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23129 } else {
23130 if (IsStrict) {
23131 Cmp = DAG.getNode(
23132 Opc, dl, {VT, MVT::Other},
23133 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23134 Chain = Cmp.getValue(1);
23135 } else
23136 Cmp = DAG.getNode(
23137 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23138 }
23139 } else {
23140 // Handle all other FP comparisons here.
23141 if (IsStrict) {
23142 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23143 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23144 Cmp = DAG.getNode(
23145 Opc, dl, {VT, MVT::Other},
23146 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23147 Chain = Cmp.getValue(1);
23148 } else
23149 Cmp = DAG.getNode(
23150 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23151 }
23152
23153 if (VT.getFixedSizeInBits() >
23154 Op.getSimpleValueType().getFixedSizeInBits()) {
23155 // We emitted a compare with an XMM/YMM result. Finish converting to a
23156 // mask register using a vptestm.
23157 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23158 Cmp = DAG.getBitcast(CastVT, Cmp);
23159 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23160 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23161 } else {
23162 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23163 // the result type of SETCC. The bitcast is expected to be optimized
23164 // away during combining/isel.
23165 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23166 }
23167
23168 if (IsStrict)
23169 return DAG.getMergeValues({Cmp, Chain}, dl);
23170
23171 return Cmp;
23172 }
23173
23174 assert(!IsStrict && "Strict SETCC only handles FP operands.")((void)0);
23175
23176 MVT VTOp0 = Op0.getSimpleValueType();
23177 (void)VTOp0;
23178 assert(VTOp0 == Op1.getSimpleValueType() &&((void)0)
23179 "Expected operands with same type!")((void)0);
23180 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((void)0)
23181 "Invalid number of packed elements for source and destination!")((void)0);
23182
23183 // The non-AVX512 code below works under the assumption that source and
23184 // destination types are the same.
23185 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&((void)0)
23186 "Value types for source and destination must be the same!")((void)0);
23187
23188 // The result is boolean, but operands are int/float
23189 if (VT.getVectorElementType() == MVT::i1) {
23190 // In AVX-512 architecture setcc returns mask with i1 elements,
23191 // But there is no compare instruction for i8 and i16 elements in KNL.
23192 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&((void)0)
23193 "Unexpected operand type")((void)0);
23194 return LowerIntVSETCC_AVX512(Op, DAG);
23195 }
23196
23197 // Lower using XOP integer comparisons.
23198 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23199 // Translate compare code to XOP PCOM compare mode.
23200 unsigned CmpMode = 0;
23201 switch (Cond) {
23202 default: llvm_unreachable("Unexpected SETCC condition")__builtin_unreachable();
23203 case ISD::SETULT:
23204 case ISD::SETLT: CmpMode = 0x00; break;
23205 case ISD::SETULE:
23206 case ISD::SETLE: CmpMode = 0x01; break;
23207 case ISD::SETUGT:
23208 case ISD::SETGT: CmpMode = 0x02; break;
23209 case ISD::SETUGE:
23210 case ISD::SETGE: CmpMode = 0x03; break;
23211 case ISD::SETEQ: CmpMode = 0x04; break;
23212 case ISD::SETNE: CmpMode = 0x05; break;
23213 }
23214
23215 // Are we comparing unsigned or signed integers?
23216 unsigned Opc =
23217 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23218
23219 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23220 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23221 }
23222
23223 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23224 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23225 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23226 SDValue BC0 = peekThroughBitcasts(Op0);
23227 if (BC0.getOpcode() == ISD::AND) {
23228 APInt UndefElts;
23229 SmallVector<APInt, 64> EltBits;
23230 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23231 VT.getScalarSizeInBits(), UndefElts,
23232 EltBits, false, false)) {
23233 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23234 Cond = ISD::SETEQ;
23235 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23236 }
23237 }
23238 }
23239 }
23240
23241 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23242 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23243 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23244 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23245 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23246 unsigned BitWidth = VT.getScalarSizeInBits();
23247 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23248
23249 SDValue Result = Op0.getOperand(0);
23250 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23251 DAG.getConstant(ShiftAmt, dl, VT));
23252 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23253 DAG.getConstant(BitWidth - 1, dl, VT));
23254 return Result;
23255 }
23256 }
23257
23258 // Break 256-bit integer vector compare into smaller ones.
23259 if (VT.is256BitVector() && !Subtarget.hasInt256())
23260 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23261
23262 if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23263 assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!")((void)0);
23264 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23265 }
23266
23267 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23268 // not-of-PCMPEQ:
23269 // X != INT_MIN --> X >s INT_MIN
23270 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23271 // +X != 0 --> +X >s 0
23272 APInt ConstValue;
23273 if (Cond == ISD::SETNE &&
23274 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23275 if (ConstValue.isMinSignedValue())
23276 Cond = ISD::SETGT;
23277 else if (ConstValue.isMaxSignedValue())
23278 Cond = ISD::SETLT;
23279 else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23280 Cond = ISD::SETGT;
23281 }
23282
23283 // If both operands are known non-negative, then an unsigned compare is the
23284 // same as a signed compare and there's no need to flip signbits.
23285 // TODO: We could check for more general simplifications here since we're
23286 // computing known bits.
23287 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23288 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23289
23290 // Special case: Use min/max operations for unsigned compares.
23291 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23292 if (ISD::isUnsignedIntSetCC(Cond) &&
23293 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23294 TLI.isOperationLegal(ISD::UMIN, VT)) {
23295 // If we have a constant operand, increment/decrement it and change the
23296 // condition to avoid an invert.
23297 if (Cond == ISD::SETUGT) {
23298 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23299 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23300 Op1 = UGTOp1;
23301 Cond = ISD::SETUGE;
23302 }
23303 }
23304 if (Cond == ISD::SETULT) {
23305 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23306 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23307 Op1 = ULTOp1;
23308 Cond = ISD::SETULE;
23309 }
23310 }
23311 bool Invert = false;
23312 unsigned Opc;
23313 switch (Cond) {
23314 default: llvm_unreachable("Unexpected condition code")__builtin_unreachable();
23315 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23316 case ISD::SETULE: Opc = ISD::UMIN; break;
23317 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23318 case ISD::SETUGE: Opc = ISD::UMAX; break;
23319 }
23320
23321 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23322 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23323
23324 // If the logical-not of the result is required, perform that now.
23325 if (Invert)
23326 Result = DAG.getNOT(dl, Result, VT);
23327
23328 return Result;
23329 }
23330
23331 // Try to use SUBUS and PCMPEQ.
23332 if (FlipSigns)
23333 if (SDValue V =
23334 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23335 return V;
23336
23337 // We are handling one of the integer comparisons here. Since SSE only has
23338 // GT and EQ comparisons for integer, swapping operands and multiple
23339 // operations may be required for some comparisons.
23340 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23341 : X86ISD::PCMPGT;
23342 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23343 Cond == ISD::SETGE || Cond == ISD::SETUGE;
23344 bool Invert = Cond == ISD::SETNE ||
23345 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23346
23347 if (Swap)
23348 std::swap(Op0, Op1);
23349
23350 // Check that the operation in question is available (most are plain SSE2,
23351 // but PCMPGTQ and PCMPEQQ have different requirements).
23352 if (VT == MVT::v2i64) {
23353 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23354 assert(Subtarget.hasSSE2() && "Don't know how to lower!")((void)0);
23355
23356 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23357 // the odd elements over the even elements.
23358 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23359 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23360 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23361
23362 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23363 static const int MaskHi[] = { 1, 1, 3, 3 };
23364 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23365
23366 return DAG.getBitcast(VT, Result);
23367 }
23368
23369 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23370 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23371 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23372
23373 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23374 static const int MaskHi[] = { 1, 1, 3, 3 };
23375 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23376
23377 return DAG.getBitcast(VT, Result);
23378 }
23379
23380 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23381 // bits of the inputs before performing those operations. The lower
23382 // compare is always unsigned.
23383 SDValue SB;
23384 if (FlipSigns) {
23385 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23386 } else {
23387 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23388 }
23389 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23390 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23391
23392 // Cast everything to the right type.
23393 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23394 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23395
23396 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23397 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23398 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23399
23400 // Create masks for only the low parts/high parts of the 64 bit integers.
23401 static const int MaskHi[] = { 1, 1, 3, 3 };
23402 static const int MaskLo[] = { 0, 0, 2, 2 };
23403 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23404 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23405 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23406
23407 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23408 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23409
23410 if (Invert)
23411 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23412
23413 return DAG.getBitcast(VT, Result);
23414 }
23415
23416 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23417 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23418 // pcmpeqd + pshufd + pand.
23419 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((void)0);
23420
23421 // First cast everything to the right type.
23422 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23423 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23424
23425 // Do the compare.
23426 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23427
23428 // Make sure the lower and upper halves are both all-ones.
23429 static const int Mask[] = { 1, 0, 3, 2 };
23430 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23431 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23432
23433 if (Invert)
23434 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23435
23436 return DAG.getBitcast(VT, Result);
23437 }
23438 }
23439
23440 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23441 // bits of the inputs before performing those operations.
23442 if (FlipSigns) {
23443 MVT EltVT = VT.getVectorElementType();
23444 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23445 VT);
23446 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23447 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23448 }
23449
23450 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23451
23452 // If the logical-not of the result is required, perform that now.
23453 if (Invert)
23454 Result = DAG.getNOT(dl, Result, VT);
23455
23456 return Result;
23457}
23458
23459// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23460static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23461 const SDLoc &dl, SelectionDAG &DAG,
23462 const X86Subtarget &Subtarget,
23463 SDValue &X86CC) {
23464 // Only support equality comparisons.
23465 if (CC != ISD::SETEQ && CC != ISD::SETNE)
23466 return SDValue();
23467
23468 // Must be a bitcast from vXi1.
23469 if (Op0.getOpcode() != ISD::BITCAST)
23470 return SDValue();
23471
23472 Op0 = Op0.getOperand(0);
23473 MVT VT = Op0.getSimpleValueType();
23474 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23475 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23476 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23477 return SDValue();
23478
23479 X86::CondCode X86Cond;
23480 if (isNullConstant(Op1)) {
23481 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23482 } else if (isAllOnesConstant(Op1)) {
23483 // C flag is set for all ones.
23484 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23485 } else
23486 return SDValue();
23487
23488 // If the input is an AND, we can combine it's operands into the KTEST.
23489 bool KTestable = false;
23490 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23491 KTestable = true;
23492 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23493 KTestable = true;
23494 if (!isNullConstant(Op1))
23495 KTestable = false;
23496 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23497 SDValue LHS = Op0.getOperand(0);
23498 SDValue RHS = Op0.getOperand(1);
23499 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23500 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23501 }
23502
23503 // If the input is an OR, we can combine it's operands into the KORTEST.
23504 SDValue LHS = Op0;
23505 SDValue RHS = Op0;
23506 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23507 LHS = Op0.getOperand(0);
23508 RHS = Op0.getOperand(1);
23509 }
23510
23511 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23512 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23513}
23514
23515/// Emit flags for the given setcc condition and operands. Also returns the
23516/// corresponding X86 condition code constant in X86CC.
23517SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23518 ISD::CondCode CC, const SDLoc &dl,
23519 SelectionDAG &DAG,
23520 SDValue &X86CC) const {
23521 // Optimize to BT if possible.
23522 // Lower (X & (1 << N)) == 0 to BT(X, N).
23523 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23524 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23525 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
23526 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23527 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
23528 return BT;
23529 }
23530
23531 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
23532 // TODO: We could do AND tree with all 1s as well by using the C flag.
23533 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
23534 if (SDValue CmpZ =
23535 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
23536 return CmpZ;
23537
23538 // Try to lower using KORTEST or KTEST.
23539 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23540 return Test;
23541
23542 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
23543 // these.
23544 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
23545 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23546 // If the input is a setcc, then reuse the input setcc or use a new one with
23547 // the inverted condition.
23548 if (Op0.getOpcode() == X86ISD::SETCC) {
23549 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23550
23551 X86CC = Op0.getOperand(0);
23552 if (Invert) {
23553 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23554 CCode = X86::GetOppositeBranchCondition(CCode);
23555 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23556 }
23557
23558 return Op0.getOperand(1);
23559 }
23560 }
23561
23562 // Try to use the carry flag from the add in place of an separate CMP for:
23563 // (seteq (add X, -1), -1). Similar for setne.
23564 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23565 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
23566 if (isProfitableToUseFlagOp(Op0)) {
23567 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23568
23569 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23570 Op0.getOperand(1));
23571 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23572 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23573 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
23574 return SDValue(New.getNode(), 1);
23575 }
23576 }
23577
23578 X86::CondCode CondCode =
23579 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23580 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")((void)0);
23581
23582 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23583 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23584 return EFLAGS;
23585}
23586
23587SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23588
23589 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23590 Op.getOpcode() == ISD::STRICT_FSETCCS;
23591 MVT VT = Op->getSimpleValueType(0);
23592
23593 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23594
23595 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((void)0);
23596 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23597 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23598 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23599 SDLoc dl(Op);
23600 ISD::CondCode CC =
23601 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23602
23603 // Handle f128 first, since one possible outcome is a normal integer
23604 // comparison which gets handled by emitFlagsForSetcc.
23605 if (Op0.getValueType() == MVT::f128) {
23606 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23607 Op.getOpcode() == ISD::STRICT_FSETCCS);
23608
23609 // If softenSetCCOperands returned a scalar, use it.
23610 if (!Op1.getNode()) {
23611 assert(Op0.getValueType() == Op.getValueType() &&((void)0)
23612 "Unexpected setcc expansion!")((void)0);
23613 if (IsStrict)
23614 return DAG.getMergeValues({Op0, Chain}, dl);
23615 return Op0;
23616 }
23617 }
23618
23619 if (Op0.getSimpleValueType().isInteger()) {
23620 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23621 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23622 // this may translate to less uops depending on uarch implementation. The
23623 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23624 // canonicalize to that CondCode.
23625 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23626 // encoding size - so it must either already be a i8 or i32 immediate, or it
23627 // shrinks down to that. We don't do this for any i64's to avoid additional
23628 // constant materializations.
23629 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23630 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23631 const APInt &Op1Val = Op1C->getAPIntValue();
23632 if (!Op1Val.isNullValue()) {
23633 // Ensure the constant+1 doesn't overflow.
23634 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23635 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23636 APInt Op1ValPlusOne = Op1Val + 1;
23637 if (Op1ValPlusOne.isSignedIntN(32) &&
23638 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23639 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23640 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
23641 : ISD::CondCode::SETUGE;
23642 }
23643 }
23644 }
23645 }
23646
23647 SDValue X86CC;
23648 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23649 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23650 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23651 }
23652
23653 // Handle floating point.
23654 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23655 if (CondCode == X86::COND_INVALID)
23656 return SDValue();
23657
23658 SDValue EFLAGS;
23659 if (IsStrict) {
23660 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23661 EFLAGS =
23662 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23663 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23664 Chain = EFLAGS.getValue(1);
23665 } else {
23666 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23667 }
23668
23669 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23670 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23671 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23672}
23673
23674SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23675 SDValue LHS = Op.getOperand(0);
23676 SDValue RHS = Op.getOperand(1);
23677 SDValue Carry = Op.getOperand(2);
23678 SDValue Cond = Op.getOperand(3);
23679 SDLoc DL(Op);
23680
23681 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((void)0);
23682 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23683
23684 // Recreate the carry if needed.
23685 EVT CarryVT = Carry.getValueType();
23686 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23687 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23688
23689 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23690 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23691 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23692}
23693
23694// This function returns three things: the arithmetic computation itself
23695// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23696// flag and the condition code define the case in which the arithmetic
23697// computation overflows.
23698static std::pair<SDValue, SDValue>
23699getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23700 assert(Op.getResNo() == 0 && "Unexpected result number!")((void)0);
23701 SDValue Value, Overflow;
23702 SDValue LHS = Op.getOperand(0);
23703 SDValue RHS = Op.getOperand(1);
23704 unsigned BaseOp = 0;
23705 SDLoc DL(Op);
23706 switch (Op.getOpcode()) {
23707 default: llvm_unreachable("Unknown ovf instruction!")__builtin_unreachable();
23708 case ISD::SADDO:
23709 BaseOp = X86ISD::ADD;
23710 Cond = X86::COND_O;
23711 break;
23712 case ISD::UADDO:
23713 BaseOp = X86ISD::ADD;
23714 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23715 break;
23716 case ISD::SSUBO:
23717 BaseOp = X86ISD::SUB;
23718 Cond = X86::COND_O;
23719 break;
23720 case ISD::USUBO:
23721 BaseOp = X86ISD::SUB;
23722 Cond = X86::COND_B;
23723 break;
23724 case ISD::SMULO:
23725 BaseOp = X86ISD::SMUL;
23726 Cond = X86::COND_O;
23727 break;
23728 case ISD::UMULO:
23729 BaseOp = X86ISD::UMUL;
23730 Cond = X86::COND_O;
23731 break;
23732 }
23733
23734 if (BaseOp) {
23735 // Also sets EFLAGS.
23736 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23737 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23738 Overflow = Value.getValue(1);
23739 }
23740
23741 return std::make_pair(Value, Overflow);
23742}
23743
23744static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23745 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23746 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23747 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23748 // has only one use.
23749 SDLoc DL(Op);
23750 X86::CondCode Cond;
23751 SDValue Value, Overflow;
23752 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23753
23754 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23755 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")((void)0);
23756 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23757}
23758
23759/// Return true if opcode is a X86 logical comparison.
23760static bool isX86LogicalCmp(SDValue Op) {
23761 unsigned Opc = Op.getOpcode();
23762 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23763 Opc == X86ISD::FCMP)
23764 return true;
23765 if (Op.getResNo() == 1 &&
23766 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23767 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23768 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23769 return true;
23770
23771 return false;
23772}
23773
23774static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23775 if (V.getOpcode() != ISD::TRUNCATE)
23776 return false;
23777
23778 SDValue VOp0 = V.getOperand(0);
23779 unsigned InBits = VOp0.getValueSizeInBits();
23780 unsigned Bits = V.getValueSizeInBits();
23781 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23782}
23783
23784SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23785 bool AddTest = true;
23786 SDValue Cond = Op.getOperand(0);
23787 SDValue Op1 = Op.getOperand(1);
23788 SDValue Op2 = Op.getOperand(2);
23789 SDLoc DL(Op);
23790 MVT VT = Op1.getSimpleValueType();
23791 SDValue CC;
23792
23793 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23794 // are available or VBLENDV if AVX is available.
23795 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23796 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23797 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23798 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23799 bool IsAlwaysSignaling;
23800 unsigned SSECC =
23801 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23802 CondOp0, CondOp1, IsAlwaysSignaling);
23803
23804 if (Subtarget.hasAVX512()) {
23805 SDValue Cmp =
23806 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23807 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23808 assert(!VT.isVector() && "Not a scalar type?")((void)0);
23809 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23810 }
23811
23812 if (SSECC < 8 || Subtarget.hasAVX()) {
23813 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23814 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23815
23816 // If we have AVX, we can use a variable vector select (VBLENDV) instead
23817 // of 3 logic instructions for size savings and potentially speed.
23818 // Unfortunately, there is no scalar form of VBLENDV.
23819
23820 // If either operand is a +0.0 constant, don't try this. We can expect to
23821 // optimize away at least one of the logic instructions later in that
23822 // case, so that sequence would be faster than a variable blend.
23823
23824 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23825 // uses XMM0 as the selection register. That may need just as many
23826 // instructions as the AND/ANDN/OR sequence due to register moves, so
23827 // don't bother.
23828 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23829 !isNullFPConstant(Op2)) {
23830 // Convert to vectors, do a VSELECT, and convert back to scalar.
23831 // All of the conversions should be optimized away.
23832 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23833 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23834 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23835 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23836
23837 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23838 VCmp = DAG.getBitcast(VCmpVT, VCmp);
23839
23840 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23841
23842 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23843 VSel, DAG.getIntPtrConstant(0, DL));
23844 }
23845 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23846 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23847 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23848 }
23849 }
23850
23851 // AVX512 fallback is to lower selects of scalar floats to masked moves.
23852 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23853 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23854 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23855 }
23856
23857 if (Cond.getOpcode() == ISD::SETCC) {
23858 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23859 Cond = NewCond;
23860 // If the condition was updated, it's possible that the operands of the
23861 // select were also updated (for example, EmitTest has a RAUW). Refresh
23862 // the local references to the select operands in case they got stale.
23863 Op1 = Op.getOperand(1);
23864 Op2 = Op.getOperand(2);
23865 }
23866 }
23867
23868 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23869 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23870 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23871 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23872 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23873 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23874 if (Cond.getOpcode() == X86ISD::SETCC &&
23875 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23876 isNullConstant(Cond.getOperand(1).getOperand(1))) {
23877 SDValue Cmp = Cond.getOperand(1);
23878 SDValue CmpOp0 = Cmp.getOperand(0);
23879 unsigned CondCode = Cond.getConstantOperandVal(0);
23880
23881 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23882 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23883 // handle to keep the CMP with 0. This should be removed by
23884 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23885 // cttz_zero_undef.
23886 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23887 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23888 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23889 };
23890 if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
23891 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23892 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23893 // Keep Cmp.
23894 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23895 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23896 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23897
23898 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23899 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23900
23901 // Apply further optimizations for special cases
23902 // (select (x != 0), -1, 0) -> neg & sbb
23903 // (select (x == 0), 0, -1) -> neg & sbb
23904 if (isNullConstant(Y) &&
23905 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
23906 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23907 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23908 Zero = DAG.getConstant(0, DL, Op.getValueType());
23909 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
23910 }
23911
23912 Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
23913 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
23914
23915 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
23916 SDValue Res = // Res = 0 or -1.
23917 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
23918
23919 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
23920 Res = DAG.getNOT(DL, Res, Res.getValueType());
23921
23922 return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
23923 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
23924 Cmp.getOperand(0).getOpcode() == ISD::AND &&
23925 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
23926 SDValue Src1, Src2;
23927 // true if Op2 is XOR or OR operator and one of its operands
23928 // is equal to Op1
23929 // ( a , a op b) || ( b , a op b)
23930 auto isOrXorPattern = [&]() {
23931 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23932 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23933 Src1 =
23934 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23935 Src2 = Op1;
23936 return true;
23937 }
23938 return false;
23939 };
23940
23941 if (isOrXorPattern()) {
23942 SDValue Neg;
23943 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23944 // we need mask of all zeros or ones with same size of the other
23945 // operands.
23946 if (CmpSz > VT.getSizeInBits())
23947 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23948 else if (CmpSz < VT.getSizeInBits())
23949 Neg = DAG.getNode(ISD::AND, DL, VT,
23950 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23951 DAG.getConstant(1, DL, VT));
23952 else
23953 Neg = CmpOp0;
23954 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23955 Neg); // -(and (x, 0x1))
23956 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23957 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
23958 }
23959 }
23960 }
23961
23962 // Look past (and (setcc_carry (cmp ...)), 1).
23963 if (Cond.getOpcode() == ISD::AND &&
23964 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23965 isOneConstant(Cond.getOperand(1)))
23966 Cond = Cond.getOperand(0);
23967
23968 // If condition flag is set by a X86ISD::CMP, then use it as the condition
23969 // setting operand in place of the X86ISD::SETCC.
23970 unsigned CondOpcode = Cond.getOpcode();
23971 if (CondOpcode == X86ISD::SETCC ||
23972 CondOpcode == X86ISD::SETCC_CARRY) {
23973 CC = Cond.getOperand(0);
23974
23975 SDValue Cmp = Cond.getOperand(1);
23976 bool IllegalFPCMov = false;
23977 if (VT.isFloatingPoint() && !VT.isVector() &&
23978 !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
23979 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23980
23981 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23982 Cmp.getOpcode() == X86ISD::BT) { // FIXME
23983 Cond = Cmp;
23984 AddTest = false;
23985 }
23986 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23987 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23988 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23989 SDValue Value;
23990 X86::CondCode X86Cond;
23991 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23992
23993 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23994 AddTest = false;
23995 }
23996
23997 if (AddTest) {
23998 // Look past the truncate if the high bits are known zero.
23999 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24000 Cond = Cond.getOperand(0);
24001
24002 // We know the result of AND is compared against zero. Try to match
24003 // it to BT.
24004 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24005 SDValue BTCC;
24006 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
24007 CC = BTCC;
24008 Cond = BT;
24009 AddTest = false;
24010 }
24011 }
24012 }
24013
24014 if (AddTest) {
24015 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24016 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24017 }
24018
24019 // a < b ? -1 : 0 -> RES = ~setcc_carry
24020 // a < b ? 0 : -1 -> RES = setcc_carry
24021 // a >= b ? -1 : 0 -> RES = setcc_carry
24022 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24023 if (Cond.getOpcode() == X86ISD::SUB) {
24024 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24025
24026 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24027 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24028 (isNullConstant(Op1) || isNullConstant(Op2))) {
24029 SDValue Res =
24030 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24031 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24032 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24033 return DAG.getNOT(DL, Res, Res.getValueType());
24034 return Res;
24035 }
24036 }
24037
24038 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24039 // widen the cmov and push the truncate through. This avoids introducing a new
24040 // branch during isel and doesn't add any extensions.
24041 if (Op.getValueType() == MVT::i8 &&
24042 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24043 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24044 if (T1.getValueType() == T2.getValueType() &&
24045 // Exclude CopyFromReg to avoid partial register stalls.
24046 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24047 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24048 CC, Cond);
24049 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24050 }
24051 }
24052
24053 // Or finally, promote i8 cmovs if we have CMOV,
24054 // or i16 cmovs if it won't prevent folding a load.
24055 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24056 // legal, but EmitLoweredSelect() can not deal with these extensions
24057 // being inserted between two CMOV's. (in i16 case too TBN)
24058 // https://bugs.llvm.org/show_bug.cgi?id=40974
24059 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
24060 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
24061 !MayFoldLoad(Op2))) {
24062 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24063 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24064 SDValue Ops[] = { Op2, Op1, CC, Cond };
24065 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24066 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24067 }
24068
24069 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24070 // condition is true.
24071 SDValue Ops[] = { Op2, Op1, CC, Cond };
24072 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24073}
24074
24075static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24076 const X86Subtarget &Subtarget,
24077 SelectionDAG &DAG) {
24078 MVT VT = Op->getSimpleValueType(0);
24079 SDValue In = Op->getOperand(0);
24080 MVT InVT = In.getSimpleValueType();
24081 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((void)0);
24082 MVT VTElt = VT.getVectorElementType();
24083 SDLoc dl(Op);
24084
24085 unsigned NumElts = VT.getVectorNumElements();
24086
24087 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24088 MVT ExtVT = VT;
24089 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24090 // If v16i32 is to be avoided, we'll need to split and concatenate.
24091 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24092 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24093
24094 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24095 }
24096
24097 // Widen to 512-bits if VLX is not supported.
24098 MVT WideVT = ExtVT;
24099 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24100 NumElts *= 512 / ExtVT.getSizeInBits();
24101 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24102 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24103 In, DAG.getIntPtrConstant(0, dl));
24104 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24105 }
24106
24107 SDValue V;
24108 MVT WideEltVT = WideVT.getVectorElementType();
24109 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24110 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24111 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24112 } else {
24113 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24114 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24115 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24116 }
24117
24118 // Truncate if we had to extend i16/i8 above.
24119 if (VT != ExtVT) {
24120 WideVT = MVT::getVectorVT(VTElt, NumElts);
24121 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24122 }
24123
24124 // Extract back to 128/256-bit if we widened.
24125 if (WideVT != VT)
24126 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24127 DAG.getIntPtrConstant(0, dl));
24128
24129 return V;
24130}
24131
24132static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24133 SelectionDAG &DAG) {
24134 SDValue In = Op->getOperand(0);
24135 MVT InVT = In.getSimpleValueType();
24136
24137 if (InVT.getVectorElementType() == MVT::i1)
24138 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24139
24140 assert(Subtarget.hasAVX() && "Expected AVX support")((void)0);
24141 return LowerAVXExtend(Op, DAG, Subtarget);
24142}
24143
24144// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24145// For sign extend this needs to handle all vector sizes and SSE4.1 and
24146// non-SSE4.1 targets. For zero extend this should only handle inputs of
24147// MVT::v64i8 when BWI is not supported, but AVX512 is.
24148static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24149 const X86Subtarget &Subtarget,
24150 SelectionDAG &DAG) {
24151 SDValue In = Op->getOperand(0);
24152 MVT VT = Op->getSimpleValueType(0);
24153 MVT InVT = In.getSimpleValueType();
24154
24155 MVT SVT = VT.getVectorElementType();
24156 MVT InSVT = InVT.getVectorElementType();
24157 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())((void)0);
24158
24159 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24160 return SDValue();
24161 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24162 return SDValue();
24163 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24164 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24165 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24166 return SDValue();
24167
24168 SDLoc dl(Op);
24169 unsigned Opc = Op.getOpcode();
24170 unsigned NumElts = VT.getVectorNumElements();
24171
24172 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24173 // For 512-bit vectors, we need 128-bits or 256-bits.
24174 if (InVT.getSizeInBits() > 128) {
24175 // Input needs to be at least the same number of elements as output, and
24176 // at least 128-bits.
24177 int InSize = InSVT.getSizeInBits() * NumElts;
24178 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24179 InVT = In.getSimpleValueType();
24180 }
24181
24182 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24183 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24184 // need to be handled here for 256/512-bit results.
24185 if (Subtarget.hasInt256()) {
24186 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((void)0);
24187
24188 if (InVT.getVectorNumElements() != NumElts)
24189 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24190
24191 // FIXME: Apparently we create inreg operations that could be regular
24192 // extends.
24193 unsigned ExtOpc =
24194 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24195 : ISD::ZERO_EXTEND;
24196 return DAG.getNode(ExtOpc, dl, VT, In);
24197 }
24198
24199 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24200 if (Subtarget.hasAVX()) {
24201 assert(VT.is256BitVector() && "256-bit vector expected")((void)0);
24202 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24203 int HalfNumElts = HalfVT.getVectorNumElements();
24204
24205 unsigned NumSrcElts = InVT.getVectorNumElements();
24206 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24207 for (int i = 0; i != HalfNumElts; ++i)
24208 HiMask[i] = HalfNumElts + i;
24209
24210 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24211 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24212 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24213 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24214 }
24215
24216 // We should only get here for sign extend.
24217 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")((void)0);
24218 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")((void)0);
24219
24220 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24221 SDValue Curr = In;
24222 SDValue SignExt = Curr;
24223
24224 // As SRAI is only available on i16/i32 types, we expand only up to i32
24225 // and handle i64 separately.
24226 if (InVT != MVT::v4i32) {
24227 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24228
24229 unsigned DestWidth = DestVT.getScalarSizeInBits();
24230 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24231
24232 unsigned InNumElts = InVT.getVectorNumElements();
24233 unsigned DestElts = DestVT.getVectorNumElements();
24234
24235 // Build a shuffle mask that takes each input element and places it in the
24236 // MSBs of the new element size.
24237 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24238 for (unsigned i = 0; i != DestElts; ++i)
24239 Mask[i * Scale + (Scale - 1)] = i;
24240
24241 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24242 Curr = DAG.getBitcast(DestVT, Curr);
24243
24244 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24245 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24246 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24247 }
24248
24249 if (VT == MVT::v2i64) {
24250 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")((void)0);
24251 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24252 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24253 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24254 SignExt = DAG.getBitcast(VT, SignExt);
24255 }
24256
24257 return SignExt;
24258}
24259
24260static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24261 SelectionDAG &DAG) {
24262 MVT VT = Op->getSimpleValueType(0);
24263 SDValue In = Op->getOperand(0);
24264 MVT InVT = In.getSimpleValueType();
24265 SDLoc dl(Op);
24266
24267 if (InVT.getVectorElementType() == MVT::i1)
24268 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24269
24270 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((void)0);
24271 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((void)0)
24272 "Expected same number of elements")((void)0);
24273 assert((VT.getVectorElementType() == MVT::i16 ||((void)0)
24274 VT.getVectorElementType() == MVT::i32 ||((void)0)
24275 VT.getVectorElementType() == MVT::i64) &&((void)0)
24276 "Unexpected element type")((void)0);
24277 assert((InVT.getVectorElementType() == MVT::i8 ||((void)0)
24278 InVT.getVectorElementType() == MVT::i16 ||((void)0)
24279 InVT.getVectorElementType() == MVT::i32) &&((void)0)
24280 "Unexpected element type")((void)0);
24281
24282 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24283 assert(InVT == MVT::v32i8 && "Unexpected VT!")((void)0);
24284 return splitVectorIntUnary(Op, DAG);
24285 }
24286
24287 if (Subtarget.hasInt256())
24288 return Op;
24289
24290 // Optimize vectors in AVX mode
24291 // Sign extend v8i16 to v8i32 and
24292 // v4i32 to v4i64
24293 //
24294 // Divide input vector into two parts
24295 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24296 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24297 // concat the vectors to original VT
24298 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24299 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24300
24301 unsigned NumElems = InVT.getVectorNumElements();
24302 SmallVector<int,8> ShufMask(NumElems, -1);
24303 for (unsigned i = 0; i != NumElems/2; ++i)
24304 ShufMask[i] = i + NumElems/2;
24305
24306 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24307 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24308
24309 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24310}
24311
24312/// Change a vector store into a pair of half-size vector stores.
24313static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24314 SDValue StoredVal = Store->getValue();
24315 assert((StoredVal.getValueType().is256BitVector() ||((void)0)
24316 StoredVal.getValueType().is512BitVector()) &&((void)0)
24317 "Expecting 256/512-bit op")((void)0);
24318
24319 // Splitting volatile memory ops is not allowed unless the operation was not
24320 // legal to begin with. Assume the input store is legal (this transform is
24321 // only used for targets with AVX). Note: It is possible that we have an
24322 // illegal type like v2i128, and so we could allow splitting a volatile store
24323 // in that case if that is important.
24324 if (!Store->isSimple())
24325 return SDValue();
24326
24327 SDLoc DL(Store);
24328 SDValue Value0, Value1;
24329 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24330 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24331 SDValue Ptr0 = Store->getBasePtr();
24332 SDValue Ptr1 =
24333 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24334 SDValue Ch0 =
24335 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24336 Store->getOriginalAlign(),
24337 Store->getMemOperand()->getFlags());
24338 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24339 Store->getPointerInfo().getWithOffset(HalfOffset),
24340 Store->getOriginalAlign(),
24341 Store->getMemOperand()->getFlags());
24342 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24343}
24344
24345/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24346/// type.
24347static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24348 SelectionDAG &DAG) {
24349 SDValue StoredVal = Store->getValue();
24350 assert(StoreVT.is128BitVector() &&((void)0)
24351 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")((void)0);
24352 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24353
24354 // Splitting volatile memory ops is not allowed unless the operation was not
24355 // legal to begin with. We are assuming the input op is legal (this transform
24356 // is only used for targets with AVX).
24357 if (!Store->isSimple())
24358 return SDValue();
24359
24360 MVT StoreSVT = StoreVT.getScalarType();
24361 unsigned NumElems = StoreVT.getVectorNumElements();
24362 unsigned ScalarSize = StoreSVT.getStoreSize();
24363
24364 SDLoc DL(Store);
24365 SmallVector<SDValue, 4> Stores;
24366 for (unsigned i = 0; i != NumElems; ++i) {
24367 unsigned Offset = i * ScalarSize;
24368 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24369 TypeSize::Fixed(Offset), DL);
24370 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24371 DAG.getIntPtrConstant(i, DL));
24372 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24373 Store->getPointerInfo().getWithOffset(Offset),
24374 Store->getOriginalAlign(),
24375 Store->getMemOperand()->getFlags());
24376 Stores.push_back(Ch);
24377 }
24378 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24379}
24380
24381static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24382 SelectionDAG &DAG) {
24383 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24384 SDLoc dl(St);
24385 SDValue StoredVal = St->getValue();
24386
24387 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24388 if (StoredVal.getValueType().isVector() &&
24389 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24390 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24391 assert(NumElts <= 8 && "Unexpected VT")((void)0);
24392 assert(!St->isTruncatingStore() && "Expected non-truncating store")((void)0);
24393 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((void)0)
24394 "Expected AVX512F without AVX512DQI")((void)0);
24395
24396 // We must pad with zeros to ensure we store zeroes to any unused bits.
24397 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24398 DAG.getUNDEF(MVT::v16i1), StoredVal,
24399 DAG.getIntPtrConstant(0, dl));
24400 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24401 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24402 // Make sure we store zeros in the extra bits.
24403 if (NumElts < 8)
24404 StoredVal = DAG.getZeroExtendInReg(
24405 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24406
24407 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24408 St->getPointerInfo(), St->getOriginalAlign(),
24409 St->getMemOperand()->getFlags());
24410 }
24411
24412 if (St->isTruncatingStore())
24413 return SDValue();
24414
24415 // If this is a 256-bit store of concatenated ops, we are better off splitting
24416 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24417 // and each half can execute independently. Some cores would split the op into
24418 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24419 MVT StoreVT = StoredVal.getSimpleValueType();
24420 if (StoreVT.is256BitVector() ||
24421 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24422 !Subtarget.hasBWI())) {
24423 SmallVector<SDValue, 4> CatOps;
24424 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24425 return splitVectorStore(St, DAG);
24426 return SDValue();
24427 }
24428
24429 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24430 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&((void)0)
24431 "Unexpected VT")((void)0);
24432 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==((void)0)
24433 TargetLowering::TypeWidenVector && "Unexpected type action!")((void)0);
24434
24435 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24436 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24437 DAG.getUNDEF(StoreVT));
24438
24439 if (Subtarget.hasSSE2()) {
24440 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24441 // and store it.
24442 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24443 MVT CastVT = MVT::getVectorVT(StVT, 2);
24444 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24445 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24446 DAG.getIntPtrConstant(0, dl));
24447
24448 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24449 St->getPointerInfo(), St->getOriginalAlign(),
24450 St->getMemOperand()->getFlags());
24451 }
24452 assert(Subtarget.hasSSE1() && "Expected SSE")((void)0);
24453 SDVTList Tys = DAG.getVTList(MVT::Other);
24454 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24455 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24456 St->getMemOperand());
24457}
24458
24459// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24460// may emit an illegal shuffle but the expansion is still better than scalar
24461// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24462// we'll emit a shuffle and a arithmetic shift.
24463// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24464// TODO: It is possible to support ZExt by zeroing the undef values during
24465// the shuffle phase or after the shuffle.
24466static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24467 SelectionDAG &DAG) {
24468 MVT RegVT = Op.getSimpleValueType();
24469 assert(RegVT.isVector() && "We only custom lower vector loads.")((void)0);
24470 assert(RegVT.isInteger() &&((void)0)
24471 "We only custom lower integer vector loads.")((void)0);
24472
24473 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24474 SDLoc dl(Ld);
24475
24476 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24477 if (RegVT.getVectorElementType() == MVT::i1) {
24478 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")((void)0);
24479 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")((void)0);
24480 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((void)0)
24481 "Expected AVX512F without AVX512DQI")((void)0);
24482
24483 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24484 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24485 Ld->getMemOperand()->getFlags());
24486
24487 // Replace chain users with the new chain.
24488 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")((void)0);
24489
24490 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24491 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24492 DAG.getBitcast(MVT::v16i1, Val),
24493 DAG.getIntPtrConstant(0, dl));
24494 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24495 }
24496
24497 return SDValue();
24498}
24499
24500/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24501/// each of which has no other use apart from the AND / OR.
24502static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24503 Opc = Op.getOpcode();
24504 if (Opc != ISD::OR && Opc != ISD::AND)
24505 return false;
24506 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24507 Op.getOperand(0).hasOneUse() &&
24508 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24509 Op.getOperand(1).hasOneUse());
24510}
24511
24512SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24513 SDValue Chain = Op.getOperand(0);
24514 SDValue Cond = Op.getOperand(1);
24515 SDValue Dest = Op.getOperand(2);
24516 SDLoc dl(Op);
24517
24518 if (Cond.getOpcode() == ISD::SETCC &&
24519 Cond.getOperand(0).getValueType() != MVT::f128) {
24520 SDValue LHS = Cond.getOperand(0);
24521 SDValue RHS = Cond.getOperand(1);
24522 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24523
24524 // Special case for
24525 // setcc([su]{add,sub,mul}o == 0)
24526 // setcc([su]{add,sub,mul}o != 1)
24527 if (ISD::isOverflowIntrOpRes(LHS) &&
24528 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24529 (isNullConstant(RHS) || isOneConstant(RHS))) {
24530 SDValue Value, Overflow;
24531 X86::CondCode X86Cond;
24532 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24533
24534 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24535 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24536
24537 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24538 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24539 Overflow);
24540 }
24541
24542 if (LHS.getSimpleValueType().isInteger()) {
24543 SDValue CCVal;
24544 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24545 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24546 EFLAGS);
24547 }
24548
24549 if (CC == ISD::SETOEQ) {
24550 // For FCMP_OEQ, we can emit
24551 // two branches instead of an explicit AND instruction with a
24552 // separate test. However, we only do this if this block doesn't
24553 // have a fall-through edge, because this requires an explicit
24554 // jmp when the condition is false.
24555 if (Op.getNode()->hasOneUse()) {
24556 SDNode *User = *Op.getNode()->use_begin();
24557 // Look for an unconditional branch following this conditional branch.
24558 // We need this because we need to reverse the successors in order
24559 // to implement FCMP_OEQ.
24560 if (User->getOpcode() == ISD::BR) {
24561 SDValue FalseBB = User->getOperand(1);
24562 SDNode *NewBR =
24563 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24564 assert(NewBR == User)((void)0);
24565 (void)NewBR;
24566 Dest = FalseBB;
24567
24568 SDValue Cmp =
24569 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24570 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24571 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24572 CCVal, Cmp);
24573 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24574 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24575 Cmp);
24576 }
24577 }
24578 } else if (CC == ISD::SETUNE) {
24579 // For FCMP_UNE, we can emit
24580 // two branches instead of an explicit OR instruction with a
24581 // separate test.
24582 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24583 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24584 Chain =
24585 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24586 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24587 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24588 Cmp);
24589 } else {
24590 X86::CondCode X86Cond =
24591 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24592 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24593 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24594 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24595 Cmp);
24596 }
24597 }
24598
24599 if (ISD::isOverflowIntrOpRes(Cond)) {
24600 SDValue Value, Overflow;
24601 X86::CondCode X86Cond;
24602 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24603
24604 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24605 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24606 Overflow);
24607 }
24608
24609 // Look past the truncate if the high bits are known zero.
24610 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24611 Cond = Cond.getOperand(0);
24612
24613 EVT CondVT = Cond.getValueType();
24614
24615 // Add an AND with 1 if we don't already have one.
24616 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24617 Cond =
24618 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24619
24620 SDValue LHS = Cond;
24621 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24622
24623 SDValue CCVal;
24624 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24625 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24626 EFLAGS);
24627}
24628
24629// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24630// Calls to _alloca are needed to probe the stack when allocating more than 4k
24631// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24632// that the guard pages used by the OS virtual memory manager are allocated in
24633// correct sequence.
24634SDValue
24635X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24636 SelectionDAG &DAG) const {
24637 MachineFunction &MF = DAG.getMachineFunction();
24638 bool SplitStack = MF.shouldSplitStack();
24639 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24640 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24641 SplitStack || EmitStackProbeCall;
24642 SDLoc dl(Op);
24643
24644 // Get the inputs.
24645 SDNode *Node = Op.getNode();
24646 SDValue Chain = Op.getOperand(0);
24647 SDValue Size = Op.getOperand(1);
24648 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24649 EVT VT = Node->getValueType(0);
24650
24651 // Chain the dynamic stack allocation so that it doesn't modify the stack
24652 // pointer when other instructions are using the stack.
24653 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24654
24655 bool Is64Bit = Subtarget.is64Bit();
24656 MVT SPTy = getPointerTy(DAG.getDataLayout());
24657
24658 SDValue Result;
24659 if (!Lower) {
24660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24661 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24662 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((void)0)
24663 " not tell us which reg is the stack pointer!")((void)0);
24664
24665 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24666 const Align StackAlign = TFI.getStackAlign();
24667 if (hasInlineStackProbe(MF)) {
24668 MachineRegisterInfo &MRI = MF.getRegInfo();
24669
24670 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24671 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24672 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24673 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24674 DAG.getRegister(Vreg, SPTy));
24675 } else {
24676 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24677 Chain = SP.getValue(1);
24678 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24679 }
24680 if (Alignment && *Alignment > StackAlign)
24681 Result =
24682 DAG.getNode(ISD::AND, dl, VT, Result,
24683 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24684 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24685 } else if (SplitStack) {
24686 MachineRegisterInfo &MRI = MF.getRegInfo();
24687
24688 if (Is64Bit) {
24689 // The 64 bit implementation of segmented stacks needs to clobber both r10
24690 // r11. This makes it impossible to use it along with nested parameters.
24691 const Function &F = MF.getFunction();
24692 for (const auto &A : F.args()) {
24693 if (A.hasNestAttr())
24694 report_fatal_error("Cannot use segmented stacks with functions that "
24695 "have nested arguments.");
24696 }
24697 }
24698
24699 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24700 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24701 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24702 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24703 DAG.getRegister(Vreg, SPTy));
24704 } else {
24705 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24706 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
24707 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
24708
24709 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24710 Register SPReg = RegInfo->getStackRegister();
24711 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24712 Chain = SP.getValue(1);
24713
24714 if (Alignment) {
24715 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24716 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24717 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24718 }
24719
24720 Result = SP;
24721 }
24722
24723 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
24724 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
24725
24726 SDValue Ops[2] = {Result, Chain};
24727 return DAG.getMergeValues(Ops, dl);
24728}
24729
24730SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24731 MachineFunction &MF = DAG.getMachineFunction();
24732 auto PtrVT = getPointerTy(MF.getDataLayout());
24733 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24734
24735 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24736 SDLoc DL(Op);
24737
24738 if (!Subtarget.is64Bit() ||
24739 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24740 // vastart just stores the address of the VarArgsFrameIndex slot into the
24741 // memory location argument.
24742 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24743 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24744 MachinePointerInfo(SV));
24745 }
24746
24747 // __va_list_tag:
24748 // gp_offset (0 - 6 * 8)
24749 // fp_offset (48 - 48 + 8 * 16)
24750 // overflow_arg_area (point to parameters coming in memory).
24751 // reg_save_area
24752 SmallVector<SDValue, 8> MemOps;
24753 SDValue FIN = Op.getOperand(1);
24754 // Store gp_offset
24755 SDValue Store = DAG.getStore(
24756 Op.getOperand(0), DL,
24757 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24758 MachinePointerInfo(SV));
24759 MemOps.push_back(Store);
24760
24761 // Store fp_offset
24762 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24763 Store = DAG.getStore(
24764 Op.getOperand(0), DL,
24765 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24766 MachinePointerInfo(SV, 4));
24767 MemOps.push_back(Store);
24768
24769 // Store ptr to overflow_arg_area
24770 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24771 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24772 Store =
24773 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24774 MemOps.push_back(Store);
24775
24776 // Store ptr to reg_save_area.
24777 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24778 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24779 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24780 Store = DAG.getStore(
24781 Op.getOperand(0), DL, RSFIN, FIN,
24782 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24783 MemOps.push_back(Store);
24784 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24785}
24786
24787SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24788 assert(Subtarget.is64Bit() &&((void)0)
24789 "LowerVAARG only handles 64-bit va_arg!")((void)0);
24790 assert(Op.getNumOperands() == 4)((void)0);
24791
24792 MachineFunction &MF = DAG.getMachineFunction();
24793 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24794 // The Win64 ABI uses char* instead of a structure.
24795 return DAG.expandVAArg(Op.getNode());
24796
24797 SDValue Chain = Op.getOperand(0);
24798 SDValue SrcPtr = Op.getOperand(1);
24799 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24800 unsigned Align = Op.getConstantOperandVal(3);
24801 SDLoc dl(Op);
24802
24803 EVT ArgVT = Op.getNode()->getValueType(0);
24804 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24805 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24806 uint8_t ArgMode;
24807
24808 // Decide which area this value should be read from.
24809 // TODO: Implement the AMD64 ABI in its entirety. This simple
24810 // selection mechanism works only for the basic types.
24811 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")((void)0);
24812 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24813 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
24814 } else {
24815 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&((void)0)
24816 "Unhandled argument type in LowerVAARG")((void)0);
24817 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
24818 }
24819
24820 if (ArgMode == 2) {
24821 // Sanity Check: Make sure using fp_offset makes sense.
24822 assert(!Subtarget.useSoftFloat() &&((void)0)
24823 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&((void)0)
24824 Subtarget.hasSSE1())((void)0);
24825 }
24826
24827 // Insert VAARG node into the DAG
24828 // VAARG returns two values: Variable Argument Address, Chain
24829 SDValue InstOps[] = {Chain, SrcPtr,
24830 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24831 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24832 DAG.getTargetConstant(Align, dl, MVT::i32)};
24833 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24834 SDValue VAARG = DAG.getMemIntrinsicNode(
24835 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24836 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24837 /*Alignment=*/None,
24838 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24839 Chain = VAARG.getValue(1);
24840
24841 // Load the next argument and return it
24842 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24843}
24844
24845static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24846 SelectionDAG &DAG) {
24847 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24848 // where a va_list is still an i8*.
24849 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((void)0);
24850 if (Subtarget.isCallingConvWin64(
24851 DAG.getMachineFunction().getFunction().getCallingConv()))
24852 // Probably a Win64 va_copy.
24853 return DAG.expandVACopy(Op.getNode());
24854
24855 SDValue Chain = Op.getOperand(0);
24856 SDValue DstPtr = Op.getOperand(1);
24857 SDValue SrcPtr = Op.getOperand(2);
24858 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24859 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24860 SDLoc DL(Op);
24861
24862 return DAG.getMemcpy(
24863 Chain, DL, DstPtr, SrcPtr,
24864 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24865 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24866 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24867}
24868
24869// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
24870static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24871 switch (Opc) {
24872 case ISD::SHL:
24873 case X86ISD::VSHL:
24874 case X86ISD::VSHLI:
24875 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24876 case ISD::SRL:
24877 case X86ISD::VSRL:
24878 case X86ISD::VSRLI:
24879 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24880 case ISD::SRA:
24881 case X86ISD::VSRA:
24882 case X86ISD::VSRAI:
24883 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24884 }
24885 llvm_unreachable("Unknown target vector shift node")__builtin_unreachable();
24886}
24887
24888/// Handle vector element shifts where the shift amount is a constant.
24889/// Takes immediate version of shift as input.
24890static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24891 SDValue SrcOp, uint64_t ShiftAmt,
24892 SelectionDAG &DAG) {
24893 MVT ElementType = VT.getVectorElementType();
24894
24895 // Bitcast the source vector to the output type, this is mainly necessary for
24896 // vXi8/vXi64 shifts.
24897 if (VT != SrcOp.getSimpleValueType())
24898 SrcOp = DAG.getBitcast(VT, SrcOp);
24899
24900 // Fold this packed shift into its first operand if ShiftAmt is 0.
24901 if (ShiftAmt == 0)
24902 return SrcOp;
24903
24904 // Check for ShiftAmt >= element width
24905 if (ShiftAmt >= ElementType.getSizeInBits()) {
24906 if (Opc == X86ISD::VSRAI)
24907 ShiftAmt = ElementType.getSizeInBits() - 1;
24908 else
24909 return DAG.getConstant(0, dl, VT);
24910 }
24911
24912 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)((void)0)
24913 && "Unknown target vector shift-by-constant node")((void)0);
24914
24915 // Fold this packed vector shift into a build vector if SrcOp is a
24916 // vector of Constants or UNDEFs.
24917 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24918 SmallVector<SDValue, 8> Elts;
24919 unsigned NumElts = SrcOp->getNumOperands();
24920
24921 switch (Opc) {
24922 default: llvm_unreachable("Unknown opcode!")__builtin_unreachable();
24923 case X86ISD::VSHLI:
24924 for (unsigned i = 0; i != NumElts; ++i) {
24925 SDValue CurrentOp = SrcOp->getOperand(i);
24926 if (CurrentOp->isUndef()) {
24927 // Must produce 0s in the correct bits.
24928 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24929 continue;
24930 }
24931 auto *ND = cast<ConstantSDNode>(CurrentOp);
24932 const APInt &C = ND->getAPIntValue();
24933 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
24934 }
24935 break;
24936 case X86ISD::VSRLI:
24937 for (unsigned i = 0; i != NumElts; ++i) {
24938 SDValue CurrentOp = SrcOp->getOperand(i);
24939 if (CurrentOp->isUndef()) {
24940 // Must produce 0s in the correct bits.
24941 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24942 continue;
24943 }
24944 auto *ND = cast<ConstantSDNode>(CurrentOp);
24945 const APInt &C = ND->getAPIntValue();
24946 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
24947 }
24948 break;
24949 case X86ISD::VSRAI:
24950 for (unsigned i = 0; i != NumElts; ++i) {
24951 SDValue CurrentOp = SrcOp->getOperand(i);
24952 if (CurrentOp->isUndef()) {
24953 // All shifted in bits must be the same so use 0.
24954 Elts.push_back(DAG.getConstant(0, dl, ElementType));
24955 continue;
24956 }
24957 auto *ND = cast<ConstantSDNode>(CurrentOp);
24958 const APInt &C = ND->getAPIntValue();
24959 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
24960 }
24961 break;
24962 }
24963
24964 return DAG.getBuildVector(VT, dl, Elts);
24965 }
24966
24967 return DAG.getNode(Opc, dl, VT, SrcOp,
24968 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24969}
24970
24971/// Handle vector element shifts where the shift amount may or may not be a
24972/// constant. Takes immediate version of shift as input.
24973static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24974 SDValue SrcOp, SDValue ShAmt,
24975 const X86Subtarget &Subtarget,
24976 SelectionDAG &DAG) {
24977 MVT SVT = ShAmt.getSimpleValueType();
24978 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")((void)0);
24979
24980 // Catch shift-by-constant.
24981 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
24982 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
24983 CShAmt->getZExtValue(), DAG);
24984
24985 // Change opcode to non-immediate version.
24986 Opc = getTargetVShiftUniformOpcode(Opc, true);
24987
24988 // Need to build a vector containing shift amount.
24989 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
24990 // +====================+============+=======================================+
24991 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
24992 // +====================+============+=======================================+
24993 // | i64 | Yes, No | Use ShAmt as lowest elt |
24994 // | i32 | Yes | zero-extend in-reg |
24995 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
24996 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
24997 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
24998 // +====================+============+=======================================+
24999
25000 if (SVT == MVT::i64)
25001 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
25002 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
25003 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25004 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
25005 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
25006 ShAmt = ShAmt.getOperand(0);
25007 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
25008 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
25009 if (Subtarget.hasSSE41())
25010 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25011 MVT::v2i64, ShAmt);
25012 else {
25013 SDValue ByteShift = DAG.getTargetConstant(
25014 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25015 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25016 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25017 ByteShift);
25018 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25019 ByteShift);
25020 }
25021 } else if (Subtarget.hasSSE41() &&
25022 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25023 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25024 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25025 MVT::v2i64, ShAmt);
25026 } else {
25027 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
25028 DAG.getUNDEF(SVT)};
25029 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
25030 }
25031
25032 // The return type has to be a 128-bit type with the same element
25033 // type as the input type.
25034 MVT EltVT = VT.getVectorElementType();
25035 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25036
25037 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25038 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25039}
25040
25041/// Return Mask with the necessary casting or extending
25042/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25043static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25044 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25045 const SDLoc &dl) {
25046
25047 if (isAllOnesConstant(Mask))
25048 return DAG.getConstant(1, dl, MaskVT);
25049 if (X86::isZeroNode(Mask))
25050 return DAG.getConstant(0, dl, MaskVT);
25051
25052 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")((void)0);
25053
25054 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25055 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")((void)0);
25056 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((void)0);
25057 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25058 SDValue Lo, Hi;
25059 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25060 DAG.getConstant(0, dl, MVT::i32));
25061 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25062 DAG.getConstant(1, dl, MVT::i32));
25063
25064 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25065 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25066
25067 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25068 } else {
25069 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25070 Mask.getSimpleValueType().getSizeInBits());
25071 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25072 // are extracted by EXTRACT_SUBVECTOR.
25073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25074 DAG.getBitcast(BitcastVT, Mask),
25075 DAG.getIntPtrConstant(0, dl));
25076 }
25077}
25078
25079/// Return (and \p Op, \p Mask) for compare instructions or
25080/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25081/// necessary casting or extending for \p Mask when lowering masking intrinsics
25082static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25083 SDValue PreservedSrc,
25084 const X86Subtarget &Subtarget,
25085 SelectionDAG &DAG) {
25086 MVT VT = Op.getSimpleValueType();
25087 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25088 unsigned OpcodeSelect = ISD::VSELECT;
25089 SDLoc dl(Op);
25090
25091 if (isAllOnesConstant(Mask))
25092 return Op;
25093
25094 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25095
25096 if (PreservedSrc.isUndef())
25097 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25098 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25099}
25100
25101/// Creates an SDNode for a predicated scalar operation.
25102/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25103/// The mask is coming as MVT::i8 and it should be transformed
25104/// to MVT::v1i1 while lowering masking intrinsics.
25105/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25106/// "X86select" instead of "vselect". We just can't create the "vselect" node
25107/// for a scalar instruction.
25108static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25109 SDValue PreservedSrc,
25110 const X86Subtarget &Subtarget,
25111 SelectionDAG &DAG) {
25112
25113 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25114 if (MaskConst->getZExtValue() & 0x1)
25115 return Op;
25116
25117 MVT VT = Op.getSimpleValueType();
25118 SDLoc dl(Op);
25119
25120 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")((void)0);
25121 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25122 DAG.getBitcast(MVT::v8i1, Mask),
25123 DAG.getIntPtrConstant(0, dl));
25124 if (Op.getOpcode() == X86ISD::FSETCCM ||
25125 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25126 Op.getOpcode() == X86ISD::VFPCLASSS)
25127 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25128
25129 if (PreservedSrc.isUndef())
25130 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25131 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25132}
25133
25134static int getSEHRegistrationNodeSize(const Function *Fn) {
25135 if (!Fn->hasPersonalityFn())
25136 report_fatal_error(
25137 "querying registration node size for function without personality");
25138 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25139 // WinEHStatePass for the full struct definition.
25140 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25141 case EHPersonality::MSVC_X86SEH: return 24;
25142 case EHPersonality::MSVC_CXX: return 16;
25143 default: break;
25144 }
25145 report_fatal_error(
25146 "can only recover FP for 32-bit MSVC EH personality functions");
25147}
25148
25149/// When the MSVC runtime transfers control to us, either to an outlined
25150/// function or when returning to a parent frame after catching an exception, we
25151/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25152/// Here's the math:
25153/// RegNodeBase = EntryEBP - RegNodeSize
25154/// ParentFP = RegNodeBase - ParentFrameOffset
25155/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25156/// subtracting the offset (negative on x86) takes us back to the parent FP.
25157static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25158 SDValue EntryEBP) {
25159 MachineFunction &MF = DAG.getMachineFunction();
25160 SDLoc dl;
25161
25162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25163 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25164
25165 // It's possible that the parent function no longer has a personality function
25166 // if the exceptional code was optimized away, in which case we just return
25167 // the incoming EBP.
25168 if (!Fn->hasPersonalityFn())
25169 return EntryEBP;
25170
25171 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25172 // registration, or the .set_setframe offset.
25173 MCSymbol *OffsetSym =
25174 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25175 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25176 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25177 SDValue ParentFrameOffset =
25178 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25179
25180 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25181 // prologue to RBP in the parent function.
25182 const X86Subtarget &Subtarget =
25183 static_cast<const X86Subtarget &>(DAG.getSubtarget());
25184 if (Subtarget.is64Bit())
25185 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25186
25187 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25188 // RegNodeBase = EntryEBP - RegNodeSize
25189 // ParentFP = RegNodeBase - ParentFrameOffset
25190 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25191 DAG.getConstant(RegNodeSize, dl, PtrVT));
25192 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25193}
25194
25195SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25196 SelectionDAG &DAG) const {
25197 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25198 auto isRoundModeCurDirection = [](SDValue Rnd) {
25199 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25200 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25201
25202 return false;
25203 };
25204 auto isRoundModeSAE = [](SDValue Rnd) {
25205 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25206 unsigned RC = C->getZExtValue();
25207 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25208 // Clear the NO_EXC bit and check remaining bits.
25209 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25210 // As a convenience we allow no other bits or explicitly
25211 // current direction.
25212 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25213 }
25214 }
25215
25216 return false;
25217 };
25218 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25219 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25220 RC = C->getZExtValue();
25221 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25222 // Clear the NO_EXC bit and check remaining bits.
25223 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25224 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25225 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25226 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25227 RC == X86::STATIC_ROUNDING::TO_ZERO;
25228 }
25229 }
25230
25231 return false;
25232 };
25233
25234 SDLoc dl(Op);
25235 unsigned IntNo = Op.getConstantOperandVal(0);
25236 MVT VT = Op.getSimpleValueType();
25237 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25238
25239 // Propagate flags from original node to transformed node(s).
25240 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25241
25242 if (IntrData) {
25243 switch(IntrData->Type) {
25244 case INTR_TYPE_1OP: {
25245 // We specify 2 possible opcodes for intrinsics with rounding modes.
25246 // First, we check if the intrinsic may have non-default rounding mode,
25247 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25248 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25249 if (IntrWithRoundingModeOpcode != 0) {
25250 SDValue Rnd = Op.getOperand(2);
25251 unsigned RC = 0;
25252 if (isRoundModeSAEToX(Rnd, RC))
25253 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25254 Op.getOperand(1),
25255 DAG.getTargetConstant(RC, dl, MVT::i32));
25256 if (!isRoundModeCurDirection(Rnd))
25257 return SDValue();
25258 }
25259 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25260 Op.getOperand(1));
25261 }
25262 case INTR_TYPE_1OP_SAE: {
25263 SDValue Sae = Op.getOperand(2);
25264
25265 unsigned Opc;
25266 if (isRoundModeCurDirection(Sae))
25267 Opc = IntrData->Opc0;
25268 else if (isRoundModeSAE(Sae))
25269 Opc = IntrData->Opc1;
25270 else
25271 return SDValue();
25272
25273 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25274 }
25275 case INTR_TYPE_2OP: {
25276 SDValue Src2 = Op.getOperand(2);
25277
25278 // We specify 2 possible opcodes for intrinsics with rounding modes.
25279 // First, we check if the intrinsic may have non-default rounding mode,
25280 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25281 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25282 if (IntrWithRoundingModeOpcode != 0) {
25283 SDValue Rnd = Op.getOperand(3);
25284 unsigned RC = 0;
25285 if (isRoundModeSAEToX(Rnd, RC))
25286 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25287 Op.getOperand(1), Src2,
25288 DAG.getTargetConstant(RC, dl, MVT::i32));
25289 if (!isRoundModeCurDirection(Rnd))
25290 return SDValue();
25291 }
25292
25293 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25294 Op.getOperand(1), Src2);
25295 }
25296 case INTR_TYPE_2OP_SAE: {
25297 SDValue Sae = Op.getOperand(3);
25298
25299 unsigned Opc;
25300 if (isRoundModeCurDirection(Sae))
25301 Opc = IntrData->Opc0;
25302 else if (isRoundModeSAE(Sae))
25303 Opc = IntrData->Opc1;
25304 else
25305 return SDValue();
25306
25307 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25308 Op.getOperand(2));
25309 }
25310 case INTR_TYPE_3OP:
25311 case INTR_TYPE_3OP_IMM8: {
25312 SDValue Src1 = Op.getOperand(1);
25313 SDValue Src2 = Op.getOperand(2);
25314 SDValue Src3 = Op.getOperand(3);
25315
25316 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25317 Src3.getValueType() != MVT::i8) {
25318 Src3 = DAG.getTargetConstant(
25319 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25320 }
25321
25322 // We specify 2 possible opcodes for intrinsics with rounding modes.
25323 // First, we check if the intrinsic may have non-default rounding mode,
25324 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25325 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25326 if (IntrWithRoundingModeOpcode != 0) {
25327 SDValue Rnd = Op.getOperand(4);
25328 unsigned RC = 0;
25329 if (isRoundModeSAEToX(Rnd, RC))
25330 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25331 Src1, Src2, Src3,
25332 DAG.getTargetConstant(RC, dl, MVT::i32));
25333 if (!isRoundModeCurDirection(Rnd))
25334 return SDValue();
25335 }
25336
25337 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25338 {Src1, Src2, Src3});
25339 }
25340 case INTR_TYPE_4OP_IMM8: {
25341 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)((void)0);
25342 SDValue Src4 = Op.getOperand(4);
25343 if (Src4.getValueType() != MVT::i8) {
25344 Src4 = DAG.getTargetConstant(
25345 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25346 }
25347
25348 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25349 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25350 Src4);
25351 }
25352 case INTR_TYPE_1OP_MASK: {
25353 SDValue Src = Op.getOperand(1);
25354 SDValue PassThru = Op.getOperand(2);
25355 SDValue Mask = Op.getOperand(3);
25356 // We add rounding mode to the Node when
25357 // - RC Opcode is specified and
25358 // - RC is not "current direction".
25359 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25360 if (IntrWithRoundingModeOpcode != 0) {
25361 SDValue Rnd = Op.getOperand(4);
25362 unsigned RC = 0;
25363 if (isRoundModeSAEToX(Rnd, RC))
25364 return getVectorMaskingNode(
25365 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25366 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25367 Mask, PassThru, Subtarget, DAG);
25368 if (!isRoundModeCurDirection(Rnd))
25369 return SDValue();
25370 }
25371 return getVectorMaskingNode(
25372 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25373 Subtarget, DAG);
25374 }
25375 case INTR_TYPE_1OP_MASK_SAE: {
25376 SDValue Src = Op.getOperand(1);
25377 SDValue PassThru = Op.getOperand(2);
25378 SDValue Mask = Op.getOperand(3);
25379 SDValue Rnd = Op.getOperand(4);
25380
25381 unsigned Opc;
25382 if (isRoundModeCurDirection(Rnd))
25383 Opc = IntrData->Opc0;
25384 else if (isRoundModeSAE(Rnd))
25385 Opc = IntrData->Opc1;
25386 else
25387 return SDValue();
25388
25389 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25390 Subtarget, DAG);
25391 }
25392 case INTR_TYPE_SCALAR_MASK: {
25393 SDValue Src1 = Op.getOperand(1);
25394 SDValue Src2 = Op.getOperand(2);
25395 SDValue passThru = Op.getOperand(3);
25396 SDValue Mask = Op.getOperand(4);
25397 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25398 // There are 2 kinds of intrinsics in this group:
25399 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25400 // (2) With rounding mode and sae - 7 operands.
25401 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25402 if (Op.getNumOperands() == (5U + HasRounding)) {
25403 if (HasRounding) {
25404 SDValue Rnd = Op.getOperand(5);
25405 unsigned RC = 0;
25406 if (isRoundModeSAEToX(Rnd, RC))
25407 return getScalarMaskingNode(
25408 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25409 DAG.getTargetConstant(RC, dl, MVT::i32)),
25410 Mask, passThru, Subtarget, DAG);
25411 if (!isRoundModeCurDirection(Rnd))
25412 return SDValue();
25413 }
25414 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25415 Src2),
25416 Mask, passThru, Subtarget, DAG);
25417 }
25418
25419 assert(Op.getNumOperands() == (6U + HasRounding) &&((void)0)
25420 "Unexpected intrinsic form")((void)0);
25421 SDValue RoundingMode = Op.getOperand(5);
25422 unsigned Opc = IntrData->Opc0;
25423 if (HasRounding) {
25424 SDValue Sae = Op.getOperand(6);
25425 if (isRoundModeSAE(Sae))
25426 Opc = IntrWithRoundingModeOpcode;
25427 else if (!isRoundModeCurDirection(Sae))
25428 return SDValue();
25429 }
25430 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25431 Src2, RoundingMode),
25432 Mask, passThru, Subtarget, DAG);
25433 }
25434 case INTR_TYPE_SCALAR_MASK_RND: {
25435 SDValue Src1 = Op.getOperand(1);
25436 SDValue Src2 = Op.getOperand(2);
25437 SDValue passThru = Op.getOperand(3);
25438 SDValue Mask = Op.getOperand(4);
25439 SDValue Rnd = Op.getOperand(5);
25440
25441 SDValue NewOp;
25442 unsigned RC = 0;
25443 if (isRoundModeCurDirection(Rnd))
25444 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25445 else if (isRoundModeSAEToX(Rnd, RC))
25446 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25447 DAG.getTargetConstant(RC, dl, MVT::i32));
25448 else
25449 return SDValue();
25450
25451 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25452 }
25453 case INTR_TYPE_SCALAR_MASK_SAE: {
25454 SDValue Src1 = Op.getOperand(1);
25455 SDValue Src2 = Op.getOperand(2);
25456 SDValue passThru = Op.getOperand(3);
25457 SDValue Mask = Op.getOperand(4);
25458 SDValue Sae = Op.getOperand(5);
25459 unsigned Opc;
25460 if (isRoundModeCurDirection(Sae))
25461 Opc = IntrData->Opc0;
25462 else if (isRoundModeSAE(Sae))
25463 Opc = IntrData->Opc1;
25464 else
25465 return SDValue();
25466
25467 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25468 Mask, passThru, Subtarget, DAG);
25469 }
25470 case INTR_TYPE_2OP_MASK: {
25471 SDValue Src1 = Op.getOperand(1);
25472 SDValue Src2 = Op.getOperand(2);
25473 SDValue PassThru = Op.getOperand(3);
25474 SDValue Mask = Op.getOperand(4);
25475 SDValue NewOp;
25476 if (IntrData->Opc1 != 0) {
25477 SDValue Rnd = Op.getOperand(5);
25478 unsigned RC = 0;
25479 if (isRoundModeSAEToX(Rnd, RC))
25480 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25481 DAG.getTargetConstant(RC, dl, MVT::i32));
25482 else if (!isRoundModeCurDirection(Rnd))
25483 return SDValue();
25484 }
25485 if (!NewOp)
25486 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25487 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25488 }
25489 case INTR_TYPE_2OP_MASK_SAE: {
25490 SDValue Src1 = Op.getOperand(1);
25491 SDValue Src2 = Op.getOperand(2);
25492 SDValue PassThru = Op.getOperand(3);
25493 SDValue Mask = Op.getOperand(4);
25494
25495 unsigned Opc = IntrData->Opc0;
25496 if (IntrData->Opc1 != 0) {
25497 SDValue Sae = Op.getOperand(5);
25498 if (isRoundModeSAE(Sae))
25499 Opc = IntrData->Opc1;
25500 else if (!isRoundModeCurDirection(Sae))
25501 return SDValue();
25502 }
25503
25504 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25505 Mask, PassThru, Subtarget, DAG);
25506 }
25507 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25508 SDValue Src1 = Op.getOperand(1);
25509 SDValue Src2 = Op.getOperand(2);
25510 SDValue Src3 = Op.getOperand(3);
25511 SDValue PassThru = Op.getOperand(4);
25512 SDValue Mask = Op.getOperand(5);
25513 SDValue Sae = Op.getOperand(6);
25514 unsigned Opc;
25515 if (isRoundModeCurDirection(Sae))
25516 Opc = IntrData->Opc0;
25517 else if (isRoundModeSAE(Sae))
25518 Opc = IntrData->Opc1;
25519 else
25520 return SDValue();
25521
25522 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25523 Mask, PassThru, Subtarget, DAG);
25524 }
25525 case INTR_TYPE_3OP_MASK_SAE: {
25526 SDValue Src1 = Op.getOperand(1);
25527 SDValue Src2 = Op.getOperand(2);
25528 SDValue Src3 = Op.getOperand(3);
25529 SDValue PassThru = Op.getOperand(4);
25530 SDValue Mask = Op.getOperand(5);
25531
25532 unsigned Opc = IntrData->Opc0;
25533 if (IntrData->Opc1 != 0) {
25534 SDValue Sae = Op.getOperand(6);
25535 if (isRoundModeSAE(Sae))
25536 Opc = IntrData->Opc1;
25537 else if (!isRoundModeCurDirection(Sae))
25538 return SDValue();
25539 }
25540 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25541 Mask, PassThru, Subtarget, DAG);
25542 }
25543 case BLENDV: {
25544 SDValue Src1 = Op.getOperand(1);
25545 SDValue Src2 = Op.getOperand(2);
25546 SDValue Src3 = Op.getOperand(3);
25547
25548 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25549 Src3 = DAG.getBitcast(MaskVT, Src3);
25550
25551 // Reverse the operands to match VSELECT order.
25552 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25553 }
25554 case VPERM_2OP : {
25555 SDValue Src1 = Op.getOperand(1);
25556 SDValue Src2 = Op.getOperand(2);
25557
25558 // Swap Src1 and Src2 in the node creation
25559 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25560 }
25561 case IFMA_OP:
25562 // NOTE: We need to swizzle the operands to pass the multiply operands
25563 // first.
25564 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25565 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25566 case FPCLASSS: {
25567 SDValue Src1 = Op.getOperand(1);
25568 SDValue Imm = Op.getOperand(2);
25569 SDValue Mask = Op.getOperand(3);
25570 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25571 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25572 Subtarget, DAG);
25573 // Need to fill with zeros to ensure the bitcast will produce zeroes
25574 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25575 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25576 DAG.getConstant(0, dl, MVT::v8i1),
25577 FPclassMask, DAG.getIntPtrConstant(0, dl));
25578 return DAG.getBitcast(MVT::i8, Ins);
25579 }
25580
25581 case CMP_MASK_CC: {
25582 MVT MaskVT = Op.getSimpleValueType();
25583 SDValue CC = Op.getOperand(3);
25584 SDValue Mask = Op.getOperand(4);
25585 // We specify 2 possible opcodes for intrinsics with rounding modes.
25586 // First, we check if the intrinsic may have non-default rounding mode,
25587 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25588 if (IntrData->Opc1 != 0) {
25589 SDValue Sae = Op.getOperand(5);
25590 if (isRoundModeSAE(Sae))
25591 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25592 Op.getOperand(2), CC, Mask, Sae);
25593 if (!isRoundModeCurDirection(Sae))
25594 return SDValue();
25595 }
25596 //default rounding mode
25597 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25598 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25599 }
25600 case CMP_MASK_SCALAR_CC: {
25601 SDValue Src1 = Op.getOperand(1);
25602 SDValue Src2 = Op.getOperand(2);
25603 SDValue CC = Op.getOperand(3);
25604 SDValue Mask = Op.getOperand(4);
25605
25606 SDValue Cmp;
25607 if (IntrData->Opc1 != 0) {
25608 SDValue Sae = Op.getOperand(5);
25609 if (isRoundModeSAE(Sae))
25610 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25611 else if (!isRoundModeCurDirection(Sae))
25612 return SDValue();
25613 }
25614 //default rounding mode
25615 if (!Cmp.getNode())
25616 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25617
25618 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25619 Subtarget, DAG);
25620 // Need to fill with zeros to ensure the bitcast will produce zeroes
25621 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25622 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25623 DAG.getConstant(0, dl, MVT::v8i1),
25624 CmpMask, DAG.getIntPtrConstant(0, dl));
25625 return DAG.getBitcast(MVT::i8, Ins);
25626 }
25627 case COMI: { // Comparison intrinsics
25628 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25629 SDValue LHS = Op.getOperand(1);
25630 SDValue RHS = Op.getOperand(2);
25631 // Some conditions require the operands to be swapped.
25632 if (CC == ISD::SETLT || CC == ISD::SETLE)
25633 std::swap(LHS, RHS);
25634
25635 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25636 SDValue SetCC;
25637 switch (CC) {
25638 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25639 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25640 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25641 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25642 break;
25643 }
25644 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25645 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25646 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25647 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25648 break;
25649 }
25650 case ISD::SETGT: // (CF = 0 and ZF = 0)
25651 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25652 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25653 break;
25654 }
25655 case ISD::SETGE: // CF = 0
25656 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25657 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25658 break;
25659 default:
25660 llvm_unreachable("Unexpected illegal condition!")__builtin_unreachable();
25661 }
25662 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25663 }
25664 case COMI_RM: { // Comparison intrinsics with Sae
25665 SDValue LHS = Op.getOperand(1);
25666 SDValue RHS = Op.getOperand(2);
25667 unsigned CondVal = Op.getConstantOperandVal(3);
25668 SDValue Sae = Op.getOperand(4);
25669
25670 SDValue FCmp;
25671 if (isRoundModeCurDirection(Sae))
25672 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25673 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25674 else if (isRoundModeSAE(Sae))
25675 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25676 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25677 else
25678 return SDValue();
25679 // Need to fill with zeros to ensure the bitcast will produce zeroes
25680 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25681 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25682 DAG.getConstant(0, dl, MVT::v16i1),
25683 FCmp, DAG.getIntPtrConstant(0, dl));
25684 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25685 DAG.getBitcast(MVT::i16, Ins));
25686 }
25687 case VSHIFT:
25688 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25689 Op.getOperand(1), Op.getOperand(2), Subtarget,
25690 DAG);
25691 case COMPRESS_EXPAND_IN_REG: {
25692 SDValue Mask = Op.getOperand(3);
25693 SDValue DataToCompress = Op.getOperand(1);
25694 SDValue PassThru = Op.getOperand(2);
25695 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25696 return Op.getOperand(1);
25697
25698 // Avoid false dependency.
25699 if (PassThru.isUndef())
25700 PassThru = DAG.getConstant(0, dl, VT);
25701
25702 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25703 Mask);
25704 }
25705 case FIXUPIMM:
25706 case FIXUPIMM_MASKZ: {
25707 SDValue Src1 = Op.getOperand(1);
25708 SDValue Src2 = Op.getOperand(2);
25709 SDValue Src3 = Op.getOperand(3);
25710 SDValue Imm = Op.getOperand(4);
25711 SDValue Mask = Op.getOperand(5);
25712 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25713 ? Src1
25714 : getZeroVector(VT, Subtarget, DAG, dl);
25715
25716 unsigned Opc = IntrData->Opc0;
25717 if (IntrData->Opc1 != 0) {
25718 SDValue Sae = Op.getOperand(6);
25719 if (isRoundModeSAE(Sae))
25720 Opc = IntrData->Opc1;
25721 else if (!isRoundModeCurDirection(Sae))
25722 return SDValue();
25723 }
25724
25725 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25726
25727 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25728 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25729
25730 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25731 }
25732 case ROUNDP: {
25733 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")((void)0);
25734 // Clear the upper bits of the rounding immediate so that the legacy
25735 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25736 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25737 SDValue RoundingMode =
25738 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25739 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25740 Op.getOperand(1), RoundingMode);
25741 }
25742 case ROUNDS: {
25743 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")((void)0);
25744 // Clear the upper bits of the rounding immediate so that the legacy
25745 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25746 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25747 SDValue RoundingMode =
25748 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25749 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25750 Op.getOperand(1), Op.getOperand(2), RoundingMode);
25751 }
25752 case BEXTRI: {
25753 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")((void)0);
25754
25755 uint64_t Imm = Op.getConstantOperandVal(2);
25756 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25757 Op.getValueType());
25758 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25759 Op.getOperand(1), Control);
25760 }
25761 // ADC/ADCX/SBB
25762 case ADX: {
25763 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25764 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25765
25766 SDValue Res;
25767 // If the carry in is zero, then we should just use ADD/SUB instead of
25768 // ADC/SBB.
25769 if (isNullConstant(Op.getOperand(1))) {
25770 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25771 Op.getOperand(3));
25772 } else {
25773 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25774 DAG.getConstant(-1, dl, MVT::i8));
25775 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25776 Op.getOperand(3), GenCF.getValue(1));
25777 }
25778 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25779 SDValue Results[] = { SetCC, Res };
25780 return DAG.getMergeValues(Results, dl);
25781 }
25782 case CVTPD2PS_MASK:
25783 case CVTPD2DQ_MASK:
25784 case CVTQQ2PS_MASK:
25785 case TRUNCATE_TO_REG: {
25786 SDValue Src = Op.getOperand(1);
25787 SDValue PassThru = Op.getOperand(2);
25788 SDValue Mask = Op.getOperand(3);
25789
25790 if (isAllOnesConstant(Mask))
25791 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25792
25793 MVT SrcVT = Src.getSimpleValueType();
25794 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25795 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25796 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25797 {Src, PassThru, Mask});
25798 }
25799 case CVTPS2PH_MASK: {
25800 SDValue Src = Op.getOperand(1);
25801 SDValue Rnd = Op.getOperand(2);
25802 SDValue PassThru = Op.getOperand(3);
25803 SDValue Mask = Op.getOperand(4);
25804
25805 if (isAllOnesConstant(Mask))
25806 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
25807
25808 MVT SrcVT = Src.getSimpleValueType();
25809 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25810 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25811 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
25812 PassThru, Mask);
25813
25814 }
25815 case CVTNEPS2BF16_MASK: {
25816 SDValue Src = Op.getOperand(1);
25817 SDValue PassThru = Op.getOperand(2);
25818 SDValue Mask = Op.getOperand(3);
25819
25820 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25821 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25822
25823 // Break false dependency.
25824 if (PassThru.isUndef())
25825 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25826
25827 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25828 Mask);
25829 }
25830 default:
25831 break;
25832 }
25833 }
25834
25835 switch (IntNo) {
25836 default: return SDValue(); // Don't custom lower most intrinsics.
25837
25838 // ptest and testp intrinsics. The intrinsic these come from are designed to
25839 // return an integer value, not just an instruction so lower it to the ptest
25840 // or testp pattern and a setcc for the result.
25841 case Intrinsic::x86_avx512_ktestc_b:
25842 case Intrinsic::x86_avx512_ktestc_w:
25843 case Intrinsic::x86_avx512_ktestc_d:
25844 case Intrinsic::x86_avx512_ktestc_q:
25845 case Intrinsic::x86_avx512_ktestz_b:
25846 case Intrinsic::x86_avx512_ktestz_w:
25847 case Intrinsic::x86_avx512_ktestz_d:
25848 case Intrinsic::x86_avx512_ktestz_q:
25849 case Intrinsic::x86_sse41_ptestz:
25850 case Intrinsic::x86_sse41_ptestc:
25851 case Intrinsic::x86_sse41_ptestnzc:
25852 case Intrinsic::x86_avx_ptestz_256:
25853 case Intrinsic::x86_avx_ptestc_256:
25854 case Intrinsic::x86_avx_ptestnzc_256:
25855 case Intrinsic::x86_avx_vtestz_ps:
25856 case Intrinsic::x86_avx_vtestc_ps:
25857 case Intrinsic::x86_avx_vtestnzc_ps:
25858 case Intrinsic::x86_avx_vtestz_pd:
25859 case Intrinsic::x86_avx_vtestc_pd:
25860 case Intrinsic::x86_avx_vtestnzc_pd:
25861 case Intrinsic::x86_avx_vtestz_ps_256:
25862 case Intrinsic::x86_avx_vtestc_ps_256:
25863 case Intrinsic::x86_avx_vtestnzc_ps_256:
25864 case Intrinsic::x86_avx_vtestz_pd_256:
25865 case Intrinsic::x86_avx_vtestc_pd_256:
25866 case Intrinsic::x86_avx_vtestnzc_pd_256: {
25867 unsigned TestOpc = X86ISD::PTEST;
25868 X86::CondCode X86CC;
25869 switch (IntNo) {
25870 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")__builtin_unreachable();
25871 case Intrinsic::x86_avx512_ktestc_b:
25872 case Intrinsic::x86_avx512_ktestc_w:
25873 case Intrinsic::x86_avx512_ktestc_d:
25874 case Intrinsic::x86_avx512_ktestc_q:
25875 // CF = 1
25876 TestOpc = X86ISD::KTEST;
25877 X86CC = X86::COND_B;
25878 break;
25879 case Intrinsic::x86_avx512_ktestz_b:
25880 case Intrinsic::x86_avx512_ktestz_w:
25881 case Intrinsic::x86_avx512_ktestz_d:
25882 case Intrinsic::x86_avx512_ktestz_q:
25883 TestOpc = X86ISD::KTEST;
25884 X86CC = X86::COND_E;
25885 break;
25886 case Intrinsic::x86_avx_vtestz_ps:
25887 case Intrinsic::x86_avx_vtestz_pd:
25888 case Intrinsic::x86_avx_vtestz_ps_256:
25889 case Intrinsic::x86_avx_vtestz_pd_256:
25890 TestOpc = X86ISD::TESTP;
25891 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25892 case Intrinsic::x86_sse41_ptestz:
25893 case Intrinsic::x86_avx_ptestz_256:
25894 // ZF = 1
25895 X86CC = X86::COND_E;
25896 break;
25897 case Intrinsic::x86_avx_vtestc_ps:
25898 case Intrinsic::x86_avx_vtestc_pd:
25899 case Intrinsic::x86_avx_vtestc_ps_256:
25900 case Intrinsic::x86_avx_vtestc_pd_256:
25901 TestOpc = X86ISD::TESTP;
25902 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25903 case Intrinsic::x86_sse41_ptestc:
25904 case Intrinsic::x86_avx_ptestc_256:
25905 // CF = 1
25906 X86CC = X86::COND_B;
25907 break;
25908 case Intrinsic::x86_avx_vtestnzc_ps:
25909 case Intrinsic::x86_avx_vtestnzc_pd:
25910 case Intrinsic::x86_avx_vtestnzc_ps_256:
25911 case Intrinsic::x86_avx_vtestnzc_pd_256:
25912 TestOpc = X86ISD::TESTP;
25913 LLVM_FALLTHROUGH[[gnu::fallthrough]];
25914 case Intrinsic::x86_sse41_ptestnzc:
25915 case Intrinsic::x86_avx_ptestnzc_256:
25916 // ZF and CF = 0
25917 X86CC = X86::COND_A;
25918 break;
25919 }
25920
25921 SDValue LHS = Op.getOperand(1);
25922 SDValue RHS = Op.getOperand(2);
25923 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25924 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25925 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25926 }
25927
25928 case Intrinsic::x86_sse42_pcmpistria128:
25929 case Intrinsic::x86_sse42_pcmpestria128:
25930 case Intrinsic::x86_sse42_pcmpistric128:
25931 case Intrinsic::x86_sse42_pcmpestric128:
25932 case Intrinsic::x86_sse42_pcmpistrio128:
25933 case Intrinsic::x86_sse42_pcmpestrio128:
25934 case Intrinsic::x86_sse42_pcmpistris128:
25935 case Intrinsic::x86_sse42_pcmpestris128:
25936 case Intrinsic::x86_sse42_pcmpistriz128:
25937 case Intrinsic::x86_sse42_pcmpestriz128: {
25938 unsigned Opcode;
25939 X86::CondCode X86CC;
25940 switch (IntNo) {
25941 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable(); // Can't reach here.
25942 case Intrinsic::x86_sse42_pcmpistria128:
25943 Opcode = X86ISD::PCMPISTR;
25944 X86CC = X86::COND_A;
25945 break;
25946 case Intrinsic::x86_sse42_pcmpestria128:
25947 Opcode = X86ISD::PCMPESTR;
25948 X86CC = X86::COND_A;
25949 break;
25950 case Intrinsic::x86_sse42_pcmpistric128:
25951 Opcode = X86ISD::PCMPISTR;
25952 X86CC = X86::COND_B;
25953 break;
25954 case Intrinsic::x86_sse42_pcmpestric128:
25955 Opcode = X86ISD::PCMPESTR;
25956 X86CC = X86::COND_B;
25957 break;
25958 case Intrinsic::x86_sse42_pcmpistrio128:
25959 Opcode = X86ISD::PCMPISTR;
25960 X86CC = X86::COND_O;
25961 break;
25962 case Intrinsic::x86_sse42_pcmpestrio128:
25963 Opcode = X86ISD::PCMPESTR;
25964 X86CC = X86::COND_O;
25965 break;
25966 case Intrinsic::x86_sse42_pcmpistris128:
25967 Opcode = X86ISD::PCMPISTR;
25968 X86CC = X86::COND_S;
25969 break;
25970 case Intrinsic::x86_sse42_pcmpestris128:
25971 Opcode = X86ISD::PCMPESTR;
25972 X86CC = X86::COND_S;
25973 break;
25974 case Intrinsic::x86_sse42_pcmpistriz128:
25975 Opcode = X86ISD::PCMPISTR;
25976 X86CC = X86::COND_E;
25977 break;
25978 case Intrinsic::x86_sse42_pcmpestriz128:
25979 Opcode = X86ISD::PCMPESTR;
25980 X86CC = X86::COND_E;
25981 break;
25982 }
25983 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25984 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25985 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25986 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25987 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25988 }
25989
25990 case Intrinsic::x86_sse42_pcmpistri128:
25991 case Intrinsic::x86_sse42_pcmpestri128: {
25992 unsigned Opcode;
25993 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25994 Opcode = X86ISD::PCMPISTR;
25995 else
25996 Opcode = X86ISD::PCMPESTR;
25997
25998 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
25999 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26000 return DAG.getNode(Opcode, dl, VTs, NewOps);
26001 }
26002
26003 case Intrinsic::x86_sse42_pcmpistrm128:
26004 case Intrinsic::x86_sse42_pcmpestrm128: {
26005 unsigned Opcode;
26006 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26007 Opcode = X86ISD::PCMPISTR;
26008 else
26009 Opcode = X86ISD::PCMPESTR;
26010
26011 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26012 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26013 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26014 }
26015
26016 case Intrinsic::eh_sjlj_lsda: {
26017 MachineFunction &MF = DAG.getMachineFunction();
26018 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26019 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26020 auto &Context = MF.getMMI().getContext();
26021 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26022 Twine(MF.getFunctionNumber()));
26023 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26024 DAG.getMCSymbol(S, PtrVT));
26025 }
26026
26027 case Intrinsic::x86_seh_lsda: {
26028 // Compute the symbol for the LSDA. We know it'll get emitted later.
26029 MachineFunction &MF = DAG.getMachineFunction();
26030 SDValue Op1 = Op.getOperand(1);
26031 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26032 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26033 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26034
26035 // Generate a simple absolute symbol reference. This intrinsic is only
26036 // supported on 32-bit Windows, which isn't PIC.
26037 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26038 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26039 }
26040
26041 case Intrinsic::eh_recoverfp: {
26042 SDValue FnOp = Op.getOperand(1);
26043 SDValue IncomingFPOp = Op.getOperand(2);
26044 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26045 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26046 if (!Fn)
26047 report_fatal_error(
26048 "llvm.eh.recoverfp must take a function as the first argument");
26049 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26050 }
26051
26052 case Intrinsic::localaddress: {
26053 // Returns one of the stack, base, or frame pointer registers, depending on
26054 // which is used to reference local variables.
26055 MachineFunction &MF = DAG.getMachineFunction();
26056 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26057 unsigned Reg;
26058 if (RegInfo->hasBasePointer(MF))
26059 Reg = RegInfo->getBaseRegister();
26060 else { // Handles the SP or FP case.
26061 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26062 if (CantUseFP)
26063 Reg = RegInfo->getPtrSizedStackRegister(MF);
26064 else
26065 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26066 }
26067 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26068 }
26069 case Intrinsic::swift_async_context_addr: {
26070 auto &MF = DAG.getMachineFunction();
26071 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26072 if (Subtarget.is64Bit()) {
26073 MF.getFrameInfo().setFrameAddressIsTaken(true);
26074 X86FI->setHasSwiftAsyncContext(true);
26075 return SDValue(
26076 DAG.getMachineNode(
26077 X86::SUB64ri8, dl, MVT::i64,
26078 DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
26079 DAG.getTargetConstant(8, dl, MVT::i32)),
26080 0);
26081 } else {
26082 // 32-bit so no special extended frame, create or reuse an existing stack
26083 // slot.
26084 if (!X86FI->getSwiftAsyncContextFrameIdx())
26085 X86FI->setSwiftAsyncContextFrameIdx(
26086 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26087 return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26088 }
26089 }
26090 case Intrinsic::x86_avx512_vp2intersect_q_512:
26091 case Intrinsic::x86_avx512_vp2intersect_q_256:
26092 case Intrinsic::x86_avx512_vp2intersect_q_128:
26093 case Intrinsic::x86_avx512_vp2intersect_d_512:
26094 case Intrinsic::x86_avx512_vp2intersect_d_256:
26095 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26096 MVT MaskVT = Op.getSimpleValueType();
26097
26098 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26099 SDLoc DL(Op);
26100
26101 SDValue Operation =
26102 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26103 Op->getOperand(1), Op->getOperand(2));
26104
26105 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26106 MaskVT, Operation);
26107 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26108 MaskVT, Operation);
26109 return DAG.getMergeValues({Result0, Result1}, DL);
26110 }
26111 case Intrinsic::x86_mmx_pslli_w:
26112 case Intrinsic::x86_mmx_pslli_d:
26113 case Intrinsic::x86_mmx_pslli_q:
26114 case Intrinsic::x86_mmx_psrli_w:
26115 case Intrinsic::x86_mmx_psrli_d:
26116 case Intrinsic::x86_mmx_psrli_q:
26117 case Intrinsic::x86_mmx_psrai_w:
26118 case Intrinsic::x86_mmx_psrai_d: {
26119 SDLoc DL(Op);
26120 SDValue ShAmt = Op.getOperand(2);
26121 // If the argument is a constant, convert it to a target constant.
26122 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26123 // Clamp out of bounds shift amounts since they will otherwise be masked
26124 // to 8-bits which may make it no longer out of bounds.
26125 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26126 if (ShiftAmount == 0)
26127 return Op.getOperand(1);
26128
26129 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26130 Op.getOperand(0), Op.getOperand(1),
26131 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26132 }
26133
26134 unsigned NewIntrinsic;
26135 switch (IntNo) {
26136 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable(); // Can't reach here.
26137 case Intrinsic::x86_mmx_pslli_w:
26138 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26139 break;
26140 case Intrinsic::x86_mmx_pslli_d:
26141 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26142 break;
26143 case Intrinsic::x86_mmx_pslli_q:
26144 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26145 break;
26146 case Intrinsic::x86_mmx_psrli_w:
26147 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26148 break;
26149 case Intrinsic::x86_mmx_psrli_d:
26150 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26151 break;
26152 case Intrinsic::x86_mmx_psrli_q:
26153 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26154 break;
26155 case Intrinsic::x86_mmx_psrai_w:
26156 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26157 break;
26158 case Intrinsic::x86_mmx_psrai_d:
26159 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26160 break;
26161 }
26162
26163 // The vector shift intrinsics with scalars uses 32b shift amounts but
26164 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26165 // MMX register.
26166 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26167 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26168 DAG.getTargetConstant(NewIntrinsic, DL,
26169 getPointerTy(DAG.getDataLayout())),
26170 Op.getOperand(1), ShAmt);
26171 }
26172 }
26173}
26174
26175static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26176 SDValue Src, SDValue Mask, SDValue Base,
26177 SDValue Index, SDValue ScaleOp, SDValue Chain,
26178 const X86Subtarget &Subtarget) {
26179 SDLoc dl(Op);
26180 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26181 // Scale must be constant.
26182 if (!C)
26183 return SDValue();
26184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26185 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26186 TLI.getPointerTy(DAG.getDataLayout()));
26187 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26188 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26189 // If source is undef or we know it won't be used, use a zero vector
26190 // to break register dependency.
26191 // TODO: use undef instead and let BreakFalseDeps deal with it?
26192 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26193 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26194
26195 // Cast mask to an integer type.
26196 Mask = DAG.getBitcast(MaskVT, Mask);
26197
26198 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26199
26200 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26201 SDValue Res =
26202 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26203 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26204 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26205}
26206
26207static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26208 SDValue Src, SDValue Mask, SDValue Base,
26209 SDValue Index, SDValue ScaleOp, SDValue Chain,
26210 const X86Subtarget &Subtarget) {
26211 MVT VT = Op.getSimpleValueType();
26212 SDLoc dl(Op);
26213 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26214 // Scale must be constant.
26215 if (!C)
26216 return SDValue();
26217 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26218 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26219 TLI.getPointerTy(DAG.getDataLayout()));
26220 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26221 VT.getVectorNumElements());
26222 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26223
26224 // We support two versions of the gather intrinsics. One with scalar mask and
26225 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26226 if (Mask.getValueType() != MaskVT)
26227 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26228
26229 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26230 // If source is undef or we know it won't be used, use a zero vector
26231 // to break register dependency.
26232 // TODO: use undef instead and let BreakFalseDeps deal with it?
26233 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26234 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26235
26236 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26237
26238 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26239 SDValue Res =
26240 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26241 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26242 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26243}
26244
26245static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26246 SDValue Src, SDValue Mask, SDValue Base,
26247 SDValue Index, SDValue ScaleOp, SDValue Chain,
26248 const X86Subtarget &Subtarget) {
26249 SDLoc dl(Op);
26250 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26251 // Scale must be constant.
26252 if (!C)
26253 return SDValue();
26254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26255 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26256 TLI.getPointerTy(DAG.getDataLayout()));
26257 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26258 Src.getSimpleValueType().getVectorNumElements());
26259 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26260
26261 // We support two versions of the scatter intrinsics. One with scalar mask and
26262 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26263 if (Mask.getValueType() != MaskVT)
26264 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26265
26266 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26267
26268 SDVTList VTs = DAG.getVTList(MVT::Other);
26269 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26270 SDValue Res =
26271 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26272 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26273 return Res;
26274}
26275
26276static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26277 SDValue Mask, SDValue Base, SDValue Index,
26278 SDValue ScaleOp, SDValue Chain,
26279 const X86Subtarget &Subtarget) {
26280 SDLoc dl(Op);
26281 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26282 // Scale must be constant.
26283 if (!C)
26284 return SDValue();
26285 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26286 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26287 TLI.getPointerTy(DAG.getDataLayout()));
26288 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26289 SDValue Segment = DAG.getRegister(0, MVT::i32);
26290 MVT MaskVT =
26291 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26292 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26293 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26294 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26295 return SDValue(Res, 0);
26296}
26297
26298/// Handles the lowering of builtin intrinsics with chain that return their
26299/// value into registers EDX:EAX.
26300/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26301/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26302/// TargetOpcode.
26303/// Returns a Glue value which can be used to add extra copy-from-reg if the
26304/// expanded intrinsics implicitly defines extra registers (i.e. not just
26305/// EDX:EAX).
26306static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26307 SelectionDAG &DAG,
26308 unsigned TargetOpcode,
26309 unsigned SrcReg,
26310 const X86Subtarget &Subtarget,
26311 SmallVectorImpl<SDValue> &Results) {
26312 SDValue Chain = N->getOperand(0);
26313 SDValue Glue;
26314
26315 if (SrcReg) {
26316 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((void)0);
26317 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26318 Glue = Chain.getValue(1);
26319 }
26320
26321 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26322 SDValue N1Ops[] = {Chain, Glue};
26323 SDNode *N1 = DAG.getMachineNode(
26324 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26325 Chain = SDValue(N1, 0);
26326
26327 // Reads the content of XCR and returns it in registers EDX:EAX.
26328 SDValue LO, HI;
26329 if (Subtarget.is64Bit()) {
26330 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26331 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26332 LO.getValue(2));
26333 } else {
26334 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26335 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26336 LO.getValue(2));
26337 }
26338 Chain = HI.getValue(1);
26339 Glue = HI.getValue(2);
26340
26341 if (Subtarget.is64Bit()) {
26342 // Merge the two 32-bit values into a 64-bit one.
26343 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26344 DAG.getConstant(32, DL, MVT::i8));
26345 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26346 Results.push_back(Chain);
26347 return Glue;
26348 }
26349
26350 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26351 SDValue Ops[] = { LO, HI };
26352 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26353 Results.push_back(Pair);
26354 Results.push_back(Chain);
26355 return Glue;
26356}
26357
26358/// Handles the lowering of builtin intrinsics that read the time stamp counter
26359/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26360/// READCYCLECOUNTER nodes.
26361static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26362 SelectionDAG &DAG,
26363 const X86Subtarget &Subtarget,
26364 SmallVectorImpl<SDValue> &Results) {
26365 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26366 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26367 // and the EAX register is loaded with the low-order 32 bits.
26368 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26369 /* NoRegister */0, Subtarget,
26370 Results);
26371 if (Opcode != X86::RDTSCP)
26372 return;
26373
26374 SDValue Chain = Results[1];
26375 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26376 // the ECX register. Add 'ecx' explicitly to the chain.
26377 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26378 Results[1] = ecx;
26379 Results.push_back(ecx.getValue(1));
26380}
26381
26382static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26383 SelectionDAG &DAG) {
26384 SmallVector<SDValue, 3> Results;
26385 SDLoc DL(Op);
26386 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26387 Results);
26388 return DAG.getMergeValues(Results, DL);
26389}
26390
26391static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26392 MachineFunction &MF = DAG.getMachineFunction();
26393 SDValue Chain = Op.getOperand(0);
26394 SDValue RegNode = Op.getOperand(2);
26395 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26396 if (!EHInfo)
26397 report_fatal_error("EH registrations only live in functions using WinEH");
26398
26399 // Cast the operand to an alloca, and remember the frame index.
26400 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26401 if (!FINode)
26402 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26403 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26404
26405 // Return the chain operand without making any DAG nodes.
26406 return Chain;
26407}
26408
26409static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26410 MachineFunction &MF = DAG.getMachineFunction();
26411 SDValue Chain = Op.getOperand(0);
26412 SDValue EHGuard = Op.getOperand(2);
26413 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26414 if (!EHInfo)
26415 report_fatal_error("EHGuard only live in functions using WinEH");
26416
26417 // Cast the operand to an alloca, and remember the frame index.
26418 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26419 if (!FINode)
26420 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26421 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26422
26423 // Return the chain operand without making any DAG nodes.
26424 return Chain;
26425}
26426
26427/// Emit Truncating Store with signed or unsigned saturation.
26428static SDValue
26429EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26430 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26431 SelectionDAG &DAG) {
26432 SDVTList VTs = DAG.getVTList(MVT::Other);
26433 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26434 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26435 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26436 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26437}
26438
26439/// Emit Masked Truncating Store with signed or unsigned saturation.
26440static SDValue
26441EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26442 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26443 MachineMemOperand *MMO, SelectionDAG &DAG) {
26444 SDVTList VTs = DAG.getVTList(MVT::Other);
26445 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26446 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26447 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26448}
26449
26450static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26451 SelectionDAG &DAG) {
26452 unsigned IntNo = Op.getConstantOperandVal(1);
26453 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26454 if (!IntrData) {
26455 switch (IntNo) {
26456 case llvm::Intrinsic::x86_seh_ehregnode:
26457 return MarkEHRegistrationNode(Op, DAG);
26458 case llvm::Intrinsic::x86_seh_ehguard:
26459 return MarkEHGuard(Op, DAG);
26460 case llvm::Intrinsic::x86_rdpkru: {
26461 SDLoc dl(Op);
26462 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26463 // Create a RDPKRU node and pass 0 to the ECX parameter.
26464 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26465 DAG.getConstant(0, dl, MVT::i32));
26466 }
26467 case llvm::Intrinsic::x86_wrpkru: {
26468 SDLoc dl(Op);
26469 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26470 // to the EDX and ECX parameters.
26471 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26472 Op.getOperand(0), Op.getOperand(2),
26473 DAG.getConstant(0, dl, MVT::i32),
26474 DAG.getConstant(0, dl, MVT::i32));
26475 }
26476 case llvm::Intrinsic::x86_flags_read_u32:
26477 case llvm::Intrinsic::x86_flags_read_u64:
26478 case llvm::Intrinsic::x86_flags_write_u32:
26479 case llvm::Intrinsic::x86_flags_write_u64: {
26480 // We need a frame pointer because this will get lowered to a PUSH/POP
26481 // sequence.
26482 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26483 MFI.setHasCopyImplyingStackAdjustment(true);
26484 // Don't do anything here, we will expand these intrinsics out later
26485 // during FinalizeISel in EmitInstrWithCustomInserter.
26486 return Op;
26487 }
26488 case Intrinsic::x86_lwpins32:
26489 case Intrinsic::x86_lwpins64:
26490 case Intrinsic::x86_umwait:
26491 case Intrinsic::x86_tpause: {
26492 SDLoc dl(Op);
26493 SDValue Chain = Op->getOperand(0);
26494 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26495 unsigned Opcode;
26496
26497 switch (IntNo) {
26498 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26499 case Intrinsic::x86_umwait:
26500 Opcode = X86ISD::UMWAIT;
26501 break;
26502 case Intrinsic::x86_tpause:
26503 Opcode = X86ISD::TPAUSE;
26504 break;
26505 case Intrinsic::x86_lwpins32:
26506 case Intrinsic::x86_lwpins64:
26507 Opcode = X86ISD::LWPINS;
26508 break;
26509 }
26510
26511 SDValue Operation =
26512 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26513 Op->getOperand(3), Op->getOperand(4));
26514 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26515 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26516 Operation.getValue(1));
26517 }
26518 case Intrinsic::x86_enqcmd:
26519 case Intrinsic::x86_enqcmds: {
26520 SDLoc dl(Op);
26521 SDValue Chain = Op.getOperand(0);
26522 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26523 unsigned Opcode;
26524 switch (IntNo) {
26525 default: llvm_unreachable("Impossible intrinsic!")__builtin_unreachable();
26526 case Intrinsic::x86_enqcmd:
26527 Opcode = X86ISD::ENQCMD;
26528 break;
26529 case Intrinsic::x86_enqcmds:
26530 Opcode = X86ISD::ENQCMDS;
26531 break;
26532 }
26533 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26534 Op.getOperand(3));
26535 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26536 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26537 Operation.getValue(1));
26538 }
26539 case Intrinsic::x86_aesenc128kl:
26540 case Intrinsic::x86_aesdec128kl:
26541 case Intrinsic::x86_aesenc256kl:
26542 case Intrinsic::x86_aesdec256kl: {
26543 SDLoc DL(Op);
26544 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26545 SDValue Chain = Op.getOperand(0);
26546 unsigned Opcode;
26547
26548 switch (IntNo) {
26549 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26550 case Intrinsic::x86_aesenc128kl:
26551 Opcode = X86ISD::AESENC128KL;
26552 break;
26553 case Intrinsic::x86_aesdec128kl:
26554 Opcode = X86ISD::AESDEC128KL;
26555 break;
26556 case Intrinsic::x86_aesenc256kl:
26557 Opcode = X86ISD::AESENC256KL;
26558 break;
26559 case Intrinsic::x86_aesdec256kl:
26560 Opcode = X86ISD::AESDEC256KL;
26561 break;
26562 }
26563
26564 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26565 MachineMemOperand *MMO = MemIntr->getMemOperand();
26566 EVT MemVT = MemIntr->getMemoryVT();
26567 SDValue Operation = DAG.getMemIntrinsicNode(
26568 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26569 MMO);
26570 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26571
26572 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26573 {ZF, Operation.getValue(0), Operation.getValue(2)});
26574 }
26575 case Intrinsic::x86_aesencwide128kl:
26576 case Intrinsic::x86_aesdecwide128kl:
26577 case Intrinsic::x86_aesencwide256kl:
26578 case Intrinsic::x86_aesdecwide256kl: {
26579 SDLoc DL(Op);
26580 SDVTList VTs = DAG.getVTList(
26581 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26582 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26583 SDValue Chain = Op.getOperand(0);
26584 unsigned Opcode;
26585
26586 switch (IntNo) {
26587 default: llvm_unreachable("Impossible intrinsic")__builtin_unreachable();
26588 case Intrinsic::x86_aesencwide128kl:
26589 Opcode = X86ISD::AESENCWIDE128KL;
26590 break;
26591 case Intrinsic::x86_aesdecwide128kl:
26592 Opcode = X86ISD::AESDECWIDE128KL;
26593 break;
26594 case Intrinsic::x86_aesencwide256kl:
26595 Opcode = X86ISD::AESENCWIDE256KL;
26596 break;
26597 case Intrinsic::x86_aesdecwide256kl:
26598 Opcode = X86ISD::AESDECWIDE256KL;
26599 break;
26600 }
26601
26602 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26603 MachineMemOperand *MMO = MemIntr->getMemOperand();
26604 EVT MemVT = MemIntr->getMemoryVT();
26605 SDValue Operation = DAG.getMemIntrinsicNode(
26606 Opcode, DL, VTs,
26607 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26608 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26609 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26610 MemVT, MMO);
26611 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26612
26613 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26614 {ZF, Operation.getValue(1), Operation.getValue(2),
26615 Operation.getValue(3), Operation.getValue(4),
26616 Operation.getValue(5), Operation.getValue(6),
26617 Operation.getValue(7), Operation.getValue(8),
26618 Operation.getValue(9)});
26619 }
26620 case Intrinsic::x86_testui: {
26621 SDLoc dl(Op);
26622 SDValue Chain = Op.getOperand(0);
26623 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26624 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26625 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26626 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26627 Operation.getValue(1));
26628 }
26629 }
26630 return SDValue();
26631 }
26632
26633 SDLoc dl(Op);
26634 switch(IntrData->Type) {
26635 default: llvm_unreachable("Unknown Intrinsic Type")__builtin_unreachable();
26636 case RDSEED:
26637 case RDRAND: {
26638 // Emit the node with the right value type.
26639 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26640 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26641
26642 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26643 // Otherwise return the value from Rand, which is always 0, casted to i32.
26644 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26645 DAG.getConstant(1, dl, Op->getValueType(1)),
26646 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26647 SDValue(Result.getNode(), 1)};
26648 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26649
26650 // Return { result, isValid, chain }.
26651 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26652 SDValue(Result.getNode(), 2));
26653 }
26654 case GATHER_AVX2: {
26655 SDValue Chain = Op.getOperand(0);
26656 SDValue Src = Op.getOperand(2);
26657 SDValue Base = Op.getOperand(3);
26658 SDValue Index = Op.getOperand(4);
26659 SDValue Mask = Op.getOperand(5);
26660 SDValue Scale = Op.getOperand(6);
26661 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26662 Scale, Chain, Subtarget);
26663 }
26664 case GATHER: {
26665 //gather(v1, mask, index, base, scale);
26666 SDValue Chain = Op.getOperand(0);
26667 SDValue Src = Op.getOperand(2);
26668 SDValue Base = Op.getOperand(3);
26669 SDValue Index = Op.getOperand(4);
26670 SDValue Mask = Op.getOperand(5);
26671 SDValue Scale = Op.getOperand(6);
26672 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26673 Chain, Subtarget);
26674 }
26675 case SCATTER: {
26676 //scatter(base, mask, index, v1, scale);
26677 SDValue Chain = Op.getOperand(0);
26678 SDValue Base = Op.getOperand(2);
26679 SDValue Mask = Op.getOperand(3);
26680 SDValue Index = Op.getOperand(4);
26681 SDValue Src = Op.getOperand(5);
26682 SDValue Scale = Op.getOperand(6);
26683 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26684 Scale, Chain, Subtarget);
26685 }
26686 case PREFETCH: {
26687 const APInt &HintVal = Op.getConstantOperandAPInt(6);
26688 assert((HintVal == 2 || HintVal == 3) &&((void)0)
26689 "Wrong prefetch hint in intrinsic: should be 2 or 3")((void)0);
26690 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26691 SDValue Chain = Op.getOperand(0);
26692 SDValue Mask = Op.getOperand(2);
26693 SDValue Index = Op.getOperand(3);
26694 SDValue Base = Op.getOperand(4);
26695 SDValue Scale = Op.getOperand(5);
26696 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26697 Subtarget);
26698 }
26699 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26700 case RDTSC: {
26701 SmallVector<SDValue, 2> Results;
26702 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26703 Results);
26704 return DAG.getMergeValues(Results, dl);
26705 }
26706 // Read Performance Monitoring Counters.
26707 case RDPMC:
26708 // GetExtended Control Register.
26709 case XGETBV: {
26710 SmallVector<SDValue, 2> Results;
26711
26712 // RDPMC uses ECX to select the index of the performance counter to read.
26713 // XGETBV uses ECX to select the index of the XCR register to return.
26714 // The result is stored into registers EDX:EAX.
26715 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26716 Subtarget, Results);
26717 return DAG.getMergeValues(Results, dl);
26718 }
26719 // XTEST intrinsics.
26720 case XTEST: {
26721 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26722 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26723
26724 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26725 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26726 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26727 Ret, SDValue(InTrans.getNode(), 1));
26728 }
26729 case TRUNCATE_TO_MEM_VI8:
26730 case TRUNCATE_TO_MEM_VI16:
26731 case TRUNCATE_TO_MEM_VI32: {
26732 SDValue Mask = Op.getOperand(4);
26733 SDValue DataToTruncate = Op.getOperand(3);
26734 SDValue Addr = Op.getOperand(2);
26735 SDValue Chain = Op.getOperand(0);
26736
26737 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26738 assert(MemIntr && "Expected MemIntrinsicSDNode!")((void)0);
26739
26740 EVT MemVT = MemIntr->getMemoryVT();
26741
26742 uint16_t TruncationOp = IntrData->Opc0;
26743 switch (TruncationOp) {
26744 case X86ISD::VTRUNC: {
26745 if (isAllOnesConstant(Mask)) // return just a truncate store
26746 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26747 MemIntr->getMemOperand());
26748
26749 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26750 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26751 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26752
26753 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26754 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26755 true /* truncating */);
26756 }
26757 case X86ISD::VTRUNCUS:
26758 case X86ISD::VTRUNCS: {
26759 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26760 if (isAllOnesConstant(Mask))
26761 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26762 MemIntr->getMemOperand(), DAG);
26763
26764 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26765 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26766
26767 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26768 VMask, MemVT, MemIntr->getMemOperand(), DAG);
26769 }
26770 default:
26771 llvm_unreachable("Unsupported truncstore intrinsic")__builtin_unreachable();
26772 }
26773 }
26774 }
26775}
26776
26777SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26778 SelectionDAG &DAG) const {
26779 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26780 MFI.setReturnAddressIsTaken(true);
26781
26782 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26783 return SDValue();
26784
26785 unsigned Depth = Op.getConstantOperandVal(0);
26786 SDLoc dl(Op);
26787 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26788
26789 if (Depth > 0) {
26790 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26791 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26792 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26793 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26794 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26795 MachinePointerInfo());
26796 }
26797
26798 // Just load the return address.
26799 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26800 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26801 MachinePointerInfo());
26802}
26803
26804SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26805 SelectionDAG &DAG) const {
26806 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26807 return getReturnAddressFrameIndex(DAG);
26808}
26809
26810SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26811 MachineFunction &MF = DAG.getMachineFunction();
26812 MachineFrameInfo &MFI = MF.getFrameInfo();
26813 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26814 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26815 EVT VT = Op.getValueType();
26816
26817 MFI.setFrameAddressIsTaken(true);
26818
26819 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26820 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
26821 // is not possible to crawl up the stack without looking at the unwind codes
26822 // simultaneously.
26823 int FrameAddrIndex = FuncInfo->getFAIndex();
26824 if (!FrameAddrIndex) {
26825 // Set up a frame object for the return address.
26826 unsigned SlotSize = RegInfo->getSlotSize();
26827 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26828 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26829 FuncInfo->setFAIndex(FrameAddrIndex);
26830 }
26831 return DAG.getFrameIndex(FrameAddrIndex, VT);
26832 }
26833
26834 unsigned FrameReg =
26835 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26836 SDLoc dl(Op); // FIXME probably not meaningful
26837 unsigned Depth = Op.getConstantOperandVal(0);
26838 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((void)0)
26839 (FrameReg == X86::EBP && VT == MVT::i32)) &&((void)0)
26840 "Invalid Frame Register!")((void)0);
26841 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26842 while (Depth--)
26843 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
26844 MachinePointerInfo());
26845 return FrameAddr;
26846}
26847
26848// FIXME? Maybe this could be a TableGen attribute on some registers and
26849// this table could be generated automatically from RegInfo.
26850Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
26851 const MachineFunction &MF) const {
26852 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
26853
26854 Register Reg = StringSwitch<unsigned>(RegName)
26855 .Case("esp", X86::ESP)
26856 .Case("rsp", X86::RSP)
26857 .Case("ebp", X86::EBP)
26858 .Case("rbp", X86::RBP)
26859 .Default(0);
26860
26861 if (Reg == X86::EBP || Reg == X86::RBP) {
26862 if (!TFI.hasFP(MF))
26863 report_fatal_error("register " + StringRef(RegName) +
26864 " is allocatable: function has no frame pointer");
26865#ifndef NDEBUG1
26866 else {
26867 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26868 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
26869 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&((void)0)
26870 "Invalid Frame Register!")((void)0);
26871 }
26872#endif
26873 }
26874
26875 if (Reg)
26876 return Reg;
26877
26878 report_fatal_error("Invalid register name global variable");
26879}
26880
26881SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
26882 SelectionDAG &DAG) const {
26883 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26884 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
26885}
26886
26887Register X86TargetLowering::getExceptionPointerRegister(
26888 const Constant *PersonalityFn) const {
26889 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
26890 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26891
26892 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
26893}
26894
26895Register X86TargetLowering::getExceptionSelectorRegister(
26896 const Constant *PersonalityFn) const {
26897 // Funclet personalities don't use selectors (the runtime does the selection).
26898 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
26899 return X86::NoRegister;
26900 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
26901}
26902
26903bool X86TargetLowering::needsFixedCatchObjects() const {
26904 return Subtarget.isTargetWin64();
26905}
26906
26907SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
26908 SDValue Chain = Op.getOperand(0);
26909 SDValue Offset = Op.getOperand(1);
26910 SDValue Handler = Op.getOperand(2);
26911 SDLoc dl (Op);
26912
26913 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26914 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26915 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
26916 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((void)0)
26917 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((void)0)
26918 "Invalid Frame Register!")((void)0);
26919 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
26920 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
26921
26922 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
26923 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
26924 dl));
26925 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
26926 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
26927 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
26928
26929 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
26930 DAG.getRegister(StoreAddrReg, PtrVT));
26931}
26932
26933SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
26934 SelectionDAG &DAG) const {
26935 SDLoc DL(Op);
26936 // If the subtarget is not 64bit, we may need the global base reg
26937 // after isel expand pseudo, i.e., after CGBR pass ran.
26938 // Therefore, ask for the GlobalBaseReg now, so that the pass
26939 // inserts the code for us in case we need it.
26940 // Otherwise, we will end up in a situation where we will
26941 // reference a virtual register that is not defined!
26942 if (!Subtarget.is64Bit()) {
26943 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26944 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
26945 }
26946 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
26947 DAG.getVTList(MVT::i32, MVT::Other),
26948 Op.getOperand(0), Op.getOperand(1));
26949}
26950
26951SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
26952 SelectionDAG &DAG) const {
26953 SDLoc DL(Op);
26954 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
26955 Op.getOperand(0), Op.getOperand(1));
26956}
26957
26958SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
26959 SelectionDAG &DAG) const {
26960 SDLoc DL(Op);
26961 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
26962 Op.getOperand(0));
26963}
26964
26965static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
26966 return Op.getOperand(0);
26967}
26968
26969SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
26970 SelectionDAG &DAG) const {
26971 SDValue Root = Op.getOperand(0);
26972 SDValue Trmp = Op.getOperand(1); // trampoline
26973 SDValue FPtr = Op.getOperand(2); // nested function
26974 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
26975 SDLoc dl (Op);
26976
26977 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26978 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26979
26980 if (Subtarget.is64Bit()) {
26981 SDValue OutChains[6];
26982
26983 // Large code-model.
26984 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
26985 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
26986
26987 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
26988 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
26989
26990 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
26991
26992 // Load the pointer to the nested function into R11.
26993 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
26994 SDValue Addr = Trmp;
26995 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
26996 Addr, MachinePointerInfo(TrmpAddr));
26997
26998 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
26999 DAG.getConstant(2, dl, MVT::i64));
27000 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27001 MachinePointerInfo(TrmpAddr, 2), Align(2));
27002
27003 // Load the 'nest' parameter value into R10.
27004 // R10 is specified in X86CallingConv.td
27005 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27006 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27007 DAG.getConstant(10, dl, MVT::i64));
27008 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27009 Addr, MachinePointerInfo(TrmpAddr, 10));
27010
27011 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27012 DAG.getConstant(12, dl, MVT::i64));
27013 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27014 MachinePointerInfo(TrmpAddr, 12), Align(2));
27015
27016 // Jump to the nested function.
27017 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27018 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27019 DAG.getConstant(20, dl, MVT::i64));
27020 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27021 Addr, MachinePointerInfo(TrmpAddr, 20));
27022
27023 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27024 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27025 DAG.getConstant(22, dl, MVT::i64));
27026 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27027 Addr, MachinePointerInfo(TrmpAddr, 22));
27028
27029 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27030 } else {
27031 const Function *Func =
27032 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27033 CallingConv::ID CC = Func->getCallingConv();
27034 unsigned NestReg;
27035
27036 switch (CC) {
27037 default:
27038 llvm_unreachable("Unsupported calling convention")__builtin_unreachable();
27039 case CallingConv::C:
27040 case CallingConv::X86_StdCall: {
27041 // Pass 'nest' parameter in ECX.
27042 // Must be kept in sync with X86CallingConv.td
27043 NestReg = X86::ECX;
27044
27045 // Check that ECX wasn't needed by an 'inreg' parameter.
27046 FunctionType *FTy = Func->getFunctionType();
27047 const AttributeList &Attrs = Func->getAttributes();
27048
27049 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27050 unsigned InRegCount = 0;
27051 unsigned Idx = 1;
27052
27053 for (FunctionType::param_iterator I = FTy->param_begin(),
27054 E = FTy->param_end(); I != E; ++I, ++Idx)
27055 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
27056 const DataLayout &DL = DAG.getDataLayout();
27057 // FIXME: should only count parameters that are lowered to integers.
27058 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27059 }
27060
27061 if (InRegCount > 2) {
27062 report_fatal_error("Nest register in use - reduce number of inreg"
27063 " parameters!");
27064 }
27065 }
27066 break;
27067 }
27068 case CallingConv::X86_FastCall:
27069 case CallingConv::X86_ThisCall:
27070 case CallingConv::Fast:
27071 case CallingConv::Tail:
27072 case CallingConv::SwiftTail:
27073 // Pass 'nest' parameter in EAX.
27074 // Must be kept in sync with X86CallingConv.td
27075 NestReg = X86::EAX;
27076 break;
27077 }
27078
27079 SDValue OutChains[4];
27080 SDValue Addr, Disp;
27081
27082 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27083 DAG.getConstant(10, dl, MVT::i32));
27084 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27085
27086 // This is storing the opcode for MOV32ri.
27087 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27088 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27089 OutChains[0] =
27090 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27091 Trmp, MachinePointerInfo(TrmpAddr));
27092
27093 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27094 DAG.getConstant(1, dl, MVT::i32));
27095 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27096 MachinePointerInfo(TrmpAddr, 1), Align(1));
27097
27098 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27099 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27100 DAG.getConstant(5, dl, MVT::i32));
27101 OutChains[2] =
27102 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27103 MachinePointerInfo(TrmpAddr, 5), Align(1));
27104
27105 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27106 DAG.getConstant(6, dl, MVT::i32));
27107 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27108 MachinePointerInfo(TrmpAddr, 6), Align(1));
27109
27110 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27111 }
27112}
27113
27114SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
27115 SelectionDAG &DAG) const {
27116 /*
27117 The rounding mode is in bits 11:10 of FPSR, and has the following
27118 settings:
27119 00 Round to nearest
27120 01 Round to -inf
27121 10 Round to +inf
27122 11 Round to 0
27123
27124 FLT_ROUNDS, on the other hand, expects the following:
27125 -1 Undefined
27126 0 Round to 0
27127 1 Round to nearest
27128 2 Round to +inf
27129 3 Round to -inf
27130
27131 To perform the conversion, we use a packed lookup table of the four 2-bit
27132 values that we can index by FPSP[11:10]
27133 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27134
27135 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27136 */
27137
27138 MachineFunction &MF = DAG.getMachineFunction();
27139 MVT VT = Op.getSimpleValueType();
27140 SDLoc DL(Op);
27141
27142 // Save FP Control Word to stack slot
27143 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27144 SDValue StackSlot =
27145 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27146
27147 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27148
27149 SDValue Chain = Op.getOperand(0);
27150 SDValue Ops[] = {Chain, StackSlot};
27151 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27152 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27153 Align(2), MachineMemOperand::MOStore);
27154
27155 // Load FP Control Word from stack slot
27156 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27157 Chain = CWD.getValue(1);
27158
27159 // Mask and turn the control bits into a shift for the lookup table.
27160 SDValue Shift =
27161 DAG.getNode(ISD::SRL, DL, MVT::i16,
27162 DAG.getNode(ISD::AND, DL, MVT::i16,
27163 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27164 DAG.getConstant(9, DL, MVT::i8));
27165 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27166
27167 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27168 SDValue RetVal =
27169 DAG.getNode(ISD::AND, DL, MVT::i32,
27170 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27171 DAG.getConstant(3, DL, MVT::i32));
27172
27173 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27174
27175 return DAG.getMergeValues({RetVal, Chain}, DL);
27176}
27177
27178SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27179 SelectionDAG &DAG) const {
27180 MachineFunction &MF = DAG.getMachineFunction();
27181 SDLoc DL(Op);
27182 SDValue Chain = Op.getNode()->getOperand(0);
27183
27184 // FP control word may be set only from data in memory. So we need to allocate
27185 // stack space to save/load FP control word.
27186 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27187 SDValue StackSlot =
27188 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27189 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27190 MachineMemOperand *MMO =
27191 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27192
27193 // Store FP control word into memory.
27194 SDValue Ops[] = {Chain, StackSlot};
27195 Chain = DAG.getMemIntrinsicNode(
27196 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27197
27198 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27199 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27200 Chain = CWD.getValue(1);
27201 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27202 DAG.getConstant(0xf3ff, DL, MVT::i16));
27203
27204 // Calculate new rounding mode.
27205 SDValue NewRM = Op.getNode()->getOperand(1);
27206 SDValue RMBits;
27207 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27208 uint64_t RM = CVal->getZExtValue();
27209 int FieldVal;
27210 switch (static_cast<RoundingMode>(RM)) {
27211 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27212 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27213 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27214 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27215 default:
27216 llvm_unreachable("rounding mode is not supported by X86 hardware")__builtin_unreachable();
27217 }
27218 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27219 } else {
27220 // Need to convert argument into bits of control word:
27221 // 0 Round to 0 -> 11
27222 // 1 Round to nearest -> 00
27223 // 2 Round to +inf -> 10
27224 // 3 Round to -inf -> 01
27225 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27226 // To make the conversion, put all these values into a value 0xc9 and shift
27227 // it left depending on the rounding mode:
27228 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27229 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27230 // ...
27231 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27232 SDValue ShiftValue =
27233 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27234 DAG.getNode(ISD::ADD, DL, MVT::i32,
27235 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27236 DAG.getConstant(1, DL, MVT::i8)),
27237 DAG.getConstant(4, DL, MVT::i32)));
27238 SDValue Shifted =
27239 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27240 ShiftValue);
27241 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27242 DAG.getConstant(0xc00, DL, MVT::i16));
27243 }
27244
27245 // Update rounding mode bits and store the new FP Control Word into stack.
27246 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27247 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27248
27249 // Load FP control word from the slot.
27250 SDValue OpsLD[] = {Chain, StackSlot};
27251 MachineMemOperand *MMOL =
27252 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27253 Chain = DAG.getMemIntrinsicNode(
27254 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27255
27256 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27257 // same way but in bits 14:13.
27258 if (Subtarget.hasSSE1()) {
27259 // Store MXCSR into memory.
27260 Chain = DAG.getNode(
27261 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27262 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27263 StackSlot);
27264
27265 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27266 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27267 Chain = CWD.getValue(1);
27268 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27269 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27270
27271 // Shift X87 RM bits from 11:10 to 14:13.
27272 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27273 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27274 DAG.getConstant(3, DL, MVT::i8));
27275
27276 // Update rounding mode bits and store the new FP Control Word into stack.
27277 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27278 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27279
27280 // Load MXCSR from the slot.
27281 Chain = DAG.getNode(
27282 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27283 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27284 StackSlot);
27285 }
27286
27287 return Chain;
27288}
27289
27290/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27291//
27292// i8/i16 vector implemented using dword LZCNT vector instruction
27293// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27294// split the vector, perform operation on it's Lo a Hi part and
27295// concatenate the results.
27296static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27297 const X86Subtarget &Subtarget) {
27298 assert(Op.getOpcode() == ISD::CTLZ)((void)0);
27299 SDLoc dl(Op);
27300 MVT VT = Op.getSimpleValueType();
27301 MVT EltVT = VT.getVectorElementType();
27302 unsigned NumElems = VT.getVectorNumElements();
27303
27304 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&((void)0)
27305 "Unsupported element type")((void)0);
27306
27307 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27308 if (NumElems > 16 ||
27309 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27310 return splitVectorIntUnary(Op, DAG);
27311
27312 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27313 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&((void)0)
27314 "Unsupported value type for operation")((void)0);
27315
27316 // Use native supported vector instruction vplzcntd.
27317 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27318 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27319 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27320 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27321
27322 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27323}
27324
27325// Lower CTLZ using a PSHUFB lookup table implementation.
27326static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27327 const X86Subtarget &Subtarget,
27328 SelectionDAG &DAG) {
27329 MVT VT = Op.getSimpleValueType();
27330 int NumElts = VT.getVectorNumElements();
27331 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27332 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27333
27334 // Per-nibble leading zero PSHUFB lookup table.
27335 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27336 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27337 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27338 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27339
27340 SmallVector<SDValue, 64> LUTVec;
27341 for (int i = 0; i < NumBytes; ++i)
27342 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27343 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27344
27345 // Begin by bitcasting the input to byte vector, then split those bytes
27346 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27347 // If the hi input nibble is zero then we add both results together, otherwise
27348 // we just take the hi result (by masking the lo result to zero before the
27349 // add).
27350 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27351 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27352
27353 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27354 SDValue Lo = Op0;
27355 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27356 SDValue HiZ;
27357 if (CurrVT.is512BitVector()) {
27358 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27359 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27360 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27361 } else {
27362 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27363 }
27364
27365 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27366 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27367 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27368 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27369
27370 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27371 // of the current vector width in the same way we did for the nibbles.
27372 // If the upper half of the input element is zero then add the halves'
27373 // leading zero counts together, otherwise just use the upper half's.
27374 // Double the width of the result until we are at target width.
27375 while (CurrVT != VT) {
27376 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27377 int CurrNumElts = CurrVT.getVectorNumElements();
27378 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27379 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27380 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27381
27382 // Check if the upper half of the input element is zero.
27383 if (CurrVT.is512BitVector()) {
27384 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27385 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27386 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27387 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27388 } else {
27389 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27390 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27391 }
27392 HiZ = DAG.getBitcast(NextVT, HiZ);
27393
27394 // Move the upper/lower halves to the lower bits as we'll be extending to
27395 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27396 // together.
27397 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27398 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27399 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27400 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27401 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27402 CurrVT = NextVT;
27403 }
27404
27405 return Res;
27406}
27407
27408static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27409 const X86Subtarget &Subtarget,
27410 SelectionDAG &DAG) {
27411 MVT VT = Op.getSimpleValueType();
27412
27413 if (Subtarget.hasCDI() &&
27414 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27415 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27416 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27417
27418 // Decompose 256-bit ops into smaller 128-bit ops.
27419 if (VT.is256BitVector() && !Subtarget.hasInt256())
27420 return splitVectorIntUnary(Op, DAG);
27421
27422 // Decompose 512-bit ops into smaller 256-bit ops.
27423 if (VT.is512BitVector() && !Subtarget.hasBWI())
27424 return splitVectorIntUnary(Op, DAG);
27425
27426 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((void)0);
27427 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27428}
27429
27430static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27431 SelectionDAG &DAG) {
27432 MVT VT = Op.getSimpleValueType();
27433 MVT OpVT = VT;
27434 unsigned NumBits = VT.getSizeInBits();
27435 SDLoc dl(Op);
27436 unsigned Opc = Op.getOpcode();
27437
27438 if (VT.isVector())
27439 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27440
27441 Op = Op.getOperand(0);
27442 if (VT == MVT::i8) {
27443 // Zero extend to i32 since there is not an i8 bsr.
27444 OpVT = MVT::i32;
27445 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27446 }
27447
27448 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27449 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27450 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27451
27452 if (Opc == ISD::CTLZ) {
27453 // If src is zero (i.e. bsr sets ZF), returns NumBits.
27454 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27455 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27456 Op.getValue(1)};
27457 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27458 }
27459
27460 // Finally xor with NumBits-1.
27461 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27462 DAG.getConstant(NumBits - 1, dl, OpVT));
27463
27464 if (VT == MVT::i8)
27465 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27466 return Op;
27467}
27468
27469static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27470 SelectionDAG &DAG) {
27471 MVT VT = Op.getSimpleValueType();
27472 unsigned NumBits = VT.getScalarSizeInBits();
27473 SDValue N0 = Op.getOperand(0);
27474 SDLoc dl(Op);
27475
27476 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&((void)0)
27477 "Only scalar CTTZ requires custom lowering")((void)0);
27478
27479 // Issue a bsf (scan bits forward) which also sets EFLAGS.
27480 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27481 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27482
27483 // If src is zero (i.e. bsf sets ZF), returns NumBits.
27484 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27485 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27486 Op.getValue(1)};
27487 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27488}
27489
27490static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27491 const X86Subtarget &Subtarget) {
27492 MVT VT = Op.getSimpleValueType();
27493 if (VT == MVT::i16 || VT == MVT::i32)
27494 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27495
27496 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27497 return splitVectorIntBinary(Op, DAG);
27498
27499 assert(Op.getSimpleValueType().is256BitVector() &&((void)0)
27500 Op.getSimpleValueType().isInteger() &&((void)0)
27501 "Only handle AVX 256-bit vector integer operation")((void)0);
27502 return splitVectorIntBinary(Op, DAG);
27503}
27504
27505static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27506 const X86Subtarget &Subtarget) {
27507 MVT VT = Op.getSimpleValueType();
27508 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27509 unsigned Opcode = Op.getOpcode();
27510 SDLoc DL(Op);
27511
27512 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27513 (VT.is256BitVector() && !Subtarget.hasInt256())) {
27514 assert(Op.getSimpleValueType().isInteger() &&((void)0)
27515 "Only handle AVX vector integer operation")((void)0);
27516 return splitVectorIntBinary(Op, DAG);
27517 }
27518
27519 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27520 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27521 EVT SetCCResultType =
27522 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27523
27524 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
27525 // usubsat X, Y --> (X >u Y) ? X - Y : 0
27526 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27527 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27528 // TODO: Move this to DAGCombiner?
27529 if (SetCCResultType == VT &&
27530 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27531 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27532 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27533 }
27534
27535 // Use default expansion.
27536 return SDValue();
27537}
27538
27539static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27540 SelectionDAG &DAG) {
27541 MVT VT = Op.getSimpleValueType();
27542 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27543 // Since X86 does not have CMOV for 8-bit integer, we don't convert
27544 // 8-bit integer abs to NEG and CMOV.
27545 SDLoc DL(Op);
27546 SDValue N0 = Op.getOperand(0);
27547 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27548 DAG.getConstant(0, DL, VT), N0);
27549 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
27550 SDValue(Neg.getNode(), 1)};
27551 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27552 }
27553
27554 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27555 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27556 SDLoc DL(Op);
27557 SDValue Src = Op.getOperand(0);
27558 SDValue Sub =
27559 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27560 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27561 }
27562
27563 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27564 assert(VT.isInteger() &&((void)0)
27565 "Only handle AVX 256-bit vector integer operation")((void)0);
27566 return splitVectorIntUnary(Op, DAG);
27567 }
27568
27569 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27570 return splitVectorIntUnary(Op, DAG);
27571
27572 // Default to expand.
27573 return SDValue();
27574}
27575
27576static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
27577 MVT VT = Op.getSimpleValueType();
27578
27579 // For AVX1 cases, split to use legal ops (everything but v4i64).
27580 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
27581 return splitVectorIntBinary(Op, DAG);
27582
27583 if (VT == MVT::v32i16 || VT == MVT::v64i8)
27584 return splitVectorIntBinary(Op, DAG);
27585
27586 // Default to expand.
27587 return SDValue();
27588}
27589
27590static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
27591 SelectionDAG &DAG) {
27592 SDLoc dl(Op);
27593 MVT VT = Op.getSimpleValueType();
27594
27595 // Decompose 256-bit ops into 128-bit ops.
27596 if (VT.is256BitVector() && !Subtarget.hasInt256())
27597 return splitVectorIntBinary(Op, DAG);
27598
27599 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27600 return splitVectorIntBinary(Op, DAG);
27601
27602 SDValue A = Op.getOperand(0);
27603 SDValue B = Op.getOperand(1);
27604
27605 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
27606 // vector pairs, multiply and truncate.
27607 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
27608 unsigned NumElts = VT.getVectorNumElements();
27609
27610 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27611 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27612 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
27613 return DAG.getNode(
27614 ISD::TRUNCATE, dl, VT,
27615 DAG.getNode(ISD::MUL, dl, ExVT,
27616 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
27617 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
27618 }
27619
27620 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27621
27622 // Extract the lo/hi parts to any extend to i16.
27623 // We're going to mask off the low byte of each result element of the
27624 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
27625 // element.
27626 SDValue Undef = DAG.getUNDEF(VT);
27627 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
27628 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
27629
27630 SDValue BLo, BHi;
27631 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27632 // If the RHS is a constant, manually unpackl/unpackh.
27633 SmallVector<SDValue, 16> LoOps, HiOps;
27634 for (unsigned i = 0; i != NumElts; i += 16) {
27635 for (unsigned j = 0; j != 8; ++j) {
27636 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
27637 MVT::i16));
27638 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
27639 MVT::i16));
27640 }
27641 }
27642
27643 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27644 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27645 } else {
27646 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
27647 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
27648 }
27649
27650 // Multiply, mask the lower 8bits of the lo/hi results and pack.
27651 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
27652 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
27653 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
27654 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
27655 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27656 }
27657
27658 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
27659 if (VT == MVT::v4i32) {
27660 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((void)0)
27661 "Should not custom lower when pmulld is available!")((void)0);
27662
27663 // Extract the odd parts.
27664 static const int UnpackMask[] = { 1, -1, 3, -1 };
27665 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
27666 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
27667
27668 // Multiply the even parts.
27669 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27670 DAG.getBitcast(MVT::v2i64, A),
27671 DAG.getBitcast(MVT::v2i64, B));
27672 // Now multiply odd parts.
27673 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
27674 DAG.getBitcast(MVT::v2i64, Aodds),
27675 DAG.getBitcast(MVT::v2i64, Bodds));
27676
27677 Evens = DAG.getBitcast(VT, Evens);
27678 Odds = DAG.getBitcast(VT, Odds);
27679
27680 // Merge the two vectors back together with a shuffle. This expands into 2
27681 // shuffles.
27682 static const int ShufMask[] = { 0, 4, 2, 6 };
27683 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
27684 }
27685
27686 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&((void)0)
27687 "Only know how to lower V2I64/V4I64/V8I64 multiply")((void)0);
27688 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")((void)0);
27689
27690 // Ahi = psrlqi(a, 32);
27691 // Bhi = psrlqi(b, 32);
27692 //
27693 // AloBlo = pmuludq(a, b);
27694 // AloBhi = pmuludq(a, Bhi);
27695 // AhiBlo = pmuludq(Ahi, b);
27696 //
27697 // Hi = psllqi(AloBhi + AhiBlo, 32);
27698 // return AloBlo + Hi;
27699 KnownBits AKnown = DAG.computeKnownBits(A);
27700 KnownBits BKnown = DAG.computeKnownBits(B);
27701
27702 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
27703 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
27704 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
27705
27706 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
27707 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
27708 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
27709
27710 SDValue Zero = DAG.getConstant(0, dl, VT);
27711
27712 // Only multiply lo/hi halves that aren't known to be zero.
27713 SDValue AloBlo = Zero;
27714 if (!ALoIsZero && !BLoIsZero)
27715 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
27716
27717 SDValue AloBhi = Zero;
27718 if (!ALoIsZero && !BHiIsZero) {
27719 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
27720 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
27721 }
27722
27723 SDValue AhiBlo = Zero;
27724 if (!AHiIsZero && !BLoIsZero) {
27725 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
27726 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
27727 }
27728
27729 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
27730 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
27731
27732 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
27733}
27734
27735static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
27736 MVT VT, bool IsSigned,
27737 const X86Subtarget &Subtarget,
27738 SelectionDAG &DAG,
27739 SDValue *Low = nullptr) {
27740 unsigned NumElts = VT.getVectorNumElements();
27741
27742 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
27743 // to a vXi16 type. Do the multiplies, shift the results and pack the half
27744 // lane results back together.
27745
27746 // We'll take different approaches for signed and unsigned.
27747 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
27748 // and use pmullw to calculate the full 16-bit product.
27749 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
27750 // shift them left into the upper byte of each word. This allows us to use
27751 // pmulhw to calculate the full 16-bit product. This trick means we don't
27752 // need to sign extend the bytes to use pmullw.
27753
27754 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27755 SDValue Zero = DAG.getConstant(0, dl, VT);
27756
27757 SDValue ALo, AHi;
27758 if (IsSigned) {
27759 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
27760 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
27761 } else {
27762 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
27763 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
27764 }
27765
27766 SDValue BLo, BHi;
27767 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
27768 // If the RHS is a constant, manually unpackl/unpackh and extend.
27769 SmallVector<SDValue, 16> LoOps, HiOps;
27770 for (unsigned i = 0; i != NumElts; i += 16) {
27771 for (unsigned j = 0; j != 8; ++j) {
27772 SDValue LoOp = B.getOperand(i + j);
27773 SDValue HiOp = B.getOperand(i + j + 8);
27774
27775 if (IsSigned) {
27776 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
27777 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
27778 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
27779 DAG.getConstant(8, dl, MVT::i16));
27780 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
27781 DAG.getConstant(8, dl, MVT::i16));
27782 } else {
27783 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
27784 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
27785 }
27786
27787 LoOps.push_back(LoOp);
27788 HiOps.push_back(HiOp);
27789 }
27790 }
27791
27792 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
27793 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
27794 } else if (IsSigned) {
27795 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
27796 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
27797 } else {
27798 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
27799 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
27800 }
27801
27802 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
27803 // pack back to vXi8.
27804 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
27805 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
27806 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
27807
27808 if (Low) {
27809 // Mask the lower bits and pack the results to rejoin the halves.
27810 SDValue Mask = DAG.getConstant(255, dl, ExVT);
27811 SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
27812 SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
27813 *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
27814 }
27815
27816 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
27817 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
27818
27819 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
27820 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27821}
27822
27823static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
27824 SelectionDAG &DAG) {
27825 SDLoc dl(Op);
27826 MVT VT = Op.getSimpleValueType();
27827 bool IsSigned = Op->getOpcode() == ISD::MULHS;
27828 unsigned NumElts = VT.getVectorNumElements();
27829 SDValue A = Op.getOperand(0);
27830 SDValue B = Op.getOperand(1);
27831
27832 // Decompose 256-bit ops into 128-bit ops.
27833 if (VT.is256BitVector() && !Subtarget.hasInt256())
27834 return splitVectorIntBinary(Op, DAG);
27835
27836 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27837 return splitVectorIntBinary(Op, DAG);
27838
27839 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
27840 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||((void)0)
27841 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||((void)0)
27842 (VT == MVT::v16i32 && Subtarget.hasAVX512()))((void)0);
27843
27844 // PMULxD operations multiply each even value (starting at 0) of LHS with
27845 // the related value of RHS and produce a widen result.
27846 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27847 // => <2 x i64> <ae|cg>
27848 //
27849 // In other word, to have all the results, we need to perform two PMULxD:
27850 // 1. one with the even values.
27851 // 2. one with the odd values.
27852 // To achieve #2, with need to place the odd values at an even position.
27853 //
27854 // Place the odd value at an even position (basically, shift all values 1
27855 // step to the left):
27856 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
27857 9, -1, 11, -1, 13, -1, 15, -1};
27858 // <a|b|c|d> => <b|undef|d|undef>
27859 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
27860 makeArrayRef(&Mask[0], NumElts));
27861 // <e|f|g|h> => <f|undef|h|undef>
27862 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
27863 makeArrayRef(&Mask[0], NumElts));
27864
27865 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
27866 // ints.
27867 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
27868 unsigned Opcode =
27869 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
27870 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
27871 // => <2 x i64> <ae|cg>
27872 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27873 DAG.getBitcast(MulVT, A),
27874 DAG.getBitcast(MulVT, B)));
27875 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
27876 // => <2 x i64> <bf|dh>
27877 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
27878 DAG.getBitcast(MulVT, Odd0),
27879 DAG.getBitcast(MulVT, Odd1)));
27880
27881 // Shuffle it back into the right order.
27882 SmallVector<int, 16> ShufMask(NumElts);
27883 for (int i = 0; i != (int)NumElts; ++i)
27884 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
27885
27886 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
27887
27888 // If we have a signed multiply but no PMULDQ fix up the result of an
27889 // unsigned multiply.
27890 if (IsSigned && !Subtarget.hasSSE41()) {
27891 SDValue Zero = DAG.getConstant(0, dl, VT);
27892 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
27893 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
27894 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
27895 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
27896
27897 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
27898 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
27899 }
27900
27901 return Res;
27902 }
27903
27904 // Only i8 vectors should need custom lowering after this.
27905 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||((void)0)
27906 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&((void)0)
27907 "Unsupported vector type")((void)0);
27908
27909 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
27910 // logical shift down the upper half and pack back to i8.
27911
27912 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
27913 // and then ashr/lshr the upper bits down to the lower bits before multiply.
27914
27915 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27916 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27917 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27918 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27919 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27920 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27921 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27922 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
27923 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27924 }
27925
27926 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
27927}
27928
27929// Custom lowering for SMULO/UMULO.
27930static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
27931 SelectionDAG &DAG) {
27932 MVT VT = Op.getSimpleValueType();
27933
27934 // Scalars defer to LowerXALUO.
27935 if (!VT.isVector())
27936 return LowerXALUO(Op, DAG);
27937
27938 SDLoc dl(Op);
27939 bool IsSigned = Op->getOpcode() == ISD::SMULO;
27940 SDValue A = Op.getOperand(0);
27941 SDValue B = Op.getOperand(1);
27942 EVT OvfVT = Op->getValueType(1);
27943
27944 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
27945 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
27946 // Extract the LHS Lo/Hi vectors
27947 SDValue LHSLo, LHSHi;
27948 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
27949
27950 // Extract the RHS Lo/Hi vectors
27951 SDValue RHSLo, RHSHi;
27952 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
27953
27954 EVT LoOvfVT, HiOvfVT;
27955 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
27956 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
27957 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
27958
27959 // Issue the split operations.
27960 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
27961 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
27962
27963 // Join the separate data results and the overflow results.
27964 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
27965 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
27966 Hi.getValue(1));
27967
27968 return DAG.getMergeValues({Res, Ovf}, dl);
27969 }
27970
27971 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27972 EVT SetccVT =
27973 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27974
27975 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
27976 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
27977 unsigned NumElts = VT.getVectorNumElements();
27978 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27979 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27980 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
27981 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
27982 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
27983
27984 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
27985
27986 SDValue Ovf;
27987 if (IsSigned) {
27988 SDValue High, LowSign;
27989 if (OvfVT.getVectorElementType() == MVT::i1 &&
27990 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
27991 // Rather the truncating try to do the compare on vXi16 or vXi32.
27992 // Shift the high down filling with sign bits.
27993 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
27994 // Fill all 16 bits with the sign bit from the low.
27995 LowSign =
27996 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
27997 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
27998 15, DAG);
27999 SetccVT = OvfVT;
28000 if (!Subtarget.hasBWI()) {
28001 // We can't do a vXi16 compare so sign extend to v16i32.
28002 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28003 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28004 }
28005 } else {
28006 // Otherwise do the compare at vXi8.
28007 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28008 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28009 LowSign =
28010 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28011 }
28012
28013 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28014 } else {
28015 SDValue High =
28016 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28017 if (OvfVT.getVectorElementType() == MVT::i1 &&
28018 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28019 // Rather the truncating try to do the compare on vXi16 or vXi32.
28020 SetccVT = OvfVT;
28021 if (!Subtarget.hasBWI()) {
28022 // We can't do a vXi16 compare so sign extend to v16i32.
28023 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28024 }
28025 } else {
28026 // Otherwise do the compare at vXi8.
28027 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28028 }
28029
28030 Ovf =
28031 DAG.getSetCC(dl, SetccVT, High,
28032 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28033 }
28034
28035 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28036
28037 return DAG.getMergeValues({Low, Ovf}, dl);
28038 }
28039
28040 SDValue Low;
28041 SDValue High =
28042 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28043
28044 SDValue Ovf;
28045 if (IsSigned) {
28046 // SMULO overflows if the high bits don't match the sign of the low.
28047 SDValue LowSign =
28048 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28049 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28050 } else {
28051 // UMULO overflows if the high bits are non-zero.
28052 Ovf =
28053 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28054 }
28055
28056 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28057
28058 return DAG.getMergeValues({Low, Ovf}, dl);
28059}
28060
28061SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28062 assert(Subtarget.isTargetWin64() && "Unexpected target")((void)0);
28063 EVT VT = Op.getValueType();
28064 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((void)0)
28065 "Unexpected return type for lowering")((void)0);
28066
28067 RTLIB::Libcall LC;
28068 bool isSigned;
28069 switch (Op->getOpcode()) {
28070 default: llvm_unreachable("Unexpected request for libcall!")__builtin_unreachable();
28071 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28072 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28073 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28074 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28075 }
28076
28077 SDLoc dl(Op);
28078 SDValue InChain = DAG.getEntryNode();
28079
28080 TargetLowering::ArgListTy Args;
28081 TargetLowering::ArgListEntry Entry;
28082 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28083 EVT ArgVT = Op->getOperand(i).getValueType();
28084 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((void)0)
28085 "Unexpected argument type for lowering")((void)0);
28086 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28087 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28088 MachinePointerInfo MPI =
28089 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28090 Entry.Node = StackPtr;
28091 InChain =
28092 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28093 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28094 Entry.Ty = PointerType::get(ArgTy,0);
28095 Entry.IsSExt = false;
28096 Entry.IsZExt = false;
28097 Args.push_back(Entry);
28098 }
28099
28100 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28101 getPointerTy(DAG.getDataLayout()));
28102
28103 TargetLowering::CallLoweringInfo CLI(DAG);
28104 CLI.setDebugLoc(dl)
28105 .setChain(InChain)
28106 .setLibCallee(
28107 getLibcallCallingConv(LC),
28108 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28109 std::move(Args))
28110 .setInRegister()
28111 .setSExtResult(isSigned)
28112 .setZExtResult(!isSigned);
28113
28114 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28115 return DAG.getBitcast(VT, CallInfo.first);
28116}
28117
28118// Return true if the required (according to Opcode) shift-imm form is natively
28119// supported by the Subtarget
28120static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
28121 unsigned Opcode) {
28122 if (VT.getScalarSizeInBits() < 16)
28123 return false;
28124
28125 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
28126 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28127 return true;
28128
28129 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28130 (VT.is256BitVector() && Subtarget.hasInt256());
28131
28132 bool AShift = LShift && (Subtarget.hasAVX512() ||
28133 (VT != MVT::v2i64 && VT != MVT::v4i64));
28134 return (Opcode == ISD::SRA) ? AShift : LShift;
28135}
28136
28137// The shift amount is a variable, but it is the same for all vector lanes.
28138// These instructions are defined together with shift-immediate.
28139static
28140bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
28141 unsigned Opcode) {
28142 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
28143}
28144
28145// Return true if the required (according to Opcode) variable-shift form is
28146// natively supported by the Subtarget
28147static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
28148 unsigned Opcode) {
28149
28150 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28151 return false;
28152
28153 // vXi16 supported only on AVX-512, BWI
28154 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28155 return false;
28156
28157 if (Subtarget.hasAVX512())
28158 return true;
28159
28160 bool LShift = VT.is128BitVector() || VT.is256BitVector();
28161 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
28162 return (Opcode == ISD::SRA) ? AShift : LShift;
28163}
28164
28165static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
28166 const X86Subtarget &Subtarget) {
28167 MVT VT = Op.getSimpleValueType();
28168 SDLoc dl(Op);
28169 SDValue R = Op.getOperand(0);
28170 SDValue Amt = Op.getOperand(1);
28171 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28172
28173 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28174 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")((void)0);
28175 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28176 SDValue Ex = DAG.getBitcast(ExVT, R);
28177
28178 // ashr(R, 63) === cmp_slt(R, 0)
28179 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28180 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&((void)0)
28181 "Unsupported PCMPGT op")((void)0);
28182 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28183 }
28184
28185 if (ShiftAmt >= 32) {
28186 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28187 SDValue Upper =
28188 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28189 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28190 ShiftAmt - 32, DAG);
28191 if (VT == MVT::v2i64)
28192 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28193 if (VT == MVT::v4i64)
28194 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28195 {9, 1, 11, 3, 13, 5, 15, 7});
28196 } else {
28197 // SRA upper i32, SRL whole i64 and select lower i32.
28198 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28199 ShiftAmt, DAG);
28200 SDValue Lower =
28201 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28202 Lower = DAG.getBitcast(ExVT, Lower);
28203 if (VT == MVT::v2i64)
28204 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28205 if (VT == MVT::v4i64)
28206 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28207 {8, 1, 10, 3, 12, 5, 14, 7});
28208 }
28209 return DAG.getBitcast(VT, Ex);
28210 };
28211
28212 // Optimize shl/srl/sra with constant shift amount.
28213 APInt APIntShiftAmt;
28214 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28215 return SDValue();
28216
28217 // If the shift amount is out of range, return undef.
28218 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28219 return DAG.getUNDEF(VT);
28220
28221 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28222
28223 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28224 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28225
28226 // i64 SRA needs to be performed as partial shifts.
28227 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28228 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28229 Op.getOpcode() == ISD::SRA)
28230 return ArithmeticShiftRight64(ShiftAmt);
28231
28232 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28233 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28234 unsigned NumElts = VT.getVectorNumElements();
28235 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28236
28237 // Simple i8 add case
28238 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
28239 return DAG.getNode(ISD::ADD, dl, VT, R, R);
28240
28241 // ashr(R, 7) === cmp_slt(R, 0)
28242 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28243 SDValue Zeros = DAG.getConstant(0, dl, VT);
28244 if (VT.is512BitVector()) {
28245 assert(VT == MVT::v64i8 && "Unexpected element type!")((void)0);
28246 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28247 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28248 }
28249 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28250 }
28251
28252 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28253 if (VT == MVT::v16i8 && Subtarget.hasXOP())
28254 return SDValue();
28255
28256 if (Op.getOpcode() == ISD::SHL) {
28257 // Make a large shift.
28258 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28259 ShiftAmt, DAG);
28260 SHL = DAG.getBitcast(VT, SHL);
28261 // Zero out the rightmost bits.
28262 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28263 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28264 }
28265 if (Op.getOpcode() == ISD::SRL) {
28266 // Make a large shift.
28267 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28268 ShiftAmt, DAG);
28269 SRL = DAG.getBitcast(VT, SRL);
28270 // Zero out the leftmost bits.
28271 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28272 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28273 }
28274 if (Op.getOpcode() == ISD::SRA) {
28275 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28276 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28277
28278 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28279 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28280 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28281 return Res;
28282 }
28283 llvm_unreachable("Unknown shift opcode.")__builtin_unreachable();
28284 }
28285
28286 return SDValue();
28287}
28288
28289static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28290 const X86Subtarget &Subtarget) {
28291 MVT VT = Op.getSimpleValueType();
28292 SDLoc dl(Op);
28293 SDValue R = Op.getOperand(0);
28294 SDValue Amt = Op.getOperand(1);
28295 unsigned Opcode = Op.getOpcode();
28296 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28297 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28298
28299 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28300 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28301 MVT EltVT = VT.getVectorElementType();
28302 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((void)0);
28303 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28304 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28305 else if (EltVT.bitsLT(MVT::i32))
28306 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28307
28308 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28309 }
28310
28311 // vXi8 shifts - shift as v8i16 + mask result.
28312 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28313 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28314 VT == MVT::v64i8) &&
28315 !Subtarget.hasXOP()) {
28316 unsigned NumElts = VT.getVectorNumElements();
28317 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28318 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28319 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28320 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28321 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28322
28323 // Create the mask using vXi16 shifts. For shift-rights we need to move
28324 // the upper byte down before splatting the vXi8 mask.
28325 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28326 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28327 BaseShAmt, Subtarget, DAG);
28328 if (Opcode != ISD::SHL)
28329 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28330 8, DAG);
28331 BitMask = DAG.getBitcast(VT, BitMask);
28332 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28333 SmallVector<int, 64>(NumElts, 0));
28334
28335 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28336 DAG.getBitcast(ExtVT, R), BaseShAmt,
28337 Subtarget, DAG);
28338 Res = DAG.getBitcast(VT, Res);
28339 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28340
28341 if (Opcode == ISD::SRA) {
28342 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28343 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28344 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28345 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28346 BaseShAmt, Subtarget, DAG);
28347 SignMask = DAG.getBitcast(VT, SignMask);
28348 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28349 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28350 }
28351 return Res;
28352 }
28353 }
28354 }
28355
28356 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28357 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28358 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28359 Amt = Amt.getOperand(0);
28360 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28361 std::vector<SDValue> Vals(Ratio);
28362 for (unsigned i = 0; i != Ratio; ++i)
28363 Vals[i] = Amt.getOperand(i);
28364 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28365 for (unsigned j = 0; j != Ratio; ++j)
28366 if (Vals[j] != Amt.getOperand(i + j))
28367 return SDValue();
28368 }
28369
28370 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28371 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28372 }
28373 return SDValue();
28374}
28375
28376// Convert a shift/rotate left amount to a multiplication scale factor.
28377static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28378 const X86Subtarget &Subtarget,
28379 SelectionDAG &DAG) {
28380 MVT VT = Amt.getSimpleValueType();
28381 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28382 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28383 (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28384 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28385 return SDValue();
28386
28387 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28388 SmallVector<SDValue, 8> Elts;
28389 MVT SVT = VT.getVectorElementType();
28390 unsigned SVTBits = SVT.getSizeInBits();
28391 APInt One(SVTBits, 1);
28392 unsigned NumElems = VT.getVectorNumElements();
28393
28394 for (unsigned i = 0; i != NumElems; ++i) {
28395 SDValue Op = Amt->getOperand(i);
28396 if (Op->isUndef()) {
28397 Elts.push_back(Op);
28398 continue;
28399 }
28400
28401 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28402 APInt C(SVTBits, ND->getZExtValue());
28403 uint64_t ShAmt = C.getZExtValue();
28404 if (ShAmt >= SVTBits) {
28405 Elts.push_back(DAG.getUNDEF(SVT));
28406 continue;
28407 }
28408 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28409 }
28410 return DAG.getBuildVector(VT, dl, Elts);
28411 }
28412
28413 // If the target doesn't support variable shifts, use either FP conversion
28414 // or integer multiplication to avoid shifting each element individually.
28415 if (VT == MVT::v4i32) {
28416 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28417 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28418 DAG.getConstant(0x3f800000U, dl, VT));
28419 Amt = DAG.getBitcast(MVT::v4f32, Amt);
28420 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28421 }
28422
28423 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28424 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28425 SDValue Z = DAG.getConstant(0, dl, VT);
28426 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28427 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28428 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28429 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28430 if (Subtarget.hasSSE41())
28431 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28432
28433 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28434 DAG.getBitcast(VT, Hi),
28435 {0, 2, 4, 6, 8, 10, 12, 14});
28436 }
28437
28438 return SDValue();
28439}
28440
28441static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28442 SelectionDAG &DAG) {
28443 MVT VT = Op.getSimpleValueType();
28444 SDLoc dl(Op);
28445 SDValue R = Op.getOperand(0);
28446 SDValue Amt = Op.getOperand(1);
28447 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28448 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28449
28450 unsigned Opc = Op.getOpcode();
28451 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28452 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28453
28454 assert(VT.isVector() && "Custom lowering only for vector shifts!")((void)0);
28455 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((void)0);
28456
28457 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28458 return V;
28459
28460 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28461 return V;
28462
28463 if (SupportedVectorVarShift(VT, Subtarget, Opc))
28464 return Op;
28465
28466 // XOP has 128-bit variable logical/arithmetic shifts.
28467 // +ve/-ve Amt = shift left/right.
28468 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
28469 VT == MVT::v8i16 || VT == MVT::v16i8)) {
28470 if (Opc == ISD::SRL || Opc == ISD::SRA) {
28471 SDValue Zero = DAG.getConstant(0, dl, VT);
28472 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
28473 }
28474 if (Opc == ISD::SHL || Opc == ISD::SRL)
28475 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
28476 if (Opc == ISD::SRA)
28477 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
28478 }
28479
28480 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
28481 // shifts per-lane and then shuffle the partial results back together.
28482 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
28483 // Splat the shift amounts so the scalar shifts above will catch it.
28484 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
28485 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
28486 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
28487 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
28488 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
28489 }
28490
28491 // i64 vector arithmetic shift can be emulated with the transform:
28492 // M = lshr(SIGN_MASK, Amt)
28493 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
28494 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
28495 Opc == ISD::SRA) {
28496 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
28497 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
28498 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28499 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
28500 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
28501 return R;
28502 }
28503
28504 // If possible, lower this shift as a sequence of two shifts by
28505 // constant plus a BLENDing shuffle instead of scalarizing it.
28506 // Example:
28507 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
28508 //
28509 // Could be rewritten as:
28510 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
28511 //
28512 // The advantage is that the two shifts from the example would be
28513 // lowered as X86ISD::VSRLI nodes in parallel before blending.
28514 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
28515 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28516 SDValue Amt1, Amt2;
28517 unsigned NumElts = VT.getVectorNumElements();
28518 SmallVector<int, 8> ShuffleMask;
28519 for (unsigned i = 0; i != NumElts; ++i) {
28520 SDValue A = Amt->getOperand(i);
28521 if (A.isUndef()) {
28522 ShuffleMask.push_back(SM_SentinelUndef);
28523 continue;
28524 }
28525 if (!Amt1 || Amt1 == A) {
28526 ShuffleMask.push_back(i);
28527 Amt1 = A;
28528 continue;
28529 }
28530 if (!Amt2 || Amt2 == A) {
28531 ShuffleMask.push_back(i + NumElts);
28532 Amt2 = A;
28533 continue;
28534 }
28535 break;
28536 }
28537
28538 // Only perform this blend if we can perform it without loading a mask.
28539 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
28540 (VT != MVT::v16i16 ||
28541 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
28542 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
28543 canWidenShuffleElements(ShuffleMask))) {
28544 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
28545 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
28546 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
28547 Cst2->getAPIntValue().ult(EltSizeInBits)) {
28548 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28549 Cst1->getZExtValue(), DAG);
28550 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
28551 Cst2->getZExtValue(), DAG);
28552 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
28553 }
28554 }
28555 }
28556
28557 // If possible, lower this packed shift into a vector multiply instead of
28558 // expanding it into a sequence of scalar shifts.
28559 if (Opc == ISD::SHL)
28560 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
28561 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
28562
28563 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
28564 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
28565 if (Opc == ISD::SRL && ConstantAmt &&
28566 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
28567 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28568 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28569 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28570 SDValue Zero = DAG.getConstant(0, dl, VT);
28571 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
28572 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
28573 return DAG.getSelect(dl, VT, ZAmt, R, Res);
28574 }
28575 }
28576
28577 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
28578 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
28579 // TODO: Special case handling for shift by 0/1, really we can afford either
28580 // of these cases in pre-SSE41/XOP/AVX512 but not both.
28581 if (Opc == ISD::SRA && ConstantAmt &&
28582 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
28583 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
28584 !Subtarget.hasAVX512()) ||
28585 DAG.isKnownNeverZero(Amt))) {
28586 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
28587 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
28588 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
28589 SDValue Amt0 =
28590 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
28591 SDValue Amt1 =
28592 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
28593 SDValue Sra1 =
28594 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
28595 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
28596 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
28597 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
28598 }
28599 }
28600
28601 // v4i32 Non Uniform Shifts.
28602 // If the shift amount is constant we can shift each lane using the SSE2
28603 // immediate shifts, else we need to zero-extend each lane to the lower i64
28604 // and shift using the SSE2 variable shifts.
28605 // The separate results can then be blended together.
28606 if (VT == MVT::v4i32) {
28607 SDValue Amt0, Amt1, Amt2, Amt3;
28608 if (ConstantAmt) {
28609 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
28610 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
28611 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
28612 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
28613 } else {
28614 // The SSE2 shifts use the lower i64 as the same shift amount for
28615 // all lanes and the upper i64 is ignored. On AVX we're better off
28616 // just zero-extending, but for SSE just duplicating the top 16-bits is
28617 // cheaper and has the same effect for out of range values.
28618 if (Subtarget.hasAVX()) {
28619 SDValue Z = DAG.getConstant(0, dl, VT);
28620 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
28621 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
28622 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
28623 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
28624 } else {
28625 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
28626 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28627 {4, 5, 6, 7, -1, -1, -1, -1});
28628 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28629 {0, 1, 1, 1, -1, -1, -1, -1});
28630 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
28631 {2, 3, 3, 3, -1, -1, -1, -1});
28632 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28633 {0, 1, 1, 1, -1, -1, -1, -1});
28634 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
28635 {2, 3, 3, 3, -1, -1, -1, -1});
28636 }
28637 }
28638
28639 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
28640 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
28641 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
28642 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
28643 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
28644
28645 // Merge the shifted lane results optimally with/without PBLENDW.
28646 // TODO - ideally shuffle combining would handle this.
28647 if (Subtarget.hasSSE41()) {
28648 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
28649 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
28650 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
28651 }
28652 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
28653 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
28654 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
28655 }
28656
28657 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
28658 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
28659 // make the existing SSE solution better.
28660 // NOTE: We honor prefered vector width before promoting to 512-bits.
28661 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
28662 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
28663 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
28664 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
28665 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
28666 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&((void)0)
28667 "Unexpected vector type")((void)0);
28668 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
28669 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
28670 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28671 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
28672 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
28673 return DAG.getNode(ISD::TRUNCATE, dl, VT,
28674 DAG.getNode(Opc, dl, ExtVT, R, Amt));
28675 }
28676
28677 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
28678 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
28679 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
28680 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28681 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28682 !Subtarget.hasXOP()) {
28683 int NumElts = VT.getVectorNumElements();
28684 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
28685
28686 // Extend constant shift amount to vXi16 (it doesn't matter if the type
28687 // isn't legal).
28688 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28689 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
28690 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
28691 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
28692 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&((void)0)
28693 "Constant build vector expected")((void)0);
28694
28695 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
28696 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
28697 : DAG.getZExtOrTrunc(R, dl, ExVT);
28698 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
28699 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
28700 return DAG.getZExtOrTrunc(R, dl, VT);
28701 }
28702
28703 SmallVector<SDValue, 16> LoAmt, HiAmt;
28704 for (int i = 0; i != NumElts; i += 16) {
28705 for (int j = 0; j != 8; ++j) {
28706 LoAmt.push_back(Amt.getOperand(i + j));
28707 HiAmt.push_back(Amt.getOperand(i + j + 8));
28708 }
28709 }
28710
28711 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
28712 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
28713 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
28714
28715 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
28716 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
28717 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
28718 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
28719 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
28720 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
28721 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
28722 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
28723 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
28724 }
28725
28726 if (VT == MVT::v16i8 ||
28727 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
28728 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
28729 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
28730
28731 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
28732 if (VT.is512BitVector()) {
28733 // On AVX512BW targets we make use of the fact that VSELECT lowers
28734 // to a masked blend which selects bytes based just on the sign bit
28735 // extracted to a mask.
28736 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
28737 V0 = DAG.getBitcast(VT, V0);
28738 V1 = DAG.getBitcast(VT, V1);
28739 Sel = DAG.getBitcast(VT, Sel);
28740 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
28741 ISD::SETGT);
28742 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
28743 } else if (Subtarget.hasSSE41()) {
28744 // On SSE41 targets we can use PBLENDVB which selects bytes based just
28745 // on the sign bit.
28746 V0 = DAG.getBitcast(VT, V0);
28747 V1 = DAG.getBitcast(VT, V1);
28748 Sel = DAG.getBitcast(VT, Sel);
28749 return DAG.getBitcast(SelVT,
28750 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
28751 }
28752 // On pre-SSE41 targets we test for the sign bit by comparing to
28753 // zero - a negative value will set all bits of the lanes to true
28754 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
28755 SDValue Z = DAG.getConstant(0, dl, SelVT);
28756 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
28757 return DAG.getSelect(dl, SelVT, C, V0, V1);
28758 };
28759
28760 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
28761 // We can safely do this using i16 shifts as we're only interested in
28762 // the 3 lower bits of each byte.
28763 Amt = DAG.getBitcast(ExtVT, Amt);
28764 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
28765 Amt = DAG.getBitcast(VT, Amt);
28766
28767 if (Opc == ISD::SHL || Opc == ISD::SRL) {
28768 // r = VSELECT(r, shift(r, 4), a);
28769 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
28770 R = SignBitSelect(VT, Amt, M, R);
28771
28772 // a += a
28773 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28774
28775 // r = VSELECT(r, shift(r, 2), a);
28776 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
28777 R = SignBitSelect(VT, Amt, M, R);
28778
28779 // a += a
28780 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28781
28782 // return VSELECT(r, shift(r, 1), a);
28783 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
28784 R = SignBitSelect(VT, Amt, M, R);
28785 return R;
28786 }
28787
28788 if (Opc == ISD::SRA) {
28789 // For SRA we need to unpack each byte to the higher byte of a i16 vector
28790 // so we can correctly sign extend. We don't care what happens to the
28791 // lower byte.
28792 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28793 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
28794 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
28795 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
28796 ALo = DAG.getBitcast(ExtVT, ALo);
28797 AHi = DAG.getBitcast(ExtVT, AHi);
28798 RLo = DAG.getBitcast(ExtVT, RLo);
28799 RHi = DAG.getBitcast(ExtVT, RHi);
28800
28801 // r = VSELECT(r, shift(r, 4), a);
28802 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
28803 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
28804 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28805 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28806
28807 // a += a
28808 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28809 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28810
28811 // r = VSELECT(r, shift(r, 2), a);
28812 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
28813 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
28814 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28815 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28816
28817 // a += a
28818 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
28819 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
28820
28821 // r = VSELECT(r, shift(r, 1), a);
28822 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
28823 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
28824 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
28825 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
28826
28827 // Logical shift the result back to the lower byte, leaving a zero upper
28828 // byte meaning that we can safely pack with PACKUSWB.
28829 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
28830 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
28831 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28832 }
28833 }
28834
28835 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
28836 MVT ExtVT = MVT::v8i32;
28837 SDValue Z = DAG.getConstant(0, dl, VT);
28838 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
28839 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
28840 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
28841 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
28842 ALo = DAG.getBitcast(ExtVT, ALo);
28843 AHi = DAG.getBitcast(ExtVT, AHi);
28844 RLo = DAG.getBitcast(ExtVT, RLo);
28845 RHi = DAG.getBitcast(ExtVT, RHi);
28846 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
28847 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
28848 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
28849 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
28850 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28851 }
28852
28853 if (VT == MVT::v8i16) {
28854 // If we have a constant shift amount, the non-SSE41 path is best as
28855 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
28856 bool UseSSE41 = Subtarget.hasSSE41() &&
28857 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28858
28859 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
28860 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
28861 // the sign bit.
28862 if (UseSSE41) {
28863 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
28864 V0 = DAG.getBitcast(ExtVT, V0);
28865 V1 = DAG.getBitcast(ExtVT, V1);
28866 Sel = DAG.getBitcast(ExtVT, Sel);
28867 return DAG.getBitcast(
28868 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
28869 }
28870 // On pre-SSE41 targets we splat the sign bit - a negative value will
28871 // set all bits of the lanes to true and VSELECT uses that in
28872 // its OR(AND(V0,C),AND(V1,~C)) lowering.
28873 SDValue C =
28874 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
28875 return DAG.getSelect(dl, VT, C, V0, V1);
28876 };
28877
28878 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
28879 if (UseSSE41) {
28880 // On SSE41 targets we need to replicate the shift mask in both
28881 // bytes for PBLENDVB.
28882 Amt = DAG.getNode(
28883 ISD::OR, dl, VT,
28884 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
28885 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
28886 } else {
28887 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
28888 }
28889
28890 // r = VSELECT(r, shift(r, 8), a);
28891 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
28892 R = SignBitSelect(Amt, M, R);
28893
28894 // a += a
28895 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28896
28897 // r = VSELECT(r, shift(r, 4), a);
28898 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
28899 R = SignBitSelect(Amt, M, R);
28900
28901 // a += a
28902 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28903
28904 // r = VSELECT(r, shift(r, 2), a);
28905 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
28906 R = SignBitSelect(Amt, M, R);
28907
28908 // a += a
28909 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
28910
28911 // return VSELECT(r, shift(r, 1), a);
28912 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
28913 R = SignBitSelect(Amt, M, R);
28914 return R;
28915 }
28916
28917 // Decompose 256-bit shifts into 128-bit shifts.
28918 if (VT.is256BitVector())
28919 return splitVectorIntBinary(Op, DAG);
28920
28921 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28922 return splitVectorIntBinary(Op, DAG);
28923
28924 return SDValue();
28925}
28926
28927static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
28928 SelectionDAG &DAG) {
28929 MVT VT = Op.getSimpleValueType();
28930 assert(VT.isVector() && "Custom lowering only for vector rotates!")((void)0);
28931
28932 SDLoc DL(Op);
28933 SDValue R = Op.getOperand(0);
28934 SDValue Amt = Op.getOperand(1);
28935 unsigned Opcode = Op.getOpcode();
28936 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28937 int NumElts = VT.getVectorNumElements();
28938
28939 // Check for constant splat rotation amount.
28940 APInt CstSplatValue;
28941 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
28942
28943 // Check for splat rotate by zero.
28944 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
28945 return R;
28946
28947 // AVX512 implicitly uses modulo rotation amounts.
28948 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
28949 // Attempt to rotate by immediate.
28950 if (IsCstSplat) {
28951 unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
28952 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28953 return DAG.getNode(RotOpc, DL, VT, R,
28954 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28955 }
28956
28957 // Else, fall-back on VPROLV/VPRORV.
28958 return Op;
28959 }
28960
28961 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
28962 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
28963 unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
28964 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
28965 }
28966
28967 assert((Opcode == ISD::ROTL) && "Only ROTL supported")((void)0);
28968
28969 // XOP has 128-bit vector variable + immediate rotates.
28970 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
28971 // XOP implicitly uses modulo rotation amounts.
28972 if (Subtarget.hasXOP()) {
28973 if (VT.is256BitVector())
28974 return splitVectorIntBinary(Op, DAG);
28975 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((void)0);
28976
28977 // Attempt to rotate by immediate.
28978 if (IsCstSplat) {
28979 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
28980 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
28981 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
28982 }
28983
28984 // Use general rotate by variable (per-element).
28985 return Op;
28986 }
28987
28988 // Split 256-bit integers on pre-AVX2 targets.
28989 if (VT.is256BitVector() && !Subtarget.hasAVX2())
28990 return splitVectorIntBinary(Op, DAG);
28991
28992 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||((void)0)
28993 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||((void)0)
28994 VT == MVT::v32i16) &&((void)0)
28995 Subtarget.hasAVX2())) &&((void)0)
28996 "Only vXi32/vXi16/vXi8 vector rotates supported")((void)0);
28997
28998 // Rotate by an uniform constant - expand back to shifts.
28999 if (IsCstSplat)
29000 return SDValue();
29001
29002 bool IsSplatAmt = DAG.isSplatValue(Amt);
29003
29004 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
29005 // the amount bit.
29006 if (EltSizeInBits == 8 && !IsSplatAmt) {
29007 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
29008 return SDValue();
29009
29010 // We don't need ModuloAmt here as we just peek at individual bits.
29011 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29012
29013 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29014 if (Subtarget.hasSSE41()) {
29015 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29016 // on the sign bit.
29017 V0 = DAG.getBitcast(VT, V0);
29018 V1 = DAG.getBitcast(VT, V1);
29019 Sel = DAG.getBitcast(VT, Sel);
29020 return DAG.getBitcast(SelVT,
29021 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29022 }
29023 // On pre-SSE41 targets we test for the sign bit by comparing to
29024 // zero - a negative value will set all bits of the lanes to true
29025 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29026 SDValue Z = DAG.getConstant(0, DL, SelVT);
29027 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29028 return DAG.getSelect(DL, SelVT, C, V0, V1);
29029 };
29030
29031 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29032 // We can safely do this using i16 shifts as we're only interested in
29033 // the 3 lower bits of each byte.
29034 Amt = DAG.getBitcast(ExtVT, Amt);
29035 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29036 Amt = DAG.getBitcast(VT, Amt);
29037
29038 // r = VSELECT(r, rot(r, 4), a);
29039 SDValue M;
29040 M = DAG.getNode(
29041 ISD::OR, DL, VT,
29042 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
29043 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
29044 R = SignBitSelect(VT, Amt, M, R);
29045
29046 // a += a
29047 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29048
29049 // r = VSELECT(r, rot(r, 2), a);
29050 M = DAG.getNode(
29051 ISD::OR, DL, VT,
29052 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
29053 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
29054 R = SignBitSelect(VT, Amt, M, R);
29055
29056 // a += a
29057 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29058
29059 // return VSELECT(r, rot(r, 1), a);
29060 M = DAG.getNode(
29061 ISD::OR, DL, VT,
29062 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
29063 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
29064 return SignBitSelect(VT, Amt, M, R);
29065 }
29066
29067 // ISD::ROT* uses modulo rotate amounts.
29068 if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
29069 // If the amount is a splat, perform the modulo BEFORE the splat,
29070 // this helps LowerScalarVariableShift to remove the splat later.
29071 Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
29072 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29073 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29074 Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
29075 SmallVector<int>(NumElts, 0));
29076 } else {
29077 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29078 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29079 }
29080
29081 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29082 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29083 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
29084
29085 // Fallback for splats + all supported variable shifts.
29086 // Fallback for non-constants AVX2 vXi16 as well.
29087 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29088 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29089 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29090 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
29091 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
29092 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29093 }
29094
29095 // As with shifts, convert the rotation amount to a multiplication factor.
29096 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29097 assert(Scale && "Failed to convert ROTL amount to scale")((void)0);
29098
29099 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29100 if (EltSizeInBits == 16) {
29101 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29102 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29103 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29104 }
29105
29106 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29107 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29108 // that can then be OR'd with the lower 32-bits.
29109 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")((void)0);
29110 static const int OddMask[] = {1, -1, 3, -1};
29111 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29112 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29113
29114 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29115 DAG.getBitcast(MVT::v2i64, R),
29116 DAG.getBitcast(MVT::v2i64, Scale));
29117 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29118 DAG.getBitcast(MVT::v2i64, R13),
29119 DAG.getBitcast(MVT::v2i64, Scale13));
29120 Res02 = DAG.getBitcast(VT, Res02);
29121 Res13 = DAG.getBitcast(VT, Res13);
29122
29123 return DAG.getNode(ISD::OR, DL, VT,
29124 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29125 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29126}
29127
29128/// Returns true if the operand type is exactly twice the native width, and
29129/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29130/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29131/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
29132bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
29133 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
29134
29135 if (OpWidth == 64)
29136 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
29137 if (OpWidth == 128)
29138 return Subtarget.hasCmpxchg16b();
29139
29140 return false;
29141}
29142
29143bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
29144 Type *MemType = SI->getValueOperand()->getType();
29145
29146 bool NoImplicitFloatOps =
29147 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29148 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29149 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29150 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29151 return false;
29152
29153 return needsCmpXchgNb(MemType);
29154}
29155
29156// Note: this turns large loads into lock cmpxchg8b/16b.
29157// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
29158TargetLowering::AtomicExpansionKind
29159X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
29160 Type *MemType = LI->getType();
29161
29162 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
29163 // can use movq to do the load. If we have X87 we can load into an 80-bit
29164 // X87 register and store it to a stack temporary.
29165 bool NoImplicitFloatOps =
29166 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29167 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29168 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29169 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29170 return AtomicExpansionKind::None;
29171
29172 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29173 : AtomicExpansionKind::None;
29174}
29175
29176TargetLowering::AtomicExpansionKind
29177X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29178 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29179 Type *MemType = AI->getType();
29180
29181 // If the operand is too big, we must see if cmpxchg8/16b is available
29182 // and default to library calls otherwise.
29183 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29184 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29185 : AtomicExpansionKind::None;
29186 }
29187
29188 AtomicRMWInst::BinOp Op = AI->getOperation();
29189 switch (Op) {
29190 default:
29191 llvm_unreachable("Unknown atomic operation")__builtin_unreachable();
29192 case AtomicRMWInst::Xchg:
29193 case AtomicRMWInst::Add:
29194 case AtomicRMWInst::Sub:
29195 // It's better to use xadd, xsub or xchg for these in all cases.
29196 return AtomicExpansionKind::None;
29197 case AtomicRMWInst::Or:
29198 case AtomicRMWInst::And:
29199 case AtomicRMWInst::Xor:
29200 // If the atomicrmw's result isn't actually used, we can just add a "lock"
29201 // prefix to a normal instruction for these operations.
29202 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29203 : AtomicExpansionKind::None;
29204 case AtomicRMWInst::Nand:
29205 case AtomicRMWInst::Max:
29206 case AtomicRMWInst::Min:
29207 case AtomicRMWInst::UMax:
29208 case AtomicRMWInst::UMin:
29209 case AtomicRMWInst::FAdd:
29210 case AtomicRMWInst::FSub:
29211 // These always require a non-trivial set of data operations on x86. We must
29212 // use a cmpxchg loop.
29213 return AtomicExpansionKind::CmpXChg;
29214 }
29215}
29216
29217LoadInst *
29218X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29219 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29220 Type *MemType = AI->getType();
29221 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29222 // there is no benefit in turning such RMWs into loads, and it is actually
29223 // harmful as it introduces a mfence.
29224 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29225 return nullptr;
29226
29227 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29228 // lowering available in lowerAtomicArith.
29229 // TODO: push more cases through this path.
29230 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29231 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29232 AI->use_empty())
29233 return nullptr;
29234
29235 IRBuilder<> Builder(AI);
29236 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29237 auto SSID = AI->getSyncScopeID();
29238 // We must restrict the ordering to avoid generating loads with Release or
29239 // ReleaseAcquire orderings.
29240 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29241
29242 // Before the load we need a fence. Here is an example lifted from
29243 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29244 // is required:
29245 // Thread 0:
29246 // x.store(1, relaxed);
29247 // r1 = y.fetch_add(0, release);
29248 // Thread 1:
29249 // y.fetch_add(42, acquire);
29250 // r2 = x.load(relaxed);
29251 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29252 // lowered to just a load without a fence. A mfence flushes the store buffer,
29253 // making the optimization clearly correct.
29254 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29255 // otherwise, we might be able to be more aggressive on relaxed idempotent
29256 // rmw. In practice, they do not look useful, so we don't try to be
29257 // especially clever.
29258 if (SSID == SyncScope::SingleThread)
29259 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29260 // the IR level, so we must wrap it in an intrinsic.
29261 return nullptr;
29262
29263 if (!Subtarget.hasMFence())
29264 // FIXME: it might make sense to use a locked operation here but on a
29265 // different cache-line to prevent cache-line bouncing. In practice it
29266 // is probably a small win, and x86 processors without mfence are rare
29267 // enough that we do not bother.
29268 return nullptr;
29269
29270 Function *MFence =
29271 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29272 Builder.CreateCall(MFence, {});
29273
29274 // Finally we can emit the atomic load.
29275 LoadInst *Loaded = Builder.CreateAlignedLoad(
29276 AI->getType(), AI->getPointerOperand(), AI->getAlign());
29277 Loaded->setAtomic(Order, SSID);
29278 AI->replaceAllUsesWith(Loaded);
29279 AI->eraseFromParent();
29280 return Loaded;
29281}
29282
29283bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29284 if (!SI.isUnordered())
29285 return false;
29286 return ExperimentalUnorderedISEL;
29287}
29288bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29289 if (!LI.isUnordered())
29290 return false;
29291 return ExperimentalUnorderedISEL;
29292}
29293
29294
29295/// Emit a locked operation on a stack location which does not change any
29296/// memory location, but does involve a lock prefix. Location is chosen to be
29297/// a) very likely accessed only by a single thread to minimize cache traffic,
29298/// and b) definitely dereferenceable. Returns the new Chain result.
29299static SDValue emitLockedStackOp(SelectionDAG &DAG,
29300 const X86Subtarget &Subtarget, SDValue Chain,
29301 const SDLoc &DL) {
29302 // Implementation notes:
29303 // 1) LOCK prefix creates a full read/write reordering barrier for memory
29304 // operations issued by the current processor. As such, the location
29305 // referenced is not relevant for the ordering properties of the instruction.
29306 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
29307 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
29308 // 2) Using an immediate operand appears to be the best encoding choice
29309 // here since it doesn't require an extra register.
29310 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29311 // is small enough it might just be measurement noise.)
29312 // 4) When choosing offsets, there are several contributing factors:
29313 // a) If there's no redzone, we default to TOS. (We could allocate a cache
29314 // line aligned stack object to improve this case.)
29315 // b) To minimize our chances of introducing a false dependence, we prefer
29316 // to offset the stack usage from TOS slightly.
29317 // c) To minimize concerns about cross thread stack usage - in particular,
29318 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29319 // captures state in the TOS frame and accesses it from many threads -
29320 // we want to use an offset such that the offset is in a distinct cache
29321 // line from the TOS frame.
29322 //
29323 // For a general discussion of the tradeoffs and benchmark results, see:
29324 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29325
29326 auto &MF = DAG.getMachineFunction();
29327 auto &TFL = *Subtarget.getFrameLowering();
29328 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29329
29330 if (Subtarget.is64Bit()) {
29331 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29332 SDValue Ops[] = {
29333 DAG.getRegister(X86::RSP, MVT::i64), // Base
29334 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29335 DAG.getRegister(0, MVT::i64), // Index
29336 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29337 DAG.getRegister(0, MVT::i16), // Segment.
29338 Zero,
29339 Chain};
29340 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29341 MVT::Other, Ops);
29342 return SDValue(Res, 1);
29343 }
29344
29345 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29346 SDValue Ops[] = {
29347 DAG.getRegister(X86::ESP, MVT::i32), // Base
29348 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29349 DAG.getRegister(0, MVT::i32), // Index
29350 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29351 DAG.getRegister(0, MVT::i16), // Segment.
29352 Zero,
29353 Chain
29354 };
29355 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29356 MVT::Other, Ops);
29357 return SDValue(Res, 1);
29358}
29359
29360static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29361 SelectionDAG &DAG) {
29362 SDLoc dl(Op);
29363 AtomicOrdering FenceOrdering =
29364 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29365 SyncScope::ID FenceSSID =
29366 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29367
29368 // The only fence that needs an instruction is a sequentially-consistent
29369 // cross-thread fence.
29370 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29371 FenceSSID == SyncScope::System) {
29372 if (Subtarget.hasMFence())
29373 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29374
29375 SDValue Chain = Op.getOperand(0);
29376 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29377 }
29378
29379 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29380 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29381}
29382
29383static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29384 SelectionDAG &DAG) {
29385 MVT T = Op.getSimpleValueType();
29386 SDLoc DL(Op);
29387 unsigned Reg = 0;
29388 unsigned size = 0;
29389 switch(T.SimpleTy) {
29390 default: llvm_unreachable("Invalid value type!")__builtin_unreachable();
29391 case MVT::i8: Reg = X86::AL; size = 1; break;
29392 case MVT::i16: Reg = X86::AX; size = 2; break;
29393 case MVT::i32: Reg = X86::EAX; size = 4; break;
29394 case MVT::i64:
29395 assert(Subtarget.is64Bit() && "Node not type legal!")((void)0);
29396 Reg = X86::RAX; size = 8;
29397 break;
29398 }
29399 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29400 Op.getOperand(2), SDValue());
29401 SDValue Ops[] = { cpIn.getValue(0),
29402 Op.getOperand(1),
29403 Op.getOperand(3),
29404 DAG.getTargetConstant(size, DL, MVT::i8),
29405 cpIn.getValue(1) };
29406 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29407 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29408 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29409 Ops, T, MMO);
29410
29411 SDValue cpOut =
29412 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29413 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29414 MVT::i32, cpOut.getValue(2));
29415 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29416
29417 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29418 cpOut, Success, EFLAGS.getValue(1));
29419}
29420
29421// Create MOVMSKB, taking into account whether we need to split for AVX1.
29422static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29423 const X86Subtarget &Subtarget) {
29424 MVT InVT = V.getSimpleValueType();
29425
29426 if (InVT == MVT::v64i8) {
29427 SDValue Lo, Hi;
29428 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29429 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29430 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29431 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29432 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29433 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29434 DAG.getConstant(32, DL, MVT::i8));
29435 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29436 }
29437 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29438 SDValue Lo, Hi;
29439 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29440 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29441 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29442 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29443 DAG.getConstant(16, DL, MVT::i8));
29444 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29445 }
29446
29447 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29448}
29449
29450static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29451 SelectionDAG &DAG) {
29452 SDValue Src = Op.getOperand(0);
29453 MVT SrcVT = Src.getSimpleValueType();
29454 MVT DstVT = Op.getSimpleValueType();
29455
29456 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29457 // half to v32i1 and concatenating the result.
29458 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29459 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((void)0);
29460 assert(Subtarget.hasBWI() && "Expected BWI target")((void)0);
29461 SDLoc dl(Op);
29462 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29463 DAG.getIntPtrConstant(0, dl));
29464 Lo = DAG.getBitcast(MVT::v32i1, Lo);
29465 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29466 DAG.getIntPtrConstant(1, dl));
29467 Hi = DAG.getBitcast(MVT::v32i1, Hi);
29468 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
29469 }
29470
29471 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
29472 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
29473 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")((void)0);
29474 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
29475 SDLoc DL(Op);
29476 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
29477 V = getPMOVMSKB(DL, V, DAG, Subtarget);
29478 return DAG.getZExtOrTrunc(V, DL, DstVT);
29479 }
29480
29481 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||((void)0)
29482 SrcVT == MVT::i64) && "Unexpected VT!")((void)0);
29483
29484 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
29485 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
29486 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
29487 // This conversion needs to be expanded.
29488 return SDValue();
29489
29490 SDLoc dl(Op);
29491 if (SrcVT.isVector()) {
29492 // Widen the vector in input in the case of MVT::v2i32.
29493 // Example: from MVT::v2i32 to MVT::v4i32.
29494 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
29495 SrcVT.getVectorNumElements() * 2);
29496 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
29497 DAG.getUNDEF(SrcVT));
29498 } else {
29499 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((void)0)
29500 "Unexpected source type in LowerBITCAST")((void)0);
29501 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
29502 }
29503
29504 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
29505 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
29506
29507 if (DstVT == MVT::x86mmx)
29508 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
29509
29510 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
29511 DAG.getIntPtrConstant(0, dl));
29512}
29513
29514/// Compute the horizontal sum of bytes in V for the elements of VT.
29515///
29516/// Requires V to be a byte vector and VT to be an integer vector type with
29517/// wider elements than V's type. The width of the elements of VT determines
29518/// how many bytes of V are summed horizontally to produce each element of the
29519/// result.
29520static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
29521 const X86Subtarget &Subtarget,
29522 SelectionDAG &DAG) {
29523 SDLoc DL(V);
29524 MVT ByteVecVT = V.getSimpleValueType();
29525 MVT EltVT = VT.getVectorElementType();
29526 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((void)0)
29527 "Expected value to have byte element type.")((void)0);
29528 assert(EltVT != MVT::i8 &&((void)0)
29529 "Horizontal byte sum only makes sense for wider elements!")((void)0);
29530 unsigned VecSize = VT.getSizeInBits();
29531 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((void)0);
29532
29533 // PSADBW instruction horizontally add all bytes and leave the result in i64
29534 // chunks, thus directly computes the pop count for v2i64 and v4i64.
29535 if (EltVT == MVT::i64) {
29536 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
29537 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29538 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
29539 return DAG.getBitcast(VT, V);
29540 }
29541
29542 if (EltVT == MVT::i32) {
29543 // We unpack the low half and high half into i32s interleaved with zeros so
29544 // that we can use PSADBW to horizontally sum them. The most useful part of
29545 // this is that it lines up the results of two PSADBW instructions to be
29546 // two v2i64 vectors which concatenated are the 4 population counts. We can
29547 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
29548 SDValue Zeros = DAG.getConstant(0, DL, VT);
29549 SDValue V32 = DAG.getBitcast(VT, V);
29550 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
29551 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
29552
29553 // Do the horizontal sums into two v2i64s.
29554 Zeros = DAG.getConstant(0, DL, ByteVecVT);
29555 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
29556 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29557 DAG.getBitcast(ByteVecVT, Low), Zeros);
29558 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
29559 DAG.getBitcast(ByteVecVT, High), Zeros);
29560
29561 // Merge them together.
29562 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
29563 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
29564 DAG.getBitcast(ShortVecVT, Low),
29565 DAG.getBitcast(ShortVecVT, High));
29566
29567 return DAG.getBitcast(VT, V);
29568 }
29569
29570 // The only element type left is i16.
29571 assert(EltVT == MVT::i16 && "Unknown how to handle type")((void)0);
29572
29573 // To obtain pop count for each i16 element starting from the pop count for
29574 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
29575 // right by 8. It is important to shift as i16s as i8 vector shift isn't
29576 // directly supported.
29577 SDValue ShifterV = DAG.getConstant(8, DL, VT);
29578 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29579 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
29580 DAG.getBitcast(ByteVecVT, V));
29581 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
29582}
29583
29584static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
29585 const X86Subtarget &Subtarget,
29586 SelectionDAG &DAG) {
29587 MVT VT = Op.getSimpleValueType();
29588 MVT EltVT = VT.getVectorElementType();
29589 int NumElts = VT.getVectorNumElements();
29590 (void)EltVT;
29591 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")((void)0);
29592
29593 // Implement a lookup table in register by using an algorithm based on:
29594 // http://wm.ite.pl/articles/sse-popcount.html
29595 //
29596 // The general idea is that every lower byte nibble in the input vector is an
29597 // index into a in-register pre-computed pop count table. We then split up the
29598 // input vector in two new ones: (1) a vector with only the shifted-right
29599 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
29600 // masked out higher ones) for each byte. PSHUFB is used separately with both
29601 // to index the in-register table. Next, both are added and the result is a
29602 // i8 vector where each element contains the pop count for input byte.
29603 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
29604 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
29605 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
29606 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
29607
29608 SmallVector<SDValue, 64> LUTVec;
29609 for (int i = 0; i < NumElts; ++i)
29610 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29611 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
29612 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
29613
29614 // High nibbles
29615 SDValue FourV = DAG.getConstant(4, DL, VT);
29616 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
29617
29618 // Low nibbles
29619 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
29620
29621 // The input vector is used as the shuffle mask that index elements into the
29622 // LUT. After counting low and high nibbles, add the vector to obtain the
29623 // final pop count per i8 element.
29624 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
29625 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
29626 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
29627}
29628
29629// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
29630// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
29631static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29632 SelectionDAG &DAG) {
29633 MVT VT = Op.getSimpleValueType();
29634 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&((void)0)
29635 "Unknown CTPOP type to handle")((void)0);
29636 SDLoc DL(Op.getNode());
29637 SDValue Op0 = Op.getOperand(0);
29638
29639 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
29640 if (Subtarget.hasVPOPCNTDQ()) {
29641 unsigned NumElems = VT.getVectorNumElements();
29642 assert((VT.getVectorElementType() == MVT::i8 ||((void)0)
29643 VT.getVectorElementType() == MVT::i16) && "Unexpected type")((void)0);
29644 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
29645 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
29646 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
29647 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
29648 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
29649 }
29650 }
29651
29652 // Decompose 256-bit ops into smaller 128-bit ops.
29653 if (VT.is256BitVector() && !Subtarget.hasInt256())
29654 return splitVectorIntUnary(Op, DAG);
29655
29656 // Decompose 512-bit ops into smaller 256-bit ops.
29657 if (VT.is512BitVector() && !Subtarget.hasBWI())
29658 return splitVectorIntUnary(Op, DAG);
29659
29660 // For element types greater than i8, do vXi8 pop counts and a bytesum.
29661 if (VT.getScalarType() != MVT::i8) {
29662 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
29663 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
29664 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
29665 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
29666 }
29667
29668 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
29669 if (!Subtarget.hasSSSE3())
29670 return SDValue();
29671
29672 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
29673}
29674
29675static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
29676 SelectionDAG &DAG) {
29677 assert(Op.getSimpleValueType().isVector() &&((void)0)
29678 "We only do custom lowering for vector population count.")((void)0);
29679 return LowerVectorCTPOP(Op, Subtarget, DAG);
29680}
29681
29682static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
29683 MVT VT = Op.getSimpleValueType();
29684 SDValue In = Op.getOperand(0);
29685 SDLoc DL(Op);
29686
29687 // For scalars, its still beneficial to transfer to/from the SIMD unit to
29688 // perform the BITREVERSE.
29689 if (!VT.isVector()) {
29690 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
29691 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
29692 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
29693 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
29694 DAG.getIntPtrConstant(0, DL));
29695 }
29696
29697 int NumElts = VT.getVectorNumElements();
29698 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
29699
29700 // Decompose 256-bit ops into smaller 128-bit ops.
29701 if (VT.is256BitVector())
29702 return splitVectorIntUnary(Op, DAG);
29703
29704 assert(VT.is128BitVector() &&((void)0)
29705 "Only 128-bit vector bitreverse lowering supported.")((void)0);
29706
29707 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
29708 // perform the BSWAP in the shuffle.
29709 // Its best to shuffle using the second operand as this will implicitly allow
29710 // memory folding for multiple vectors.
29711 SmallVector<SDValue, 16> MaskElts;
29712 for (int i = 0; i != NumElts; ++i) {
29713 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
29714 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
29715 int PermuteByte = SourceByte | (2 << 5);
29716 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
29717 }
29718 }
29719
29720 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
29721 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
29722 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
29723 Res, Mask);
29724 return DAG.getBitcast(VT, Res);
29725}
29726
29727static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
29728 SelectionDAG &DAG) {
29729 MVT VT = Op.getSimpleValueType();
29730
29731 if (Subtarget.hasXOP() && !VT.is512BitVector())
29732 return LowerBITREVERSE_XOP(Op, DAG);
29733
29734 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((void)0);
29735
29736 SDValue In = Op.getOperand(0);
29737 SDLoc DL(Op);
29738
29739 assert(VT.getScalarType() == MVT::i8 &&((void)0)
29740 "Only byte vector BITREVERSE supported")((void)0);
29741
29742 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
29743 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
29744 return splitVectorIntUnary(Op, DAG);
29745
29746 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
29747 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
29748 return splitVectorIntUnary(Op, DAG);
29749
29750 unsigned NumElts = VT.getVectorNumElements();
29751
29752 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
29753 if (Subtarget.hasGFNI()) {
29754 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29755 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
29756 Matrix = DAG.getBitcast(VT, Matrix);
29757 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
29758 DAG.getTargetConstant(0, DL, MVT::i8));
29759 }
29760
29761 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
29762 // two nibbles and a PSHUFB lookup to find the bitreverse of each
29763 // 0-15 value (moved to the other nibble).
29764 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
29765 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
29766 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
29767
29768 const int LoLUT[16] = {
29769 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
29770 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
29771 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
29772 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
29773 const int HiLUT[16] = {
29774 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
29775 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
29776 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
29777 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
29778
29779 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
29780 for (unsigned i = 0; i < NumElts; ++i) {
29781 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
29782 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
29783 }
29784
29785 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
29786 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
29787 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
29788 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
29789 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29790}
29791
29792static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
29793 SelectionDAG &DAG) {
29794 SDLoc DL(Op);
29795 SDValue X = Op.getOperand(0);
29796 MVT VT = Op.getSimpleValueType();
29797
29798 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
29799 if (VT == MVT::i8 ||
29800 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
29801 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29802 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
29803 DAG.getConstant(0, DL, MVT::i8));
29804 // Copy the inverse of the parity flag into a register with setcc.
29805 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29806 // Extend to the original type.
29807 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29808 }
29809
29810 if (VT == MVT::i64) {
29811 // Xor the high and low 16-bits together using a 32-bit operation.
29812 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
29813 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
29814 DAG.getConstant(32, DL, MVT::i8)));
29815 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
29816 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
29817 }
29818
29819 if (VT != MVT::i16) {
29820 // Xor the high and low 16-bits together using a 32-bit operation.
29821 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
29822 DAG.getConstant(16, DL, MVT::i8));
29823 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
29824 } else {
29825 // If the input is 16-bits, we need to extend to use an i32 shift below.
29826 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
29827 }
29828
29829 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
29830 // This should allow an h-reg to be used to save a shift.
29831 SDValue Hi = DAG.getNode(
29832 ISD::TRUNCATE, DL, MVT::i8,
29833 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
29834 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
29835 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
29836 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
29837
29838 // Copy the inverse of the parity flag into a register with setcc.
29839 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
29840 // Extend to the original type.
29841 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
29842}
29843
29844static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
29845 const X86Subtarget &Subtarget) {
29846 unsigned NewOpc = 0;
29847 switch (N->getOpcode()) {
29848 case ISD::ATOMIC_LOAD_ADD:
29849 NewOpc = X86ISD::LADD;
29850 break;
29851 case ISD::ATOMIC_LOAD_SUB:
29852 NewOpc = X86ISD::LSUB;
29853 break;
29854 case ISD::ATOMIC_LOAD_OR:
29855 NewOpc = X86ISD::LOR;
29856 break;
29857 case ISD::ATOMIC_LOAD_XOR:
29858 NewOpc = X86ISD::LXOR;
29859 break;
29860 case ISD::ATOMIC_LOAD_AND:
29861 NewOpc = X86ISD::LAND;
29862 break;
29863 default:
29864 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")__builtin_unreachable();
29865 }
29866
29867 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
29868
29869 return DAG.getMemIntrinsicNode(
29870 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
29871 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
29872 /*MemVT=*/N->getSimpleValueType(0), MMO);
29873}
29874
29875/// Lower atomic_load_ops into LOCK-prefixed operations.
29876static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
29877 const X86Subtarget &Subtarget) {
29878 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
29879 SDValue Chain = N->getOperand(0);
29880 SDValue LHS = N->getOperand(1);
29881 SDValue RHS = N->getOperand(2);
29882 unsigned Opc = N->getOpcode();
29883 MVT VT = N->getSimpleValueType(0);
29884 SDLoc DL(N);
29885
29886 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
29887 // can only be lowered when the result is unused. They should have already
29888 // been transformed into a cmpxchg loop in AtomicExpand.
29889 if (N->hasAnyUseOfValue(0)) {
29890 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
29891 // select LXADD if LOCK_SUB can't be selected.
29892 if (Opc == ISD::ATOMIC_LOAD_SUB) {
29893 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
29894 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
29895 RHS, AN->getMemOperand());
29896 }
29897 assert(Opc == ISD::ATOMIC_LOAD_ADD &&((void)0)
29898 "Used AtomicRMW ops other than Add should have been expanded!")((void)0);
29899 return N;
29900 }
29901
29902 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
29903 // The core idea here is that since the memory location isn't actually
29904 // changing, all we need is a lowering for the *ordering* impacts of the
29905 // atomicrmw. As such, we can chose a different operation and memory
29906 // location to minimize impact on other code.
29907 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
29908 // On X86, the only ordering which actually requires an instruction is
29909 // seq_cst which isn't SingleThread, everything just needs to be preserved
29910 // during codegen and then dropped. Note that we expect (but don't assume),
29911 // that orderings other than seq_cst and acq_rel have been canonicalized to
29912 // a store or load.
29913 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
29914 AN->getSyncScopeID() == SyncScope::System) {
29915 // Prefer a locked operation against a stack location to minimize cache
29916 // traffic. This assumes that stack locations are very likely to be
29917 // accessed only by the owning thread.
29918 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
29919 assert(!N->hasAnyUseOfValue(0))((void)0);
29920 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29921 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29922 DAG.getUNDEF(VT), NewChain);
29923 }
29924 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29925 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
29926 assert(!N->hasAnyUseOfValue(0))((void)0);
29927 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29928 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29929 DAG.getUNDEF(VT), NewChain);
29930 }
29931
29932 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
29933 // RAUW the chain, but don't worry about the result, as it's unused.
29934 assert(!N->hasAnyUseOfValue(0))((void)0);
29935 // NOTE: The getUNDEF is needed to give something for the unused result 0.
29936 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
29937 DAG.getUNDEF(VT), LockOp.getValue(1));
29938}
29939
29940static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
29941 const X86Subtarget &Subtarget) {
29942 auto *Node = cast<AtomicSDNode>(Op.getNode());
29943 SDLoc dl(Node);
29944 EVT VT = Node->getMemoryVT();
29945
29946 bool IsSeqCst =
29947 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
29948 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
29949
29950 // If this store is not sequentially consistent and the type is legal
29951 // we can just keep it.
29952 if (!IsSeqCst && IsTypeLegal)
29953 return Op;
29954
29955 if (VT == MVT::i64 && !IsTypeLegal) {
29956 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
29957 // is enabled.
29958 bool NoImplicitFloatOps =
29959 DAG.getMachineFunction().getFunction().hasFnAttribute(
29960 Attribute::NoImplicitFloat);
29961 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29962 SDValue Chain;
29963 if (Subtarget.hasSSE1()) {
29964 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
29965 Node->getOperand(2));
29966 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29967 SclToVec = DAG.getBitcast(StVT, SclToVec);
29968 SDVTList Tys = DAG.getVTList(MVT::Other);
29969 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
29970 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
29971 MVT::i64, Node->getMemOperand());
29972 } else if (Subtarget.hasX87()) {
29973 // First load this into an 80-bit X87 register using a stack temporary.
29974 // This will put the whole integer into the significand.
29975 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29976 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29977 MachinePointerInfo MPI =
29978 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29979 Chain =
29980 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
29981 MPI, MaybeAlign(), MachineMemOperand::MOStore);
29982 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29983 SDValue LdOps[] = {Chain, StackPtr};
29984 SDValue Value =
29985 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
29986 /*Align*/ None, MachineMemOperand::MOLoad);
29987 Chain = Value.getValue(1);
29988
29989 // Now use an FIST to do the atomic store.
29990 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
29991 Chain =
29992 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
29993 StoreOps, MVT::i64, Node->getMemOperand());
29994 }
29995
29996 if (Chain) {
29997 // If this is a sequentially consistent store, also emit an appropriate
29998 // barrier.
29999 if (IsSeqCst)
30000 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
30001
30002 return Chain;
30003 }
30004 }
30005 }
30006
30007 // Convert seq_cst store -> xchg
30008 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
30009 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
30010 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
30011 Node->getMemoryVT(),
30012 Node->getOperand(0),
30013 Node->getOperand(1), Node->getOperand(2),
30014 Node->getMemOperand());
30015 return Swap.getValue(1);
30016}
30017
30018static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
30019 SDNode *N = Op.getNode();
30020 MVT VT = N->getSimpleValueType(0);
30021 unsigned Opc = Op.getOpcode();
30022
30023 // Let legalize expand this if it isn't a legal type yet.
30024 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30025 return SDValue();
30026
30027 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30028 SDLoc DL(N);
30029
30030 // Set the carry flag.
30031 SDValue Carry = Op.getOperand(2);
30032 EVT CarryVT = Carry.getValueType();
30033 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
30034 Carry, DAG.getAllOnesConstant(DL, CarryVT));
30035
30036 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
30037 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
30038 Op.getOperand(0), Op.getOperand(1),
30039 Carry.getValue(1));
30040
30041 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
30042 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
30043 Sum.getValue(1), DL, DAG);
30044 if (N->getValueType(1) == MVT::i1)
30045 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
30046
30047 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
30048}
30049
30050static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
30051 SelectionDAG &DAG) {
30052 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((void)0);
30053
30054 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
30055 // which returns the values as { float, float } (in XMM0) or
30056 // { double, double } (which is returned in XMM0, XMM1).
30057 SDLoc dl(Op);
30058 SDValue Arg = Op.getOperand(0);
30059 EVT ArgVT = Arg.getValueType();
30060 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30061
30062 TargetLowering::ArgListTy Args;
30063 TargetLowering::ArgListEntry Entry;
30064
30065 Entry.Node = Arg;
30066 Entry.Ty = ArgTy;
30067 Entry.IsSExt = false;
30068 Entry.IsZExt = false;
30069 Args.push_back(Entry);
30070
30071 bool isF64 = ArgVT == MVT::f64;
30072 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
30073 // the small struct {f32, f32} is returned in (eax, edx). For f64,
30074 // the results are returned via SRet in memory.
30075 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30076 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
30077 const char *LibcallName = TLI.getLibcallName(LC);
30078 SDValue Callee =
30079 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
30080
30081 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
30082 : (Type *)FixedVectorType::get(ArgTy, 4);
30083
30084 TargetLowering::CallLoweringInfo CLI(DAG);
30085 CLI.setDebugLoc(dl)
30086 .setChain(DAG.getEntryNode())
30087 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
30088
30089 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
30090
30091 if (isF64)
30092 // Returned in xmm0 and xmm1.
30093 return CallResult.first;
30094
30095 // Returned in bits 0:31 and 32:64 xmm0.
30096 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30097 CallResult.first, DAG.getIntPtrConstant(0, dl));
30098 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30099 CallResult.first, DAG.getIntPtrConstant(1, dl));
30100 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
30101 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
30102}
30103
30104/// Widen a vector input to a vector of NVT. The
30105/// input vector must have the same element type as NVT.
30106static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
30107 bool FillWithZeroes = false) {
30108 // Check if InOp already has the right width.
30109 MVT InVT = InOp.getSimpleValueType();
30110 if (InVT == NVT)
30111 return InOp;
30112
30113 if (InOp.isUndef())
30114 return DAG.getUNDEF(NVT);
30115
30116 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((void)0)
30117 "input and widen element type must match")((void)0);
30118
30119 unsigned InNumElts = InVT.getVectorNumElements();
30120 unsigned WidenNumElts = NVT.getVectorNumElements();
30121 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((void)0)
30122 "Unexpected request for vector widening")((void)0);
30123
30124 SDLoc dl(InOp);
30125 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
30126 InOp.getNumOperands() == 2) {
30127 SDValue N1 = InOp.getOperand(1);
30128 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
30129 N1.isUndef()) {
30130 InOp = InOp.getOperand(0);
30131 InVT = InOp.getSimpleValueType();
30132 InNumElts = InVT.getVectorNumElements();
30133 }
30134 }
30135 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
30136 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
30137 SmallVector<SDValue, 16> Ops;
30138 for (unsigned i = 0; i < InNumElts; ++i)
30139 Ops.push_back(InOp.getOperand(i));
30140
30141 EVT EltVT = InOp.getOperand(0).getValueType();
30142
30143 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
30144 DAG.getUNDEF(EltVT);
30145 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
30146 Ops.push_back(FillVal);
30147 return DAG.getBuildVector(NVT, dl, Ops);
30148 }
30149 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
30150 DAG.getUNDEF(NVT);
30151 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
30152 InOp, DAG.getIntPtrConstant(0, dl));
30153}
30154
30155static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
30156 SelectionDAG &DAG) {
30157 assert(Subtarget.hasAVX512() &&((void)0)
30158 "MGATHER/MSCATTER are supported on AVX-512 arch only")((void)0);
30159
30160 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
30161 SDValue Src = N->getValue();
30162 MVT VT = Src.getSimpleValueType();
30163 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((void)0);
30164 SDLoc dl(Op);
30165
30166 SDValue Scale = N->getScale();
30167 SDValue Index = N->getIndex();
30168 SDValue Mask = N->getMask();
30169 SDValue Chain = N->getChain();
30170 SDValue BasePtr = N->getBasePtr();
30171
30172 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
30173 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((void)0);
30174 // If the index is v2i64 and we have VLX we can use xmm for data and index.
30175 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
30176 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30177 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
30178 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30179 SDVTList VTs = DAG.getVTList(MVT::Other);
30180 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30181 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30182 N->getMemoryVT(), N->getMemOperand());
30183 }
30184 return SDValue();
30185 }
30186
30187 MVT IndexVT = Index.getSimpleValueType();
30188
30189 // If the index is v2i32, we're being called by type legalization and we
30190 // should just let the default handling take care of it.
30191 if (IndexVT == MVT::v2i32)
30192 return SDValue();
30193
30194 // If we don't have VLX and neither the passthru or index is 512-bits, we
30195 // need to widen until one is.
30196 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30197 !Index.getSimpleValueType().is512BitVector()) {
30198 // Determine how much we need to widen by to get a 512-bit type.
30199 unsigned Factor = std::min(512/VT.getSizeInBits(),
30200 512/IndexVT.getSizeInBits());
30201 unsigned NumElts = VT.getVectorNumElements() * Factor;
30202
30203 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30204 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30205 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30206
30207 Src = ExtendToType(Src, VT, DAG);
30208 Index = ExtendToType(Index, IndexVT, DAG);
30209 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30210 }
30211
30212 SDVTList VTs = DAG.getVTList(MVT::Other);
30213 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30214 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30215 N->getMemoryVT(), N->getMemOperand());
30216}
30217
30218static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30219 SelectionDAG &DAG) {
30220
30221 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30222 MVT VT = Op.getSimpleValueType();
30223 MVT ScalarVT = VT.getScalarType();
30224 SDValue Mask = N->getMask();
30225 MVT MaskVT = Mask.getSimpleValueType();
30226 SDValue PassThru = N->getPassThru();
30227 SDLoc dl(Op);
30228
30229 // Handle AVX masked loads which don't support passthru other than 0.
30230 if (MaskVT.getVectorElementType() != MVT::i1) {
30231 // We also allow undef in the isel pattern.
30232 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30233 return Op;
30234
30235 SDValue NewLoad = DAG.getMaskedLoad(
30236 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30237 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30238 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30239 N->isExpandingLoad());
30240 // Emit a blend.
30241 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30242 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30243 }
30244
30245 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&((void)0)
30246 "Expanding masked load is supported on AVX-512 target only!")((void)0);
30247
30248 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&((void)0)
30249 "Expanding masked load is supported for 32 and 64-bit types only!")((void)0);
30250
30251 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((void)0)
30252 "Cannot lower masked load op.")((void)0);
30253
30254 assert((ScalarVT.getSizeInBits() >= 32 ||((void)0)
30255 (Subtarget.hasBWI() &&((void)0)
30256 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&((void)0)
30257 "Unsupported masked load op.")((void)0);
30258
30259 // This operation is legal for targets with VLX, but without
30260 // VLX the vector should be widened to 512 bit
30261 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30262 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30263 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30264
30265 // Mask element has to be i1.
30266 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((void)0)
30267 "Unexpected mask type")((void)0);
30268
30269 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30270
30271 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30272 SDValue NewLoad = DAG.getMaskedLoad(
30273 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30274 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30275 N->getExtensionType(), N->isExpandingLoad());
30276
30277 SDValue Extract =
30278 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30279 DAG.getIntPtrConstant(0, dl));
30280 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30281 return DAG.getMergeValues(RetOps, dl);
30282}
30283
30284static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30285 SelectionDAG &DAG) {
30286 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30287 SDValue DataToStore = N->getValue();
30288 MVT VT = DataToStore.getSimpleValueType();
30289 MVT ScalarVT = VT.getScalarType();
30290 SDValue Mask = N->getMask();
30291 SDLoc dl(Op);
30292
30293 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&((void)0)
30294 "Expanding masked load is supported on AVX-512 target only!")((void)0);
30295
30296 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&((void)0)
30297 "Expanding masked load is supported for 32 and 64-bit types only!")((void)0);
30298
30299 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((void)0)
30300 "Cannot lower masked store op.")((void)0);
30301
30302 assert((ScalarVT.getSizeInBits() >= 32 ||((void)0)
30303 (Subtarget.hasBWI() &&((void)0)
30304 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&((void)0)
30305 "Unsupported masked store op.")((void)0);
30306
30307 // This operation is legal for targets with VLX, but without
30308 // VLX the vector should be widened to 512 bit
30309 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
30310 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30311
30312 // Mask element has to be i1.
30313 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((void)0)
30314 "Unexpected mask type")((void)0);
30315
30316 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30317
30318 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
30319 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30320 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
30321 N->getOffset(), Mask, N->getMemoryVT(),
30322 N->getMemOperand(), N->getAddressingMode(),
30323 N->isTruncatingStore(), N->isCompressingStore());
30324}
30325
30326static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
30327 SelectionDAG &DAG) {
30328 assert(Subtarget.hasAVX2() &&((void)0)
30329 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")((void)0);
30330
30331 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
30332 SDLoc dl(Op);
30333 MVT VT = Op.getSimpleValueType();
30334 SDValue Index = N->getIndex();
30335 SDValue Mask = N->getMask();
30336 SDValue PassThru = N->getPassThru();
30337 MVT IndexVT = Index.getSimpleValueType();
30338
30339 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")((void)0);
30340
30341 // If the index is v2i32, we're being called by type legalization.
30342 if (IndexVT == MVT::v2i32)
30343 return SDValue();
30344
30345 // If we don't have VLX and neither the passthru or index is 512-bits, we
30346 // need to widen until one is.
30347 MVT OrigVT = VT;
30348 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30349 !IndexVT.is512BitVector()) {
30350 // Determine how much we need to widen by to get a 512-bit type.
30351 unsigned Factor = std::min(512/VT.getSizeInBits(),
30352 512/IndexVT.getSizeInBits());
30353
30354 unsigned NumElts = VT.getVectorNumElements() * Factor;
30355
30356 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30357 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30358 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30359
30360 PassThru = ExtendToType(PassThru, VT, DAG);
30361 Index = ExtendToType(Index, IndexVT, DAG);
30362 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30363 }
30364
30365 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
30366 N->getScale() };
30367 SDValue NewGather = DAG.getMemIntrinsicNode(
30368 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
30369 N->getMemOperand());
30370 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
30371 NewGather, DAG.getIntPtrConstant(0, dl));
30372 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
30373}
30374
30375static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
30376 SDLoc dl(Op);
30377 SDValue Src = Op.getOperand(0);
30378 MVT DstVT = Op.getSimpleValueType();
30379
30380 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
30381 unsigned SrcAS = N->getSrcAddressSpace();
30382
30383 assert(SrcAS != N->getDestAddressSpace() &&((void)0)
30384 "addrspacecast must be between different address spaces")((void)0);
30385
30386 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
30387 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
30388 } else if (DstVT == MVT::i64) {
30389 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
30390 } else if (DstVT == MVT::i32) {
30391 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
30392 } else {
30393 report_fatal_error("Bad address space in addrspacecast");
30394 }
30395 return Op;
30396}
30397
30398SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
30399 SelectionDAG &DAG) const {
30400 // TODO: Eventually, the lowering of these nodes should be informed by or
30401 // deferred to the GC strategy for the function in which they appear. For
30402 // now, however, they must be lowered to something. Since they are logically
30403 // no-ops in the case of a null GC strategy (or a GC strategy which does not
30404 // require special handling for these nodes), lower them as literal NOOPs for
30405 // the time being.
30406 SmallVector<SDValue, 2> Ops;
30407
30408 Ops.push_back(Op.getOperand(0));
30409 if (Op->getGluedNode())
30410 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
30411
30412 SDLoc OpDL(Op);
30413 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
30414 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
30415
30416 return NOOP;
30417}
30418
30419// Custom split CVTPS2PH with wide types.
30420static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
30421 SDLoc dl(Op);
30422 EVT VT = Op.getValueType();
30423 SDValue Lo, Hi;
30424 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
30425 EVT LoVT, HiVT;
30426 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30427 SDValue RC = Op.getOperand(1);
30428 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
30429 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
30430 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30431}
30432
30433/// Provide custom lowering hooks for some operations.
30434SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
30435 switch (Op.getOpcode()) {
30436 default: llvm_unreachable("Should not custom lower this!")__builtin_unreachable();
30437 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
30438 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
30439 return LowerCMP_SWAP(Op, Subtarget, DAG);
30440 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
30441 case ISD::ATOMIC_LOAD_ADD:
30442 case ISD::ATOMIC_LOAD_SUB:
30443 case ISD::ATOMIC_LOAD_OR:
30444 case ISD::ATOMIC_LOAD_XOR:
30445 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
30446 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
30447 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
30448 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
30449 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
30450 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
30451 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
30452 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
30453 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
30454 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
30455 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
30456 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
30457 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
30458 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
30459 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
30460 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
30461 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
30462 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
30463 case ISD::SHL_PARTS:
30464 case ISD::SRA_PARTS:
30465 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
30466 case ISD::FSHL:
30467 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
30468 case ISD::STRICT_SINT_TO_FP:
30469 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
30470 case ISD::STRICT_UINT_TO_FP:
30471 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
30472 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
30473 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
30474 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
30475 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
30476 case ISD::ZERO_EXTEND_VECTOR_INREG:
30477 case ISD::SIGN_EXTEND_VECTOR_INREG:
30478 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
30479 case ISD::FP_TO_SINT:
30480 case ISD::STRICT_FP_TO_SINT:
30481 case ISD::FP_TO_UINT:
30482 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
30483 case ISD::FP_TO_SINT_SAT:
30484 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
30485 case ISD::FP_EXTEND:
30486 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
30487 case ISD::FP_ROUND:
30488 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
30489 case ISD::FP16_TO_FP:
30490 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
30491 case ISD::FP_TO_FP16:
30492 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
30493 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
30494 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
30495 case ISD::FADD:
30496 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
30497 case ISD::FROUND: return LowerFROUND(Op, DAG);
30498 case ISD::FABS:
30499 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
30500 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
30501 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
30502 case ISD::LRINT:
30503 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
30504 case ISD::SETCC:
30505 case ISD::STRICT_FSETCC:
30506 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
30507 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
30508 case ISD::SELECT: return LowerSELECT(Op, DAG);
30509 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
30510 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
30511 case ISD::VASTART: return LowerVASTART(Op, DAG);
30512 case ISD::VAARG: return LowerVAARG(Op, DAG);
30513 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
30514 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
30515 case ISD::INTRINSIC_VOID:
30516 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
30517 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
30518 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
30519 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
30520 case ISD::FRAME_TO_ARGS_OFFSET:
30521 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
30522 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
30523 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
30524 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
30525 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
30526 case ISD::EH_SJLJ_SETUP_DISPATCH:
30527 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
30528 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
30529 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
30530 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
30531 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
30532 case ISD::CTLZ:
30533 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
30534 case ISD::CTTZ:
30535 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
30536 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
30537 case ISD::MULHS:
30538 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
30539 case ISD::ROTL:
30540 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
30541 case ISD::SRA:
30542 case ISD::SRL:
30543 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
30544 case ISD::SADDO:
30545 case ISD::UADDO:
30546 case ISD::SSUBO:
30547 case ISD::USUBO: return LowerXALUO(Op, DAG);
30548 case ISD::SMULO:
30549 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
30550 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
30551 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
30552 case ISD::SADDO_CARRY:
30553 case ISD::SSUBO_CARRY:
30554 case ISD::ADDCARRY:
30555 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
30556 case ISD::ADD:
30557 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
30558 case ISD::UADDSAT:
30559 case ISD::SADDSAT:
30560 case ISD::USUBSAT:
30561 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
30562 case ISD::SMAX:
30563 case ISD::SMIN:
30564 case ISD::UMAX:
30565 case ISD::UMIN: return LowerMINMAX(Op, DAG);
30566 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
30567 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
30568 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
30569 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
30570 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
30571 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
30572 case ISD::GC_TRANSITION_START:
30573 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
30574 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
30575 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
30576 }
30577}
30578
30579/// Replace a node with an illegal result type with a new node built out of
30580/// custom code.
30581void X86TargetLowering::ReplaceNodeResults(SDNode *N,
30582 SmallVectorImpl<SDValue>&Results,
30583 SelectionDAG &DAG) const {
30584 SDLoc dl(N);
30585 switch (N->getOpcode()) {
30586 default:
30587#ifndef NDEBUG1
30588 dbgs() << "ReplaceNodeResults: ";
30589 N->dump(&DAG);
30590#endif
30591 llvm_unreachable("Do not know how to custom type legalize this operation!")__builtin_unreachable();
30592 case X86ISD::CVTPH2PS: {
30593 EVT VT = N->getValueType(0);
30594 SDValue Lo, Hi;
30595 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
30596 EVT LoVT, HiVT;
30597 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30598 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
30599 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
30600 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30601 Results.push_back(Res);
30602 return;
30603 }
30604 case X86ISD::STRICT_CVTPH2PS: {
30605 EVT VT = N->getValueType(0);
30606 SDValue Lo, Hi;
30607 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
30608 EVT LoVT, HiVT;
30609 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30610 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
30611 {N->getOperand(0), Lo});
30612 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
30613 {N->getOperand(0), Hi});
30614 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
30615 Lo.getValue(1), Hi.getValue(1));
30616 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30617 Results.push_back(Res);
30618 Results.push_back(Chain);
30619 return;
30620 }
30621 case X86ISD::CVTPS2PH:
30622 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
30623 return;
30624 case ISD::CTPOP: {
30625 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((void)0);
30626 // Use a v2i64 if possible.
30627 bool NoImplicitFloatOps =
30628 DAG.getMachineFunction().getFunction().hasFnAttribute(
30629 Attribute::NoImplicitFloat);
30630 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
30631 SDValue Wide =
30632 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
30633 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
30634 // Bit count should fit in 32-bits, extract it as that and then zero
30635 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
30636 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
30637 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
30638 DAG.getIntPtrConstant(0, dl));
30639 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
30640 Results.push_back(Wide);
30641 }
30642 return;
30643 }
30644 case ISD::MUL: {
30645 EVT VT = N->getValueType(0);
30646 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30647 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")((void)0);
30648 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
30649 // elements are needed.
30650 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30651 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
30652 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
30653 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
30654 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30655 unsigned NumConcats = 16 / VT.getVectorNumElements();
30656 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30657 ConcatOps[0] = Res;
30658 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
30659 Results.push_back(Res);
30660 return;
30661 }
30662 case X86ISD::VPMADDWD:
30663 case X86ISD::AVG: {
30664 // Legalize types for X86ISD::AVG/VPMADDWD by widening.
30665 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
30666
30667 EVT VT = N->getValueType(0);
30668 EVT InVT = N->getOperand(0).getValueType();
30669 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&((void)0)
30670 "Expected a VT that divides into 128 bits.")((void)0);
30671 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30672 "Unexpected type action!")((void)0);
30673 unsigned NumConcat = 128 / InVT.getSizeInBits();
30674
30675 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
30676 InVT.getVectorElementType(),
30677 NumConcat * InVT.getVectorNumElements());
30678 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
30679 VT.getVectorElementType(),
30680 NumConcat * VT.getVectorNumElements());
30681
30682 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
30683 Ops[0] = N->getOperand(0);
30684 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30685 Ops[0] = N->getOperand(1);
30686 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
30687
30688 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
30689 Results.push_back(Res);
30690 return;
30691 }
30692 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
30693 case X86ISD::FMINC:
30694 case X86ISD::FMIN:
30695 case X86ISD::FMAXC:
30696 case X86ISD::FMAX: {
30697 EVT VT = N->getValueType(0);
30698 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")((void)0);
30699 SDValue UNDEF = DAG.getUNDEF(VT);
30700 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30701 N->getOperand(0), UNDEF);
30702 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
30703 N->getOperand(1), UNDEF);
30704 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
30705 return;
30706 }
30707 case ISD::SDIV:
30708 case ISD::UDIV:
30709 case ISD::SREM:
30710 case ISD::UREM: {
30711 EVT VT = N->getValueType(0);
30712 if (VT.isVector()) {
30713 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30714 "Unexpected type action!")((void)0);
30715 // If this RHS is a constant splat vector we can widen this and let
30716 // division/remainder by constant optimize it.
30717 // TODO: Can we do something for non-splat?
30718 APInt SplatVal;
30719 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
30720 unsigned NumConcats = 128 / VT.getSizeInBits();
30721 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
30722 Ops0[0] = N->getOperand(0);
30723 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
30724 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
30725 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
30726 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
30727 Results.push_back(Res);
30728 }
30729 return;
30730 }
30731
30732 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
30733 Results.push_back(V);
30734 return;
30735 }
30736 case ISD::TRUNCATE: {
30737 MVT VT = N->getSimpleValueType(0);
30738 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
30739 return;
30740
30741 // The generic legalizer will try to widen the input type to the same
30742 // number of elements as the widened result type. But this isn't always
30743 // the best thing so do some custom legalization to avoid some cases.
30744 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
30745 SDValue In = N->getOperand(0);
30746 EVT InVT = In.getValueType();
30747
30748 unsigned InBits = InVT.getSizeInBits();
30749 if (128 % InBits == 0) {
30750 // 128 bit and smaller inputs should avoid truncate all together and
30751 // just use a build_vector that will become a shuffle.
30752 // TODO: Widen and use a shuffle directly?
30753 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
30754 EVT EltVT = VT.getVectorElementType();
30755 unsigned WidenNumElts = WidenVT.getVectorNumElements();
30756 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
30757 // Use the original element count so we don't do more scalar opts than
30758 // necessary.
30759 unsigned MinElts = VT.getVectorNumElements();
30760 for (unsigned i=0; i < MinElts; ++i) {
30761 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
30762 DAG.getIntPtrConstant(i, dl));
30763 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
30764 }
30765 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
30766 return;
30767 }
30768 // With AVX512 there are some cases that can use a target specific
30769 // truncate node to go from 256/512 to less than 128 with zeros in the
30770 // upper elements of the 128 bit result.
30771 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
30772 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
30773 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
30774 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30775 return;
30776 }
30777 // There's one case we can widen to 512 bits and use VTRUNC.
30778 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
30779 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
30780 DAG.getUNDEF(MVT::v4i64));
30781 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
30782 return;
30783 }
30784 }
30785 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
30786 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
30787 isTypeLegal(MVT::v4i64)) {
30788 // Input needs to be split and output needs to widened. Let's use two
30789 // VTRUNCs, and shuffle their results together into the wider type.
30790 SDValue Lo, Hi;
30791 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
30792
30793 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
30794 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
30795 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
30796 { 0, 1, 2, 3, 16, 17, 18, 19,
30797 -1, -1, -1, -1, -1, -1, -1, -1 });
30798 Results.push_back(Res);
30799 return;
30800 }
30801
30802 return;
30803 }
30804 case ISD::ANY_EXTEND:
30805 // Right now, only MVT::v8i8 has Custom action for an illegal type.
30806 // It's intended to custom handle the input type.
30807 assert(N->getValueType(0) == MVT::v8i8 &&((void)0)
30808 "Do not know how to legalize this Node")((void)0);
30809 return;
30810 case ISD::SIGN_EXTEND:
30811 case ISD::ZERO_EXTEND: {
30812 EVT VT = N->getValueType(0);
30813 SDValue In = N->getOperand(0);
30814 EVT InVT = In.getValueType();
30815 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
30816 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
30817 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&((void)0)
30818 "Unexpected type action!")((void)0);
30819 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")((void)0);
30820 // Custom split this so we can extend i8/i16->i32 invec. This is better
30821 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
30822 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
30823 // we allow the sra from the extend to i32 to be shared by the split.
30824 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
30825
30826 // Fill a vector with sign bits for each element.
30827 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
30828 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
30829
30830 // Create an unpackl and unpackh to interleave the sign bits then bitcast
30831 // to v2i64.
30832 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30833 {0, 4, 1, 5});
30834 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
30835 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
30836 {2, 6, 3, 7});
30837 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
30838
30839 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30840 Results.push_back(Res);
30841 return;
30842 }
30843
30844 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
30845 if (!InVT.is128BitVector()) {
30846 // Not a 128 bit vector, but maybe type legalization will promote
30847 // it to 128 bits.
30848 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
30849 return;
30850 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
30851 if (!InVT.is128BitVector())
30852 return;
30853
30854 // Promote the input to 128 bits. Type legalization will turn this into
30855 // zext_inreg/sext_inreg.
30856 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
30857 }
30858
30859 // Perform custom splitting instead of the two stage extend we would get
30860 // by default.
30861 EVT LoVT, HiVT;
30862 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
30863 assert(isTypeLegal(LoVT) && "Split VT not legal?")((void)0);
30864
30865 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
30866
30867 // We need to shift the input over by half the number of elements.
30868 unsigned NumElts = InVT.getVectorNumElements();
30869 unsigned HalfNumElts = NumElts / 2;
30870 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
30871 for (unsigned i = 0; i != HalfNumElts; ++i)
30872 ShufMask[i] = i + HalfNumElts;
30873
30874 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
30875 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
30876
30877 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30878 Results.push_back(Res);
30879 }
30880 return;
30881 }
30882 case ISD::FP_TO_SINT:
30883 case ISD::STRICT_FP_TO_SINT:
30884 case ISD::FP_TO_UINT:
30885 case ISD::STRICT_FP_TO_UINT: {
30886 bool IsStrict = N->isStrictFPOpcode();
30887 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
30888 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
30889 EVT VT = N->getValueType(0);
30890 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
30891 EVT SrcVT = Src.getValueType();
30892
30893 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
30894 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30895 "Unexpected type action!")((void)0);
30896
30897 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
30898 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
30899 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
30900 VT.getVectorNumElements());
30901 SDValue Res;
30902 SDValue Chain;
30903 if (IsStrict) {
30904 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
30905 {N->getOperand(0), Src});
30906 Chain = Res.getValue(1);
30907 } else
30908 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
30909
30910 // Preserve what we know about the size of the original result. If the
30911 // result is v2i32, we have to manually widen the assert.
30912 if (PromoteVT == MVT::v2i32)
30913 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
30914 DAG.getUNDEF(MVT::v2i32));
30915
30916 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
30917 Res.getValueType(), Res,
30918 DAG.getValueType(VT.getVectorElementType()));
30919
30920 if (PromoteVT == MVT::v2i32)
30921 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
30922 DAG.getIntPtrConstant(0, dl));
30923
30924 // Truncate back to the original width.
30925 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
30926
30927 // Now widen to 128 bits.
30928 unsigned NumConcats = 128 / VT.getSizeInBits();
30929 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
30930 VT.getVectorNumElements() * NumConcats);
30931 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
30932 ConcatOps[0] = Res;
30933 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
30934 Results.push_back(Res);
30935 if (IsStrict)
30936 Results.push_back(Chain);
30937 return;
30938 }
30939
30940
30941 if (VT == MVT::v2i32) {
30942 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&((void)0)
30943 "Strict unsigned conversion requires AVX512")((void)0);
30944 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
30945 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
30946 "Unexpected type action!")((void)0);
30947 if (Src.getValueType() == MVT::v2f64) {
30948 if (!IsSigned && !Subtarget.hasAVX512()) {
30949 SDValue Res =
30950 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
30951 Results.push_back(Res);
30952 return;
30953 }
30954
30955 unsigned Opc;
30956 if (IsStrict)
30957 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
30958 else
30959 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
30960
30961 // If we have VLX we can emit a target specific FP_TO_UINT node,.
30962 if (!IsSigned && !Subtarget.hasVLX()) {
30963 // Otherwise we can defer to the generic legalizer which will widen
30964 // the input as well. This will be further widened during op
30965 // legalization to v8i32<-v8f64.
30966 // For strict nodes we'll need to widen ourselves.
30967 // FIXME: Fix the type legalizer to safely widen strict nodes?
30968 if (!IsStrict)
30969 return;
30970 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
30971 DAG.getConstantFP(0.0, dl, MVT::v2f64));
30972 Opc = N->getOpcode();
30973 }
30974 SDValue Res;
30975 SDValue Chain;
30976 if (IsStrict) {
30977 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
30978 {N->getOperand(0), Src});
30979 Chain = Res.getValue(1);
30980 } else {
30981 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
30982 }
30983 Results.push_back(Res);
30984 if (IsStrict)
30985 Results.push_back(Chain);
30986 return;
30987 }
30988
30989 // Custom widen strict v2f32->v2i32 by padding with zeros.
30990 // FIXME: Should generic type legalizer do this?
30991 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
30992 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
30993 DAG.getConstantFP(0.0, dl, MVT::v2f32));
30994 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
30995 {N->getOperand(0), Src});
30996 Results.push_back(Res);
30997 Results.push_back(Res.getValue(1));
30998 return;
30999 }
31000
31001 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
31002 // so early out here.
31003 return;
31004 }
31005
31006 assert(!VT.isVector() && "Vectors should have been handled above!")((void)0);
31007
31008 if (Subtarget.hasDQI() && VT == MVT::i64 &&
31009 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
31010 assert(!Subtarget.is64Bit() && "i64 should be legal")((void)0);
31011 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
31012 // If we use a 128-bit result we might need to use a target specific node.
31013 unsigned SrcElts =
31014 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
31015 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
31016 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
31017 unsigned Opc = N->getOpcode();
31018 if (NumElts != SrcElts) {
31019 if (IsStrict)
31020 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31021 else
31022 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31023 }
31024
31025 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
31026 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
31027 DAG.getConstantFP(0.0, dl, VecInVT), Src,
31028 ZeroIdx);
31029 SDValue Chain;
31030 if (IsStrict) {
31031 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
31032 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
31033 Chain = Res.getValue(1);
31034 } else
31035 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
31036 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
31037 Results.push_back(Res);
31038 if (IsStrict)
31039 Results.push_back(Chain);
31040 return;
31041 }
31042
31043 SDValue Chain;
31044 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
31045 Results.push_back(V);
31046 if (IsStrict)
31047 Results.push_back(Chain);
31048 }
31049 return;
31050 }
31051 case ISD::LRINT:
31052 case ISD::LLRINT: {
31053 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
31054 Results.push_back(V);
31055 return;
31056 }
31057
31058 case ISD::SINT_TO_FP:
31059 case ISD::STRICT_SINT_TO_FP:
31060 case ISD::UINT_TO_FP:
31061 case ISD::STRICT_UINT_TO_FP: {
31062 bool IsStrict = N->isStrictFPOpcode();
31063 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
31064 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
31065 EVT VT = N->getValueType(0);
31066 if (VT != MVT::v2f32)
31067 return;
31068 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31069 EVT SrcVT = Src.getValueType();
31070 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
31071 if (IsStrict) {
31072 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
31073 : X86ISD::STRICT_CVTUI2P;
31074 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
31075 {N->getOperand(0), Src});
31076 Results.push_back(Res);
31077 Results.push_back(Res.getValue(1));
31078 } else {
31079 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
31080 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
31081 }
31082 return;
31083 }
31084 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
31085 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
31086 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
31087 SDValue One = DAG.getConstant(1, dl, SrcVT);
31088 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
31089 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
31090 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
31091 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
31092 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
31093 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
31094 for (int i = 0; i != 2; ++i) {
31095 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
31096 SignSrc, DAG.getIntPtrConstant(i, dl));
31097 if (IsStrict)
31098 SignCvts[i] =
31099 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
31100 {N->getOperand(0), Elt});
31101 else
31102 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
31103 };
31104 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
31105 SDValue Slow, Chain;
31106 if (IsStrict) {
31107 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31108 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
31109 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
31110 {Chain, SignCvt, SignCvt});
31111 Chain = Slow.getValue(1);
31112 } else {
31113 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
31114 }
31115 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
31116 IsNeg =
31117 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
31118 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
31119 Results.push_back(Cvt);
31120 if (IsStrict)
31121 Results.push_back(Chain);
31122 return;
31123 }
31124
31125 if (SrcVT != MVT::v2i32)
31126 return;
31127
31128 if (IsSigned || Subtarget.hasAVX512()) {
31129 if (!IsStrict)
31130 return;
31131
31132 // Custom widen strict v2i32->v2f32 to avoid scalarization.
31133 // FIXME: Should generic type legalizer do this?
31134 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
31135 DAG.getConstant(0, dl, MVT::v2i32));
31136 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
31137 {N->getOperand(0), Src});
31138 Results.push_back(Res);
31139 Results.push_back(Res.getValue(1));
31140 return;
31141 }
31142
31143 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
31144 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
31145 SDValue VBias =
31146 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
31147 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
31148 DAG.getBitcast(MVT::v2i64, VBias));
31149 Or = DAG.getBitcast(MVT::v2f64, Or);
31150 if (IsStrict) {
31151 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
31152 {N->getOperand(0), Or, VBias});
31153 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
31154 {MVT::v4f32, MVT::Other},
31155 {Sub.getValue(1), Sub});
31156 Results.push_back(Res);
31157 Results.push_back(Res.getValue(1));
31158 } else {
31159 // TODO: Are there any fast-math-flags to propagate here?
31160 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
31161 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
31162 }
31163 return;
31164 }
31165 case ISD::STRICT_FP_ROUND:
31166 case ISD::FP_ROUND: {
31167 bool IsStrict = N->isStrictFPOpcode();
31168 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31169 if (!isTypeLegal(Src.getValueType()))
31170 return;
31171 SDValue V;
31172 if (IsStrict)
31173 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
31174 {N->getOperand(0), N->getOperand(1)});
31175 else
31176 V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
31177 Results.push_back(V);
31178 if (IsStrict)
31179 Results.push_back(V.getValue(1));
31180 return;
31181 }
31182 case ISD::FP_EXTEND:
31183 case ISD::STRICT_FP_EXTEND: {
31184 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
31185 // No other ValueType for FP_EXTEND should reach this point.
31186 assert(N->getValueType(0) == MVT::v2f32 &&((void)0)
31187 "Do not know how to legalize this Node")((void)0);
31188 return;
31189 }
31190 case ISD::INTRINSIC_W_CHAIN: {
31191 unsigned IntNo = N->getConstantOperandVal(1);
31192 switch (IntNo) {
31193 default : llvm_unreachable("Do not know how to custom type "__builtin_unreachable()
31194 "legalize this intrinsic operation!")__builtin_unreachable();
31195 case Intrinsic::x86_rdtsc:
31196 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
31197 Results);
31198 case Intrinsic::x86_rdtscp:
31199 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
31200 Results);
31201 case Intrinsic::x86_rdpmc:
31202 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
31203 Results);
31204 return;
31205 case Intrinsic::x86_xgetbv:
31206 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
31207 Results);
31208 return;
31209 }
31210 }
31211 case ISD::READCYCLECOUNTER: {
31212 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
31213 }
31214 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
31215 EVT T = N->getValueType(0);
31216 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")((void)0);
31217 bool Regs64bit = T == MVT::i128;
31218 assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&((void)0)
31219 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")((void)0);
31220 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
31221 SDValue cpInL, cpInH;
31222 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31223 DAG.getConstant(0, dl, HalfT));
31224 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31225 DAG.getConstant(1, dl, HalfT));
31226 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
31227 Regs64bit ? X86::RAX : X86::EAX,
31228 cpInL, SDValue());
31229 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
31230 Regs64bit ? X86::RDX : X86::EDX,
31231 cpInH, cpInL.getValue(1));
31232 SDValue swapInL, swapInH;
31233 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31234 DAG.getConstant(0, dl, HalfT));
31235 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31236 DAG.getConstant(1, dl, HalfT));
31237 swapInH =
31238 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
31239 swapInH, cpInH.getValue(1));
31240
31241 // In 64-bit mode we might need the base pointer in RBX, but we can't know
31242 // until later. So we keep the RBX input in a vreg and use a custom
31243 // inserter.
31244 // Since RBX will be a reserved register the register allocator will not
31245 // make sure its value will be properly saved and restored around this
31246 // live-range.
31247 SDValue Result;
31248 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31249 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
31250 if (Regs64bit) {
31251 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
31252 swapInH.getValue(1)};
31253 Result =
31254 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
31255 } else {
31256 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
31257 swapInH.getValue(1));
31258 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
31259 swapInL.getValue(1)};
31260 Result =
31261 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
31262 }
31263
31264 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
31265 Regs64bit ? X86::RAX : X86::EAX,
31266 HalfT, Result.getValue(1));
31267 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
31268 Regs64bit ? X86::RDX : X86::EDX,
31269 HalfT, cpOutL.getValue(2));
31270 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
31271
31272 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
31273 MVT::i32, cpOutH.getValue(2));
31274 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
31275 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
31276
31277 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
31278 Results.push_back(Success);
31279 Results.push_back(EFLAGS.getValue(1));
31280 return;
31281 }
31282 case ISD::ATOMIC_LOAD: {
31283 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((void)0);
31284 bool NoImplicitFloatOps =
31285 DAG.getMachineFunction().getFunction().hasFnAttribute(
31286 Attribute::NoImplicitFloat);
31287 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31288 auto *Node = cast<AtomicSDNode>(N);
31289 if (Subtarget.hasSSE1()) {
31290 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
31291 // Then extract the lower 64-bits.
31292 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31293 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
31294 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31295 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31296 MVT::i64, Node->getMemOperand());
31297 if (Subtarget.hasSSE2()) {
31298 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
31299 DAG.getIntPtrConstant(0, dl));
31300 Results.push_back(Res);
31301 Results.push_back(Ld.getValue(1));
31302 return;
31303 }
31304 // We use an alternative sequence for SSE1 that extracts as v2f32 and
31305 // then casts to i64. This avoids a 128-bit stack temporary being
31306 // created by type legalization if we were to cast v4f32->v2i64.
31307 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
31308 DAG.getIntPtrConstant(0, dl));
31309 Res = DAG.getBitcast(MVT::i64, Res);
31310 Results.push_back(Res);
31311 Results.push_back(Ld.getValue(1));
31312 return;
31313 }
31314 if (Subtarget.hasX87()) {
31315 // First load this into an 80-bit X87 register. This will put the whole
31316 // integer into the significand.
31317 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31318 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31319 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
31320 dl, Tys, Ops, MVT::i64,
31321 Node->getMemOperand());
31322 SDValue Chain = Result.getValue(1);
31323
31324 // Now store the X87 register to a stack temporary and convert to i64.
31325 // This store is not atomic and doesn't need to be.
31326 // FIXME: We don't need a stack temporary if the result of the load
31327 // is already being stored. We could just directly store there.
31328 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31329 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31330 MachinePointerInfo MPI =
31331 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31332 SDValue StoreOps[] = { Chain, Result, StackPtr };
31333 Chain = DAG.getMemIntrinsicNode(
31334 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
31335 MPI, None /*Align*/, MachineMemOperand::MOStore);
31336
31337 // Finally load the value back from the stack temporary and return it.
31338 // This load is not atomic and doesn't need to be.
31339 // This load will be further type legalized.
31340 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
31341 Results.push_back(Result);
31342 Results.push_back(Result.getValue(1));
31343 return;
31344 }
31345 }
31346 // TODO: Use MOVLPS when SSE1 is available?
31347 // Delegate to generic TypeLegalization. Situations we can really handle
31348 // should have already been dealt with by AtomicExpandPass.cpp.
31349 break;
31350 }
31351 case ISD::ATOMIC_SWAP:
31352 case ISD::ATOMIC_LOAD_ADD:
31353 case ISD::ATOMIC_LOAD_SUB:
31354 case ISD::ATOMIC_LOAD_AND:
31355 case ISD::ATOMIC_LOAD_OR:
31356 case ISD::ATOMIC_LOAD_XOR:
31357 case ISD::ATOMIC_LOAD_NAND:
31358 case ISD::ATOMIC_LOAD_MIN:
31359 case ISD::ATOMIC_LOAD_MAX:
31360 case ISD::ATOMIC_LOAD_UMIN:
31361 case ISD::ATOMIC_LOAD_UMAX:
31362 // Delegate to generic TypeLegalization. Situations we can really handle
31363 // should have already been dealt with by AtomicExpandPass.cpp.
31364 break;
31365
31366 case ISD::BITCAST: {
31367 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((void)0);
31368 EVT DstVT = N->getValueType(0);
31369 EVT SrcVT = N->getOperand(0).getValueType();
31370
31371 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
31372 // we can split using the k-register rather than memory.
31373 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
31374 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((void)0);
31375 SDValue Lo, Hi;
31376 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31377 Lo = DAG.getBitcast(MVT::i32, Lo);
31378 Hi = DAG.getBitcast(MVT::i32, Hi);
31379 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
31380 Results.push_back(Res);
31381 return;
31382 }
31383
31384 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
31385 // FIXME: Use v4f32 for SSE1?
31386 assert(Subtarget.hasSSE2() && "Requires SSE2")((void)0);
31387 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&((void)0)
31388 "Unexpected type action!")((void)0);
31389 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
31390 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
31391 N->getOperand(0));
31392 Res = DAG.getBitcast(WideVT, Res);
31393 Results.push_back(Res);
31394 return;
31395 }
31396
31397 return;
31398 }
31399 case ISD::MGATHER: {
31400 EVT VT = N->getValueType(0);
31401 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
31402 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
31403 auto *Gather = cast<MaskedGatherSDNode>(N);
31404 SDValue Index = Gather->getIndex();
31405 if (Index.getValueType() != MVT::v2i64)
31406 return;
31407 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
31408 "Unexpected type action!")((void)0);
31409 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31410 SDValue Mask = Gather->getMask();
31411 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((void)0);
31412 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
31413 Gather->getPassThru(),
31414 DAG.getUNDEF(VT));
31415 if (!Subtarget.hasVLX()) {
31416 // We need to widen the mask, but the instruction will only use 2
31417 // of its elements. So we can use undef.
31418 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
31419 DAG.getUNDEF(MVT::v2i1));
31420 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
31421 }
31422 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
31423 Gather->getBasePtr(), Index, Gather->getScale() };
31424 SDValue Res = DAG.getMemIntrinsicNode(
31425 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
31426 Gather->getMemoryVT(), Gather->getMemOperand());
31427 Results.push_back(Res);
31428 Results.push_back(Res.getValue(1));
31429 return;
31430 }
31431 return;
31432 }
31433 case ISD::LOAD: {
31434 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
31435 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
31436 // cast since type legalization will try to use an i64 load.
31437 MVT VT = N->getSimpleValueType(0);
31438 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")((void)0);
31439 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((void)0)
31440 "Unexpected type action!")((void)0);
31441 if (!ISD::isNON_EXTLoad(N))
31442 return;
31443 auto *Ld = cast<LoadSDNode>(N);
31444 if (Subtarget.hasSSE2()) {
31445 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
31446 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
31447 Ld->getPointerInfo(), Ld->getOriginalAlign(),
31448 Ld->getMemOperand()->getFlags());
31449 SDValue Chain = Res.getValue(1);
31450 MVT VecVT = MVT::getVectorVT(LdVT, 2);
31451 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
31452 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
31453 Res = DAG.getBitcast(WideVT, Res);
31454 Results.push_back(Res);
31455 Results.push_back(Chain);
31456 return;
31457 }
31458 assert(Subtarget.hasSSE1() && "Expected SSE")((void)0);
31459 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
31460 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
31461 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31462 MVT::i64, Ld->getMemOperand());
31463 Results.push_back(Res);
31464 Results.push_back(Res.getValue(1));
31465 return;
31466 }
31467 case ISD::ADDRSPACECAST: {
31468 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
31469 Results.push_back(V);
31470 return;
31471 }
31472 case ISD::BITREVERSE:
31473 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((void)0);
31474 assert(Subtarget.hasXOP() && "Expected XOP")((void)0);
31475 // We can use VPPERM by copying to a vector register and back. We'll need
31476 // to move the scalar in two i32 pieces.
31477 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
31478 return;
31479 }
31480}
31481
31482const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
31483 switch ((X86ISD::NodeType)Opcode) {
31484 case X86ISD::FIRST_NUMBER: break;
31485#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
31486 NODE_NAME_CASE(BSF)
31487 NODE_NAME_CASE(BSR)
31488 NODE_NAME_CASE(FSHL)
31489 NODE_NAME_CASE(FSHR)
31490 NODE_NAME_CASE(FAND)
31491 NODE_NAME_CASE(FANDN)
31492 NODE_NAME_CASE(FOR)
31493 NODE_NAME_CASE(FXOR)
31494 NODE_NAME_CASE(FILD)
31495 NODE_NAME_CASE(FIST)
31496 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
31497 NODE_NAME_CASE(FLD)
31498 NODE_NAME_CASE(FST)
31499 NODE_NAME_CASE(CALL)
31500 NODE_NAME_CASE(CALL_RVMARKER)
31501 NODE_NAME_CASE(BT)
31502 NODE_NAME_CASE(CMP)
31503 NODE_NAME_CASE(FCMP)
31504 NODE_NAME_CASE(STRICT_FCMP)
31505 NODE_NAME_CASE(STRICT_FCMPS)
31506 NODE_NAME_CASE(COMI)
31507 NODE_NAME_CASE(UCOMI)
31508 NODE_NAME_CASE(CMPM)
31509 NODE_NAME_CASE(CMPMM)
31510 NODE_NAME_CASE(STRICT_CMPM)
31511 NODE_NAME_CASE(CMPMM_SAE)
31512 NODE_NAME_CASE(SETCC)
31513 NODE_NAME_CASE(SETCC_CARRY)
31514 NODE_NAME_CASE(FSETCC)
31515 NODE_NAME_CASE(FSETCCM)
31516 NODE_NAME_CASE(FSETCCM_SAE)
31517 NODE_NAME_CASE(CMOV)
31518 NODE_NAME_CASE(BRCOND)
31519 NODE_NAME_CASE(RET_FLAG)
31520 NODE_NAME_CASE(IRET)
31521 NODE_NAME_CASE(REP_STOS)
31522 NODE_NAME_CASE(REP_MOVS)
31523 NODE_NAME_CASE(GlobalBaseReg)
31524 NODE_NAME_CASE(Wrapper)
31525 NODE_NAME_CASE(WrapperRIP)
31526 NODE_NAME_CASE(MOVQ2DQ)
31527 NODE_NAME_CASE(MOVDQ2Q)
31528 NODE_NAME_CASE(MMX_MOVD2W)
31529 NODE_NAME_CASE(MMX_MOVW2D)
31530 NODE_NAME_CASE(PEXTRB)
31531 NODE_NAME_CASE(PEXTRW)
31532 NODE_NAME_CASE(INSERTPS)
31533 NODE_NAME_CASE(PINSRB)
31534 NODE_NAME_CASE(PINSRW)
31535 NODE_NAME_CASE(PSHUFB)
31536 NODE_NAME_CASE(ANDNP)
31537 NODE_NAME_CASE(BLENDI)
31538 NODE_NAME_CASE(BLENDV)
31539 NODE_NAME_CASE(HADD)
31540 NODE_NAME_CASE(HSUB)
31541 NODE_NAME_CASE(FHADD)
31542 NODE_NAME_CASE(FHSUB)
31543 NODE_NAME_CASE(CONFLICT)
31544 NODE_NAME_CASE(FMAX)
31545 NODE_NAME_CASE(FMAXS)
31546 NODE_NAME_CASE(FMAX_SAE)
31547 NODE_NAME_CASE(FMAXS_SAE)
31548 NODE_NAME_CASE(FMIN)
31549 NODE_NAME_CASE(FMINS)
31550 NODE_NAME_CASE(FMIN_SAE)
31551 NODE_NAME_CASE(FMINS_SAE)
31552 NODE_NAME_CASE(FMAXC)
31553 NODE_NAME_CASE(FMINC)
31554 NODE_NAME_CASE(FRSQRT)
31555 NODE_NAME_CASE(FRCP)
31556 NODE_NAME_CASE(EXTRQI)
31557 NODE_NAME_CASE(INSERTQI)
31558 NODE_NAME_CASE(TLSADDR)
31559 NODE_NAME_CASE(TLSBASEADDR)
31560 NODE_NAME_CASE(TLSCALL)
31561 NODE_NAME_CASE(EH_SJLJ_SETJMP)
31562 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
31563 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
31564 NODE_NAME_CASE(EH_RETURN)
31565 NODE_NAME_CASE(TC_RETURN)
31566 NODE_NAME_CASE(FNSTCW16m)
31567 NODE_NAME_CASE(FLDCW16m)
31568 NODE_NAME_CASE(LCMPXCHG_DAG)
31569 NODE_NAME_CASE(LCMPXCHG8_DAG)
31570 NODE_NAME_CASE(LCMPXCHG16_DAG)
31571 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
31572 NODE_NAME_CASE(LADD)
31573 NODE_NAME_CASE(LSUB)
31574 NODE_NAME_CASE(LOR)
31575 NODE_NAME_CASE(LXOR)
31576 NODE_NAME_CASE(LAND)
31577 NODE_NAME_CASE(VZEXT_MOVL)
31578 NODE_NAME_CASE(VZEXT_LOAD)
31579 NODE_NAME_CASE(VEXTRACT_STORE)
31580 NODE_NAME_CASE(VTRUNC)
31581 NODE_NAME_CASE(VTRUNCS)
31582 NODE_NAME_CASE(VTRUNCUS)
31583 NODE_NAME_CASE(VMTRUNC)
31584 NODE_NAME_CASE(VMTRUNCS)
31585 NODE_NAME_CASE(VMTRUNCUS)
31586 NODE_NAME_CASE(VTRUNCSTORES)
31587 NODE_NAME_CASE(VTRUNCSTOREUS)
31588 NODE_NAME_CASE(VMTRUNCSTORES)
31589 NODE_NAME_CASE(VMTRUNCSTOREUS)
31590 NODE_NAME_CASE(VFPEXT)
31591 NODE_NAME_CASE(STRICT_VFPEXT)
31592 NODE_NAME_CASE(VFPEXT_SAE)
31593 NODE_NAME_CASE(VFPEXTS)
31594 NODE_NAME_CASE(VFPEXTS_SAE)
31595 NODE_NAME_CASE(VFPROUND)
31596 NODE_NAME_CASE(STRICT_VFPROUND)
31597 NODE_NAME_CASE(VMFPROUND)
31598 NODE_NAME_CASE(VFPROUND_RND)
31599 NODE_NAME_CASE(VFPROUNDS)
31600 NODE_NAME_CASE(VFPROUNDS_RND)
31601 NODE_NAME_CASE(VSHLDQ)
31602 NODE_NAME_CASE(VSRLDQ)
31603 NODE_NAME_CASE(VSHL)
31604 NODE_NAME_CASE(VSRL)
31605 NODE_NAME_CASE(VSRA)
31606 NODE_NAME_CASE(VSHLI)
31607 NODE_NAME_CASE(VSRLI)
31608 NODE_NAME_CASE(VSRAI)
31609 NODE_NAME_CASE(VSHLV)
31610 NODE_NAME_CASE(VSRLV)
31611 NODE_NAME_CASE(VSRAV)
31612 NODE_NAME_CASE(VROTLI)
31613 NODE_NAME_CASE(VROTRI)
31614 NODE_NAME_CASE(VPPERM)
31615 NODE_NAME_CASE(CMPP)
31616 NODE_NAME_CASE(STRICT_CMPP)
31617 NODE_NAME_CASE(PCMPEQ)
31618 NODE_NAME_CASE(PCMPGT)
31619 NODE_NAME_CASE(PHMINPOS)
31620 NODE_NAME_CASE(ADD)
31621 NODE_NAME_CASE(SUB)
31622 NODE_NAME_CASE(ADC)
31623 NODE_NAME_CASE(SBB)
31624 NODE_NAME_CASE(SMUL)
31625 NODE_NAME_CASE(UMUL)
31626 NODE_NAME_CASE(OR)
31627 NODE_NAME_CASE(XOR)
31628 NODE_NAME_CASE(AND)
31629 NODE_NAME_CASE(BEXTR)
31630 NODE_NAME_CASE(BEXTRI)
31631 NODE_NAME_CASE(BZHI)
31632 NODE_NAME_CASE(PDEP)
31633 NODE_NAME_CASE(PEXT)
31634 NODE_NAME_CASE(MUL_IMM)
31635 NODE_NAME_CASE(MOVMSK)
31636 NODE_NAME_CASE(PTEST)
31637 NODE_NAME_CASE(TESTP)
31638 NODE_NAME_CASE(KORTEST)
31639 NODE_NAME_CASE(KTEST)
31640 NODE_NAME_CASE(KADD)
31641 NODE_NAME_CASE(KSHIFTL)
31642 NODE_NAME_CASE(KSHIFTR)
31643 NODE_NAME_CASE(PACKSS)
31644 NODE_NAME_CASE(PACKUS)
31645 NODE_NAME_CASE(PALIGNR)
31646 NODE_NAME_CASE(VALIGN)
31647 NODE_NAME_CASE(VSHLD)
31648 NODE_NAME_CASE(VSHRD)
31649 NODE_NAME_CASE(VSHLDV)
31650 NODE_NAME_CASE(VSHRDV)
31651 NODE_NAME_CASE(PSHUFD)
31652 NODE_NAME_CASE(PSHUFHW)
31653 NODE_NAME_CASE(PSHUFLW)
31654 NODE_NAME_CASE(SHUFP)
31655 NODE_NAME_CASE(SHUF128)
31656 NODE_NAME_CASE(MOVLHPS)
31657 NODE_NAME_CASE(MOVHLPS)
31658 NODE_NAME_CASE(MOVDDUP)
31659 NODE_NAME_CASE(MOVSHDUP)
31660 NODE_NAME_CASE(MOVSLDUP)
31661 NODE_NAME_CASE(MOVSD)
31662 NODE_NAME_CASE(MOVSS)
31663 NODE_NAME_CASE(UNPCKL)
31664 NODE_NAME_CASE(UNPCKH)
31665 NODE_NAME_CASE(VBROADCAST)
31666 NODE_NAME_CASE(VBROADCAST_LOAD)
31667 NODE_NAME_CASE(VBROADCASTM)
31668 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
31669 NODE_NAME_CASE(VPERMILPV)
31670 NODE_NAME_CASE(VPERMILPI)
31671 NODE_NAME_CASE(VPERM2X128)
31672 NODE_NAME_CASE(VPERMV)
31673 NODE_NAME_CASE(VPERMV3)
31674 NODE_NAME_CASE(VPERMI)
31675 NODE_NAME_CASE(VPTERNLOG)
31676 NODE_NAME_CASE(VFIXUPIMM)
31677 NODE_NAME_CASE(VFIXUPIMM_SAE)
31678 NODE_NAME_CASE(VFIXUPIMMS)
31679 NODE_NAME_CASE(VFIXUPIMMS_SAE)
31680 NODE_NAME_CASE(VRANGE)
31681 NODE_NAME_CASE(VRANGE_SAE)
31682 NODE_NAME_CASE(VRANGES)
31683 NODE_NAME_CASE(VRANGES_SAE)
31684 NODE_NAME_CASE(PMULUDQ)
31685 NODE_NAME_CASE(PMULDQ)
31686 NODE_NAME_CASE(PSADBW)
31687 NODE_NAME_CASE(DBPSADBW)
31688 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
31689 NODE_NAME_CASE(VAARG_64)
31690 NODE_NAME_CASE(VAARG_X32)
31691 NODE_NAME_CASE(WIN_ALLOCA)
31692 NODE_NAME_CASE(MEMBARRIER)
31693 NODE_NAME_CASE(MFENCE)
31694 NODE_NAME_CASE(SEG_ALLOCA)
31695 NODE_NAME_CASE(PROBED_ALLOCA)
31696 NODE_NAME_CASE(RDRAND)
31697 NODE_NAME_CASE(RDSEED)
31698 NODE_NAME_CASE(RDPKRU)
31699 NODE_NAME_CASE(WRPKRU)
31700 NODE_NAME_CASE(VPMADDUBSW)
31701 NODE_NAME_CASE(VPMADDWD)
31702 NODE_NAME_CASE(VPSHA)
31703 NODE_NAME_CASE(VPSHL)
31704 NODE_NAME_CASE(VPCOM)
31705 NODE_NAME_CASE(VPCOMU)
31706 NODE_NAME_CASE(VPERMIL2)
31707 NODE_NAME_CASE(FMSUB)
31708 NODE_NAME_CASE(STRICT_FMSUB)
31709 NODE_NAME_CASE(FNMADD)
31710 NODE_NAME_CASE(STRICT_FNMADD)
31711 NODE_NAME_CASE(FNMSUB)
31712 NODE_NAME_CASE(STRICT_FNMSUB)
31713 NODE_NAME_CASE(FMADDSUB)
31714 NODE_NAME_CASE(FMSUBADD)
31715 NODE_NAME_CASE(FMADD_RND)
31716 NODE_NAME_CASE(FNMADD_RND)
31717 NODE_NAME_CASE(FMSUB_RND)
31718 NODE_NAME_CASE(FNMSUB_RND)
31719 NODE_NAME_CASE(FMADDSUB_RND)
31720 NODE_NAME_CASE(FMSUBADD_RND)
31721 NODE_NAME_CASE(VPMADD52H)
31722 NODE_NAME_CASE(VPMADD52L)
31723 NODE_NAME_CASE(VRNDSCALE)
31724 NODE_NAME_CASE(STRICT_VRNDSCALE)
31725 NODE_NAME_CASE(VRNDSCALE_SAE)
31726 NODE_NAME_CASE(VRNDSCALES)
31727 NODE_NAME_CASE(VRNDSCALES_SAE)
31728 NODE_NAME_CASE(VREDUCE)
31729 NODE_NAME_CASE(VREDUCE_SAE)
31730 NODE_NAME_CASE(VREDUCES)
31731 NODE_NAME_CASE(VREDUCES_SAE)
31732 NODE_NAME_CASE(VGETMANT)
31733 NODE_NAME_CASE(VGETMANT_SAE)
31734 NODE_NAME_CASE(VGETMANTS)
31735 NODE_NAME_CASE(VGETMANTS_SAE)
31736 NODE_NAME_CASE(PCMPESTR)
31737 NODE_NAME_CASE(PCMPISTR)
31738 NODE_NAME_CASE(XTEST)
31739 NODE_NAME_CASE(COMPRESS)
31740 NODE_NAME_CASE(EXPAND)
31741 NODE_NAME_CASE(SELECTS)
31742 NODE_NAME_CASE(ADDSUB)
31743 NODE_NAME_CASE(RCP14)
31744 NODE_NAME_CASE(RCP14S)
31745 NODE_NAME_CASE(RCP28)
31746 NODE_NAME_CASE(RCP28_SAE)
31747 NODE_NAME_CASE(RCP28S)
31748 NODE_NAME_CASE(RCP28S_SAE)
31749 NODE_NAME_CASE(EXP2)
31750 NODE_NAME_CASE(EXP2_SAE)
31751 NODE_NAME_CASE(RSQRT14)
31752 NODE_NAME_CASE(RSQRT14S)
31753 NODE_NAME_CASE(RSQRT28)
31754 NODE_NAME_CASE(RSQRT28_SAE)
31755 NODE_NAME_CASE(RSQRT28S)
31756 NODE_NAME_CASE(RSQRT28S_SAE)
31757 NODE_NAME_CASE(FADD_RND)
31758 NODE_NAME_CASE(FADDS)
31759 NODE_NAME_CASE(FADDS_RND)
31760 NODE_NAME_CASE(FSUB_RND)
31761 NODE_NAME_CASE(FSUBS)
31762 NODE_NAME_CASE(FSUBS_RND)
31763 NODE_NAME_CASE(FMUL_RND)
31764 NODE_NAME_CASE(FMULS)
31765 NODE_NAME_CASE(FMULS_RND)
31766 NODE_NAME_CASE(FDIV_RND)
31767 NODE_NAME_CASE(FDIVS)
31768 NODE_NAME_CASE(FDIVS_RND)
31769 NODE_NAME_CASE(FSQRT_RND)
31770 NODE_NAME_CASE(FSQRTS)
31771 NODE_NAME_CASE(FSQRTS_RND)
31772 NODE_NAME_CASE(FGETEXP)
31773 NODE_NAME_CASE(FGETEXP_SAE)
31774 NODE_NAME_CASE(FGETEXPS)
31775 NODE_NAME_CASE(FGETEXPS_SAE)
31776 NODE_NAME_CASE(SCALEF)
31777 NODE_NAME_CASE(SCALEF_RND)
31778 NODE_NAME_CASE(SCALEFS)
31779 NODE_NAME_CASE(SCALEFS_RND)
31780 NODE_NAME_CASE(AVG)
31781 NODE_NAME_CASE(MULHRS)
31782 NODE_NAME_CASE(SINT_TO_FP_RND)
31783 NODE_NAME_CASE(UINT_TO_FP_RND)
31784 NODE_NAME_CASE(CVTTP2SI)
31785 NODE_NAME_CASE(CVTTP2UI)
31786 NODE_NAME_CASE(STRICT_CVTTP2SI)
31787 NODE_NAME_CASE(STRICT_CVTTP2UI)
31788 NODE_NAME_CASE(MCVTTP2SI)
31789 NODE_NAME_CASE(MCVTTP2UI)
31790 NODE_NAME_CASE(CVTTP2SI_SAE)
31791 NODE_NAME_CASE(CVTTP2UI_SAE)
31792 NODE_NAME_CASE(CVTTS2SI)
31793 NODE_NAME_CASE(CVTTS2UI)
31794 NODE_NAME_CASE(CVTTS2SI_SAE)
31795 NODE_NAME_CASE(CVTTS2UI_SAE)
31796 NODE_NAME_CASE(CVTSI2P)
31797 NODE_NAME_CASE(CVTUI2P)
31798 NODE_NAME_CASE(STRICT_CVTSI2P)
31799 NODE_NAME_CASE(STRICT_CVTUI2P)
31800 NODE_NAME_CASE(MCVTSI2P)
31801 NODE_NAME_CASE(MCVTUI2P)
31802 NODE_NAME_CASE(VFPCLASS)
31803 NODE_NAME_CASE(VFPCLASSS)
31804 NODE_NAME_CASE(MULTISHIFT)
31805 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
31806 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
31807 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
31808 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
31809 NODE_NAME_CASE(CVTPS2PH)
31810 NODE_NAME_CASE(STRICT_CVTPS2PH)
31811 NODE_NAME_CASE(MCVTPS2PH)
31812 NODE_NAME_CASE(CVTPH2PS)
31813 NODE_NAME_CASE(STRICT_CVTPH2PS)
31814 NODE_NAME_CASE(CVTPH2PS_SAE)
31815 NODE_NAME_CASE(CVTP2SI)
31816 NODE_NAME_CASE(CVTP2UI)
31817 NODE_NAME_CASE(MCVTP2SI)
31818 NODE_NAME_CASE(MCVTP2UI)
31819 NODE_NAME_CASE(CVTP2SI_RND)
31820 NODE_NAME_CASE(CVTP2UI_RND)
31821 NODE_NAME_CASE(CVTS2SI)
31822 NODE_NAME_CASE(CVTS2UI)
31823 NODE_NAME_CASE(CVTS2SI_RND)
31824 NODE_NAME_CASE(CVTS2UI_RND)
31825 NODE_NAME_CASE(CVTNE2PS2BF16)
31826 NODE_NAME_CASE(CVTNEPS2BF16)
31827 NODE_NAME_CASE(MCVTNEPS2BF16)
31828 NODE_NAME_CASE(DPBF16PS)
31829 NODE_NAME_CASE(LWPINS)
31830 NODE_NAME_CASE(MGATHER)
31831 NODE_NAME_CASE(MSCATTER)
31832 NODE_NAME_CASE(VPDPBUSD)
31833 NODE_NAME_CASE(VPDPBUSDS)
31834 NODE_NAME_CASE(VPDPWSSD)
31835 NODE_NAME_CASE(VPDPWSSDS)
31836 NODE_NAME_CASE(VPSHUFBITQMB)
31837 NODE_NAME_CASE(GF2P8MULB)
31838 NODE_NAME_CASE(GF2P8AFFINEQB)
31839 NODE_NAME_CASE(GF2P8AFFINEINVQB)
31840 NODE_NAME_CASE(NT_CALL)
31841 NODE_NAME_CASE(NT_BRIND)
31842 NODE_NAME_CASE(UMWAIT)
31843 NODE_NAME_CASE(TPAUSE)
31844 NODE_NAME_CASE(ENQCMD)
31845 NODE_NAME_CASE(ENQCMDS)
31846 NODE_NAME_CASE(VP2INTERSECT)
31847 NODE_NAME_CASE(AESENC128KL)
31848 NODE_NAME_CASE(AESDEC128KL)
31849 NODE_NAME_CASE(AESENC256KL)
31850 NODE_NAME_CASE(AESDEC256KL)
31851 NODE_NAME_CASE(AESENCWIDE128KL)
31852 NODE_NAME_CASE(AESDECWIDE128KL)
31853 NODE_NAME_CASE(AESENCWIDE256KL)
31854 NODE_NAME_CASE(AESDECWIDE256KL)
31855 NODE_NAME_CASE(TESTUI)
31856 }
31857 return nullptr;
31858#undef NODE_NAME_CASE
31859}
31860
31861/// Return true if the addressing mode represented by AM is legal for this
31862/// target, for a load/store of the specified type.
31863bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
31864 const AddrMode &AM, Type *Ty,
31865 unsigned AS,
31866 Instruction *I) const {
31867 // X86 supports extremely general addressing modes.
31868 CodeModel::Model M = getTargetMachine().getCodeModel();
31869
31870 // X86 allows a sign-extended 32-bit immediate field as a displacement.
31871 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
31872 return false;
31873
31874 if (AM.BaseGV) {
31875 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
31876
31877 // If a reference to this global requires an extra load, we can't fold it.
31878 if (isGlobalStubReference(GVFlags))
31879 return false;
31880
31881 // If BaseGV requires a register for the PIC base, we cannot also have a
31882 // BaseReg specified.
31883 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
31884 return false;
31885
31886 // If lower 4G is not available, then we must use rip-relative addressing.
31887 if ((M != CodeModel::Small || isPositionIndependent()) &&
31888 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
31889 return false;
31890 }
31891
31892 switch (AM.Scale) {
31893 case 0:
31894 case 1:
31895 case 2:
31896 case 4:
31897 case 8:
31898 // These scales always work.
31899 break;
31900 case 3:
31901 case 5:
31902 case 9:
31903 // These scales are formed with basereg+scalereg. Only accept if there is
31904 // no basereg yet.
31905 if (AM.HasBaseReg)
31906 return false;
31907 break;
31908 default: // Other stuff never works.
31909 return false;
31910 }
31911
31912 return true;
31913}
31914
31915bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
31916 unsigned Bits = Ty->getScalarSizeInBits();
31917
31918 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
31919 // particularly cheaper than those without.
31920 if (Bits == 8)
31921 return false;
31922
31923 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
31924 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
31925 if (Subtarget.hasXOP() &&
31926 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
31927 return false;
31928
31929 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
31930 // shifts just as cheap as scalar ones.
31931 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
31932 return false;
31933
31934 // AVX512BW has shifts such as vpsllvw.
31935 if (Subtarget.hasBWI() && Bits == 16)
31936 return false;
31937
31938 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
31939 // fully general vector.
31940 return true;
31941}
31942
31943bool X86TargetLowering::isBinOp(unsigned Opcode) const {
31944 switch (Opcode) {
31945 // These are non-commutative binops.
31946 // TODO: Add more X86ISD opcodes once we have test coverage.
31947 case X86ISD::ANDNP:
31948 case X86ISD::PCMPGT:
31949 case X86ISD::FMAX:
31950 case X86ISD::FMIN:
31951 case X86ISD::FANDN:
31952 return true;
31953 }
31954
31955 return TargetLoweringBase::isBinOp(Opcode);
31956}
31957
31958bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
31959 switch (Opcode) {
31960 // TODO: Add more X86ISD opcodes once we have test coverage.
31961 case X86ISD::PCMPEQ:
31962 case X86ISD::PMULDQ:
31963 case X86ISD::PMULUDQ:
31964 case X86ISD::FMAXC:
31965 case X86ISD::FMINC:
31966 case X86ISD::FAND:
31967 case X86ISD::FOR:
31968 case X86ISD::FXOR:
31969 return true;
31970 }
31971
31972 return TargetLoweringBase::isCommutativeBinOp(Opcode);
31973}
31974
31975bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
31976 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31977 return false;
31978 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
31979 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
31980 return NumBits1 > NumBits2;
31981}
31982
31983bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
31984 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
31985 return false;
31986
31987 if (!isTypeLegal(EVT::getEVT(Ty1)))
31988 return false;
31989
31990 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((void)0);
31991
31992 // Assuming the caller doesn't have a zeroext or signext return parameter,
31993 // truncation all the way down to i1 is valid.
31994 return true;
31995}
31996
31997bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
31998 return isInt<32>(Imm);
31999}
32000
32001bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
32002 // Can also use sub to handle negated immediates.
32003 return isInt<32>(Imm);
32004}
32005
32006bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
32007 return isInt<32>(Imm);
32008}
32009
32010bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
32011 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
32012 return false;
32013 unsigned NumBits1 = VT1.getSizeInBits();
32014 unsigned NumBits2 = VT2.getSizeInBits();
32015 return NumBits1 > NumBits2;
32016}
32017
32018bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
32019 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32020 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
32021}
32022
32023bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
32024 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32025 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
32026}
32027
32028bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
32029 EVT VT1 = Val.getValueType();
32030 if (isZExtFree(VT1, VT2))
32031 return true;
32032
32033 if (Val.getOpcode() != ISD::LOAD)
32034 return false;
32035
32036 if (!VT1.isSimple() || !VT1.isInteger() ||
32037 !VT2.isSimple() || !VT2.isInteger())
32038 return false;
32039
32040 switch (VT1.getSimpleVT().SimpleTy) {
32041 default: break;
32042 case MVT::i8:
32043 case MVT::i16:
32044 case MVT::i32:
32045 // X86 has 8, 16, and 32-bit zero-extending loads.
32046 return true;
32047 }
32048
32049 return false;
32050}
32051
32052bool X86TargetLowering::shouldSinkOperands(Instruction *I,
32053 SmallVectorImpl<Use *> &Ops) const {
32054 // A uniform shift amount in a vector shift or funnel shift may be much
32055 // cheaper than a generic variable vector shift, so make that pattern visible
32056 // to SDAG by sinking the shuffle instruction next to the shift.
32057 int ShiftAmountOpNum = -1;
32058 if (I->isShift())
32059 ShiftAmountOpNum = 1;
32060 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
32061 if (II->getIntrinsicID() == Intrinsic::fshl ||
32062 II->getIntrinsicID() == Intrinsic::fshr)
32063 ShiftAmountOpNum = 2;
32064 }
32065
32066 if (ShiftAmountOpNum == -1)
32067 return false;
32068
32069 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
32070 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
32071 isVectorShiftByScalarCheap(I->getType())) {
32072 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
32073 return true;
32074 }
32075
32076 return false;
32077}
32078
32079bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
32080 if (!Subtarget.is64Bit())
32081 return false;
32082 return TargetLowering::shouldConvertPhiType(From, To);
32083}
32084
32085bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
32086 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
32087 return false;
32088
32089 EVT SrcVT = ExtVal.getOperand(0).getValueType();
32090
32091 // There is no extending load for vXi1.
32092 if (SrcVT.getScalarType() == MVT::i1)
32093 return false;
32094
32095 return true;
32096}
32097
32098bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
32099 EVT VT) const {
32100 if (!Subtarget.hasAnyFMA())
32101 return false;
32102
32103 VT = VT.getScalarType();
32104
32105 if (!VT.isSimple())
32106 return false;
32107
32108 switch (VT.getSimpleVT().SimpleTy) {
32109 case MVT::f32:
32110 case MVT::f64:
32111 return true;
32112 default:
32113 break;
32114 }
32115
32116 return false;
32117}
32118
32119bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
32120 // i16 instructions are longer (0x66 prefix) and potentially slower.
32121 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
32122}
32123
32124/// Targets can use this to indicate that they only support *some*
32125/// VECTOR_SHUFFLE operations, those with specific masks.
32126/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
32127/// are assumed to be legal.
32128bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
32129 if (!VT.isSimple())
32130 return false;
32131
32132 // Not for i1 vectors
32133 if (VT.getSimpleVT().getScalarType() == MVT::i1)
32134 return false;
32135
32136 // Very little shuffling can be done for 64-bit vectors right now.
32137 if (VT.getSimpleVT().getSizeInBits() == 64)
32138 return false;
32139
32140 // We only care that the types being shuffled are legal. The lowering can
32141 // handle any possible shuffle mask that results.
32142 return isTypeLegal(VT.getSimpleVT());
32143}
32144
32145bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
32146 EVT VT) const {
32147 // Don't convert an 'and' into a shuffle that we don't directly support.
32148 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
32149 if (!Subtarget.hasAVX2())
32150 if (VT == MVT::v32i8 || VT == MVT::v16i16)
32151 return false;
32152
32153 // Just delegate to the generic legality, clear masks aren't special.
32154 return isShuffleMaskLegal(Mask, VT);
32155}
32156
32157bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
32158 // If the subtarget is using thunks, we need to not generate jump tables.
32159 if (Subtarget.useIndirectThunkBranches())
32160 return false;
32161
32162 // Otherwise, fallback on the generic logic.
32163 return TargetLowering::areJTsAllowed(Fn);
32164}
32165
32166//===----------------------------------------------------------------------===//
32167// X86 Scheduler Hooks
32168//===----------------------------------------------------------------------===//
32169
32170// Returns true if EFLAG is consumed after this iterator in the rest of the
32171// basic block or any successors of the basic block.
32172static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
32173 MachineBasicBlock *BB) {
32174 // Scan forward through BB for a use/def of EFLAGS.
32175 for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
32176 miI != miE; ++miI) {
32177 const MachineInstr& mi = *miI;
32178 if (mi.readsRegister(X86::EFLAGS))
32179 return true;
32180 // If we found a def, we can stop searching.
32181 if (mi.definesRegister(X86::EFLAGS))
32182 return false;
32183 }
32184
32185 // If we hit the end of the block, check whether EFLAGS is live into a
32186 // successor.
32187 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
32188 sEnd = BB->succ_end();
32189 sItr != sEnd; ++sItr) {
32190 MachineBasicBlock* succ = *sItr;
32191 if (succ->isLiveIn(X86::EFLAGS))
32192 return true;
32193 }
32194
32195 return false;
32196}
32197
32198/// Utility function to emit xbegin specifying the start of an RTM region.
32199static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
32200 const TargetInstrInfo *TII) {
32201 const DebugLoc &DL = MI.getDebugLoc();
32202
32203 const BasicBlock *BB = MBB->getBasicBlock();
32204 MachineFunction::iterator I = ++MBB->getIterator();
32205
32206 // For the v = xbegin(), we generate
32207 //
32208 // thisMBB:
32209 // xbegin sinkMBB
32210 //
32211 // mainMBB:
32212 // s0 = -1
32213 //
32214 // fallBB:
32215 // eax = # XABORT_DEF
32216 // s1 = eax
32217 //
32218 // sinkMBB:
32219 // v = phi(s0/mainBB, s1/fallBB)
32220
32221 MachineBasicBlock *thisMBB = MBB;
32222 MachineFunction *MF = MBB->getParent();
32223 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32224 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32225 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32226 MF->insert(I, mainMBB);
32227 MF->insert(I, fallMBB);
32228 MF->insert(I, sinkMBB);
32229
32230 if (isEFLAGSLiveAfter(MI, MBB)) {
32231 mainMBB->addLiveIn(X86::EFLAGS);
32232 fallMBB->addLiveIn(X86::EFLAGS);
32233 sinkMBB->addLiveIn(X86::EFLAGS);
32234 }
32235
32236 // Transfer the remainder of BB and its successor edges to sinkMBB.
32237 sinkMBB->splice(sinkMBB->begin(), MBB,
32238 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32239 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32240
32241 MachineRegisterInfo &MRI = MF->getRegInfo();
32242 Register DstReg = MI.getOperand(0).getReg();
32243 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32244 Register mainDstReg = MRI.createVirtualRegister(RC);
32245 Register fallDstReg = MRI.createVirtualRegister(RC);
32246
32247 // thisMBB:
32248 // xbegin fallMBB
32249 // # fallthrough to mainMBB
32250 // # abortion to fallMBB
32251 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
32252 thisMBB->addSuccessor(mainMBB);
32253 thisMBB->addSuccessor(fallMBB);
32254
32255 // mainMBB:
32256 // mainDstReg := -1
32257 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
32258 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32259 mainMBB->addSuccessor(sinkMBB);
32260
32261 // fallMBB:
32262 // ; pseudo instruction to model hardware's definition from XABORT
32263 // EAX := XABORT_DEF
32264 // fallDstReg := EAX
32265 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
32266 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
32267 .addReg(X86::EAX);
32268 fallMBB->addSuccessor(sinkMBB);
32269
32270 // sinkMBB:
32271 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
32272 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
32273 .addReg(mainDstReg).addMBB(mainMBB)
32274 .addReg(fallDstReg).addMBB(fallMBB);
32275
32276 MI.eraseFromParent();
32277 return sinkMBB;
32278}
32279
32280MachineBasicBlock *
32281X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
32282 MachineBasicBlock *MBB) const {
32283 // Emit va_arg instruction on X86-64.
32284
32285 // Operands to this pseudo-instruction:
32286 // 0 ) Output : destination address (reg)
32287 // 1-5) Input : va_list address (addr, i64mem)
32288 // 6 ) ArgSize : Size (in bytes) of vararg type
32289 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
32290 // 8 ) Align : Alignment of type
32291 // 9 ) EFLAGS (implicit-def)
32292
32293 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")((void)0);
32294 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
32295
32296 Register DestReg = MI.getOperand(0).getReg();
32297 MachineOperand &Base = MI.getOperand(1);
32298 MachineOperand &Scale = MI.getOperand(2);
32299 MachineOperand &Index = MI.getOperand(3);
32300 MachineOperand &Disp = MI.getOperand(4);
32301 MachineOperand &Segment = MI.getOperand(5);
32302 unsigned ArgSize = MI.getOperand(6).getImm();
32303 unsigned ArgMode = MI.getOperand(7).getImm();
32304 Align Alignment = Align(MI.getOperand(8).getImm());
32305
32306 MachineFunction *MF = MBB->getParent();
32307
32308 // Memory Reference
32309 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")((void)0);
32310
32311 MachineMemOperand *OldMMO = MI.memoperands().front();
32312
32313 // Clone the MMO into two separate MMOs for loading and storing
32314 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
32315 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
32316 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
32317 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
32318
32319 // Machine Information
32320 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32321 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
32322 const TargetRegisterClass *AddrRegClass =
32323 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
32324 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
32325 const DebugLoc &DL = MI.getDebugLoc();
32326
32327 // struct va_list {
32328 // i32 gp_offset
32329 // i32 fp_offset
32330 // i64 overflow_area (address)
32331 // i64 reg_save_area (address)
32332 // }
32333 // sizeof(va_list) = 24
32334 // alignment(va_list) = 8
32335
32336 unsigned TotalNumIntRegs = 6;
32337 unsigned TotalNumXMMRegs = 8;
32338 bool UseGPOffset = (ArgMode == 1);
32339 bool UseFPOffset = (ArgMode == 2);
32340 unsigned MaxOffset = TotalNumIntRegs * 8 +
32341 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
32342
32343 /* Align ArgSize to a multiple of 8 */
32344 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
32345 bool NeedsAlign = (Alignment > 8);
32346
32347 MachineBasicBlock *thisMBB = MBB;
32348 MachineBasicBlock *overflowMBB;
32349 MachineBasicBlock *offsetMBB;
32350 MachineBasicBlock *endMBB;
32351
32352 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
32353 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
32354 unsigned OffsetReg = 0;
32355
32356 if (!UseGPOffset && !UseFPOffset) {
32357 // If we only pull from the overflow region, we don't create a branch.
32358 // We don't need to alter control flow.
32359 OffsetDestReg = 0; // unused
32360 OverflowDestReg = DestReg;
32361
32362 offsetMBB = nullptr;
32363 overflowMBB = thisMBB;
32364 endMBB = thisMBB;
32365 } else {
32366 // First emit code to check if gp_offset (or fp_offset) is below the bound.
32367 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
32368 // If not, pull from overflow_area. (branch to overflowMBB)
32369 //
32370 // thisMBB
32371 // | .
32372 // | .
32373 // offsetMBB overflowMBB
32374 // | .
32375 // | .
32376 // endMBB
32377
32378 // Registers for the PHI in endMBB
32379 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
32380 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
32381
32382 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32383 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32384 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32385 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32386
32387 MachineFunction::iterator MBBIter = ++MBB->getIterator();
32388
32389 // Insert the new basic blocks
32390 MF->insert(MBBIter, offsetMBB);
32391 MF->insert(MBBIter, overflowMBB);
32392 MF->insert(MBBIter, endMBB);
32393
32394 // Transfer the remainder of MBB and its successor edges to endMBB.
32395 endMBB->splice(endMBB->begin(), thisMBB,
32396 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
32397 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
32398
32399 // Make offsetMBB and overflowMBB successors of thisMBB
32400 thisMBB->addSuccessor(offsetMBB);
32401 thisMBB->addSuccessor(overflowMBB);
32402
32403 // endMBB is a successor of both offsetMBB and overflowMBB
32404 offsetMBB->addSuccessor(endMBB);
32405 overflowMBB->addSuccessor(endMBB);
32406
32407 // Load the offset value into a register
32408 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32409 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
32410 .add(Base)
32411 .add(Scale)
32412 .add(Index)
32413 .addDisp(Disp, UseFPOffset ? 4 : 0)
32414 .add(Segment)
32415 .setMemRefs(LoadOnlyMMO);
32416
32417 // Check if there is enough room left to pull this argument.
32418 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
32419 .addReg(OffsetReg)
32420 .addImm(MaxOffset + 8 - ArgSizeA8);
32421
32422 // Branch to "overflowMBB" if offset >= max
32423 // Fall through to "offsetMBB" otherwise
32424 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
32425 .addMBB(overflowMBB).addImm(X86::COND_AE);
32426 }
32427
32428 // In offsetMBB, emit code to use the reg_save_area.
32429 if (offsetMBB) {
32430 assert(OffsetReg != 0)((void)0);
32431
32432 // Read the reg_save_area address.
32433 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
32434 BuildMI(
32435 offsetMBB, DL,
32436 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32437 RegSaveReg)
32438 .add(Base)
32439 .add(Scale)
32440 .add(Index)
32441 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
32442 .add(Segment)
32443 .setMemRefs(LoadOnlyMMO);
32444
32445 if (Subtarget.isTarget64BitLP64()) {
32446 // Zero-extend the offset
32447 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
32448 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
32449 .addImm(0)
32450 .addReg(OffsetReg)
32451 .addImm(X86::sub_32bit);
32452
32453 // Add the offset to the reg_save_area to get the final address.
32454 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
32455 .addReg(OffsetReg64)
32456 .addReg(RegSaveReg);
32457 } else {
32458 // Add the offset to the reg_save_area to get the final address.
32459 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
32460 .addReg(OffsetReg)
32461 .addReg(RegSaveReg);
32462 }
32463
32464 // Compute the offset for the next argument
32465 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
32466 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
32467 .addReg(OffsetReg)
32468 .addImm(UseFPOffset ? 16 : 8);
32469
32470 // Store it back into the va_list.
32471 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
32472 .add(Base)
32473 .add(Scale)
32474 .add(Index)
32475 .addDisp(Disp, UseFPOffset ? 4 : 0)
32476 .add(Segment)
32477 .addReg(NextOffsetReg)
32478 .setMemRefs(StoreOnlyMMO);
32479
32480 // Jump to endMBB
32481 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
32482 .addMBB(endMBB);
32483 }
32484
32485 //
32486 // Emit code to use overflow area
32487 //
32488
32489 // Load the overflow_area address into a register.
32490 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
32491 BuildMI(overflowMBB, DL,
32492 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
32493 OverflowAddrReg)
32494 .add(Base)
32495 .add(Scale)
32496 .add(Index)
32497 .addDisp(Disp, 8)
32498 .add(Segment)
32499 .setMemRefs(LoadOnlyMMO);
32500
32501 // If we need to align it, do so. Otherwise, just copy the address
32502 // to OverflowDestReg.
32503 if (NeedsAlign) {
32504 // Align the overflow address
32505 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
32506
32507 // aligned_addr = (addr + (align-1)) & ~(align-1)
32508 BuildMI(
32509 overflowMBB, DL,
32510 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32511 TmpReg)
32512 .addReg(OverflowAddrReg)
32513 .addImm(Alignment.value() - 1);
32514
32515 BuildMI(
32516 overflowMBB, DL,
32517 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
32518 OverflowDestReg)
32519 .addReg(TmpReg)
32520 .addImm(~(uint64_t)(Alignment.value() - 1));
32521 } else {
32522 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
32523 .addReg(OverflowAddrReg);
32524 }
32525
32526 // Compute the next overflow address after this argument.
32527 // (the overflow address should be kept 8-byte aligned)
32528 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
32529 BuildMI(
32530 overflowMBB, DL,
32531 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
32532 NextAddrReg)
32533 .addReg(OverflowDestReg)
32534 .addImm(ArgSizeA8);
32535
32536 // Store the new overflow address.
32537 BuildMI(overflowMBB, DL,
32538 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
32539 .add(Base)
32540 .add(Scale)
32541 .add(Index)
32542 .addDisp(Disp, 8)
32543 .add(Segment)
32544 .addReg(NextAddrReg)
32545 .setMemRefs(StoreOnlyMMO);
32546
32547 // If we branched, emit the PHI to the front of endMBB.
32548 if (offsetMBB) {
32549 BuildMI(*endMBB, endMBB->begin(), DL,
32550 TII->get(X86::PHI), DestReg)
32551 .addReg(OffsetDestReg).addMBB(offsetMBB)
32552 .addReg(OverflowDestReg).addMBB(overflowMBB);
32553 }
32554
32555 // Erase the pseudo instruction
32556 MI.eraseFromParent();
32557
32558 return endMBB;
32559}
32560
32561// The EFLAGS operand of SelectItr might be missing a kill marker
32562// because there were multiple uses of EFLAGS, and ISel didn't know
32563// which to mark. Figure out whether SelectItr should have had a
32564// kill marker, and set it if it should. Returns the correct kill
32565// marker value.
32566static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
32567 MachineBasicBlock* BB,
32568 const TargetRegisterInfo* TRI) {
32569 if (isEFLAGSLiveAfter(SelectItr, BB))
32570 return false;
32571
32572 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
32573 // out. SelectMI should have a kill flag on EFLAGS.
32574 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
32575 return true;
32576}
32577
32578// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
32579// together with other CMOV pseudo-opcodes into a single basic-block with
32580// conditional jump around it.
32581static bool isCMOVPseudo(MachineInstr &MI) {
32582 switch (MI.getOpcode()) {
32583 case X86::CMOV_FR32:
32584 case X86::CMOV_FR32X:
32585 case X86::CMOV_FR64:
32586 case X86::CMOV_FR64X:
32587 case X86::CMOV_GR8:
32588 case X86::CMOV_GR16:
32589 case X86::CMOV_GR32:
32590 case X86::CMOV_RFP32:
32591 case X86::CMOV_RFP64:
32592 case X86::CMOV_RFP80:
32593 case X86::CMOV_VR64:
32594 case X86::CMOV_VR128:
32595 case X86::CMOV_VR128X:
32596 case X86::CMOV_VR256:
32597 case X86::CMOV_VR256X:
32598 case X86::CMOV_VR512:
32599 case X86::CMOV_VK1:
32600 case X86::CMOV_VK2:
32601 case X86::CMOV_VK4:
32602 case X86::CMOV_VK8:
32603 case X86::CMOV_VK16:
32604 case X86::CMOV_VK32:
32605 case X86::CMOV_VK64:
32606 return true;
32607
32608 default:
32609 return false;
32610 }
32611}
32612
32613// Helper function, which inserts PHI functions into SinkMBB:
32614// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
32615// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
32616// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
32617// the last PHI function inserted.
32618static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
32619 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
32620 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
32621 MachineBasicBlock *SinkMBB) {
32622 MachineFunction *MF = TrueMBB->getParent();
32623 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
32624 const DebugLoc &DL = MIItBegin->getDebugLoc();
32625
32626 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
32627 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32628
32629 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
32630
32631 // As we are creating the PHIs, we have to be careful if there is more than
32632 // one. Later CMOVs may reference the results of earlier CMOVs, but later
32633 // PHIs have to reference the individual true/false inputs from earlier PHIs.
32634 // That also means that PHI construction must work forward from earlier to
32635 // later, and that the code must maintain a mapping from earlier PHI's
32636 // destination registers, and the registers that went into the PHI.
32637 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
32638 MachineInstrBuilder MIB;
32639
32640 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
32641 Register DestReg = MIIt->getOperand(0).getReg();
32642 Register Op1Reg = MIIt->getOperand(1).getReg();
32643 Register Op2Reg = MIIt->getOperand(2).getReg();
32644
32645 // If this CMOV we are generating is the opposite condition from
32646 // the jump we generated, then we have to swap the operands for the
32647 // PHI that is going to be generated.
32648 if (MIIt->getOperand(3).getImm() == OppCC)
32649 std::swap(Op1Reg, Op2Reg);
32650
32651 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
32652 Op1Reg = RegRewriteTable[Op1Reg].first;
32653
32654 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
32655 Op2Reg = RegRewriteTable[Op2Reg].second;
32656
32657 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
32658 .addReg(Op1Reg)
32659 .addMBB(FalseMBB)
32660 .addReg(Op2Reg)
32661 .addMBB(TrueMBB);
32662
32663 // Add this PHI to the rewrite table.
32664 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
32665 }
32666
32667 return MIB;
32668}
32669
32670// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
32671MachineBasicBlock *
32672X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
32673 MachineInstr &SecondCascadedCMOV,
32674 MachineBasicBlock *ThisMBB) const {
32675 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32676 const DebugLoc &DL = FirstCMOV.getDebugLoc();
32677
32678 // We lower cascaded CMOVs such as
32679 //
32680 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
32681 //
32682 // to two successive branches.
32683 //
32684 // Without this, we would add a PHI between the two jumps, which ends up
32685 // creating a few copies all around. For instance, for
32686 //
32687 // (sitofp (zext (fcmp une)))
32688 //
32689 // we would generate:
32690 //
32691 // ucomiss %xmm1, %xmm0
32692 // movss <1.0f>, %xmm0
32693 // movaps %xmm0, %xmm1
32694 // jne .LBB5_2
32695 // xorps %xmm1, %xmm1
32696 // .LBB5_2:
32697 // jp .LBB5_4
32698 // movaps %xmm1, %xmm0
32699 // .LBB5_4:
32700 // retq
32701 //
32702 // because this custom-inserter would have generated:
32703 //
32704 // A
32705 // | \
32706 // | B
32707 // | /
32708 // C
32709 // | \
32710 // | D
32711 // | /
32712 // E
32713 //
32714 // A: X = ...; Y = ...
32715 // B: empty
32716 // C: Z = PHI [X, A], [Y, B]
32717 // D: empty
32718 // E: PHI [X, C], [Z, D]
32719 //
32720 // If we lower both CMOVs in a single step, we can instead generate:
32721 //
32722 // A
32723 // | \
32724 // | C
32725 // | /|
32726 // |/ |
32727 // | |
32728 // | D
32729 // | /
32730 // E
32731 //
32732 // A: X = ...; Y = ...
32733 // D: empty
32734 // E: PHI [X, A], [X, C], [Y, D]
32735 //
32736 // Which, in our sitofp/fcmp example, gives us something like:
32737 //
32738 // ucomiss %xmm1, %xmm0
32739 // movss <1.0f>, %xmm0
32740 // jne .LBB5_4
32741 // jp .LBB5_4
32742 // xorps %xmm0, %xmm0
32743 // .LBB5_4:
32744 // retq
32745 //
32746
32747 // We lower cascaded CMOV into two successive branches to the same block.
32748 // EFLAGS is used by both, so mark it as live in the second.
32749 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32750 MachineFunction *F = ThisMBB->getParent();
32751 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32752 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
32753 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32754
32755 MachineFunction::iterator It = ++ThisMBB->getIterator();
32756 F->insert(It, FirstInsertedMBB);
32757 F->insert(It, SecondInsertedMBB);
32758 F->insert(It, SinkMBB);
32759
32760 // For a cascaded CMOV, we lower it to two successive branches to
32761 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
32762 // the FirstInsertedMBB.
32763 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
32764
32765 // If the EFLAGS register isn't dead in the terminator, then claim that it's
32766 // live into the sink and copy blocks.
32767 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32768 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
32769 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
32770 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
32771 SinkMBB->addLiveIn(X86::EFLAGS);
32772 }
32773
32774 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32775 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
32776 std::next(MachineBasicBlock::iterator(FirstCMOV)),
32777 ThisMBB->end());
32778 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32779
32780 // Fallthrough block for ThisMBB.
32781 ThisMBB->addSuccessor(FirstInsertedMBB);
32782 // The true block target of the first branch is always SinkMBB.
32783 ThisMBB->addSuccessor(SinkMBB);
32784 // Fallthrough block for FirstInsertedMBB.
32785 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
32786 // The true block for the branch of FirstInsertedMBB.
32787 FirstInsertedMBB->addSuccessor(SinkMBB);
32788 // This is fallthrough.
32789 SecondInsertedMBB->addSuccessor(SinkMBB);
32790
32791 // Create the conditional branch instructions.
32792 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
32793 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
32794
32795 X86::CondCode SecondCC =
32796 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
32797 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
32798
32799 // SinkMBB:
32800 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
32801 Register DestReg = FirstCMOV.getOperand(0).getReg();
32802 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
32803 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
32804 MachineInstrBuilder MIB =
32805 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
32806 .addReg(Op1Reg)
32807 .addMBB(SecondInsertedMBB)
32808 .addReg(Op2Reg)
32809 .addMBB(ThisMBB);
32810
32811 // The second SecondInsertedMBB provides the same incoming value as the
32812 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
32813 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
32814 // Copy the PHI result to the register defined by the second CMOV.
32815 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
32816 TII->get(TargetOpcode::COPY),
32817 SecondCascadedCMOV.getOperand(0).getReg())
32818 .addReg(FirstCMOV.getOperand(0).getReg());
32819
32820 // Now remove the CMOVs.
32821 FirstCMOV.eraseFromParent();
32822 SecondCascadedCMOV.eraseFromParent();
32823
32824 return SinkMBB;
32825}
32826
32827MachineBasicBlock *
32828X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
32829 MachineBasicBlock *ThisMBB) const {
32830 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32831 const DebugLoc &DL = MI.getDebugLoc();
32832
32833 // To "insert" a SELECT_CC instruction, we actually have to insert the
32834 // diamond control-flow pattern. The incoming instruction knows the
32835 // destination vreg to set, the condition code register to branch on, the
32836 // true/false values to select between and a branch opcode to use.
32837
32838 // ThisMBB:
32839 // ...
32840 // TrueVal = ...
32841 // cmpTY ccX, r1, r2
32842 // bCC copy1MBB
32843 // fallthrough --> FalseMBB
32844
32845 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
32846 // as described above, by inserting a BB, and then making a PHI at the join
32847 // point to select the true and false operands of the CMOV in the PHI.
32848 //
32849 // The code also handles two different cases of multiple CMOV opcodes
32850 // in a row.
32851 //
32852 // Case 1:
32853 // In this case, there are multiple CMOVs in a row, all which are based on
32854 // the same condition setting (or the exact opposite condition setting).
32855 // In this case we can lower all the CMOVs using a single inserted BB, and
32856 // then make a number of PHIs at the join point to model the CMOVs. The only
32857 // trickiness here, is that in a case like:
32858 //
32859 // t2 = CMOV cond1 t1, f1
32860 // t3 = CMOV cond1 t2, f2
32861 //
32862 // when rewriting this into PHIs, we have to perform some renaming on the
32863 // temps since you cannot have a PHI operand refer to a PHI result earlier
32864 // in the same block. The "simple" but wrong lowering would be:
32865 //
32866 // t2 = PHI t1(BB1), f1(BB2)
32867 // t3 = PHI t2(BB1), f2(BB2)
32868 //
32869 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
32870 // renaming is to note that on the path through BB1, t2 is really just a
32871 // copy of t1, and do that renaming, properly generating:
32872 //
32873 // t2 = PHI t1(BB1), f1(BB2)
32874 // t3 = PHI t1(BB1), f2(BB2)
32875 //
32876 // Case 2:
32877 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
32878 // function - EmitLoweredCascadedSelect.
32879
32880 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
32881 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
32882 MachineInstr *LastCMOV = &MI;
32883 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
32884
32885 // Check for case 1, where there are multiple CMOVs with the same condition
32886 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
32887 // number of jumps the most.
32888
32889 if (isCMOVPseudo(MI)) {
32890 // See if we have a string of CMOVS with the same condition. Skip over
32891 // intervening debug insts.
32892 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
32893 (NextMIIt->getOperand(3).getImm() == CC ||
32894 NextMIIt->getOperand(3).getImm() == OppCC)) {
32895 LastCMOV = &*NextMIIt;
32896 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
32897 }
32898 }
32899
32900 // This checks for case 2, but only do this if we didn't already find
32901 // case 1, as indicated by LastCMOV == MI.
32902 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
32903 NextMIIt->getOpcode() == MI.getOpcode() &&
32904 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
32905 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
32906 NextMIIt->getOperand(1).isKill()) {
32907 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
32908 }
32909
32910 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
32911 MachineFunction *F = ThisMBB->getParent();
32912 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
32913 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
32914
32915 MachineFunction::iterator It = ++ThisMBB->getIterator();
32916 F->insert(It, FalseMBB);
32917 F->insert(It, SinkMBB);
32918
32919 // If the EFLAGS register isn't dead in the terminator, then claim that it's
32920 // live into the sink and copy blocks.
32921 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
32922 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
32923 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
32924 FalseMBB->addLiveIn(X86::EFLAGS);
32925 SinkMBB->addLiveIn(X86::EFLAGS);
32926 }
32927
32928 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
32929 auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
32930 auto DbgIt = MachineBasicBlock::iterator(MI);
32931 while (DbgIt != DbgEnd) {
32932 auto Next = std::next(DbgIt);
32933 if (DbgIt->isDebugInstr())
32934 SinkMBB->push_back(DbgIt->removeFromParent());
32935 DbgIt = Next;
32936 }
32937
32938 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
32939 SinkMBB->splice(SinkMBB->end(), ThisMBB,
32940 std::next(MachineBasicBlock::iterator(LastCMOV)),
32941 ThisMBB->end());
32942 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
32943
32944 // Fallthrough block for ThisMBB.
32945 ThisMBB->addSuccessor(FalseMBB);
32946 // The true block target of the first (or only) branch is always a SinkMBB.
32947 ThisMBB->addSuccessor(SinkMBB);
32948 // Fallthrough block for FalseMBB.
32949 FalseMBB->addSuccessor(SinkMBB);
32950
32951 // Create the conditional branch instruction.
32952 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
32953
32954 // SinkMBB:
32955 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
32956 // ...
32957 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
32958 MachineBasicBlock::iterator MIItEnd =
32959 std::next(MachineBasicBlock::iterator(LastCMOV));
32960 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
32961
32962 // Now remove the CMOV(s).
32963 ThisMBB->erase(MIItBegin, MIItEnd);
32964
32965 return SinkMBB;
32966}
32967
32968static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
32969 if (IsLP64) {
32970 if (isInt<8>(Imm))
32971 return X86::SUB64ri8;
32972 return X86::SUB64ri32;
32973 } else {
32974 if (isInt<8>(Imm))
32975 return X86::SUB32ri8;
32976 return X86::SUB32ri;
32977 }
32978}
32979
32980MachineBasicBlock *
32981X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
32982 MachineBasicBlock *MBB) const {
32983 MachineFunction *MF = MBB->getParent();
32984 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32985 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
32986 const DebugLoc &DL = MI.getDebugLoc();
32987 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
32988
32989 const unsigned ProbeSize = getStackProbeSize(*MF);
32990
32991 MachineRegisterInfo &MRI = MF->getRegInfo();
32992 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32993 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32994 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
32995
32996 MachineFunction::iterator MBBIter = ++MBB->getIterator();
32997 MF->insert(MBBIter, testMBB);
32998 MF->insert(MBBIter, blockMBB);
32999 MF->insert(MBBIter, tailMBB);
33000
33001 Register sizeVReg = MI.getOperand(1).getReg();
33002
33003 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
33004
33005 Register TmpStackPtr = MRI.createVirtualRegister(
33006 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33007 Register FinalStackPtr = MRI.createVirtualRegister(
33008 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33009
33010 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
33011 .addReg(physSPReg);
33012 {
33013 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
33014 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
33015 .addReg(TmpStackPtr)
33016 .addReg(sizeVReg);
33017 }
33018
33019 // test rsp size
33020
33021 BuildMI(testMBB, DL,
33022 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
33023 .addReg(FinalStackPtr)
33024 .addReg(physSPReg);
33025
33026 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
33027 .addMBB(tailMBB)
33028 .addImm(X86::COND_GE);
33029 testMBB->addSuccessor(blockMBB);
33030 testMBB->addSuccessor(tailMBB);
33031
33032 // Touch the block then extend it. This is done on the opposite side of
33033 // static probe where we allocate then touch, to avoid the need of probing the
33034 // tail of the static alloca. Possible scenarios are:
33035 //
33036 // + ---- <- ------------ <- ------------- <- ------------ +
33037 // | |
33038 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
33039 // | |
33040 // + <- ----------- <- ------------ <- ----------- <- ------------ +
33041 //
33042 // The property we want to enforce is to never have more than [page alloc] between two probes.
33043
33044 const unsigned XORMIOpc =
33045 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
33046 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
33047 .addImm(0);
33048
33049 BuildMI(blockMBB, DL,
33050 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
33051 .addReg(physSPReg)
33052 .addImm(ProbeSize);
33053
33054
33055 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
33056 blockMBB->addSuccessor(testMBB);
33057
33058 // Replace original instruction by the expected stack ptr
33059 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
33060 .addReg(FinalStackPtr);
33061
33062 tailMBB->splice(tailMBB->end(), MBB,
33063 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33064 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
33065 MBB->addSuccessor(testMBB);
33066
33067 // Delete the original pseudo instruction.
33068 MI.eraseFromParent();
33069
33070 // And we're done.
33071 return tailMBB;
33072}
33073
33074MachineBasicBlock *
33075X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
33076 MachineBasicBlock *BB) const {
33077 MachineFunction *MF = BB->getParent();
33078 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33079 const DebugLoc &DL = MI.getDebugLoc();
33080 const BasicBlock *LLVM_BB = BB->getBasicBlock();
33081
33082 assert(MF->shouldSplitStack())((void)0);
33083
33084 const bool Is64Bit = Subtarget.is64Bit();
33085 const bool IsLP64 = Subtarget.isTarget64BitLP64();
33086
33087 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
33088 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
33089
33090 // BB:
33091 // ... [Till the alloca]
33092 // If stacklet is not large enough, jump to mallocMBB
33093 //
33094 // bumpMBB:
33095 // Allocate by subtracting from RSP
33096 // Jump to continueMBB
33097 //
33098 // mallocMBB:
33099 // Allocate by call to runtime
33100 //
33101 // continueMBB:
33102 // ...
33103 // [rest of original BB]
33104 //
33105
33106 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33107 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33108 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33109
33110 MachineRegisterInfo &MRI = MF->getRegInfo();
33111 const TargetRegisterClass *AddrRegClass =
33112 getRegClassFor(getPointerTy(MF->getDataLayout()));
33113
33114 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33115 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33116 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
33117 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
33118 sizeVReg = MI.getOperand(1).getReg(),
33119 physSPReg =
33120 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
33121
33122 MachineFunction::iterator MBBIter = ++BB->getIterator();
33123
33124 MF->insert(MBBIter, bumpMBB);
33125 MF->insert(MBBIter, mallocMBB);
33126 MF->insert(MBBIter, continueMBB);
33127
33128 continueMBB->splice(continueMBB->begin(), BB,
33129 std::next(MachineBasicBlock::iterator(MI)), BB->end());
33130 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
33131
33132 // Add code to the main basic block to check if the stack limit has been hit,
33133 // and if so, jump to mallocMBB otherwise to bumpMBB.
33134 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
33135 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
33136 .addReg(tmpSPVReg).addReg(sizeVReg);
33137 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
33138 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
33139 .addReg(SPLimitVReg);
33140 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
33141
33142 // bumpMBB simply decreases the stack pointer, since we know the current
33143 // stacklet has enough space.
33144 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
33145 .addReg(SPLimitVReg);
33146 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
33147 .addReg(SPLimitVReg);
33148 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33149
33150 // Calls into a routine in libgcc to allocate more space from the heap.
33151 const uint32_t *RegMask =
33152 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
33153 if (IsLP64) {
33154 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
33155 .addReg(sizeVReg);
33156 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33157 .addExternalSymbol("__morestack_allocate_stack_space")
33158 .addRegMask(RegMask)
33159 .addReg(X86::RDI, RegState::Implicit)
33160 .addReg(X86::RAX, RegState::ImplicitDefine);
33161 } else if (Is64Bit) {
33162 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
33163 .addReg(sizeVReg);
33164 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33165 .addExternalSymbol("__morestack_allocate_stack_space")
33166 .addRegMask(RegMask)
33167 .addReg(X86::EDI, RegState::Implicit)
33168 .addReg(X86::EAX, RegState::ImplicitDefine);
33169 } else {
33170 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
33171 .addImm(12);
33172 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
33173 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
33174 .addExternalSymbol("__morestack_allocate_stack_space")
33175 .addRegMask(RegMask)
33176 .addReg(X86::EAX, RegState::ImplicitDefine);
33177 }
33178
33179 if (!Is64Bit)
33180 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
33181 .addImm(16);
33182
33183 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
33184 .addReg(IsLP64 ? X86::RAX : X86::EAX);
33185 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33186
33187 // Set up the CFG correctly.
33188 BB->addSuccessor(bumpMBB);
33189 BB->addSuccessor(mallocMBB);
33190 mallocMBB->addSuccessor(continueMBB);
33191 bumpMBB->addSuccessor(continueMBB);
33192
33193 // Take care of the PHI nodes.
33194 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
33195 MI.getOperand(0).getReg())
33196 .addReg(mallocPtrVReg)
33197 .addMBB(mallocMBB)
33198 .addReg(bumpSPPtrVReg)
33199 .addMBB(bumpMBB);
33200
33201 // Delete the original pseudo instruction.
33202 MI.eraseFromParent();
33203
33204 // And we're done.
33205 return continueMBB;
33206}
33207
33208MachineBasicBlock *
33209X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
33210 MachineBasicBlock *BB) const {
33211 MachineFunction *MF = BB->getParent();
33212 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33213 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
33214 const DebugLoc &DL = MI.getDebugLoc();
33215
33216 assert(!isAsynchronousEHPersonality(((void)0)
33217 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&((void)0)
33218 "SEH does not use catchret!")((void)0);
33219
33220 // Only 32-bit EH needs to worry about manually restoring stack pointers.
33221 if (!Subtarget.is32Bit())
33222 return BB;
33223
33224 // C++ EH creates a new target block to hold the restore code, and wires up
33225 // the new block to the return destination with a normal JMP_4.
33226 MachineBasicBlock *RestoreMBB =
33227 MF->CreateMachineBasicBlock(BB->getBasicBlock());
33228 assert(BB->succ_size() == 1)((void)0);
33229 MF->insert(std::next(BB->getIterator()), RestoreMBB);
33230 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
33231 BB->addSuccessor(RestoreMBB);
33232 MI.getOperand(0).setMBB(RestoreMBB);
33233
33234 // Marking this as an EH pad but not a funclet entry block causes PEI to
33235 // restore stack pointers in the block.
33236 RestoreMBB->setIsEHPad(true);
33237
33238 auto RestoreMBBI = RestoreMBB->begin();
33239 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
33240 return BB;
33241}
33242
33243MachineBasicBlock *
33244X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
33245 MachineBasicBlock *BB) const {
33246 // So, here we replace TLSADDR with the sequence:
33247 // adjust_stackdown -> TLSADDR -> adjust_stackup.
33248 // We need this because TLSADDR is lowered into calls
33249 // inside MC, therefore without the two markers shrink-wrapping
33250 // may push the prologue/epilogue pass them.
33251 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33252 const DebugLoc &DL = MI.getDebugLoc();
33253 MachineFunction &MF = *BB->getParent();
33254
33255 // Emit CALLSEQ_START right before the instruction.
33256 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
33257 MachineInstrBuilder CallseqStart =
33258 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
33259 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
33260
33261 // Emit CALLSEQ_END right after the instruction.
33262 // We don't call erase from parent because we want to keep the
33263 // original instruction around.
33264 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
33265 MachineInstrBuilder CallseqEnd =
33266 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
33267 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
33268
33269 return BB;
33270}
33271
33272MachineBasicBlock *
33273X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
33274 MachineBasicBlock *BB) const {
33275 // This is pretty easy. We're taking the value that we received from
33276 // our load from the relocation, sticking it in either RDI (x86-64)
33277 // or EAX and doing an indirect call. The return value will then
33278 // be in the normal return register.
33279 MachineFunction *F = BB->getParent();
33280 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33281 const DebugLoc &DL = MI.getDebugLoc();
33282
33283 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")((void)0);
33284 assert(MI.getOperand(3).isGlobal() && "This should be a global")((void)0);
33285
33286 // Get a register mask for the lowered call.
33287 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
33288 // proper register mask.
33289 const uint32_t *RegMask =
33290 Subtarget.is64Bit() ?
33291 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
33292 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
33293 if (Subtarget.is64Bit()) {
33294 MachineInstrBuilder MIB =
33295 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
33296 .addReg(X86::RIP)
33297 .addImm(0)
33298 .addReg(0)
33299 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33300 MI.getOperand(3).getTargetFlags())
33301 .addReg(0);
33302 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
33303 addDirectMem(MIB, X86::RDI);
33304 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
33305 } else if (!isPositionIndependent()) {
33306 MachineInstrBuilder MIB =
33307 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33308 .addReg(0)
33309 .addImm(0)
33310 .addReg(0)
33311 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33312 MI.getOperand(3).getTargetFlags())
33313 .addReg(0);
33314 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33315 addDirectMem(MIB, X86::EAX);
33316 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33317 } else {
33318 MachineInstrBuilder MIB =
33319 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33320 .addReg(TII->getGlobalBaseReg(F))
33321 .addImm(0)
33322 .addReg(0)
33323 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33324 MI.getOperand(3).getTargetFlags())
33325 .addReg(0);
33326 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33327 addDirectMem(MIB, X86::EAX);
33328 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33329 }
33330
33331 MI.eraseFromParent(); // The pseudo instruction is gone now.
33332 return BB;
33333}
33334
33335static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
33336 switch (RPOpc) {
33337 case X86::INDIRECT_THUNK_CALL32:
33338 return X86::CALLpcrel32;
33339 case X86::INDIRECT_THUNK_CALL64:
33340 return X86::CALL64pcrel32;
33341 case X86::INDIRECT_THUNK_TCRETURN32:
33342 return X86::TCRETURNdi;
33343 case X86::INDIRECT_THUNK_TCRETURN64:
33344 return X86::TCRETURNdi64;
33345 }
33346 llvm_unreachable("not indirect thunk opcode")__builtin_unreachable();
33347}
33348
33349static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
33350 unsigned Reg) {
33351 if (Subtarget.useRetpolineExternalThunk()) {
33352 // When using an external thunk for retpolines, we pick names that match the
33353 // names GCC happens to use as well. This helps simplify the implementation
33354 // of the thunks for kernels where they have no easy ability to create
33355 // aliases and are doing non-trivial configuration of the thunk's body. For
33356 // example, the Linux kernel will do boot-time hot patching of the thunk
33357 // bodies and cannot easily export aliases of these to loaded modules.
33358 //
33359 // Note that at any point in the future, we may need to change the semantics
33360 // of how we implement retpolines and at that time will likely change the
33361 // name of the called thunk. Essentially, there is no hard guarantee that
33362 // LLVM will generate calls to specific thunks, we merely make a best-effort
33363 // attempt to help out kernels and other systems where duplicating the
33364 // thunks is costly.
33365 switch (Reg) {
33366 case X86::EAX:
33367 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33368 return "__x86_indirect_thunk_eax";
33369 case X86::ECX:
33370 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33371 return "__x86_indirect_thunk_ecx";
33372 case X86::EDX:
33373 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33374 return "__x86_indirect_thunk_edx";
33375 case X86::EDI:
33376 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33377 return "__x86_indirect_thunk_edi";
33378 case X86::R11:
33379 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((void)0);
33380 return "__x86_indirect_thunk_r11";
33381 }
33382 llvm_unreachable("unexpected reg for external indirect thunk")__builtin_unreachable();
33383 }
33384
33385 if (Subtarget.useRetpolineIndirectCalls() ||
33386 Subtarget.useRetpolineIndirectBranches()) {
33387 // When targeting an internal COMDAT thunk use an LLVM-specific name.
33388 switch (Reg) {
33389 case X86::EAX:
33390 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33391 return "__llvm_retpoline_eax";
33392 case X86::ECX:
33393 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33394 return "__llvm_retpoline_ecx";
33395 case X86::EDX:
33396 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33397 return "__llvm_retpoline_edx";
33398 case X86::EDI:
33399 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((void)0);
33400 return "__llvm_retpoline_edi";
33401 case X86::R11:
33402 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((void)0);
33403 return "__llvm_retpoline_r11";
33404 }
33405 llvm_unreachable("unexpected reg for retpoline")__builtin_unreachable();
33406 }
33407
33408 if (Subtarget.useLVIControlFlowIntegrity()) {
33409 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((void)0);
33410 return "__llvm_lvi_thunk_r11";
33411 }
33412 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")__builtin_unreachable();
33413}
33414
33415MachineBasicBlock *
33416X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
33417 MachineBasicBlock *BB) const {
33418 // Copy the virtual register into the R11 physical register and
33419 // call the retpoline thunk.
33420 const DebugLoc &DL = MI.getDebugLoc();
33421 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33422 Register CalleeVReg = MI.getOperand(0).getReg();
33423 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
33424
33425 // Find an available scratch register to hold the callee. On 64-bit, we can
33426 // just use R11, but we scan for uses anyway to ensure we don't generate
33427 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
33428 // already a register use operand to the call to hold the callee. If none
33429 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
33430 // register and ESI is the base pointer to realigned stack frames with VLAs.
33431 SmallVector<unsigned, 3> AvailableRegs;
33432 if (Subtarget.is64Bit())
33433 AvailableRegs.push_back(X86::R11);
33434 else
33435 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
33436
33437 // Zero out any registers that are already used.
33438 for (const auto &MO : MI.operands()) {
33439 if (MO.isReg() && MO.isUse())
33440 for (unsigned &Reg : AvailableRegs)
33441 if (Reg == MO.getReg())
33442 Reg = 0;
33443 }
33444
33445 // Choose the first remaining non-zero available register.
33446 unsigned AvailableReg = 0;
33447 for (unsigned MaybeReg : AvailableRegs) {
33448 if (MaybeReg) {
33449 AvailableReg = MaybeReg;
33450 break;
33451 }
33452 }
33453 if (!AvailableReg)
33454 report_fatal_error("calling convention incompatible with retpoline, no "
33455 "available registers");
33456
33457 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
33458
33459 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
33460 .addReg(CalleeVReg);
33461 MI.getOperand(0).ChangeToES(Symbol);
33462 MI.setDesc(TII->get(Opc));
33463 MachineInstrBuilder(*BB->getParent(), &MI)
33464 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
33465 return BB;
33466}
33467
33468/// SetJmp implies future control flow change upon calling the corresponding
33469/// LongJmp.
33470/// Instead of using the 'return' instruction, the long jump fixes the stack and
33471/// performs an indirect branch. To do so it uses the registers that were stored
33472/// in the jump buffer (when calling SetJmp).
33473/// In case the shadow stack is enabled we need to fix it as well, because some
33474/// return addresses will be skipped.
33475/// The function will save the SSP for future fixing in the function
33476/// emitLongJmpShadowStackFix.
33477/// \sa emitLongJmpShadowStackFix
33478/// \param [in] MI The temporary Machine Instruction for the builtin.
33479/// \param [in] MBB The Machine Basic Block that will be modified.
33480void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
33481 MachineBasicBlock *MBB) const {
33482 const DebugLoc &DL = MI.getDebugLoc();
33483 MachineFunction *MF = MBB->getParent();
33484 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33485 MachineRegisterInfo &MRI = MF->getRegInfo();
33486 MachineInstrBuilder MIB;
33487
33488 // Memory Reference.
33489 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33490 MI.memoperands_end());
33491
33492 // Initialize a register with zero.
33493 MVT PVT = getPointerTy(MF->getDataLayout());
33494 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33495 Register ZReg = MRI.createVirtualRegister(PtrRC);
33496 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
33497 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
33498 .addDef(ZReg)
33499 .addReg(ZReg, RegState::Undef)
33500 .addReg(ZReg, RegState::Undef);
33501
33502 // Read the current SSP Register value to the zeroed register.
33503 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33504 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33505 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33506
33507 // Write the SSP register value to offset 3 in input memory buffer.
33508 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33509 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
33510 const int64_t SSPOffset = 3 * PVT.getStoreSize();
33511 const unsigned MemOpndSlot = 1;
33512 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33513 if (i == X86::AddrDisp)
33514 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
33515 else
33516 MIB.add(MI.getOperand(MemOpndSlot + i));
33517 }
33518 MIB.addReg(SSPCopyReg);
33519 MIB.setMemRefs(MMOs);
33520}
33521
33522MachineBasicBlock *
33523X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
33524 MachineBasicBlock *MBB) const {
33525 const DebugLoc &DL = MI.getDebugLoc();
33526 MachineFunction *MF = MBB->getParent();
33527 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33528 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33529 MachineRegisterInfo &MRI = MF->getRegInfo();
33530
33531 const BasicBlock *BB = MBB->getBasicBlock();
33532 MachineFunction::iterator I = ++MBB->getIterator();
33533
33534 // Memory Reference
33535 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33536 MI.memoperands_end());
33537
33538 unsigned DstReg;
33539 unsigned MemOpndSlot = 0;
33540
33541 unsigned CurOp = 0;
33542
33543 DstReg = MI.getOperand(CurOp++).getReg();
33544 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33545 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")((void)0);
33546 (void)TRI;
33547 Register mainDstReg = MRI.createVirtualRegister(RC);
33548 Register restoreDstReg = MRI.createVirtualRegister(RC);
33549
33550 MemOpndSlot = CurOp;
33551
33552 MVT PVT = getPointerTy(MF->getDataLayout());
33553 assert((PVT == MVT::i64 || PVT == MVT::i32) &&((void)0)
33554 "Invalid Pointer Size!")((void)0);
33555
33556 // For v = setjmp(buf), we generate
33557 //
33558 // thisMBB:
33559 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
33560 // SjLjSetup restoreMBB
33561 //
33562 // mainMBB:
33563 // v_main = 0
33564 //
33565 // sinkMBB:
33566 // v = phi(main, restore)
33567 //
33568 // restoreMBB:
33569 // if base pointer being used, load it from frame
33570 // v_restore = 1
33571
33572 MachineBasicBlock *thisMBB = MBB;
33573 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33574 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33575 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
33576 MF->insert(I, mainMBB);
33577 MF->insert(I, sinkMBB);
33578 MF->push_back(restoreMBB);
33579 restoreMBB->setHasAddressTaken();
33580
33581 MachineInstrBuilder MIB;
33582
33583 // Transfer the remainder of BB and its successor edges to sinkMBB.
33584 sinkMBB->splice(sinkMBB->begin(), MBB,
33585 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33586 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33587
33588 // thisMBB:
33589 unsigned PtrStoreOpc = 0;
33590 unsigned LabelReg = 0;
33591 const int64_t LabelOffset = 1 * PVT.getStoreSize();
33592 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33593 !isPositionIndependent();
33594
33595 // Prepare IP either in reg or imm.
33596 if (!UseImmLabel) {
33597 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33598 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33599 LabelReg = MRI.createVirtualRegister(PtrRC);
33600 if (Subtarget.is64Bit()) {
33601 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
33602 .addReg(X86::RIP)
33603 .addImm(0)
33604 .addReg(0)
33605 .addMBB(restoreMBB)
33606 .addReg(0);
33607 } else {
33608 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
33609 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
33610 .addReg(XII->getGlobalBaseReg(MF))
33611 .addImm(0)
33612 .addReg(0)
33613 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
33614 .addReg(0);
33615 }
33616 } else
33617 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33618 // Store IP
33619 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
33620 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33621 if (i == X86::AddrDisp)
33622 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
33623 else
33624 MIB.add(MI.getOperand(MemOpndSlot + i));
33625 }
33626 if (!UseImmLabel)
33627 MIB.addReg(LabelReg);
33628 else
33629 MIB.addMBB(restoreMBB);
33630 MIB.setMemRefs(MMOs);
33631
33632 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33633 emitSetJmpShadowStackFix(MI, thisMBB);
33634 }
33635
33636 // Setup
33637 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
33638 .addMBB(restoreMBB);
33639
33640 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33641 MIB.addRegMask(RegInfo->getNoPreservedMask());
33642 thisMBB->addSuccessor(mainMBB);
33643 thisMBB->addSuccessor(restoreMBB);
33644
33645 // mainMBB:
33646 // EAX = 0
33647 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
33648 mainMBB->addSuccessor(sinkMBB);
33649
33650 // sinkMBB:
33651 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
33652 TII->get(X86::PHI), DstReg)
33653 .addReg(mainDstReg).addMBB(mainMBB)
33654 .addReg(restoreDstReg).addMBB(restoreMBB);
33655
33656 // restoreMBB:
33657 if (RegInfo->hasBasePointer(*MF)) {
33658 const bool Uses64BitFramePtr =
33659 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
33660 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
33661 X86FI->setRestoreBasePointer(MF);
33662 Register FramePtr = RegInfo->getFrameRegister(*MF);
33663 Register BasePtr = RegInfo->getBaseRegister();
33664 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
33665 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
33666 FramePtr, true, X86FI->getRestoreBasePointerOffset())
33667 .setMIFlag(MachineInstr::FrameSetup);
33668 }
33669 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
33670 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33671 restoreMBB->addSuccessor(sinkMBB);
33672
33673 MI.eraseFromParent();
33674 return sinkMBB;
33675}
33676
33677/// Fix the shadow stack using the previously saved SSP pointer.
33678/// \sa emitSetJmpShadowStackFix
33679/// \param [in] MI The temporary Machine Instruction for the builtin.
33680/// \param [in] MBB The Machine Basic Block that will be modified.
33681/// \return The sink MBB that will perform the future indirect branch.
33682MachineBasicBlock *
33683X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
33684 MachineBasicBlock *MBB) const {
33685 const DebugLoc &DL = MI.getDebugLoc();
33686 MachineFunction *MF = MBB->getParent();
33687 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33688 MachineRegisterInfo &MRI = MF->getRegInfo();
33689
33690 // Memory Reference
33691 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33692 MI.memoperands_end());
33693
33694 MVT PVT = getPointerTy(MF->getDataLayout());
33695 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
33696
33697 // checkSspMBB:
33698 // xor vreg1, vreg1
33699 // rdssp vreg1
33700 // test vreg1, vreg1
33701 // je sinkMBB # Jump if Shadow Stack is not supported
33702 // fallMBB:
33703 // mov buf+24/12(%rip), vreg2
33704 // sub vreg1, vreg2
33705 // jbe sinkMBB # No need to fix the Shadow Stack
33706 // fixShadowMBB:
33707 // shr 3/2, vreg2
33708 // incssp vreg2 # fix the SSP according to the lower 8 bits
33709 // shr 8, vreg2
33710 // je sinkMBB
33711 // fixShadowLoopPrepareMBB:
33712 // shl vreg2
33713 // mov 128, vreg3
33714 // fixShadowLoopMBB:
33715 // incssp vreg3
33716 // dec vreg2
33717 // jne fixShadowLoopMBB # Iterate until you finish fixing
33718 // # the Shadow Stack
33719 // sinkMBB:
33720
33721 MachineFunction::iterator I = ++MBB->getIterator();
33722 const BasicBlock *BB = MBB->getBasicBlock();
33723
33724 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
33725 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33726 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
33727 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
33728 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
33729 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33730 MF->insert(I, checkSspMBB);
33731 MF->insert(I, fallMBB);
33732 MF->insert(I, fixShadowMBB);
33733 MF->insert(I, fixShadowLoopPrepareMBB);
33734 MF->insert(I, fixShadowLoopMBB);
33735 MF->insert(I, sinkMBB);
33736
33737 // Transfer the remainder of BB and its successor edges to sinkMBB.
33738 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
33739 MBB->end());
33740 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33741
33742 MBB->addSuccessor(checkSspMBB);
33743
33744 // Initialize a register with zero.
33745 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
33746 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
33747
33748 if (PVT == MVT::i64) {
33749 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
33750 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
33751 .addImm(0)
33752 .addReg(ZReg)
33753 .addImm(X86::sub_32bit);
33754 ZReg = TmpZReg;
33755 }
33756
33757 // Read the current SSP Register value to the zeroed register.
33758 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
33759 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
33760 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
33761
33762 // Check whether the result of the SSP register is zero and jump directly
33763 // to the sink.
33764 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
33765 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
33766 .addReg(SSPCopyReg)
33767 .addReg(SSPCopyReg);
33768 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33769 checkSspMBB->addSuccessor(sinkMBB);
33770 checkSspMBB->addSuccessor(fallMBB);
33771
33772 // Reload the previously saved SSP register value.
33773 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
33774 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33775 const int64_t SPPOffset = 3 * PVT.getStoreSize();
33776 MachineInstrBuilder MIB =
33777 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
33778 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33779 const MachineOperand &MO = MI.getOperand(i);
33780 if (i == X86::AddrDisp)
33781 MIB.addDisp(MO, SPPOffset);
33782 else if (MO.isReg()) // Don't add the whole operand, we don't want to
33783 // preserve kill flags.
33784 MIB.addReg(MO.getReg());
33785 else
33786 MIB.add(MO);
33787 }
33788 MIB.setMemRefs(MMOs);
33789
33790 // Subtract the current SSP from the previous SSP.
33791 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
33792 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
33793 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
33794 .addReg(PrevSSPReg)
33795 .addReg(SSPCopyReg);
33796
33797 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
33798 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
33799 fallMBB->addSuccessor(sinkMBB);
33800 fallMBB->addSuccessor(fixShadowMBB);
33801
33802 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
33803 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
33804 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
33805 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
33806 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
33807 .addReg(SspSubReg)
33808 .addImm(Offset);
33809
33810 // Increase SSP when looking only on the lower 8 bits of the delta.
33811 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
33812 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
33813
33814 // Reset the lower 8 bits.
33815 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
33816 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
33817 .addReg(SspFirstShrReg)
33818 .addImm(8);
33819
33820 // Jump if the result of the shift is zero.
33821 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
33822 fixShadowMBB->addSuccessor(sinkMBB);
33823 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
33824
33825 // Do a single shift left.
33826 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
33827 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
33828 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
33829 .addReg(SspSecondShrReg);
33830
33831 // Save the value 128 to a register (will be used next with incssp).
33832 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
33833 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
33834 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
33835 .addImm(128);
33836 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
33837
33838 // Since incssp only looks at the lower 8 bits, we might need to do several
33839 // iterations of incssp until we finish fixing the shadow stack.
33840 Register DecReg = MRI.createVirtualRegister(PtrRC);
33841 Register CounterReg = MRI.createVirtualRegister(PtrRC);
33842 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
33843 .addReg(SspAfterShlReg)
33844 .addMBB(fixShadowLoopPrepareMBB)
33845 .addReg(DecReg)
33846 .addMBB(fixShadowLoopMBB);
33847
33848 // Every iteration we increase the SSP by 128.
33849 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
33850
33851 // Every iteration we decrement the counter by 1.
33852 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
33853 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
33854
33855 // Jump if the counter is not zero yet.
33856 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
33857 fixShadowLoopMBB->addSuccessor(sinkMBB);
33858 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
33859
33860 return sinkMBB;
33861}
33862
33863MachineBasicBlock *
33864X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
33865 MachineBasicBlock *MBB) const {
33866 const DebugLoc &DL = MI.getDebugLoc();
33867 MachineFunction *MF = MBB->getParent();
33868 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33869 MachineRegisterInfo &MRI = MF->getRegInfo();
33870
33871 // Memory Reference
33872 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
33873 MI.memoperands_end());
33874
33875 MVT PVT = getPointerTy(MF->getDataLayout());
33876 assert((PVT == MVT::i64 || PVT == MVT::i32) &&((void)0)
33877 "Invalid Pointer Size!")((void)0);
33878
33879 const TargetRegisterClass *RC =
33880 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33881 Register Tmp = MRI.createVirtualRegister(RC);
33882 // Since FP is only updated here but NOT referenced, it's treated as GPR.
33883 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
33884 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
33885 Register SP = RegInfo->getStackRegister();
33886
33887 MachineInstrBuilder MIB;
33888
33889 const int64_t LabelOffset = 1 * PVT.getStoreSize();
33890 const int64_t SPOffset = 2 * PVT.getStoreSize();
33891
33892 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
33893 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
33894
33895 MachineBasicBlock *thisMBB = MBB;
33896
33897 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
33898 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
33899 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
33900 }
33901
33902 // Reload FP
33903 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
33904 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33905 const MachineOperand &MO = MI.getOperand(i);
33906 if (MO.isReg()) // Don't add the whole operand, we don't want to
33907 // preserve kill flags.
33908 MIB.addReg(MO.getReg());
33909 else
33910 MIB.add(MO);
33911 }
33912 MIB.setMemRefs(MMOs);
33913
33914 // Reload IP
33915 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
33916 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33917 const MachineOperand &MO = MI.getOperand(i);
33918 if (i == X86::AddrDisp)
33919 MIB.addDisp(MO, LabelOffset);
33920 else if (MO.isReg()) // Don't add the whole operand, we don't want to
33921 // preserve kill flags.
33922 MIB.addReg(MO.getReg());
33923 else
33924 MIB.add(MO);
33925 }
33926 MIB.setMemRefs(MMOs);
33927
33928 // Reload SP
33929 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
33930 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
33931 if (i == X86::AddrDisp)
33932 MIB.addDisp(MI.getOperand(i), SPOffset);
33933 else
33934 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
33935 // the last instruction of the expansion.
33936 }
33937 MIB.setMemRefs(MMOs);
33938
33939 // Jump
33940 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
33941
33942 MI.eraseFromParent();
33943 return thisMBB;
33944}
33945
33946void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
33947 MachineBasicBlock *MBB,
33948 MachineBasicBlock *DispatchBB,
33949 int FI) const {
33950 const DebugLoc &DL = MI.getDebugLoc();
33951 MachineFunction *MF = MBB->getParent();
33952 MachineRegisterInfo *MRI = &MF->getRegInfo();
33953 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33954
33955 MVT PVT = getPointerTy(MF->getDataLayout());
33956 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")((void)0);
33957
33958 unsigned Op = 0;
33959 unsigned VR = 0;
33960
33961 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
33962 !isPositionIndependent();
33963
33964 if (UseImmLabel) {
33965 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
33966 } else {
33967 const TargetRegisterClass *TRC =
33968 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
33969 VR = MRI->createVirtualRegister(TRC);
33970 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
33971
33972 if (Subtarget.is64Bit())
33973 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
33974 .addReg(X86::RIP)
33975 .addImm(1)
33976 .addReg(0)
33977 .addMBB(DispatchBB)
33978 .addReg(0);
33979 else
33980 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
33981 .addReg(0) /* TII->getGlobalBaseReg(MF) */
33982 .addImm(1)
33983 .addReg(0)
33984 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
33985 .addReg(0);
33986 }
33987
33988 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
33989 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
33990 if (UseImmLabel)
33991 MIB.addMBB(DispatchBB);
33992 else
33993 MIB.addReg(VR);
33994}
33995
33996MachineBasicBlock *
33997X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
33998 MachineBasicBlock *BB) const {
33999 const DebugLoc &DL = MI.getDebugLoc();
34000 MachineFunction *MF = BB->getParent();
34001 MachineRegisterInfo *MRI = &MF->getRegInfo();
34002 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34003 int FI = MF->getFrameInfo().getFunctionContextIndex();
34004
34005 // Get a mapping of the call site numbers to all of the landing pads they're
34006 // associated with.
34007 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
34008 unsigned MaxCSNum = 0;
34009 for (auto &MBB : *MF) {
34010 if (!MBB.isEHPad())
34011 continue;
34012
34013 MCSymbol *Sym = nullptr;
34014 for (const auto &MI : MBB) {
34015 if (MI.isDebugInstr())
34016 continue;
34017
34018 assert(MI.isEHLabel() && "expected EH_LABEL")((void)0);
34019 Sym = MI.getOperand(0).getMCSymbol();
34020 break;
34021 }
34022
34023 if (!MF->hasCallSiteLandingPad(Sym))
34024 continue;
34025
34026 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
34027 CallSiteNumToLPad[CSI].push_back(&MBB);
34028 MaxCSNum = std::max(MaxCSNum, CSI);
34029 }
34030 }
34031
34032 // Get an ordered list of the machine basic blocks for the jump table.
34033 std::vector<MachineBasicBlock *> LPadList;
34034 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
34035 LPadList.reserve(CallSiteNumToLPad.size());
34036
34037 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
34038 for (auto &LP : CallSiteNumToLPad[CSI]) {
34039 LPadList.push_back(LP);
34040 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
34041 }
34042 }
34043
34044 assert(!LPadList.empty() &&((void)0)
34045 "No landing pad destinations for the dispatch jump table!")((void)0);
34046
34047 // Create the MBBs for the dispatch code.
34048
34049 // Shove the dispatch's address into the return slot in the function context.
34050 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
34051 DispatchBB->setIsEHPad(true);
34052
34053 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
34054 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
34055 DispatchBB->addSuccessor(TrapBB);
34056
34057 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
34058 DispatchBB->addSuccessor(DispContBB);
34059
34060 // Insert MBBs.
34061 MF->push_back(DispatchBB);
34062 MF->push_back(DispContBB);
34063 MF->push_back(TrapBB);
34064
34065 // Insert code into the entry block that creates and registers the function
34066 // context.
34067 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
34068
34069 // Create the jump table and associated information
34070 unsigned JTE = getJumpTableEncoding();
34071 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
34072 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
34073
34074 const X86RegisterInfo &RI = TII->getRegisterInfo();
34075 // Add a register mask with no preserved registers. This results in all
34076 // registers being marked as clobbered.
34077 if (RI.hasBasePointer(*MF)) {
34078 const bool FPIs64Bit =
34079 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34080 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
34081 MFI->setRestoreBasePointer(MF);
34082
34083 Register FP = RI.getFrameRegister(*MF);
34084 Register BP = RI.getBaseRegister();
34085 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
34086 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
34087 MFI->getRestoreBasePointerOffset())
34088 .addRegMask(RI.getNoPreservedMask());
34089 } else {
34090 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
34091 .addRegMask(RI.getNoPreservedMask());
34092 }
34093
34094 // IReg is used as an index in a memory operand and therefore can't be SP
34095 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
34096 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
34097 Subtarget.is64Bit() ? 8 : 4);
34098 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
34099 .addReg(IReg)
34100 .addImm(LPadList.size());
34101 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
34102
34103 if (Subtarget.is64Bit()) {
34104 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34105 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
34106
34107 // leaq .LJTI0_0(%rip), BReg
34108 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
34109 .addReg(X86::RIP)
34110 .addImm(1)
34111 .addReg(0)
34112 .addJumpTableIndex(MJTI)
34113 .addReg(0);
34114 // movzx IReg64, IReg
34115 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
34116 .addImm(0)
34117 .addReg(IReg)
34118 .addImm(X86::sub_32bit);
34119
34120 switch (JTE) {
34121 case MachineJumpTableInfo::EK_BlockAddress:
34122 // jmpq *(BReg,IReg64,8)
34123 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
34124 .addReg(BReg)
34125 .addImm(8)
34126 .addReg(IReg64)
34127 .addImm(0)
34128 .addReg(0);
34129 break;
34130 case MachineJumpTableInfo::EK_LabelDifference32: {
34131 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
34132 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
34133 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34134
34135 // movl (BReg,IReg64,4), OReg
34136 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
34137 .addReg(BReg)
34138 .addImm(4)
34139 .addReg(IReg64)
34140 .addImm(0)
34141 .addReg(0);
34142 // movsx OReg64, OReg
34143 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
34144 // addq BReg, OReg64, TReg
34145 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
34146 .addReg(OReg64)
34147 .addReg(BReg);
34148 // jmpq *TReg
34149 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
34150 break;
34151 }
34152 default:
34153 llvm_unreachable("Unexpected jump table encoding")__builtin_unreachable();
34154 }
34155 } else {
34156 // jmpl *.LJTI0_0(,IReg,4)
34157 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
34158 .addReg(0)
34159 .addImm(4)
34160 .addReg(IReg)
34161 .addJumpTableIndex(MJTI)
34162 .addReg(0);
34163 }
34164
34165 // Add the jump table entries as successors to the MBB.
34166 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
34167 for (auto &LP : LPadList)
34168 if (SeenMBBs.insert(LP).second)
34169 DispContBB->addSuccessor(LP);
34170
34171 // N.B. the order the invoke BBs are processed in doesn't matter here.
34172 SmallVector<MachineBasicBlock *, 64> MBBLPads;
34173 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
34174 for (MachineBasicBlock *MBB : InvokeBBs) {
34175 // Remove the landing pad successor from the invoke block and replace it
34176 // with the new dispatch block.
34177 // Keep a copy of Successors since it's modified inside the loop.
34178 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
34179 MBB->succ_rend());
34180 // FIXME: Avoid quadratic complexity.
34181 for (auto MBBS : Successors) {
34182 if (MBBS->isEHPad()) {
34183 MBB->removeSuccessor(MBBS);
34184 MBBLPads.push_back(MBBS);
34185 }
34186 }
34187
34188 MBB->addSuccessor(DispatchBB);
34189
34190 // Find the invoke call and mark all of the callee-saved registers as
34191 // 'implicit defined' so that they're spilled. This prevents code from
34192 // moving instructions to before the EH block, where they will never be
34193 // executed.
34194 for (auto &II : reverse(*MBB)) {
34195 if (!II.isCall())
34196 continue;
34197
34198 DenseMap<unsigned, bool> DefRegs;
34199 for (auto &MOp : II.operands())
34200 if (MOp.isReg())
34201 DefRegs[MOp.getReg()] = true;
34202
34203 MachineInstrBuilder MIB(*MF, &II);
34204 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
34205 unsigned Reg = SavedRegs[RegIdx];
34206 if (!DefRegs[Reg])
34207 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
34208 }
34209
34210 break;
34211 }
34212 }
34213
34214 // Mark all former landing pads as non-landing pads. The dispatch is the only
34215 // landing pad now.
34216 for (auto &LP : MBBLPads)
34217 LP->setIsEHPad(false);
34218
34219 // The instruction is gone now.
34220 MI.eraseFromParent();
34221 return BB;
34222}
34223
34224MachineBasicBlock *
34225X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
34226 MachineBasicBlock *BB) const {
34227 MachineFunction *MF = BB->getParent();
34228 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34229 const DebugLoc &DL = MI.getDebugLoc();
34230
34231 auto TMMImmToTMMReg = [](unsigned Imm) {
34232 assert (Imm < 8 && "Illegal tmm index")((void)0);
34233 return X86::TMM0 + Imm;
34234 };
34235 switch (MI.getOpcode()) {
34236 default: llvm_unreachable("Unexpected instr type to insert")__builtin_unreachable();
34237 case X86::TLS_addr32:
34238 case X86::TLS_addr64:
34239 case X86::TLS_addrX32:
34240 case X86::TLS_base_addr32:
34241 case X86::TLS_base_addr64:
34242 case X86::TLS_base_addrX32:
34243 return EmitLoweredTLSAddr(MI, BB);
34244 case X86::INDIRECT_THUNK_CALL32:
34245 case X86::INDIRECT_THUNK_CALL64:
34246 case X86::INDIRECT_THUNK_TCRETURN32:
34247 case X86::INDIRECT_THUNK_TCRETURN64:
34248 return EmitLoweredIndirectThunk(MI, BB);
34249 case X86::CATCHRET:
34250 return EmitLoweredCatchRet(MI, BB);
34251 case X86::SEG_ALLOCA_32:
34252 case X86::SEG_ALLOCA_64:
34253 return EmitLoweredSegAlloca(MI, BB);
34254 case X86::PROBED_ALLOCA_32:
34255 case X86::PROBED_ALLOCA_64:
34256 return EmitLoweredProbedAlloca(MI, BB);
34257 case X86::TLSCall_32:
34258 case X86::TLSCall_64:
34259 return EmitLoweredTLSCall(MI, BB);
34260 case X86::CMOV_FR32:
34261 case X86::CMOV_FR32X:
34262 case X86::CMOV_FR64:
34263 case X86::CMOV_FR64X:
34264 case X86::CMOV_GR8:
34265 case X86::CMOV_GR16:
34266 case X86::CMOV_GR32:
34267 case X86::CMOV_RFP32:
34268 case X86::CMOV_RFP64:
34269 case X86::CMOV_RFP80:
34270 case X86::CMOV_VR64:
34271 case X86::CMOV_VR128:
34272 case X86::CMOV_VR128X:
34273 case X86::CMOV_VR256:
34274 case X86::CMOV_VR256X:
34275 case X86::CMOV_VR512:
34276 case X86::CMOV_VK1:
34277 case X86::CMOV_VK2:
34278 case X86::CMOV_VK4:
34279 case X86::CMOV_VK8:
34280 case X86::CMOV_VK16:
34281 case X86::CMOV_VK32:
34282 case X86::CMOV_VK64:
34283 return EmitLoweredSelect(MI, BB);
34284
34285 case X86::RDFLAGS32:
34286 case X86::RDFLAGS64: {
34287 unsigned PushF =
34288 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
34289 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
34290 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
34291 // Permit reads of the EFLAGS and DF registers without them being defined.
34292 // This intrinsic exists to read external processor state in flags, such as
34293 // the trap flag, interrupt flag, and direction flag, none of which are
34294 // modeled by the backend.
34295 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&((void)0)
34296 "Unexpected register in operand!")((void)0);
34297 Push->getOperand(2).setIsUndef();
34298 assert(Push->getOperand(3).getReg() == X86::DF &&((void)0)
34299 "Unexpected register in operand!")((void)0);
34300 Push->getOperand(3).setIsUndef();
34301 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
34302
34303 MI.eraseFromParent(); // The pseudo is gone now.
34304 return BB;
34305 }
34306
34307 case X86::WRFLAGS32:
34308 case X86::WRFLAGS64: {
34309 unsigned Push =
34310 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
34311 unsigned PopF =
34312 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
34313 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
34314 BuildMI(*BB, MI, DL, TII->get(PopF));
34315
34316 MI.eraseFromParent(); // The pseudo is gone now.
34317 return BB;
34318 }
34319
34320 case X86::FP32_TO_INT16_IN_MEM:
34321 case X86::FP32_TO_INT32_IN_MEM:
34322 case X86::FP32_TO_INT64_IN_MEM:
34323 case X86::FP64_TO_INT16_IN_MEM:
34324 case X86::FP64_TO_INT32_IN_MEM:
34325 case X86::FP64_TO_INT64_IN_MEM:
34326 case X86::FP80_TO_INT16_IN_MEM:
34327 case X86::FP80_TO_INT32_IN_MEM:
34328 case X86::FP80_TO_INT64_IN_MEM: {
34329 // Change the floating point control register to use "round towards zero"
34330 // mode when truncating to an integer value.
34331 int OrigCWFrameIdx =
34332 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34333 addFrameReference(BuildMI(*BB, MI, DL,
34334 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
34335
34336 // Load the old value of the control word...
34337 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34338 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
34339 OrigCWFrameIdx);
34340
34341 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
34342 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34343 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
34344 .addReg(OldCW, RegState::Kill).addImm(0xC00);
34345
34346 // Extract to 16 bits.
34347 Register NewCW16 =
34348 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
34349 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
34350 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
34351
34352 // Prepare memory for FLDCW.
34353 int NewCWFrameIdx =
34354 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34355 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
34356 NewCWFrameIdx)
34357 .addReg(NewCW16, RegState::Kill);
34358
34359 // Reload the modified control word now...
34360 addFrameReference(BuildMI(*BB, MI, DL,
34361 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
34362
34363 // Get the X86 opcode to use.
34364 unsigned Opc;
34365 switch (MI.getOpcode()) {
34366 default: llvm_unreachable("illegal opcode!")__builtin_unreachable();
34367 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
34368 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
34369 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
34370 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
34371 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
34372 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
34373 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
34374 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
34375 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
34376 }
34377
34378 X86AddressMode AM = getAddressFromInstr(&MI, 0);
34379 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
34380 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
34381
34382 // Reload the original control word now.
34383 addFrameReference(BuildMI(*BB, MI, DL,
34384 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
34385
34386 MI.eraseFromParent(); // The pseudo instruction is gone now.
34387 return BB;
34388 }
34389
34390 // xbegin
34391 case X86::XBEGIN:
34392 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
34393
34394 case X86::VAARG_64:
34395 case X86::VAARG_X32:
34396 return EmitVAARGWithCustomInserter(MI, BB);
34397
34398 case X86::EH_SjLj_SetJmp32:
34399 case X86::EH_SjLj_SetJmp64:
34400 return emitEHSjLjSetJmp(MI, BB);
34401
34402 case X86::EH_SjLj_LongJmp32:
34403 case X86::EH_SjLj_LongJmp64:
34404 return emitEHSjLjLongJmp(MI, BB);
34405
34406 case X86::Int_eh_sjlj_setup_dispatch:
34407 return EmitSjLjDispatchBlock(MI, BB);
34408
34409 case TargetOpcode::STATEPOINT:
34410 // As an implementation detail, STATEPOINT shares the STACKMAP format at
34411 // this point in the process. We diverge later.
34412 return emitPatchPoint(MI, BB);
34413
34414 case TargetOpcode::STACKMAP:
34415 case TargetOpcode::PATCHPOINT:
34416 return emitPatchPoint(MI, BB);
34417
34418 case TargetOpcode::PATCHABLE_EVENT_CALL:
34419 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
34420 return BB;
34421
34422 case X86::LCMPXCHG8B: {
34423 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34424 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
34425 // requires a memory operand. If it happens that current architecture is
34426 // i686 and for current function we need a base pointer
34427 // - which is ESI for i686 - register allocator would not be able to
34428 // allocate registers for an address in form of X(%reg, %reg, Y)
34429 // - there never would be enough unreserved registers during regalloc
34430 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
34431 // We are giving a hand to register allocator by precomputing the address in
34432 // a new vreg using LEA.
34433
34434 // If it is not i686 or there is no base pointer - nothing to do here.
34435 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
34436 return BB;
34437
34438 // Even though this code does not necessarily needs the base pointer to
34439 // be ESI, we check for that. The reason: if this assert fails, there are
34440 // some changes happened in the compiler base pointer handling, which most
34441 // probably have to be addressed somehow here.
34442 assert(TRI->getBaseRegister() == X86::ESI &&((void)0)
34443 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "((void)0)
34444 "base pointer in mind")((void)0);
34445
34446 MachineRegisterInfo &MRI = MF->getRegInfo();
34447 MVT SPTy = getPointerTy(MF->getDataLayout());
34448 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
34449 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
34450
34451 X86AddressMode AM = getAddressFromInstr(&MI, 0);
34452 // Regalloc does not need any help when the memory operand of CMPXCHG8B
34453 // does not use index register.
34454 if (AM.IndexReg == X86::NoRegister)
34455 return BB;
34456
34457 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
34458 // four operand definitions that are E[ABCD] registers. We skip them and
34459 // then insert the LEA.
34460 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
34461 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
34462 RMBBI->definesRegister(X86::EBX) ||
34463 RMBBI->definesRegister(X86::ECX) ||
34464 RMBBI->definesRegister(X86::EDX))) {
34465 ++RMBBI;
34466 }
34467 MachineBasicBlock::iterator MBBI(RMBBI);
34468 addFullAddress(
34469 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
34470
34471 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
34472
34473 return BB;
34474 }
34475 case X86::LCMPXCHG16B_NO_RBX: {
34476 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34477 Register BasePtr = TRI->getBaseRegister();
34478 if (TRI->hasBasePointer(*MF) &&
34479 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
34480 if (!BB->isLiveIn(BasePtr))
34481 BB->addLiveIn(BasePtr);
34482 // Save RBX into a virtual register.
34483 Register SaveRBX =
34484 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34485 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34486 .addReg(X86::RBX);
34487 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34488 MachineInstrBuilder MIB =
34489 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
34490 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34491 MIB.add(MI.getOperand(Idx));
34492 MIB.add(MI.getOperand(X86::AddrNumOperands));
34493 MIB.addReg(SaveRBX);
34494 } else {
34495 // Simple case, just copy the virtual register to RBX.
34496 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
34497 .add(MI.getOperand(X86::AddrNumOperands));
34498 MachineInstrBuilder MIB =
34499 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
34500 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
34501 MIB.add(MI.getOperand(Idx));
34502 }
34503 MI.eraseFromParent();
34504 return BB;
34505 }
34506 case X86::MWAITX: {
34507 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34508 Register BasePtr = TRI->getBaseRegister();
34509 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
34510 // If no need to save the base pointer, we generate MWAITXrrr,
34511 // else we generate pseudo MWAITX_SAVE_RBX.
34512 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
34513 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34514 .addReg(MI.getOperand(0).getReg());
34515 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34516 .addReg(MI.getOperand(1).getReg());
34517 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
34518 .addReg(MI.getOperand(2).getReg());
34519 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
34520 MI.eraseFromParent();
34521 } else {
34522 if (!BB->isLiveIn(BasePtr)) {
34523 BB->addLiveIn(BasePtr);
34524 }
34525 // Parameters can be copied into ECX and EAX but not EBX yet.
34526 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
34527 .addReg(MI.getOperand(0).getReg());
34528 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
34529 .addReg(MI.getOperand(1).getReg());
34530 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")((void)0);
34531 // Save RBX into a virtual register.
34532 Register SaveRBX =
34533 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34534 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
34535 .addReg(X86::RBX);
34536 // Generate mwaitx pseudo.
34537 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
34538 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
34539 .addDef(Dst) // Destination tied in with SaveRBX.
34540 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
34541 .addUse(SaveRBX); // Save of base pointer.
34542 MI.eraseFromParent();
34543 }
34544 return BB;
34545 }
34546 case TargetOpcode::PREALLOCATED_SETUP: {
34547 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")((void)0);
34548 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34549 MFI->setHasPreallocatedCall(true);
34550 int64_t PreallocatedId = MI.getOperand(0).getImm();
34551 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
34552 assert(StackAdjustment != 0 && "0 stack adjustment")((void)0);
34553 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { } while (false)
34554 << StackAdjustment << "\n")do { } while (false);
34555 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
34556 .addReg(X86::ESP)
34557 .addImm(StackAdjustment);
34558 MI.eraseFromParent();
34559 return BB;
34560 }
34561 case TargetOpcode::PREALLOCATED_ARG: {
34562 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")((void)0);
34563 int64_t PreallocatedId = MI.getOperand(1).getImm();
34564 int64_t ArgIdx = MI.getOperand(2).getImm();
34565 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
34566 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
34567 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { } while (false)
34568 << ", arg offset " << ArgOffset << "\n")do { } while (false);
34569 // stack pointer + offset
34570 addRegOffset(
34571 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
34572 X86::ESP, false, ArgOffset);
34573 MI.eraseFromParent();
34574 return BB;
34575 }
34576 case X86::PTDPBSSD:
34577 case X86::PTDPBSUD:
34578 case X86::PTDPBUSD:
34579 case X86::PTDPBUUD:
34580 case X86::PTDPBF16PS: {
34581 unsigned Opc;
34582 switch (MI.getOpcode()) {
34583 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
34584 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
34585 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
34586 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
34587 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
34588 }
34589
34590 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34591 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
34592 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
34593 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
34594 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
34595
34596 MI.eraseFromParent(); // The pseudo is gone now.
34597 return BB;
34598 }
34599 case X86::PTILEZERO: {
34600 unsigned Imm = MI.getOperand(0).getImm();
34601 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
34602 MI.eraseFromParent(); // The pseudo is gone now.
34603 return BB;
34604 }
34605 case X86::PTILELOADD:
34606 case X86::PTILELOADDT1:
34607 case X86::PTILESTORED: {
34608 unsigned Opc;
34609 switch (MI.getOpcode()) {
34610 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
34611 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
34612 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
34613 }
34614
34615 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
34616 unsigned CurOp = 0;
34617 if (Opc != X86::TILESTORED)
34618 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34619 RegState::Define);
34620
34621 MIB.add(MI.getOperand(CurOp++)); // base
34622 MIB.add(MI.getOperand(CurOp++)); // scale
34623 MIB.add(MI.getOperand(CurOp++)); // index -- stride
34624 MIB.add(MI.getOperand(CurOp++)); // displacement
34625 MIB.add(MI.getOperand(CurOp++)); // segment
34626
34627 if (Opc == X86::TILESTORED)
34628 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
34629 RegState::Undef);
34630
34631 MI.eraseFromParent(); // The pseudo is gone now.
34632 return BB;
34633 }
34634 }
34635}
34636
34637//===----------------------------------------------------------------------===//
34638// X86 Optimization Hooks
34639//===----------------------------------------------------------------------===//
34640
34641bool
34642X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
34643 const APInt &DemandedBits,
34644 const APInt &DemandedElts,
34645 TargetLoweringOpt &TLO) const {
34646 EVT VT = Op.getValueType();
34647 unsigned Opcode = Op.getOpcode();
34648 unsigned EltSize = VT.getScalarSizeInBits();
34649
34650 if (VT.isVector()) {
34651 // If the constant is only all signbits in the active bits, then we should
34652 // extend it to the entire constant to allow it act as a boolean constant
34653 // vector.
34654 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
34655 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
34656 return false;
34657 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
34658 if (!DemandedElts[i] || V.getOperand(i).isUndef())
34659 continue;
34660 const APInt &Val = V.getConstantOperandAPInt(i);
34661 if (Val.getBitWidth() > Val.getNumSignBits() &&
34662 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
34663 return true;
34664 }
34665 return false;
34666 };
34667 // For vectors - if we have a constant, then try to sign extend.
34668 // TODO: Handle AND/ANDN cases.
34669 unsigned ActiveBits = DemandedBits.getActiveBits();
34670 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
34671 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
34672 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
34673 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
34674 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
34675 VT.getVectorNumElements());
34676 SDValue NewC =
34677 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
34678 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
34679 SDValue NewOp =
34680 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
34681 return TLO.CombineTo(Op, NewOp);
34682 }
34683 return false;
34684 }
34685
34686 // Only optimize Ands to prevent shrinking a constant that could be
34687 // matched by movzx.
34688 if (Opcode != ISD::AND)
34689 return false;
34690
34691 // Make sure the RHS really is a constant.
34692 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
34693 if (!C)
34694 return false;
34695
34696 const APInt &Mask = C->getAPIntValue();
34697
34698 // Clear all non-demanded bits initially.
34699 APInt ShrunkMask = Mask & DemandedBits;
34700
34701 // Find the width of the shrunk mask.
34702 unsigned Width = ShrunkMask.getActiveBits();
34703
34704 // If the mask is all 0s there's nothing to do here.
34705 if (Width == 0)
34706 return false;
34707
34708 // Find the next power of 2 width, rounding up to a byte.
34709 Width = PowerOf2Ceil(std::max(Width, 8U));
34710 // Truncate the width to size to handle illegal types.
34711 Width = std::min(Width, EltSize);
34712
34713 // Calculate a possible zero extend mask for this constant.
34714 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
34715
34716 // If we aren't changing the mask, just return true to keep it and prevent
34717 // the caller from optimizing.
34718 if (ZeroExtendMask == Mask)
34719 return true;
34720
34721 // Make sure the new mask can be represented by a combination of mask bits
34722 // and non-demanded bits.
34723 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
34724 return false;
34725
34726 // Replace the constant with the zero extend mask.
34727 SDLoc DL(Op);
34728 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
34729 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
34730 return TLO.CombineTo(Op, NewOp);
34731}
34732
34733void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
34734 KnownBits &Known,
34735 const APInt &DemandedElts,
34736 const SelectionDAG &DAG,
34737 unsigned Depth) const {
34738 unsigned BitWidth = Known.getBitWidth();
34739 unsigned NumElts = DemandedElts.getBitWidth();
34740 unsigned Opc = Op.getOpcode();
34741 EVT VT = Op.getValueType();
34742 assert((Opc >= ISD::BUILTIN_OP_END ||((void)0)
34743 Opc == ISD::INTRINSIC_WO_CHAIN ||((void)0)
34744 Opc == ISD::INTRINSIC_W_CHAIN ||((void)0)
34745 Opc == ISD::INTRINSIC_VOID) &&((void)0)
34746 "Should use MaskedValueIsZero if you don't know whether Op"((void)0)
34747 " is a target node!")((void)0);
34748
34749 Known.resetAll();
34750 switch (Opc) {
34751 default: break;
34752 case X86ISD::SETCC:
34753 Known.Zero.setBitsFrom(1);
34754 break;
34755 case X86ISD::MOVMSK: {
34756 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
34757 Known.Zero.setBitsFrom(NumLoBits);
34758 break;
34759 }
34760 case X86ISD::PEXTRB:
34761 case X86ISD::PEXTRW: {
34762 SDValue Src = Op.getOperand(0);
34763 EVT SrcVT = Src.getValueType();
34764 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
34765 Op.getConstantOperandVal(1));
34766 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
34767 Known = Known.anyextOrTrunc(BitWidth);
34768 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
34769 break;
34770 }
34771 case X86ISD::VSRAI:
34772 case X86ISD::VSHLI:
34773 case X86ISD::VSRLI: {
34774 unsigned ShAmt = Op.getConstantOperandVal(1);
34775 if (ShAmt >= VT.getScalarSizeInBits()) {
34776 Known.setAllZero();
34777 break;
34778 }
34779
34780 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34781 if (Opc == X86ISD::VSHLI) {
34782 Known.Zero <<= ShAmt;
34783 Known.One <<= ShAmt;
34784 // Low bits are known zero.
34785 Known.Zero.setLowBits(ShAmt);
34786 } else if (Opc == X86ISD::VSRLI) {
34787 Known.Zero.lshrInPlace(ShAmt);
34788 Known.One.lshrInPlace(ShAmt);
34789 // High bits are known zero.
34790 Known.Zero.setHighBits(ShAmt);
34791 } else {
34792 Known.Zero.ashrInPlace(ShAmt);
34793 Known.One.ashrInPlace(ShAmt);
34794 }
34795 break;
34796 }
34797 case X86ISD::PACKUS: {
34798 // PACKUS is just a truncation if the upper half is zero.
34799 APInt DemandedLHS, DemandedRHS;
34800 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
34801
34802 Known.One = APInt::getAllOnesValue(BitWidth * 2);
34803 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
34804
34805 KnownBits Known2;
34806 if (!!DemandedLHS) {
34807 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
34808 Known = KnownBits::commonBits(Known, Known2);
34809 }
34810 if (!!DemandedRHS) {
34811 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
34812 Known = KnownBits::commonBits(Known, Known2);
34813 }
34814
34815 if (Known.countMinLeadingZeros() < BitWidth)
34816 Known.resetAll();
34817 Known = Known.trunc(BitWidth);
34818 break;
34819 }
34820 case X86ISD::VBROADCAST: {
34821 SDValue Src = Op.getOperand(0);
34822 if (!Src.getSimpleValueType().isVector()) {
34823 Known = DAG.computeKnownBits(Src, Depth + 1);
34824 return;
34825 }
34826 break;
34827 }
34828 case X86ISD::ANDNP: {
34829 KnownBits Known2;
34830 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34831 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34832
34833 // ANDNP = (~X & Y);
34834 Known.One &= Known2.Zero;
34835 Known.Zero |= Known2.One;
34836 break;
34837 }
34838 case X86ISD::FOR: {
34839 KnownBits Known2;
34840 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34841 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34842
34843 Known |= Known2;
34844 break;
34845 }
34846 case X86ISD::PSADBW: {
34847 assert(VT.getScalarType() == MVT::i64 &&((void)0)
34848 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&((void)0)
34849 "Unexpected PSADBW types")((void)0);
34850
34851 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
34852 Known.Zero.setBitsFrom(16);
34853 break;
34854 }
34855 case X86ISD::PMULUDQ: {
34856 KnownBits Known2;
34857 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34858 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34859
34860 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
34861 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
34862 Known = KnownBits::mul(Known, Known2);
34863 break;
34864 }
34865 case X86ISD::CMOV: {
34866 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
34867 // If we don't know any bits, early out.
34868 if (Known.isUnknown())
34869 break;
34870 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
34871
34872 // Only known if known in both the LHS and RHS.
34873 Known = KnownBits::commonBits(Known, Known2);
34874 break;
34875 }
34876 case X86ISD::BEXTR:
34877 case X86ISD::BEXTRI: {
34878 SDValue Op0 = Op.getOperand(0);
34879 SDValue Op1 = Op.getOperand(1);
34880
34881 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
34882 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
34883 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
34884
34885 // If the length is 0, the result is 0.
34886 if (Length == 0) {
34887 Known.setAllZero();
34888 break;
34889 }
34890
34891 if ((Shift + Length) <= BitWidth) {
34892 Known = DAG.computeKnownBits(Op0, Depth + 1);
34893 Known = Known.extractBits(Length, Shift);
34894 Known = Known.zextOrTrunc(BitWidth);
34895 }
34896 }
34897 break;
34898 }
34899 case X86ISD::PDEP: {
34900 KnownBits Known2;
34901 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34902 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
34903 // Zeros are retained from the mask operand. But not ones.
34904 Known.One.clearAllBits();
34905 // The result will have at least as many trailing zeros as the non-mask
34906 // operand since bits can only map to the same or higher bit position.
34907 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
34908 break;
34909 }
34910 case X86ISD::PEXT: {
34911 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
34912 // The result has as many leading zeros as the number of zeroes in the mask.
34913 unsigned Count = Known.Zero.countPopulation();
34914 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
34915 Known.One.clearAllBits();
34916 break;
34917 }
34918 case X86ISD::VTRUNC:
34919 case X86ISD::VTRUNCS:
34920 case X86ISD::VTRUNCUS:
34921 case X86ISD::CVTSI2P:
34922 case X86ISD::CVTUI2P:
34923 case X86ISD::CVTP2SI:
34924 case X86ISD::CVTP2UI:
34925 case X86ISD::MCVTP2SI:
34926 case X86ISD::MCVTP2UI:
34927 case X86ISD::CVTTP2SI:
34928 case X86ISD::CVTTP2UI:
34929 case X86ISD::MCVTTP2SI:
34930 case X86ISD::MCVTTP2UI:
34931 case X86ISD::MCVTSI2P:
34932 case X86ISD::MCVTUI2P:
34933 case X86ISD::VFPROUND:
34934 case X86ISD::VMFPROUND:
34935 case X86ISD::CVTPS2PH:
34936 case X86ISD::MCVTPS2PH: {
34937 // Truncations/Conversions - upper elements are known zero.
34938 EVT SrcVT = Op.getOperand(0).getValueType();
34939 if (SrcVT.isVector()) {
34940 unsigned NumSrcElts = SrcVT.getVectorNumElements();
34941 if (NumElts > NumSrcElts &&
34942 DemandedElts.countTrailingZeros() >= NumSrcElts)
34943 Known.setAllZero();
34944 }
34945 break;
34946 }
34947 case X86ISD::STRICT_CVTTP2SI:
34948 case X86ISD::STRICT_CVTTP2UI:
34949 case X86ISD::STRICT_CVTSI2P:
34950 case X86ISD::STRICT_CVTUI2P:
34951 case X86ISD::STRICT_VFPROUND:
34952 case X86ISD::STRICT_CVTPS2PH: {
34953 // Strict Conversions - upper elements are known zero.
34954 EVT SrcVT = Op.getOperand(1).getValueType();
34955 if (SrcVT.isVector()) {
34956 unsigned NumSrcElts = SrcVT.getVectorNumElements();
34957 if (NumElts > NumSrcElts &&
34958 DemandedElts.countTrailingZeros() >= NumSrcElts)
34959 Known.setAllZero();
34960 }
34961 break;
34962 }
34963 case X86ISD::MOVQ2DQ: {
34964 // Move from MMX to XMM. Upper half of XMM should be 0.
34965 if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
34966 Known.setAllZero();
34967 break;
34968 }
34969 }
34970
34971 // Handle target shuffles.
34972 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
34973 if (isTargetShuffle(Opc)) {
34974 SmallVector<int, 64> Mask;
34975 SmallVector<SDValue, 2> Ops;
34976 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
34977 unsigned NumOps = Ops.size();
34978 unsigned NumElts = VT.getVectorNumElements();
34979 if (Mask.size() == NumElts) {
34980 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
34981 Known.Zero.setAllBits(); Known.One.setAllBits();
34982 for (unsigned i = 0; i != NumElts; ++i) {
34983 if (!DemandedElts[i])
34984 continue;
34985 int M = Mask[i];
34986 if (M == SM_SentinelUndef) {
34987 // For UNDEF elements, we don't know anything about the common state
34988 // of the shuffle result.
34989 Known.resetAll();
34990 break;
34991 }
34992 if (M == SM_SentinelZero) {
34993 Known.One.clearAllBits();
34994 continue;
34995 }
34996 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((void)0)
34997 "Shuffle index out of range")((void)0);
34998
34999 unsigned OpIdx = (unsigned)M / NumElts;
35000 unsigned EltIdx = (unsigned)M % NumElts;
35001 if (Ops[OpIdx].getValueType() != VT) {
35002 // TODO - handle target shuffle ops with different value types.
35003 Known.resetAll();
35004 break;
35005 }
35006 DemandedOps[OpIdx].setBit(EltIdx);
35007 }
35008 // Known bits are the values that are shared by every demanded element.
35009 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
35010 if (!DemandedOps[i])
35011 continue;
35012 KnownBits Known2 =
35013 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
35014 Known = KnownBits::commonBits(Known, Known2);
35015 }
35016 }
35017 }
35018 }
35019}
35020
35021unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
35022 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
35023 unsigned Depth) const {
35024 EVT VT = Op.getValueType();
35025 unsigned VTBits = VT.getScalarSizeInBits();
35026 unsigned Opcode = Op.getOpcode();
35027 switch (Opcode) {
35028 case X86ISD::SETCC_CARRY:
35029 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
35030 return VTBits;
35031
35032 case X86ISD::VTRUNC: {
35033 SDValue Src = Op.getOperand(0);
35034 MVT SrcVT = Src.getSimpleValueType();
35035 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
35036 assert(VTBits < NumSrcBits && "Illegal truncation input type")((void)0);
35037 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
35038 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
35039 if (Tmp > (NumSrcBits - VTBits))
35040 return Tmp - (NumSrcBits - VTBits);
35041 return 1;
35042 }
35043
35044 case X86ISD::PACKSS: {
35045 // PACKSS is just a truncation if the sign bits extend to the packed size.
35046 APInt DemandedLHS, DemandedRHS;
35047 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
35048 DemandedRHS);
35049
35050 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
35051 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
35052 if (!!DemandedLHS)
35053 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
35054 if (!!DemandedRHS)
35055 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
35056 unsigned Tmp = std::min(Tmp0, Tmp1);
35057 if (Tmp > (SrcBits - VTBits))
35058 return Tmp - (SrcBits - VTBits);
35059 return 1;
35060 }
35061
35062 case X86ISD::VBROADCAST: {
35063 SDValue Src = Op.getOperand(0);
35064 if (!Src.getSimpleValueType().isVector())
35065 return DAG.ComputeNumSignBits(Src, Depth + 1);
35066 break;
35067 }
35068
35069 case X86ISD::VSHLI: {
35070 SDValue Src = Op.getOperand(0);
35071 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
35072 if (ShiftVal.uge(VTBits))
35073 return VTBits; // Shifted all bits out --> zero.
35074 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35075 if (ShiftVal.uge(Tmp))
35076 return 1; // Shifted all sign bits out --> unknown.
35077 return Tmp - ShiftVal.getZExtValue();
35078 }
35079
35080 case X86ISD::VSRAI: {
35081 SDValue Src = Op.getOperand(0);
35082 APInt ShiftVal = Op.getConstantOperandAPInt(1);
35083 if (ShiftVal.uge(VTBits - 1))
35084 return VTBits; // Sign splat.
35085 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35086 ShiftVal += Tmp;
35087 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
35088 }
35089
35090 case X86ISD::FSETCC:
35091 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
35092 if (VT == MVT::f32 || VT == MVT::f64 ||
35093 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
35094 return VTBits;
35095 break;
35096
35097 case X86ISD::PCMPGT:
35098 case X86ISD::PCMPEQ:
35099 case X86ISD::CMPP:
35100 case X86ISD::VPCOM:
35101 case X86ISD::VPCOMU:
35102 // Vector compares return zero/all-bits result values.
35103 return VTBits;
35104
35105 case X86ISD::ANDNP: {
35106 unsigned Tmp0 =
35107 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
35108 if (Tmp0 == 1) return 1; // Early out.
35109 unsigned Tmp1 =
35110 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
35111 return std::min(Tmp0, Tmp1);
35112 }
35113
35114 case X86ISD::CMOV: {
35115 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
35116 if (Tmp0 == 1) return 1; // Early out.
35117 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
35118 return std::min(Tmp0, Tmp1);
35119 }
35120 }
35121
35122 // Handle target shuffles.
35123 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
35124 if (isTargetShuffle(Opcode)) {
35125 SmallVector<int, 64> Mask;
35126 SmallVector<SDValue, 2> Ops;
35127 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
35128 unsigned NumOps = Ops.size();
35129 unsigned NumElts = VT.getVectorNumElements();
35130 if (Mask.size() == NumElts) {
35131 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
35132 for (unsigned i = 0; i != NumElts; ++i) {
35133 if (!DemandedElts[i])
35134 continue;
35135 int M = Mask[i];
35136 if (M == SM_SentinelUndef) {
35137 // For UNDEF elements, we don't know anything about the common state
35138 // of the shuffle result.
35139 return 1;
35140 } else if (M == SM_SentinelZero) {
35141 // Zero = all sign bits.
35142 continue;
35143 }
35144 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((void)0)
35145 "Shuffle index out of range")((void)0);
35146
35147 unsigned OpIdx = (unsigned)M / NumElts;
35148 unsigned EltIdx = (unsigned)M % NumElts;
35149 if (Ops[OpIdx].getValueType() != VT) {
35150 // TODO - handle target shuffle ops with different value types.
35151 return 1;
35152 }
35153 DemandedOps[OpIdx].setBit(EltIdx);
35154 }
35155 unsigned Tmp0 = VTBits;
35156 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
35157 if (!DemandedOps[i])
35158 continue;
35159 unsigned Tmp1 =
35160 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
35161 Tmp0 = std::min(Tmp0, Tmp1);
35162 }
35163 return Tmp0;
35164 }
35165 }
35166 }
35167
35168 // Fallback case.
35169 return 1;
35170}
35171
35172SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
35173 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
35174 return N->getOperand(0);
35175 return N;
35176}
35177
35178// Helper to look for a normal load that can be narrowed into a vzload with the
35179// specified VT and memory VT. Returns SDValue() on failure.
35180static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
35181 SelectionDAG &DAG) {
35182 // Can't if the load is volatile or atomic.
35183 if (!LN->isSimple())
35184 return SDValue();
35185
35186 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35187 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
35188 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
35189 LN->getPointerInfo(), LN->getOriginalAlign(),
35190 LN->getMemOperand()->getFlags());
35191}
35192
35193// Attempt to match a combined shuffle mask against supported unary shuffle
35194// instructions.
35195// TODO: Investigate sharing more of this with shuffle lowering.
35196static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35197 bool AllowFloatDomain, bool AllowIntDomain,
35198 SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
35199 const X86Subtarget &Subtarget, unsigned &Shuffle,
35200 MVT &SrcVT, MVT &DstVT) {
35201 unsigned NumMaskElts = Mask.size();
35202 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
35203
35204 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
35205 if (MaskEltSize == 32 && Mask[0] == 0) {
35206 if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
35207 Shuffle = X86ISD::VZEXT_MOVL;
35208 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35209 return true;
35210 }
35211 if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35212 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35213 Shuffle = X86ISD::VZEXT_MOVL;
35214 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35215 return true;
35216 }
35217 }
35218
35219 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
35220 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
35221 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
35222 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
35223 unsigned MaxScale = 64 / MaskEltSize;
35224 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
35225 bool MatchAny = true;
35226 bool MatchZero = true;
35227 unsigned NumDstElts = NumMaskElts / Scale;
35228 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
35229 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
35230 MatchAny = MatchZero = false;
35231 break;
35232 }
35233 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
35234 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
35235 }
35236 if (MatchAny || MatchZero) {
35237 assert(MatchZero && "Failed to match zext but matched aext?")((void)0);
35238 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
35239 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
35240 MVT::getIntegerVT(MaskEltSize);
35241 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
35242
35243 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
35244 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
35245
35246 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
35247 if (SrcVT.getVectorNumElements() != NumDstElts)
35248 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
35249
35250 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
35251 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
35252 return true;
35253 }
35254 }
35255 }
35256
35257 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
35258 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
35259 isUndefOrEqual(Mask[0], 0) &&
35260 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35261 Shuffle = X86ISD::VZEXT_MOVL;
35262 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35263 return true;
35264 }
35265
35266 // Check if we have SSE3 which will let us use MOVDDUP etc. The
35267 // instructions are no slower than UNPCKLPD but has the option to
35268 // fold the input operand into even an unaligned memory load.
35269 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
35270 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
35271 Shuffle = X86ISD::MOVDDUP;
35272 SrcVT = DstVT = MVT::v2f64;
35273 return true;
35274 }
35275 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35276 Shuffle = X86ISD::MOVSLDUP;
35277 SrcVT = DstVT = MVT::v4f32;
35278 return true;
35279 }
35280 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
35281 Shuffle = X86ISD::MOVSHDUP;
35282 SrcVT = DstVT = MVT::v4f32;
35283 return true;
35284 }
35285 }
35286
35287 if (MaskVT.is256BitVector() && AllowFloatDomain) {
35288 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")((void)0);
35289 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35290 Shuffle = X86ISD::MOVDDUP;
35291 SrcVT = DstVT = MVT::v4f64;
35292 return true;
35293 }
35294 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35295 Shuffle = X86ISD::MOVSLDUP;
35296 SrcVT = DstVT = MVT::v8f32;
35297 return true;
35298 }
35299 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
35300 Shuffle = X86ISD::MOVSHDUP;
35301 SrcVT = DstVT = MVT::v8f32;
35302 return true;
35303 }
35304 }
35305
35306 if (MaskVT.is512BitVector() && AllowFloatDomain) {
35307 assert(Subtarget.hasAVX512() &&((void)0)
35308 "AVX512 required for 512-bit vector shuffles")((void)0);
35309 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35310 Shuffle = X86ISD::MOVDDUP;
35311 SrcVT = DstVT = MVT::v8f64;
35312 return true;
35313 }
35314 if (isTargetShuffleEquivalent(
35315 MaskVT, Mask,
35316 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
35317 Shuffle = X86ISD::MOVSLDUP;
35318 SrcVT = DstVT = MVT::v16f32;
35319 return true;
35320 }
35321 if (isTargetShuffleEquivalent(
35322 MaskVT, Mask,
35323 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
35324 Shuffle = X86ISD::MOVSHDUP;
35325 SrcVT = DstVT = MVT::v16f32;
35326 return true;
35327 }
35328 }
35329
35330 return false;
35331}
35332
35333// Attempt to match a combined shuffle mask against supported unary immediate
35334// permute instructions.
35335// TODO: Investigate sharing more of this with shuffle lowering.
35336static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
35337 const APInt &Zeroable,
35338 bool AllowFloatDomain, bool AllowIntDomain,
35339 const X86Subtarget &Subtarget,
35340 unsigned &Shuffle, MVT &ShuffleVT,
35341 unsigned &PermuteImm) {
35342 unsigned NumMaskElts = Mask.size();
35343 unsigned InputSizeInBits = MaskVT.getSizeInBits();
35344 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
35345 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
35346 bool ContainsZeros = isAnyZero(Mask);
35347
35348 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
35349 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
35350 // Check for lane crossing permutes.
35351 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
35352 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
35353 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
35354 Shuffle = X86ISD::VPERMI;
35355 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
35356 PermuteImm = getV4X86ShuffleImm(Mask);
35357 return true;
35358 }
35359 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
35360 SmallVector<int, 4> RepeatedMask;
35361 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
35362 Shuffle = X86ISD::VPERMI;
35363 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
35364 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
35365 return true;
35366 }
35367 }
35368 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
35369 // VPERMILPD can permute with a non-repeating shuffle.
35370 Shuffle = X86ISD::VPERMILPI;
35371 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
35372 PermuteImm = 0;
35373 for (int i = 0, e = Mask.size(); i != e; ++i) {
35374 int M = Mask[i];
35375 if (M == SM_SentinelUndef)
35376 continue;
35377 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")((void)0);
35378 PermuteImm |= (M & 1) << i;
35379 }
35380 return true;
35381 }
35382 }
35383
35384 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
35385 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
35386 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
35387 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
35388 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
35389 SmallVector<int, 4> RepeatedMask;
35390 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35391 // Narrow the repeated mask to create 32-bit element permutes.
35392 SmallVector<int, 4> WordMask = RepeatedMask;
35393 if (MaskScalarSizeInBits == 64)
35394 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
35395
35396 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
35397 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
35398 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
35399 PermuteImm = getV4X86ShuffleImm(WordMask);
35400 return true;
35401 }
35402 }
35403
35404 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
35405 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
35406 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35407 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35408 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35409 SmallVector<int, 4> RepeatedMask;
35410 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
35411 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
35412 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
35413
35414 // PSHUFLW: permute lower 4 elements only.
35415 if (isUndefOrInRange(LoMask, 0, 4) &&
35416 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
35417 Shuffle = X86ISD::PSHUFLW;
35418 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35419 PermuteImm = getV4X86ShuffleImm(LoMask);
35420 return true;
35421 }
35422
35423 // PSHUFHW: permute upper 4 elements only.
35424 if (isUndefOrInRange(HiMask, 4, 8) &&
35425 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
35426 // Offset the HiMask so that we can create the shuffle immediate.
35427 int OffsetHiMask[4];
35428 for (int i = 0; i != 4; ++i)
35429 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
35430
35431 Shuffle = X86ISD::PSHUFHW;
35432 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
35433 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
35434 return true;
35435 }
35436 }
35437 }
35438
35439 // Attempt to match against byte/bit shifts.
35440 if (AllowIntDomain &&
35441 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35442 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35443 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35444 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
35445 Mask, 0, Zeroable, Subtarget);
35446 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
35447 32 <= ShuffleVT.getScalarSizeInBits())) {
35448 PermuteImm = (unsigned)ShiftAmt;
35449 return true;
35450 }
35451 }
35452
35453 // Attempt to match against bit rotates.
35454 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
35455 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
35456 Subtarget.hasAVX512())) {
35457 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
35458 Subtarget, Mask);
35459 if (0 < RotateAmt) {
35460 Shuffle = X86ISD::VROTLI;
35461 PermuteImm = (unsigned)RotateAmt;
35462 return true;
35463 }
35464 }
35465
35466 return false;
35467}
35468
35469// Attempt to match a combined unary shuffle mask against supported binary
35470// shuffle instructions.
35471// TODO: Investigate sharing more of this with shuffle lowering.
35472static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35473 bool AllowFloatDomain, bool AllowIntDomain,
35474 SDValue &V1, SDValue &V2, const SDLoc &DL,
35475 SelectionDAG &DAG, const X86Subtarget &Subtarget,
35476 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
35477 bool IsUnary) {
35478 unsigned NumMaskElts = Mask.size();
35479 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35480
35481 if (MaskVT.is128BitVector()) {
35482 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
35483 V2 = V1;
35484 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
35485 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
35486 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35487 return true;
35488 }
35489 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
35490 V2 = V1;
35491 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
35492 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
35493 return true;
35494 }
35495 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
35496 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
35497 std::swap(V1, V2);
35498 Shuffle = X86ISD::MOVSD;
35499 SrcVT = DstVT = MVT::v2f64;
35500 return true;
35501 }
35502 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
35503 (AllowFloatDomain || !Subtarget.hasSSE41())) {
35504 Shuffle = X86ISD::MOVSS;
35505 SrcVT = DstVT = MVT::v4f32;
35506 return true;
35507 }
35508 }
35509
35510 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
35511 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
35512 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
35513 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
35514 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
35515 Subtarget)) {
35516 DstVT = MaskVT;
35517 return true;
35518 }
35519 }
35520
35521 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
35522 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
35523 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35524 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
35525 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35526 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
35527 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
35528 Subtarget)) {
35529 SrcVT = DstVT = MaskVT;
35530 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
35531 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
35532 return true;
35533 }
35534 }
35535
35536 // Attempt to match against a OR if we're performing a blend shuffle and the
35537 // non-blended source element is zero in each case.
35538 if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35539 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
35540 bool IsBlend = true;
35541 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
35542 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
35543 unsigned Scale1 = NumV1Elts / NumMaskElts;
35544 unsigned Scale2 = NumV2Elts / NumMaskElts;
35545 APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
35546 APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
35547 for (unsigned i = 0; i != NumMaskElts; ++i) {
35548 int M = Mask[i];
35549 if (M == SM_SentinelUndef)
35550 continue;
35551 if (M == SM_SentinelZero) {
35552 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35553 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35554 continue;
35555 }
35556 if (M == (int)i) {
35557 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
35558 continue;
35559 }
35560 if (M == (int)(i + NumMaskElts)) {
35561 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
35562 continue;
35563 }
35564 IsBlend = false;
35565 break;
35566 }
35567 if (IsBlend &&
35568 DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
35569 DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
35570 Shuffle = ISD::OR;
35571 SrcVT = DstVT = MaskVT.changeTypeToInteger();
35572 return true;
35573 }
35574 }
35575
35576 return false;
35577}
35578
35579static bool matchBinaryPermuteShuffle(
35580 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
35581 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
35582 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
35583 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
35584 unsigned NumMaskElts = Mask.size();
35585 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
35586
35587 // Attempt to match against VALIGND/VALIGNQ rotate.
35588 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
35589 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
35590 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
35591 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35592 if (!isAnyZero(Mask)) {
35593 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
35594 if (0 < Rotation) {
35595 Shuffle = X86ISD::VALIGN;
35596 if (EltSizeInBits == 64)
35597 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
35598 else
35599 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
35600 PermuteImm = Rotation;
35601 return true;
35602 }
35603 }
35604 }
35605
35606 // Attempt to match against PALIGNR byte rotate.
35607 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
35608 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
35609 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
35610 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
35611 if (0 < ByteRotation) {
35612 Shuffle = X86ISD::PALIGNR;
35613 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
35614 PermuteImm = ByteRotation;
35615 return true;
35616 }
35617 }
35618
35619 // Attempt to combine to X86ISD::BLENDI.
35620 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
35621 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
35622 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
35623 uint64_t BlendMask = 0;
35624 bool ForceV1Zero = false, ForceV2Zero = false;
35625 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
35626 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
35627 ForceV2Zero, BlendMask)) {
35628 if (MaskVT == MVT::v16i16) {
35629 // We can only use v16i16 PBLENDW if the lanes are repeated.
35630 SmallVector<int, 8> RepeatedMask;
35631 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
35632 RepeatedMask)) {
35633 assert(RepeatedMask.size() == 8 &&((void)0)
35634 "Repeated mask size doesn't match!")((void)0);
35635 PermuteImm = 0;
35636 for (int i = 0; i < 8; ++i)
35637 if (RepeatedMask[i] >= 8)
35638 PermuteImm |= 1 << i;
35639 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35640 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35641 Shuffle = X86ISD::BLENDI;
35642 ShuffleVT = MaskVT;
35643 return true;
35644 }
35645 } else {
35646 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35647 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35648 PermuteImm = (unsigned)BlendMask;
35649 Shuffle = X86ISD::BLENDI;
35650 ShuffleVT = MaskVT;
35651 return true;
35652 }
35653 }
35654 }
35655
35656 // Attempt to combine to INSERTPS, but only if it has elements that need to
35657 // be set to zero.
35658 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35659 MaskVT.is128BitVector() && isAnyZero(Mask) &&
35660 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35661 Shuffle = X86ISD::INSERTPS;
35662 ShuffleVT = MVT::v4f32;
35663 return true;
35664 }
35665
35666 // Attempt to combine to SHUFPD.
35667 if (AllowFloatDomain && EltSizeInBits == 64 &&
35668 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
35669 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35670 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35671 bool ForceV1Zero = false, ForceV2Zero = false;
35672 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
35673 PermuteImm, Mask, Zeroable)) {
35674 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
35675 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
35676 Shuffle = X86ISD::SHUFP;
35677 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
35678 return true;
35679 }
35680 }
35681
35682 // Attempt to combine to SHUFPS.
35683 if (AllowFloatDomain && EltSizeInBits == 32 &&
35684 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
35685 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
35686 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
35687 SmallVector<int, 4> RepeatedMask;
35688 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
35689 // Match each half of the repeated mask, to determine if its just
35690 // referencing one of the vectors, is zeroable or entirely undef.
35691 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
35692 int M0 = RepeatedMask[Offset];
35693 int M1 = RepeatedMask[Offset + 1];
35694
35695 if (isUndefInRange(RepeatedMask, Offset, 2)) {
35696 return DAG.getUNDEF(MaskVT);
35697 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
35698 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
35699 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
35700 return getZeroVector(MaskVT, Subtarget, DAG, DL);
35701 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
35702 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35703 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35704 return V1;
35705 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
35706 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
35707 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
35708 return V2;
35709 }
35710
35711 return SDValue();
35712 };
35713
35714 int ShufMask[4] = {-1, -1, -1, -1};
35715 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
35716 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
35717
35718 if (Lo && Hi) {
35719 V1 = Lo;
35720 V2 = Hi;
35721 Shuffle = X86ISD::SHUFP;
35722 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
35723 PermuteImm = getV4X86ShuffleImm(ShufMask);
35724 return true;
35725 }
35726 }
35727 }
35728
35729 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
35730 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
35731 MaskVT.is128BitVector() &&
35732 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
35733 Shuffle = X86ISD::INSERTPS;
35734 ShuffleVT = MVT::v4f32;
35735 return true;
35736 }
35737
35738 return false;
35739}
35740
35741static SDValue combineX86ShuffleChainWithExtract(
35742 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
35743 bool HasVariableMask, bool AllowVariableCrossLaneMask,
35744 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
35745 const X86Subtarget &Subtarget);
35746
35747/// Combine an arbitrary chain of shuffles into a single instruction if
35748/// possible.
35749///
35750/// This is the leaf of the recursive combine below. When we have found some
35751/// chain of single-use x86 shuffle instructions and accumulated the combined
35752/// shuffle mask represented by them, this will try to pattern match that mask
35753/// into either a single instruction if there is a special purpose instruction
35754/// for this operation, or into a PSHUFB instruction which is a fully general
35755/// instruction but should only be used to replace chains over a certain depth.
35756static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35757 ArrayRef<int> BaseMask, int Depth,
35758 bool HasVariableMask,
35759 bool AllowVariableCrossLaneMask,
35760 bool AllowVariablePerLaneMask,
35761 SelectionDAG &DAG,
35762 const X86Subtarget &Subtarget) {
35763 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")((void)0);
35764 assert((Inputs.size() == 1 || Inputs.size() == 2) &&((void)0)
35765 "Unexpected number of shuffle inputs!")((void)0);
35766
35767 MVT RootVT = Root.getSimpleValueType();
35768 unsigned RootSizeInBits = RootVT.getSizeInBits();
35769 unsigned NumRootElts = RootVT.getVectorNumElements();
35770
35771 // Canonicalize shuffle input op to the requested type.
35772 // TODO: Support cases where Op is smaller than VT.
35773 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
35774 return DAG.getBitcast(VT, Op);
35775 };
35776
35777 // Find the inputs that enter the chain. Note that multiple uses are OK
35778 // here, we're not going to remove the operands we find.
35779 bool UnaryShuffle = (Inputs.size() == 1);
35780 SDValue V1 = peekThroughBitcasts(Inputs[0]);
35781 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
35782 : peekThroughBitcasts(Inputs[1]));
35783
35784 MVT VT1 = V1.getSimpleValueType();
35785 MVT VT2 = V2.getSimpleValueType();
35786 assert(VT1.getSizeInBits() == RootSizeInBits &&((void)0)
35787 VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch")((void)0);
35788
35789 SDLoc DL(Root);
35790 SDValue Res;
35791
35792 unsigned NumBaseMaskElts = BaseMask.size();
35793 if (NumBaseMaskElts == 1) {
35794 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")((void)0);
35795 return CanonicalizeShuffleInput(RootVT, V1);
35796 }
35797
35798 bool OptForSize = DAG.shouldOptForSize();
35799 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
35800 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
35801 (RootVT.isFloatingPoint() && Depth >= 1) ||
35802 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
35803
35804 // Don't combine if we are a AVX512/EVEX target and the mask element size
35805 // is different from the root element size - this would prevent writemasks
35806 // from being reused.
35807 bool IsMaskedShuffle = false;
35808 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
35809 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
35810 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
35811 IsMaskedShuffle = true;
35812 }
35813 }
35814
35815 // If we are shuffling a broadcast (and not introducing zeros) then
35816 // we can just use the broadcast directly. This works for smaller broadcast
35817 // elements as well as they already repeat across each mask element
35818 if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35819 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35820 V1.getValueSizeInBits() >= RootSizeInBits) {
35821 return CanonicalizeShuffleInput(RootVT, V1);
35822 }
35823
35824 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
35825 // etc. can be simplified.
35826 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
35827 SmallVector<int> ScaledMask, IdentityMask;
35828 unsigned NumElts = VT1.getVectorNumElements();
35829 if (BaseMask.size() <= NumElts &&
35830 scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
35831 for (unsigned i = 0; i != NumElts; ++i)
35832 IdentityMask.push_back(i);
35833 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
35834 return CanonicalizeShuffleInput(RootVT, V1);
35835 }
35836 }
35837
35838 // Handle 128/256-bit lane shuffles of 512-bit vectors.
35839 if (RootVT.is512BitVector() &&
35840 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
35841 // If the upper subvectors are zeroable, then an extract+insert is more
35842 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35843 // to zero the upper subvectors.
35844 if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
35845 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35846 return SDValue(); // Nothing to do!
35847 assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&((void)0)
35848 "Unexpected lane shuffle")((void)0);
35849 Res = CanonicalizeShuffleInput(RootVT, V1);
35850 unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
35851 bool UseZero = isAnyZero(BaseMask);
35852 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35853 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35854 }
35855
35856 // Narrow shuffle mask to v4x128.
35857 SmallVector<int, 4> Mask;
35858 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")((void)0);
35859 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
35860
35861 // Try to lower to vshuf64x2/vshuf32x4.
35862 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
35863 SDValue V1, SDValue V2, SelectionDAG &DAG) {
35864 unsigned PermMask = 0;
35865 // Insure elements came from the same Op.
35866 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
35867 for (int i = 0; i < 4; ++i) {
35868 assert(Mask[i] >= -1 && "Illegal shuffle sentinel value")((void)0);
35869 if (Mask[i] < 0)
35870 continue;
35871
35872 SDValue Op = Mask[i] >= 4 ? V2 : V1;
35873 unsigned OpIndex = i / 2;
35874 if (Ops[OpIndex].isUndef())
35875 Ops[OpIndex] = Op;
35876 else if (Ops[OpIndex] != Op)
35877 return SDValue();
35878
35879 // Convert the 128-bit shuffle mask selection values into 128-bit
35880 // selection bits defined by a vshuf64x2 instruction's immediate control
35881 // byte.
35882 PermMask |= (Mask[i] % 4) << (i * 2);
35883 }
35884
35885 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
35886 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
35887 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
35888 DAG.getTargetConstant(PermMask, DL, MVT::i8));
35889 };
35890
35891 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
35892 // doesn't work because our mask is for 128 bits and we don't have an MVT
35893 // to match that.
35894 bool PreferPERMQ =
35895 UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
35896 isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
35897 isUndefOrInRange(Mask[3], 2, 4) &&
35898 (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
35899 (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
35900
35901 if (!isAnyZero(Mask) && !PreferPERMQ) {
35902 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35903 return SDValue(); // Nothing to do!
35904 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
35905 if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
35906 return DAG.getBitcast(RootVT, V);
35907 }
35908 }
35909
35910 // Handle 128-bit lane shuffles of 256-bit vectors.
35911 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
35912 // If the upper half is zeroable, then an extract+insert is more optimal
35913 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35914 // zero the upper half.
35915 if (isUndefOrZero(BaseMask[1])) {
35916 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35917 return SDValue(); // Nothing to do!
35918 assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle")((void)0);
35919 Res = CanonicalizeShuffleInput(RootVT, V1);
35920 Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
35921 return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
35922 DL, 256);
35923 }
35924
35925 // If we're splatting the low subvector, an insert-subvector 'concat'
35926 // pattern is quicker than VPERM2X128.
35927 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
35928 if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
35929 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35930 return SDValue(); // Nothing to do!
35931 Res = CanonicalizeShuffleInput(RootVT, V1);
35932 Res = extractSubVector(Res, 0, DAG, DL, 128);
35933 return concatSubVectors(Res, Res, DAG, DL);
35934 }
35935
35936 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
35937 return SDValue(); // Nothing to do!
35938
35939 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
35940 // we need to use the zeroing feature.
35941 // Prefer blends for sequential shuffles unless we are optimizing for size.
35942 if (UnaryShuffle &&
35943 !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
35944 (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
35945 unsigned PermMask = 0;
35946 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
35947 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
35948 return DAG.getNode(
35949 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35950 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
35951 }
35952
35953 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
35954 return SDValue(); // Nothing to do!
35955
35956 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
35957 if (!UnaryShuffle && !IsMaskedShuffle) {
35958 assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&((void)0)
35959 "Unexpected shuffle sentinel value")((void)0);
35960 // Prefer blends to X86ISD::VPERM2X128.
35961 if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35962 (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35963 unsigned PermMask = 0;
35964 PermMask |= ((BaseMask[0] & 3) << 0);
35965 PermMask |= ((BaseMask[1] & 3) << 4);
35966 SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
35967 SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
35968 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
35969 CanonicalizeShuffleInput(RootVT, LHS),
35970 CanonicalizeShuffleInput(RootVT, RHS),
35971 DAG.getTargetConstant(PermMask, DL, MVT::i8));
35972 }
35973 }
35974 }
35975
35976 // For masks that have been widened to 128-bit elements or more,
35977 // narrow back down to 64-bit elements.
35978 SmallVector<int, 64> Mask;
35979 if (BaseMaskEltSizeInBits > 64) {
35980 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")((void)0);
35981 int MaskScale = BaseMaskEltSizeInBits / 64;
35982 narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
35983 } else {
35984 Mask.assign(BaseMask.begin(), BaseMask.end());
35985 }
35986
35987 // For masked shuffles, we're trying to match the root width for better
35988 // writemask folding, attempt to scale the mask.
35989 // TODO - variable shuffles might need this to be widened again.
35990 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
35991 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")((void)0);
35992 int MaskScale = NumRootElts / Mask.size();
35993 SmallVector<int, 64> ScaledMask;
35994 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
35995 Mask = std::move(ScaledMask);
35996 }
35997
35998 unsigned NumMaskElts = Mask.size();
35999 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
36000
36001 // Determine the effective mask value type.
36002 FloatDomain &= (32 <= MaskEltSizeInBits);
36003 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
36004 : MVT::getIntegerVT(MaskEltSizeInBits);
36005 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
36006
36007 // Only allow legal mask types.
36008 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36009 return SDValue();
36010
36011 // Attempt to match the mask against known shuffle patterns.
36012 MVT ShuffleSrcVT, ShuffleVT;
36013 unsigned Shuffle, PermuteImm;
36014
36015 // Which shuffle domains are permitted?
36016 // Permit domain crossing at higher combine depths.
36017 // TODO: Should we indicate which domain is preferred if both are allowed?
36018 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
36019 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
36020 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
36021
36022 // Determine zeroable mask elements.
36023 APInt KnownUndef, KnownZero;
36024 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
36025 APInt Zeroable = KnownUndef | KnownZero;
36026
36027 if (UnaryShuffle) {
36028 // Attempt to match against broadcast-from-vector.
36029 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
36030 if ((Subtarget.hasAVX2() ||
36031 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
36032 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
36033 if (isUndefOrEqual(Mask, 0)) {
36034 if (V1.getValueType() == MaskVT &&
36035 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36036 MayFoldLoad(V1.getOperand(0))) {
36037 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36038 return SDValue(); // Nothing to do!
36039 Res = V1.getOperand(0);
36040 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36041 return DAG.getBitcast(RootVT, Res);
36042 }
36043 if (Subtarget.hasAVX2()) {
36044 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36045 return SDValue(); // Nothing to do!
36046 Res = CanonicalizeShuffleInput(MaskVT, V1);
36047 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36048 return DAG.getBitcast(RootVT, Res);
36049 }
36050 }
36051 }
36052
36053 SDValue NewV1 = V1; // Save operand in case early exit happens.
36054 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36055 DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36056 ShuffleVT) &&
36057 (!IsMaskedShuffle ||
36058 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36059 if (Depth == 0 && Root.getOpcode() == Shuffle)
36060 return SDValue(); // Nothing to do!
36061 Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36062 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
36063 return DAG.getBitcast(RootVT, Res);
36064 }
36065
36066 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36067 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
36068 PermuteImm) &&
36069 (!IsMaskedShuffle ||
36070 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36071 if (Depth == 0 && Root.getOpcode() == Shuffle)
36072 return SDValue(); // Nothing to do!
36073 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
36074 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
36075 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36076 return DAG.getBitcast(RootVT, Res);
36077 }
36078 }
36079
36080 // Attempt to combine to INSERTPS, but only if the inserted element has come
36081 // from a scalar.
36082 // TODO: Handle other insertions here as well?
36083 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
36084 Subtarget.hasSSE41() &&
36085 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
36086 if (MaskEltSizeInBits == 32) {
36087 SDValue SrcV1 = V1, SrcV2 = V2;
36088 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
36089 DAG) &&
36090 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
36091 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36092 return SDValue(); // Nothing to do!
36093 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36094 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
36095 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
36096 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36097 return DAG.getBitcast(RootVT, Res);
36098 }
36099 }
36100 if (MaskEltSizeInBits == 64 &&
36101 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
36102 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36103 V2.getScalarValueSizeInBits() <= 32) {
36104 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36105 return SDValue(); // Nothing to do!
36106 PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
36107 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36108 CanonicalizeShuffleInput(MVT::v4f32, V1),
36109 CanonicalizeShuffleInput(MVT::v4f32, V2),
36110 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36111 return DAG.getBitcast(RootVT, Res);
36112 }
36113 }
36114
36115 SDValue NewV1 = V1; // Save operands in case early exit happens.
36116 SDValue NewV2 = V2;
36117 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36118 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36119 ShuffleVT, UnaryShuffle) &&
36120 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36121 if (Depth == 0 && Root.getOpcode() == Shuffle)
36122 return SDValue(); // Nothing to do!
36123 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36124 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
36125 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
36126 return DAG.getBitcast(RootVT, Res);
36127 }
36128
36129 NewV1 = V1; // Save operands in case early exit happens.
36130 NewV2 = V2;
36131 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36132 AllowIntDomain, NewV1, NewV2, DL, DAG,
36133 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
36134 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36135 if (Depth == 0 && Root.getOpcode() == Shuffle)
36136 return SDValue(); // Nothing to do!
36137 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
36138 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
36139 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
36140 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36141 return DAG.getBitcast(RootVT, Res);
36142 }
36143
36144 // Typically from here on, we need an integer version of MaskVT.
36145 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
36146 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
36147
36148 // Annoyingly, SSE4A instructions don't map into the above match helpers.
36149 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
36150 uint64_t BitLen, BitIdx;
36151 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
36152 Zeroable)) {
36153 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
36154 return SDValue(); // Nothing to do!
36155 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36156 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
36157 DAG.getTargetConstant(BitLen, DL, MVT::i8),
36158 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36159 return DAG.getBitcast(RootVT, Res);
36160 }
36161
36162 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
36163 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
36164 return SDValue(); // Nothing to do!
36165 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36166 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
36167 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
36168 DAG.getTargetConstant(BitLen, DL, MVT::i8),
36169 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36170 return DAG.getBitcast(RootVT, Res);
36171 }
36172 }
36173
36174 // Match shuffle against TRUNCATE patterns.
36175 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
36176 // Match against a VTRUNC instruction, accounting for src/dst sizes.
36177 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
36178 Subtarget)) {
36179 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
36180 ShuffleSrcVT.getVectorNumElements();
36181 unsigned Opc =
36182 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
36183 if (Depth == 0 && Root.getOpcode() == Opc)
36184 return SDValue(); // Nothing to do!
36185 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36186 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
36187 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
36188 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
36189 return DAG.getBitcast(RootVT, Res);
36190 }
36191
36192 // Do we need a more general binary truncation pattern?
36193 if (RootSizeInBits < 512 &&
36194 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
36195 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
36196 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
36197 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
36198 if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
36199 return SDValue(); // Nothing to do!
36200 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36201 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
36202 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36203 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
36204 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36205 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
36206 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
36207 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
36208 return DAG.getBitcast(RootVT, Res);
36209 }
36210 }
36211
36212 // Don't try to re-form single instruction chains under any circumstances now
36213 // that we've done encoding canonicalization for them.
36214 if (Depth < 1)
36215 return SDValue();
36216
36217 // Depth threshold above which we can efficiently use variable mask shuffles.
36218 int VariableCrossLaneShuffleDepth =
36219 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
36220 int VariablePerLaneShuffleDepth =
36221 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
36222 AllowVariableCrossLaneMask &=
36223 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
36224 AllowVariablePerLaneMask &=
36225 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
36226 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
36227 // higher depth before combining them.
36228 bool AllowBWIVPERMV3 =
36229 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
36230
36231 bool MaskContainsZeros = isAnyZero(Mask);
36232
36233 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
36234 // If we have a single input lane-crossing shuffle then lower to VPERMV.
36235 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
36236 if (Subtarget.hasAVX2() &&
36237 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
36238 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
36239 Res = CanonicalizeShuffleInput(MaskVT, V1);
36240 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
36241 return DAG.getBitcast(RootVT, Res);
36242 }
36243 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
36244 if ((Subtarget.hasAVX512() &&
36245 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36246 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36247 (Subtarget.hasBWI() &&
36248 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36249 (Subtarget.hasVBMI() &&
36250 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
36251 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36252 V2 = DAG.getUNDEF(MaskVT);
36253 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36254 return DAG.getBitcast(RootVT, Res);
36255 }
36256 }
36257
36258 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
36259 // vector as the second source (non-VLX will pad to 512-bit shuffles).
36260 if (UnaryShuffle && AllowVariableCrossLaneMask &&
36261 ((Subtarget.hasAVX512() &&
36262 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36263 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36264 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
36265 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36266 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36267 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36268 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36269 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36270 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
36271 for (unsigned i = 0; i != NumMaskElts; ++i)
36272 if (Mask[i] == SM_SentinelZero)
36273 Mask[i] = NumMaskElts + i;
36274 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36275 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
36276 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36277 return DAG.getBitcast(RootVT, Res);
36278 }
36279
36280 // If that failed and either input is extracted then try to combine as a
36281 // shuffle with the larger type.
36282 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36283 Inputs, Root, BaseMask, Depth, HasVariableMask,
36284 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
36285 Subtarget))
36286 return WideShuffle;
36287
36288 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
36289 // (non-VLX will pad to 512-bit shuffles).
36290 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
36291 ((Subtarget.hasAVX512() &&
36292 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36293 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36294 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
36295 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
36296 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36297 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36298 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36299 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36300 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36301 V2 = CanonicalizeShuffleInput(MaskVT, V2);
36302 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36303 return DAG.getBitcast(RootVT, Res);
36304 }
36305 return SDValue();
36306 }
36307
36308 // See if we can combine a single input shuffle with zeros to a bit-mask,
36309 // which is much simpler than any shuffle.
36310 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
36311 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
36312 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
36313 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
36314 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
36315 APInt UndefElts(NumMaskElts, 0);
36316 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
36317 for (unsigned i = 0; i != NumMaskElts; ++i) {
36318 int M = Mask[i];
36319 if (M == SM_SentinelUndef) {
36320 UndefElts.setBit(i);
36321 continue;
36322 }
36323 if (M == SM_SentinelZero)
36324 continue;
36325 EltBits[i] = AllOnes;
36326 }
36327 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
36328 Res = CanonicalizeShuffleInput(MaskVT, V1);
36329 unsigned AndOpcode =
36330 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
36331 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
36332 return DAG.getBitcast(RootVT, Res);
36333 }
36334
36335 // If we have a single input shuffle with different shuffle patterns in the
36336 // the 128-bit lanes use the variable mask to VPERMILPS.
36337 // TODO Combine other mask types at higher depths.
36338 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
36339 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
36340 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
36341 SmallVector<SDValue, 16> VPermIdx;
36342 for (int M : Mask) {
36343 SDValue Idx =
36344 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
36345 VPermIdx.push_back(Idx);
36346 }
36347 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
36348 Res = CanonicalizeShuffleInput(MaskVT, V1);
36349 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
36350 return DAG.getBitcast(RootVT, Res);
36351 }
36352
36353 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
36354 // to VPERMIL2PD/VPERMIL2PS.
36355 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
36356 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
36357 MaskVT == MVT::v8f32)) {
36358 // VPERMIL2 Operation.
36359 // Bits[3] - Match Bit.
36360 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
36361 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
36362 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
36363 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
36364 SmallVector<int, 8> VPerm2Idx;
36365 unsigned M2ZImm = 0;
36366 for (int M : Mask) {
36367 if (M == SM_SentinelUndef) {
36368 VPerm2Idx.push_back(-1);
36369 continue;
36370 }
36371 if (M == SM_SentinelZero) {
36372 M2ZImm = 2;
36373 VPerm2Idx.push_back(8);
36374 continue;
36375 }
36376 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
36377 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
36378 VPerm2Idx.push_back(Index);
36379 }
36380 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36381 V2 = CanonicalizeShuffleInput(MaskVT, V2);
36382 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
36383 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
36384 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
36385 return DAG.getBitcast(RootVT, Res);
36386 }
36387
36388 // If we have 3 or more shuffle instructions or a chain involving a variable
36389 // mask, we can replace them with a single PSHUFB instruction profitably.
36390 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
36391 // instructions, but in practice PSHUFB tends to be *very* fast so we're
36392 // more aggressive.
36393 if (UnaryShuffle && AllowVariablePerLaneMask &&
36394 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36395 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
36396 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
36397 SmallVector<SDValue, 16> PSHUFBMask;
36398 int NumBytes = RootVT.getSizeInBits() / 8;
36399 int Ratio = NumBytes / NumMaskElts;
36400 for (int i = 0; i < NumBytes; ++i) {
36401 int M = Mask[i / Ratio];
36402 if (M == SM_SentinelUndef) {
36403 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
36404 continue;
36405 }
36406 if (M == SM_SentinelZero) {
36407 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36408 continue;
36409 }
36410 M = Ratio * M + i % Ratio;
36411 assert((M / 16) == (i / 16) && "Lane crossing detected")((void)0);
36412 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36413 }
36414 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
36415 Res = CanonicalizeShuffleInput(ByteVT, V1);
36416 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
36417 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
36418 return DAG.getBitcast(RootVT, Res);
36419 }
36420
36421 // With XOP, if we have a 128-bit binary input shuffle we can always combine
36422 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
36423 // slower than PSHUFB on targets that support both.
36424 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
36425 Subtarget.hasXOP()) {
36426 // VPPERM Mask Operation
36427 // Bits[4:0] - Byte Index (0 - 31)
36428 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
36429 SmallVector<SDValue, 16> VPPERMMask;
36430 int NumBytes = 16;
36431 int Ratio = NumBytes / NumMaskElts;
36432 for (int i = 0; i < NumBytes; ++i) {
36433 int M = Mask[i / Ratio];
36434 if (M == SM_SentinelUndef) {
36435 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
36436 continue;
36437 }
36438 if (M == SM_SentinelZero) {
36439 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
36440 continue;
36441 }
36442 M = Ratio * M + i % Ratio;
36443 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
36444 }
36445 MVT ByteVT = MVT::v16i8;
36446 V1 = CanonicalizeShuffleInput(ByteVT, V1);
36447 V2 = CanonicalizeShuffleInput(ByteVT, V2);
36448 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
36449 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
36450 return DAG.getBitcast(RootVT, Res);
36451 }
36452
36453 // If that failed and either input is extracted then try to combine as a
36454 // shuffle with the larger type.
36455 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36456 Inputs, Root, BaseMask, Depth, HasVariableMask,
36457 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
36458 return WideShuffle;
36459
36460 // If we have a dual input shuffle then lower to VPERMV3,
36461 // (non-VLX will pad to 512-bit shuffles)
36462 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
36463 ((Subtarget.hasAVX512() &&
36464 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
36465 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
36466 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
36467 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
36468 MaskVT == MVT::v16i32)) ||
36469 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36470 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
36471 MaskVT == MVT::v32i16)) ||
36472 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36473 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
36474 MaskVT == MVT::v64i8)))) {
36475 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36476 V2 = CanonicalizeShuffleInput(MaskVT, V2);
36477 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36478 return DAG.getBitcast(RootVT, Res);
36479 }
36480
36481 // Failed to find any combines.
36482 return SDValue();
36483}
36484
36485// Combine an arbitrary chain of shuffles + extract_subvectors into a single
36486// instruction if possible.
36487//
36488// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
36489// type size to attempt to combine:
36490// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
36491// -->
36492// extract_subvector(shuffle(x,y,m2),0)
36493static SDValue combineX86ShuffleChainWithExtract(
36494 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
36495 bool HasVariableMask, bool AllowVariableCrossLaneMask,
36496 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36497 const X86Subtarget &Subtarget) {
36498 unsigned NumMaskElts = BaseMask.size();
36499 unsigned NumInputs = Inputs.size();
36500 if (NumInputs == 0)
36501 return SDValue();
36502
36503 EVT RootVT = Root.getValueType();
36504 unsigned RootSizeInBits = RootVT.getSizeInBits();
36505 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")((void)0);
36506
36507 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
36508 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
36509
36510 // Peek through subvectors.
36511 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
36512 unsigned WideSizeInBits = RootSizeInBits;
36513 for (unsigned i = 0; i != NumInputs; ++i) {
36514 SDValue &Src = WideInputs[i];
36515 unsigned &Offset = Offsets[i];
36516 Src = peekThroughBitcasts(Src);
36517 EVT BaseVT = Src.getValueType();
36518 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
36519 Offset += Src.getConstantOperandVal(1);
36520 Src = Src.getOperand(0);
36521 }
36522 WideSizeInBits = std::max(WideSizeInBits,
36523 (unsigned)Src.getValueSizeInBits());
36524 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&((void)0)
36525 "Unexpected subvector extraction")((void)0);
36526 Offset /= BaseVT.getVectorNumElements();
36527 Offset *= NumMaskElts;
36528 }
36529
36530 // Bail if we're always extracting from the lowest subvectors,
36531 // combineX86ShuffleChain should match this for the current width.
36532 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
36533 return SDValue();
36534
36535 unsigned Scale = WideSizeInBits / RootSizeInBits;
36536 assert((WideSizeInBits % RootSizeInBits) == 0 &&((void)0)
36537 "Unexpected subvector extraction")((void)0);
36538
36539 // If the src vector types aren't the same, see if we can extend
36540 // them to match each other.
36541 // TODO: Support different scalar types?
36542 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
36543 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
36544 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
36545 Op.getValueType().getScalarType() != WideSVT;
36546 }))
36547 return SDValue();
36548
36549 for (SDValue &NewInput : WideInputs) {
36550 assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&((void)0)
36551 "Shuffle vector size mismatch")((void)0);
36552 if (WideSizeInBits > NewInput.getValueSizeInBits())
36553 NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
36554 SDLoc(NewInput), WideSizeInBits);
36555 assert(WideSizeInBits == NewInput.getValueSizeInBits() &&((void)0)
36556 "Unexpected subvector extraction")((void)0);
36557 }
36558
36559 // Create new mask for larger type.
36560 for (unsigned i = 1; i != NumInputs; ++i)
36561 Offsets[i] += i * Scale * NumMaskElts;
36562
36563 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
36564 for (int &M : WideMask) {
36565 if (M < 0)
36566 continue;
36567 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
36568 }
36569 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
36570
36571 // Remove unused/repeated shuffle source ops.
36572 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
36573 assert(!WideInputs.empty() && "Shuffle with no inputs detected")((void)0);
36574
36575 if (WideInputs.size() > 2)
36576 return SDValue();
36577
36578 // Increase depth for every upper subvector we've peeked through.
36579 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
36580
36581 // Attempt to combine wider chain.
36582 // TODO: Can we use a better Root?
36583 SDValue WideRoot = WideInputs[0];
36584 if (SDValue WideShuffle =
36585 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
36586 HasVariableMask, AllowVariableCrossLaneMask,
36587 AllowVariablePerLaneMask, DAG, Subtarget)) {
36588 WideShuffle =
36589 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
36590 return DAG.getBitcast(RootVT, WideShuffle);
36591 }
36592 return SDValue();
36593}
36594
36595// Canonicalize the combined shuffle mask chain with horizontal ops.
36596// NOTE: This may update the Ops and Mask.
36597static SDValue canonicalizeShuffleMaskWithHorizOp(
36598 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
36599 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
36600 const X86Subtarget &Subtarget) {
36601 if (Mask.empty() || Ops.empty())
36602 return SDValue();
36603
36604 SmallVector<SDValue> BC;
36605 for (SDValue Op : Ops)
36606 BC.push_back(peekThroughBitcasts(Op));
36607
36608 // All ops must be the same horizop + type.
36609 SDValue BC0 = BC[0];
36610 EVT VT0 = BC0.getValueType();
36611 unsigned Opcode0 = BC0.getOpcode();
36612 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
36613 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
36614 }))
36615 return SDValue();
36616
36617 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
36618 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
36619 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
36620 if (!isHoriz && !isPack)
36621 return SDValue();
36622
36623 // Do all ops have a single use?
36624 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
36625 return Op.hasOneUse() &&
36626 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
36627 });
36628
36629 int NumElts = VT0.getVectorNumElements();
36630 int NumLanes = VT0.getSizeInBits() / 128;
36631 int NumEltsPerLane = NumElts / NumLanes;
36632 int NumHalfEltsPerLane = NumEltsPerLane / 2;
36633 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
36634 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
36635
36636 if (NumEltsPerLane >= 4 &&
36637 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
36638 SmallVector<int> LaneMask, ScaledMask;
36639 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
36640 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
36641 // See if we can remove the shuffle by resorting the HOP chain so that
36642 // the HOP args are pre-shuffled.
36643 // TODO: Generalize to any sized/depth chain.
36644 // TODO: Add support for PACKSS/PACKUS.
36645 if (isHoriz) {
36646 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
36647 auto GetHOpSrc = [&](int M) {
36648 if (M == SM_SentinelUndef)
36649 return DAG.getUNDEF(VT0);
36650 if (M == SM_SentinelZero)
36651 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
36652 SDValue Src0 = BC[M / 4];
36653 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
36654 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
36655 return Src1.getOperand(M % 2);
36656 return SDValue();
36657 };
36658 SDValue M0 = GetHOpSrc(ScaledMask[0]);
36659 SDValue M1 = GetHOpSrc(ScaledMask[1]);
36660 SDValue M2 = GetHOpSrc(ScaledMask[2]);
36661 SDValue M3 = GetHOpSrc(ScaledMask[3]);
36662 if (M0 && M1 && M2 && M3) {
36663 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
36664 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
36665 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36666 }
36667 }
36668 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
36669 if (Ops.size() >= 2) {
36670 SDValue LHS, RHS;
36671 auto GetHOpSrc = [&](int M, int &OutM) {
36672 // TODO: Support SM_SentinelZero
36673 if (M < 0)
36674 return M == SM_SentinelUndef;
36675 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
36676 if (!LHS || LHS == Src) {
36677 LHS = Src;
36678 OutM = (M % 2);
36679 return true;
36680 }
36681 if (!RHS || RHS == Src) {
36682 RHS = Src;
36683 OutM = (M % 2) + 2;
36684 return true;
36685 }
36686 return false;
36687 };
36688 int PostMask[4] = {-1, -1, -1, -1};
36689 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
36690 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
36691 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
36692 GetHOpSrc(ScaledMask[3], PostMask[3])) {
36693 LHS = DAG.getBitcast(SrcVT, LHS);
36694 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
36695 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
36696 // Use SHUFPS for the permute so this will work on SSE3 targets,
36697 // shuffle combining and domain handling will simplify this later on.
36698 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
36699 Res = DAG.getBitcast(ShuffleVT, Res);
36700 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
36701 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
36702 }
36703 }
36704 }
36705 }
36706
36707 if (2 < Ops.size())
36708 return SDValue();
36709
36710 SDValue BC1 = BC[BC.size() - 1];
36711 if (Mask.size() == VT0.getVectorNumElements()) {
36712 // Canonicalize binary shuffles of horizontal ops that use the
36713 // same sources to an unary shuffle.
36714 // TODO: Try to perform this fold even if the shuffle remains.
36715 if (Ops.size() == 2) {
36716 auto ContainsOps = [](SDValue HOp, SDValue Op) {
36717 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
36718 };
36719 // Commute if all BC0's ops are contained in BC1.
36720 if (ContainsOps(BC1, BC0.getOperand(0)) &&
36721 ContainsOps(BC1, BC0.getOperand(1))) {
36722 ShuffleVectorSDNode::commuteMask(Mask);
36723 std::swap(Ops[0], Ops[1]);
36724 std::swap(BC0, BC1);
36725 }
36726
36727 // If BC1 can be represented by BC0, then convert to unary shuffle.
36728 if (ContainsOps(BC0, BC1.getOperand(0)) &&
36729 ContainsOps(BC0, BC1.getOperand(1))) {
36730 for (int &M : Mask) {
36731 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
36732 continue;
36733 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
36734 M -= NumElts + (SubLane * NumHalfEltsPerLane);
36735 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
36736 M += NumHalfEltsPerLane;
36737 }
36738 }
36739 }
36740
36741 // Canonicalize unary horizontal ops to only refer to lower halves.
36742 for (int i = 0; i != NumElts; ++i) {
36743 int &M = Mask[i];
36744 if (isUndefOrZero(M))
36745 continue;
36746 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
36747 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36748 M -= NumHalfEltsPerLane;
36749 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
36750 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
36751 M -= NumHalfEltsPerLane;
36752 }
36753 }
36754
36755 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
36756 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
36757 // represents the LHS/RHS inputs for the lower/upper halves.
36758 SmallVector<int, 16> TargetMask128, WideMask128;
36759 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
36760 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
36761 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")((void)0);
36762 bool SingleOp = (Ops.size() == 1);
36763 if (isPack || OneUseOps ||
36764 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
36765 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
36766 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
36767 Lo = Lo.getOperand(WideMask128[0] & 1);
36768 Hi = Hi.getOperand(WideMask128[1] & 1);
36769 if (SingleOp) {
36770 SDValue Undef = DAG.getUNDEF(SrcVT);
36771 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
36772 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
36773 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
36774 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
36775 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
36776 }
36777 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
36778 }
36779 }
36780
36781 return SDValue();
36782}
36783
36784// Attempt to constant fold all of the constant source ops.
36785// Returns true if the entire shuffle is folded to a constant.
36786// TODO: Extend this to merge multiple constant Ops and update the mask.
36787static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
36788 ArrayRef<int> Mask, SDValue Root,
36789 bool HasVariableMask,
36790 SelectionDAG &DAG,
36791 const X86Subtarget &Subtarget) {
36792 MVT VT = Root.getSimpleValueType();
36793
36794 unsigned SizeInBits = VT.getSizeInBits();
36795 unsigned NumMaskElts = Mask.size();
36796 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
36797 unsigned NumOps = Ops.size();
36798
36799 // Extract constant bits from each source op.
36800 bool OneUseConstantOp = false;
36801 SmallVector<APInt, 16> UndefEltsOps(NumOps);
36802 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
36803 for (unsigned i = 0; i != NumOps; ++i) {
36804 SDValue SrcOp = Ops[i];
36805 OneUseConstantOp |= SrcOp.hasOneUse();
36806 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
36807 RawBitsOps[i]))
36808 return SDValue();
36809 }
36810
36811 // Only fold if at least one of the constants is only used once or
36812 // the combined shuffle has included a variable mask shuffle, this
36813 // is to avoid constant pool bloat.
36814 if (!OneUseConstantOp && !HasVariableMask)
36815 return SDValue();
36816
36817 // Shuffle the constant bits according to the mask.
36818 SDLoc DL(Root);
36819 APInt UndefElts(NumMaskElts, 0);
36820 APInt ZeroElts(NumMaskElts, 0);
36821 APInt ConstantElts(NumMaskElts, 0);
36822 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
36823 APInt::getNullValue(MaskSizeInBits));
36824 for (unsigned i = 0; i != NumMaskElts; ++i) {
36825 int M = Mask[i];
36826 if (M == SM_SentinelUndef) {
36827 UndefElts.setBit(i);
36828 continue;
36829 } else if (M == SM_SentinelZero) {
36830 ZeroElts.setBit(i);
36831 continue;
36832 }
36833 assert(0 <= M && M < (int)(NumMaskElts * NumOps))((void)0);
36834
36835 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
36836 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
36837
36838 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
36839 if (SrcUndefElts[SrcMaskIdx]) {
36840 UndefElts.setBit(i);
36841 continue;
36842 }
36843
36844 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
36845 APInt &Bits = SrcEltBits[SrcMaskIdx];
36846 if (!Bits) {
36847 ZeroElts.setBit(i);
36848 continue;
36849 }
36850
36851 ConstantElts.setBit(i);
36852 ConstantBitData[i] = Bits;
36853 }
36854 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())((void)0);
36855
36856 // Attempt to create a zero vector.
36857 if ((UndefElts | ZeroElts).isAllOnesValue())
36858 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
36859
36860 // Create the constant data.
36861 MVT MaskSVT;
36862 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
36863 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
36864 else
36865 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
36866
36867 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
36868 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36869 return SDValue();
36870
36871 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
36872 return DAG.getBitcast(VT, CstOp);
36873}
36874
36875namespace llvm {
36876 namespace X86 {
36877 enum {
36878 MaxShuffleCombineDepth = 8
36879 };
36880 }
36881} // namespace llvm
36882
36883/// Fully generic combining of x86 shuffle instructions.
36884///
36885/// This should be the last combine run over the x86 shuffle instructions. Once
36886/// they have been fully optimized, this will recursively consider all chains
36887/// of single-use shuffle instructions, build a generic model of the cumulative
36888/// shuffle operation, and check for simpler instructions which implement this
36889/// operation. We use this primarily for two purposes:
36890///
36891/// 1) Collapse generic shuffles to specialized single instructions when
36892/// equivalent. In most cases, this is just an encoding size win, but
36893/// sometimes we will collapse multiple generic shuffles into a single
36894/// special-purpose shuffle.
36895/// 2) Look for sequences of shuffle instructions with 3 or more total
36896/// instructions, and replace them with the slightly more expensive SSSE3
36897/// PSHUFB instruction if available. We do this as the last combining step
36898/// to ensure we avoid using PSHUFB if we can implement the shuffle with
36899/// a suitable short sequence of other instructions. The PSHUFB will either
36900/// use a register or have to read from memory and so is slightly (but only
36901/// slightly) more expensive than the other shuffle instructions.
36902///
36903/// Because this is inherently a quadratic operation (for each shuffle in
36904/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
36905/// This should never be an issue in practice as the shuffle lowering doesn't
36906/// produce sequences of more than 8 instructions.
36907///
36908/// FIXME: We will currently miss some cases where the redundant shuffling
36909/// would simplify under the threshold for PSHUFB formation because of
36910/// combine-ordering. To fix this, we should do the redundant instruction
36911/// combining in this recursive walk.
36912static SDValue combineX86ShufflesRecursively(
36913 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
36914 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
36915 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
36916 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36917 const X86Subtarget &Subtarget) {
36918 assert(RootMask.size() > 0 &&((void)0)
36919 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&((void)0)
36920 "Illegal shuffle root mask")((void)0);
36921 assert(Root.getSimpleValueType().isVector() &&((void)0)
36922 "Shuffles operate on vector types!")((void)0);
36923 unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
36924
36925 // Bound the depth of our recursive combine because this is ultimately
36926 // quadratic in nature.
36927 if (Depth >= MaxDepth)
36928 return SDValue();
36929
36930 // Directly rip through bitcasts to find the underlying operand.
36931 SDValue Op = SrcOps[SrcOpIndex];
36932 Op = peekThroughOneUseBitcasts(Op);
36933
36934 EVT VT = Op.getValueType();
36935 if (!VT.isVector() || !VT.isSimple())
36936 return SDValue(); // Bail if we hit a non-simple non-vector.
36937
36938 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&((void)0)
36939 "Can only combine shuffles upto size of the root op.")((void)0);
36940
36941 // Extract target shuffle mask and resolve sentinels and inputs.
36942 // TODO - determine Op's demanded elts from RootMask.
36943 SmallVector<int, 64> OpMask;
36944 SmallVector<SDValue, 2> OpInputs;
36945 APInt OpUndef, OpZero;
36946 APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
36947 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
36948 if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
36949 OpZero, DAG, Depth, false))
36950 return SDValue();
36951
36952 // Shuffle inputs must not be larger than the shuffle result.
36953 // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
36954 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
36955 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
36956 }))
36957 return SDValue();
36958
36959 // If the shuffle result was smaller than the root, we need to adjust the
36960 // mask indices and pad the mask with undefs.
36961 if (RootSizeInBits > VT.getSizeInBits()) {
36962 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
36963 unsigned OpMaskSize = OpMask.size();
36964 if (OpInputs.size() > 1) {
36965 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
36966 for (int &M : OpMask) {
36967 if (M < 0)
36968 continue;
36969 int EltIdx = M % OpMaskSize;
36970 int OpIdx = M / OpMaskSize;
36971 M = (PaddedMaskSize * OpIdx) + EltIdx;
36972 }
36973 }
36974 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
36975 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
36976 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
36977 }
36978
36979 SmallVector<int, 64> Mask;
36980 SmallVector<SDValue, 16> Ops;
36981
36982 // We don't need to merge masks if the root is empty.
36983 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
36984 if (EmptyRoot) {
36985 // Only resolve zeros if it will remove an input, otherwise we might end
36986 // up in an infinite loop.
36987 bool ResolveKnownZeros = true;
36988 if (!OpZero.isNullValue()) {
36989 APInt UsedInputs = APInt::getNullValue(OpInputs.size());
36990 for (int i = 0, e = OpMask.size(); i != e; ++i) {
36991 int M = OpMask[i];
36992 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
36993 continue;
36994 UsedInputs.setBit(M / OpMask.size());
36995 if (UsedInputs.isAllOnesValue()) {
36996 ResolveKnownZeros = false;
36997 break;
36998 }
36999 }
37000 }
37001 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
37002 ResolveKnownZeros);
37003
37004 Mask = OpMask;
37005 Ops.append(OpInputs.begin(), OpInputs.end());
37006 } else {
37007 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
37008
37009 // Add the inputs to the Ops list, avoiding duplicates.
37010 Ops.append(SrcOps.begin(), SrcOps.end());
37011
37012 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
37013 // Attempt to find an existing match.
37014 SDValue InputBC = peekThroughBitcasts(Input);
37015 for (int i = 0, e = Ops.size(); i < e; ++i)
37016 if (InputBC == peekThroughBitcasts(Ops[i]))
37017 return i;
37018 // Match failed - should we replace an existing Op?
37019 if (InsertionPoint >= 0) {
37020 Ops[InsertionPoint] = Input;
37021 return InsertionPoint;
37022 }
37023 // Add to the end of the Ops list.
37024 Ops.push_back(Input);
37025 return Ops.size() - 1;
37026 };
37027
37028 SmallVector<int, 2> OpInputIdx;
37029 for (SDValue OpInput : OpInputs)
37030 OpInputIdx.push_back(
37031 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
37032
37033 assert(((RootMask.size() > OpMask.size() &&((void)0)
37034 RootMask.size() % OpMask.size() == 0) ||((void)0)
37035 (OpMask.size() > RootMask.size() &&((void)0)
37036 OpMask.size() % RootMask.size() == 0) ||((void)0)
37037 OpMask.size() == RootMask.size()) &&((void)0)
37038 "The smaller number of elements must divide the larger.")((void)0);
37039
37040 // This function can be performance-critical, so we rely on the power-of-2
37041 // knowledge that we have about the mask sizes to replace div/rem ops with
37042 // bit-masks and shifts.
37043 assert(isPowerOf2_32(RootMask.size()) &&((void)0)
37044 "Non-power-of-2 shuffle mask sizes")((void)0);
37045 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")((void)0);
37046 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
37047 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
37048
37049 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
37050 unsigned RootRatio =
37051 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
37052 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
37053 assert((RootRatio == 1 || OpRatio == 1) &&((void)0)
37054 "Must not have a ratio for both incoming and op masks!")((void)0);
37055
37056 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")((void)0);
37057 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")((void)0);
37058 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")((void)0);
37059 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
37060 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
37061
37062 Mask.resize(MaskWidth, SM_SentinelUndef);
37063
37064 // Merge this shuffle operation's mask into our accumulated mask. Note that
37065 // this shuffle's mask will be the first applied to the input, followed by
37066 // the root mask to get us all the way to the root value arrangement. The
37067 // reason for this order is that we are recursing up the operation chain.
37068 for (unsigned i = 0; i < MaskWidth; ++i) {
37069 unsigned RootIdx = i >> RootRatioLog2;
37070 if (RootMask[RootIdx] < 0) {
37071 // This is a zero or undef lane, we're done.
37072 Mask[i] = RootMask[RootIdx];
37073 continue;
37074 }
37075
37076 unsigned RootMaskedIdx =
37077 RootRatio == 1
37078 ? RootMask[RootIdx]
37079 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
37080
37081 // Just insert the scaled root mask value if it references an input other
37082 // than the SrcOp we're currently inserting.
37083 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
37084 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
37085 Mask[i] = RootMaskedIdx;
37086 continue;
37087 }
37088
37089 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
37090 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
37091 if (OpMask[OpIdx] < 0) {
37092 // The incoming lanes are zero or undef, it doesn't matter which ones we
37093 // are using.
37094 Mask[i] = OpMask[OpIdx];
37095 continue;
37096 }
37097
37098 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
37099 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
37100 : (OpMask[OpIdx] << OpRatioLog2) +
37101 (RootMaskedIdx & (OpRatio - 1));
37102
37103 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
37104 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
37105 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")((void)0);
37106 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
37107
37108 Mask[i] = OpMaskedIdx;
37109 }
37110 }
37111
37112 // Remove unused/repeated shuffle source ops.
37113 resolveTargetShuffleInputsAndMask(Ops, Mask);
37114
37115 // Handle the all undef/zero/ones cases early.
37116 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
37117 return DAG.getUNDEF(Root.getValueType());
37118 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
37119 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
37120 SDLoc(Root));
37121 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
37122 none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
37123 return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
37124
37125 assert(!Ops.empty() && "Shuffle with no inputs detected")((void)0);
37126 HasVariableMask |= IsOpVariableMask;
37127
37128 // Update the list of shuffle nodes that have been combined so far.
37129 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
37130 SrcNodes.end());
37131 CombinedNodes.push_back(Op.getNode());
37132
37133 // See if we can recurse into each shuffle source op (if it's a target
37134 // shuffle). The source op should only be generally combined if it either has
37135 // a single use (i.e. current Op) or all its users have already been combined,
37136 // if not then we can still combine but should prevent generation of variable
37137 // shuffles to avoid constant pool bloat.
37138 // Don't recurse if we already have more source ops than we can combine in
37139 // the remaining recursion depth.
37140 if (Ops.size() < (MaxDepth - Depth)) {
37141 for (int i = 0, e = Ops.size(); i < e; ++i) {
37142 // For empty roots, we need to resolve zeroable elements before combining
37143 // them with other shuffles.
37144 SmallVector<int, 64> ResolvedMask = Mask;
37145 if (EmptyRoot)
37146 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
37147 bool AllowCrossLaneVar = false;
37148 bool AllowPerLaneVar = false;
37149 if (Ops[i].getNode()->hasOneUse() ||
37150 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
37151 AllowCrossLaneVar = AllowVariableCrossLaneMask;
37152 AllowPerLaneVar = AllowVariablePerLaneMask;
37153 }
37154 if (SDValue Res = combineX86ShufflesRecursively(
37155 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
37156 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
37157 Subtarget))
37158 return Res;
37159 }
37160 }
37161
37162 // Attempt to constant fold all of the constant source ops.
37163 if (SDValue Cst = combineX86ShufflesConstants(
37164 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
37165 return Cst;
37166
37167 // If constant fold failed and we only have constants - then we have
37168 // multiple uses by a single non-variable shuffle - just bail.
37169 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
37170 APInt UndefElts;
37171 SmallVector<APInt> RawBits;
37172 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37173 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
37174 RawBits);
37175 })) {
37176 return SDValue();
37177 }
37178
37179 // Canonicalize the combined shuffle mask chain with horizontal ops.
37180 // NOTE: This will update the Ops and Mask.
37181 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
37182 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
37183 return DAG.getBitcast(Root.getValueType(), HOp);
37184
37185 // Widen any subvector shuffle inputs we've collected.
37186 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
37187 return Op.getValueSizeInBits() < RootSizeInBits;
37188 })) {
37189 for (SDValue &Op : Ops)
37190 if (Op.getValueSizeInBits() < RootSizeInBits)
37191 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
37192 RootSizeInBits);
37193 // Reresolve - we might have repeated subvector sources.
37194 resolveTargetShuffleInputsAndMask(Ops, Mask);
37195 }
37196
37197 // We can only combine unary and binary shuffle mask cases.
37198 if (Ops.size() <= 2) {
37199 // Minor canonicalization of the accumulated shuffle mask to make it easier
37200 // to match below. All this does is detect masks with sequential pairs of
37201 // elements, and shrink them to the half-width mask. It does this in a loop
37202 // so it will reduce the size of the mask to the minimal width mask which
37203 // performs an equivalent shuffle.
37204 while (Mask.size() > 1) {
37205 SmallVector<int, 64> WidenedMask;
37206 if (!canWidenShuffleElements(Mask, WidenedMask))
37207 break;
37208 Mask = std::move(WidenedMask);
37209 }
37210
37211 // Canonicalization of binary shuffle masks to improve pattern matching by
37212 // commuting the inputs.
37213 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
37214 ShuffleVectorSDNode::commuteMask(Mask);
37215 std::swap(Ops[0], Ops[1]);
37216 }
37217
37218 // Finally, try to combine into a single shuffle instruction.
37219 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
37220 AllowVariableCrossLaneMask,
37221 AllowVariablePerLaneMask, DAG, Subtarget);
37222 }
37223
37224 // If that failed and any input is extracted then try to combine as a
37225 // shuffle with the larger type.
37226 return combineX86ShuffleChainWithExtract(
37227 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
37228 AllowVariablePerLaneMask, DAG, Subtarget);
37229}
37230
37231/// Helper entry wrapper to combineX86ShufflesRecursively.
37232static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
37233 const X86Subtarget &Subtarget) {
37234 return combineX86ShufflesRecursively(
37235 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
37236 /*HasVarMask*/ false,
37237 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
37238 Subtarget);
37239}
37240
37241/// Get the PSHUF-style mask from PSHUF node.
37242///
37243/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
37244/// PSHUF-style masks that can be reused with such instructions.
37245static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
37246 MVT VT = N.getSimpleValueType();
37247 SmallVector<int, 4> Mask;
37248 SmallVector<SDValue, 2> Ops;
37249 bool HaveMask =
37250 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
37251 (void)HaveMask;
37252 assert(HaveMask)((void)0);
37253
37254 // If we have more than 128-bits, only the low 128-bits of shuffle mask
37255 // matter. Check that the upper masks are repeats and remove them.
37256 if (VT.getSizeInBits() > 128) {
37257 int LaneElts = 128 / VT.getScalarSizeInBits();
37258#ifndef NDEBUG1
37259 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
37260 for (int j = 0; j < LaneElts; ++j)
37261 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&((void)0)
37262 "Mask doesn't repeat in high 128-bit lanes!")((void)0);
37263#endif
37264 Mask.resize(LaneElts);
37265 }
37266
37267 switch (N.getOpcode()) {
37268 case X86ISD::PSHUFD:
37269 return Mask;
37270 case X86ISD::PSHUFLW:
37271 Mask.resize(4);
37272 return Mask;
37273 case X86ISD::PSHUFHW:
37274 Mask.erase(Mask.begin(), Mask.begin() + 4);
37275 for (int &M : Mask)
37276 M -= 4;
37277 return Mask;
37278 default:
37279 llvm_unreachable("No valid shuffle instruction found!")__builtin_unreachable();
37280 }
37281}
37282
37283/// Search for a combinable shuffle across a chain ending in pshufd.
37284///
37285/// We walk up the chain and look for a combinable shuffle, skipping over
37286/// shuffles that we could hoist this shuffle's transformation past without
37287/// altering anything.
37288static SDValue
37289combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
37290 SelectionDAG &DAG) {
37291 assert(N.getOpcode() == X86ISD::PSHUFD &&((void)0)
37292 "Called with something other than an x86 128-bit half shuffle!")((void)0);
37293 SDLoc DL(N);
37294
37295 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
37296 // of the shuffles in the chain so that we can form a fresh chain to replace
37297 // this one.
37298 SmallVector<SDValue, 8> Chain;
37299 SDValue V = N.getOperand(0);
37300 for (; V.hasOneUse(); V = V.getOperand(0)) {
37301 switch (V.getOpcode()) {
37302 default:
37303 return SDValue(); // Nothing combined!
37304
37305 case ISD::BITCAST:
37306 // Skip bitcasts as we always know the type for the target specific
37307 // instructions.
37308 continue;
37309
37310 case X86ISD::PSHUFD:
37311 // Found another dword shuffle.
37312 break;
37313
37314 case X86ISD::PSHUFLW:
37315 // Check that the low words (being shuffled) are the identity in the
37316 // dword shuffle, and the high words are self-contained.
37317 if (Mask[0] != 0 || Mask[1] != 1 ||
37318 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
37319 return SDValue();
37320
37321 Chain.push_back(V);
37322 continue;
37323
37324 case X86ISD::PSHUFHW:
37325 // Check that the high words (being shuffled) are the identity in the
37326 // dword shuffle, and the low words are self-contained.
37327 if (Mask[2] != 2 || Mask[3] != 3 ||
37328 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
37329 return SDValue();
37330
37331 Chain.push_back(V);
37332 continue;
37333
37334 case X86ISD::UNPCKL:
37335 case X86ISD::UNPCKH:
37336 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
37337 // shuffle into a preceding word shuffle.
37338 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
37339 V.getSimpleValueType().getVectorElementType() != MVT::i16)
37340 return SDValue();
37341
37342 // Search for a half-shuffle which we can combine with.
37343 unsigned CombineOp =
37344 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
37345 if (V.getOperand(0) != V.getOperand(1) ||
37346 !V->isOnlyUserOf(V.getOperand(0).getNode()))
37347 return SDValue();
37348 Chain.push_back(V);
37349 V = V.getOperand(0);
37350 do {
37351 switch (V.getOpcode()) {
37352 default:
37353 return SDValue(); // Nothing to combine.
37354
37355 case X86ISD::PSHUFLW:
37356 case X86ISD::PSHUFHW:
37357 if (V.getOpcode() == CombineOp)
37358 break;
37359
37360 Chain.push_back(V);
37361
37362 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37363 case ISD::BITCAST:
37364 V = V.getOperand(0);
37365 continue;
37366 }
37367 break;
37368 } while (V.hasOneUse());
37369 break;
37370 }
37371 // Break out of the loop if we break out of the switch.
37372 break;
37373 }
37374
37375 if (!V.hasOneUse())
37376 // We fell out of the loop without finding a viable combining instruction.
37377 return SDValue();
37378
37379 // Merge this node's mask and our incoming mask.
37380 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
37381 for (int &M : Mask)
37382 M = VMask[M];
37383 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
37384 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
37385
37386 // Rebuild the chain around this new shuffle.
37387 while (!Chain.empty()) {
37388 SDValue W = Chain.pop_back_val();
37389
37390 if (V.getValueType() != W.getOperand(0).getValueType())
37391 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
37392
37393 switch (W.getOpcode()) {
37394 default:
37395 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")__builtin_unreachable();
37396
37397 case X86ISD::UNPCKL:
37398 case X86ISD::UNPCKH:
37399 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
37400 break;
37401
37402 case X86ISD::PSHUFD:
37403 case X86ISD::PSHUFLW:
37404 case X86ISD::PSHUFHW:
37405 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
37406 break;
37407 }
37408 }
37409 if (V.getValueType() != N.getValueType())
37410 V = DAG.getBitcast(N.getValueType(), V);
37411
37412 // Return the new chain to replace N.
37413 return V;
37414}
37415
37416// Attempt to commute shufps LHS loads:
37417// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
37418static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
37419 SelectionDAG &DAG) {
37420 // TODO: Add vXf64 support.
37421 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
37422 return SDValue();
37423
37424 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
37425 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
37426 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
37427 return SDValue();
37428 SDValue N0 = V.getOperand(0);
37429 SDValue N1 = V.getOperand(1);
37430 unsigned Imm = V.getConstantOperandVal(2);
37431 if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
37432 MayFoldLoad(peekThroughOneUseBitcasts(N1)))
37433 return SDValue();
37434 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
37435 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
37436 DAG.getTargetConstant(Imm, DL, MVT::i8));
37437 };
37438
37439 switch (N.getOpcode()) {
37440 case X86ISD::VPERMILPI:
37441 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
37442 unsigned Imm = N.getConstantOperandVal(1);
37443 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
37444 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37445 }
37446 break;
37447 case X86ISD::SHUFP: {
37448 SDValue N0 = N.getOperand(0);
37449 SDValue N1 = N.getOperand(1);
37450 unsigned Imm = N.getConstantOperandVal(2);
37451 if (N0 == N1) {
37452 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
37453 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
37454 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
37455 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
37456 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
37457 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
37458 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
37459 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
37460 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
37461 }
37462 break;
37463 }
37464 }
37465
37466 return SDValue();
37467}
37468
37469// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
37470static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
37471 const SDLoc &DL) {
37472 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37473 EVT ShuffleVT = N.getValueType();
37474
37475 auto IsMergeableWithShuffle = [](SDValue Op) {
37476 // AllZeros/AllOnes constants are freely shuffled and will peek through
37477 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
37478 // merge with target shuffles if it has one use so shuffle combining is
37479 // likely to kick in.
37480 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
37481 ISD::isBuildVectorAllZeros(Op.getNode()) ||
37482 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
37483 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
37484 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
37485 };
37486 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
37487 // Ensure we only shuffle whole vector src elements, unless its a logical
37488 // binops where we can more aggressively move shuffles from dst to src.
37489 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
37490 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
37491 };
37492
37493 unsigned Opc = N.getOpcode();
37494 switch (Opc) {
37495 // Unary and Unary+Permute Shuffles.
37496 case X86ISD::PSHUFB: {
37497 // Don't merge PSHUFB if it contains zero'd elements.
37498 SmallVector<int> Mask;
37499 SmallVector<SDValue> Ops;
37500 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
37501 Mask))
37502 break;
37503 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37504 }
37505 case X86ISD::VBROADCAST:
37506 case X86ISD::MOVDDUP:
37507 case X86ISD::PSHUFD:
37508 case X86ISD::VPERMI:
37509 case X86ISD::VPERMILPI: {
37510 if (N.getOperand(0).getValueType() == ShuffleVT &&
37511 N->isOnlyUserOf(N.getOperand(0).getNode())) {
37512 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37513 unsigned SrcOpcode = N0.getOpcode();
37514 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
37515 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37516 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37517 if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
37518 SDValue LHS, RHS;
37519 Op00 = DAG.getBitcast(ShuffleVT, Op00);
37520 Op01 = DAG.getBitcast(ShuffleVT, Op01);
37521 if (N.getNumOperands() == 2) {
37522 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
37523 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
37524 } else {
37525 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
37526 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
37527 }
37528 EVT OpVT = N0.getValueType();
37529 return DAG.getBitcast(ShuffleVT,
37530 DAG.getNode(SrcOpcode, DL, OpVT,
37531 DAG.getBitcast(OpVT, LHS),
37532 DAG.getBitcast(OpVT, RHS)));
37533 }
37534 }
37535 }
37536 break;
37537 }
37538 // Binary and Binary+Permute Shuffles.
37539 case X86ISD::INSERTPS: {
37540 // Don't merge INSERTPS if it contains zero'd elements.
37541 unsigned InsertPSMask = N.getConstantOperandVal(2);
37542 unsigned ZeroMask = InsertPSMask & 0xF;
37543 if (ZeroMask != 0)
37544 break;
37545 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37546 }
37547 case X86ISD::MOVSD:
37548 case X86ISD::MOVSS:
37549 case X86ISD::BLENDI:
37550 case X86ISD::SHUFP:
37551 case X86ISD::UNPCKH:
37552 case X86ISD::UNPCKL: {
37553 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
37554 N->isOnlyUserOf(N.getOperand(1).getNode())) {
37555 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
37556 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
37557 unsigned SrcOpcode = N0.getOpcode();
37558 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
37559 IsSafeToMoveShuffle(N0, SrcOpcode) &&
37560 IsSafeToMoveShuffle(N1, SrcOpcode)) {
37561 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
37562 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
37563 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
37564 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
37565 // Ensure the total number of shuffles doesn't increase by folding this
37566 // shuffle through to the source ops.
37567 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
37568 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
37569 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
37570 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
37571 SDValue LHS, RHS;
37572 Op00 = DAG.getBitcast(ShuffleVT, Op00);
37573 Op10 = DAG.getBitcast(ShuffleVT, Op10);
37574 Op01 = DAG.getBitcast(ShuffleVT, Op01);
37575 Op11 = DAG.getBitcast(ShuffleVT, Op11);
37576 if (N.getNumOperands() == 3) {
37577 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
37578 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
37579 } else {
37580 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
37581 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
37582 }
37583 EVT OpVT = N0.getValueType();
37584 return DAG.getBitcast(ShuffleVT,
37585 DAG.getNode(SrcOpcode, DL, OpVT,
37586 DAG.getBitcast(OpVT, LHS),
37587 DAG.getBitcast(OpVT, RHS)));
37588 }
37589 }
37590 }
37591 break;
37592 }
37593 }
37594 return SDValue();
37595}
37596
37597/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
37598static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
37599 SelectionDAG &DAG,
37600 const SDLoc &DL) {
37601 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")((void)0);
37602
37603 MVT VT = V.getSimpleValueType();
37604 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
37605 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
37606 unsigned SrcOpc0 = Src0.getOpcode();
37607 unsigned SrcOpc1 = Src1.getOpcode();
37608 EVT SrcVT0 = Src0.getValueType();
37609 EVT SrcVT1 = Src1.getValueType();
37610
37611 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
37612 return SDValue();
37613
37614 switch (SrcOpc0) {
37615 case X86ISD::MOVDDUP: {
37616 SDValue LHS = Src0.getOperand(0);
37617 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37618 SDValue Res =
37619 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
37620 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
37621 return DAG.getBitcast(VT, Res);
37622 }
37623 case X86ISD::VPERMILPI:
37624 // TODO: Handle v4f64 permutes with different low/high lane masks.
37625 if (SrcVT0 == MVT::v4f64) {
37626 uint64_t Mask = Src0.getConstantOperandVal(1);
37627 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
37628 break;
37629 }
37630 LLVM_FALLTHROUGH[[gnu::fallthrough]];
37631 case X86ISD::VSHLI:
37632 case X86ISD::VSRLI:
37633 case X86ISD::VSRAI:
37634 case X86ISD::PSHUFD:
37635 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
37636 SDValue LHS = Src0.getOperand(0);
37637 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
37638 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
37639 V.getOperand(2));
37640 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
37641 return DAG.getBitcast(VT, Res);
37642 }
37643 break;
37644 }
37645
37646 return SDValue();
37647}
37648
37649/// Try to combine x86 target specific shuffles.
37650static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
37651 TargetLowering::DAGCombinerInfo &DCI,
37652 const X86Subtarget &Subtarget) {
37653 SDLoc DL(N);
37654 MVT VT = N.getSimpleValueType();
37655 SmallVector<int, 4> Mask;
37656 unsigned Opcode = N.getOpcode();
37657
37658 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
37659 return R;
37660
37661 if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
37662 return R;
37663
37664 // Handle specific target shuffles.
37665 switch (Opcode) {
37666 case X86ISD::MOVDDUP: {
37667 SDValue Src = N.getOperand(0);
37668 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
37669 if (VT == MVT::v2f64 && Src.hasOneUse() &&
37670 ISD::isNormalLoad(Src.getNode())) {
37671 LoadSDNode *LN = cast<LoadSDNode>(Src);
37672 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
37673 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
37674 DCI.CombineTo(N.getNode(), Movddup);
37675 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37676 DCI.recursivelyDeleteUnusedNodes(LN);
37677 return N; // Return N so it doesn't get rechecked!
37678 }
37679 }
37680
37681 return SDValue();
37682 }
37683 case X86ISD::VBROADCAST: {
37684 SDValue Src = N.getOperand(0);
37685 SDValue BC = peekThroughBitcasts(Src);
37686 EVT SrcVT = Src.getValueType();
37687 EVT BCVT = BC.getValueType();
37688
37689 // If broadcasting from another shuffle, attempt to simplify it.
37690 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
37691 if (isTargetShuffle(BC.getOpcode()) &&
37692 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
37693 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
37694 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
37695 SM_SentinelUndef);
37696 for (unsigned i = 0; i != Scale; ++i)
37697 DemandedMask[i] = i;
37698 if (SDValue Res = combineX86ShufflesRecursively(
37699 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
37700 X86::MaxShuffleCombineDepth,
37701 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
37702 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
37703 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37704 DAG.getBitcast(SrcVT, Res));
37705 }
37706
37707 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
37708 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
37709 if (Src.getOpcode() == ISD::BITCAST &&
37710 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
37711 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
37712 FixedVectorType::isValidElementType(
37713 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
37714 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
37715 VT.getVectorNumElements());
37716 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
37717 }
37718
37719 // Reduce broadcast source vector to lowest 128-bits.
37720 if (SrcVT.getSizeInBits() > 128)
37721 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
37722 extract128BitVector(Src, 0, DAG, DL));
37723
37724 // broadcast(scalar_to_vector(x)) -> broadcast(x).
37725 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
37726 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
37727
37728 // Share broadcast with the longest vector and extract low subvector (free).
37729 // Ensure the same SDValue from the SDNode use is being used.
37730 for (SDNode *User : Src->uses())
37731 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
37732 Src == User->getOperand(0) &&
37733 User->getValueSizeInBits(0).getFixedSize() >
37734 VT.getFixedSizeInBits()) {
37735 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
37736 VT.getSizeInBits());
37737 }
37738
37739 // vbroadcast(scalarload X) -> vbroadcast_load X
37740 // For float loads, extract other uses of the scalar from the broadcast.
37741 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
37742 ISD::isNormalLoad(Src.getNode())) {
37743 LoadSDNode *LN = cast<LoadSDNode>(Src);
37744 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37745 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37746 SDValue BcastLd =
37747 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37748 LN->getMemoryVT(), LN->getMemOperand());
37749 // If the load value is used only by N, replace it via CombineTo N.
37750 bool NoReplaceExtract = Src.hasOneUse();
37751 DCI.CombineTo(N.getNode(), BcastLd);
37752 if (NoReplaceExtract) {
37753 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37754 DCI.recursivelyDeleteUnusedNodes(LN);
37755 } else {
37756 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
37757 DAG.getIntPtrConstant(0, DL));
37758 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
37759 }
37760 return N; // Return N so it doesn't get rechecked!
37761 }
37762
37763 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
37764 // i16. So shrink it ourselves if we can make a broadcast_load.
37765 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
37766 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
37767 assert(Subtarget.hasAVX2() && "Expected AVX2")((void)0);
37768 SDValue TruncIn = Src.getOperand(0);
37769
37770 // If this is a truncate of a non extending load we can just narrow it to
37771 // use a broadcast_load.
37772 if (ISD::isNormalLoad(TruncIn.getNode())) {
37773 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
37774 // Unless its volatile or atomic.
37775 if (LN->isSimple()) {
37776 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37777 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37778 SDValue BcastLd = DAG.getMemIntrinsicNode(
37779 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37780 LN->getPointerInfo(), LN->getOriginalAlign(),
37781 LN->getMemOperand()->getFlags());
37782 DCI.CombineTo(N.getNode(), BcastLd);
37783 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37784 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37785 return N; // Return N so it doesn't get rechecked!
37786 }
37787 }
37788
37789 // If this is a truncate of an i16 extload, we can directly replace it.
37790 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
37791 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
37792 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
37793 if (LN->getMemoryVT().getSizeInBits() == 16) {
37794 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37795 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37796 SDValue BcastLd =
37797 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37798 LN->getMemoryVT(), LN->getMemOperand());
37799 DCI.CombineTo(N.getNode(), BcastLd);
37800 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37801 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37802 return N; // Return N so it doesn't get rechecked!
37803 }
37804 }
37805
37806 // If this is a truncate of load that has been shifted right, we can
37807 // offset the pointer and use a narrower load.
37808 if (TruncIn.getOpcode() == ISD::SRL &&
37809 TruncIn.getOperand(0).hasOneUse() &&
37810 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
37811 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
37812 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
37813 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
37814 // Make sure the shift amount and the load size are divisible by 16.
37815 // Don't do this if the load is volatile or atomic.
37816 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
37817 LN->isSimple()) {
37818 unsigned Offset = ShiftAmt / 8;
37819 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37820 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
37821 TypeSize::Fixed(Offset), DL);
37822 SDValue Ops[] = { LN->getChain(), Ptr };
37823 SDValue BcastLd = DAG.getMemIntrinsicNode(
37824 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
37825 LN->getPointerInfo().getWithOffset(Offset),
37826 LN->getOriginalAlign(),
37827 LN->getMemOperand()->getFlags());
37828 DCI.CombineTo(N.getNode(), BcastLd);
37829 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37830 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
37831 return N; // Return N so it doesn't get rechecked!
37832 }
37833 }
37834 }
37835
37836 // vbroadcast(vzload X) -> vbroadcast_load X
37837 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
37838 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
37839 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
37840 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37841 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
37842 SDValue BcastLd =
37843 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
37844 LN->getMemoryVT(), LN->getMemOperand());
37845 DCI.CombineTo(N.getNode(), BcastLd);
37846 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37847 DCI.recursivelyDeleteUnusedNodes(LN);
37848 return N; // Return N so it doesn't get rechecked!
37849 }
37850 }
37851
37852 // vbroadcast(vector load X) -> vbroadcast_load
37853 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
37854 SrcVT == MVT::v4i32) &&
37855 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
37856 LoadSDNode *LN = cast<LoadSDNode>(Src);
37857 // Unless the load is volatile or atomic.
37858 if (LN->isSimple()) {
37859 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37860 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37861 SDValue BcastLd = DAG.getMemIntrinsicNode(
37862 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
37863 LN->getPointerInfo(), LN->getOriginalAlign(),
37864 LN->getMemOperand()->getFlags());
37865 DCI.CombineTo(N.getNode(), BcastLd);
37866 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
37867 DCI.recursivelyDeleteUnusedNodes(LN);
37868 return N; // Return N so it doesn't get rechecked!
37869 }
37870 }
37871
37872 return SDValue();
37873 }
37874 case X86ISD::VZEXT_MOVL: {
37875 SDValue N0 = N.getOperand(0);
37876
37877 // If this a vzmovl of a full vector load, replace it with a vzload, unless
37878 // the load is volatile.
37879 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
37880 auto *LN = cast<LoadSDNode>(N0);
37881 if (SDValue VZLoad =
37882 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
37883 DCI.CombineTo(N.getNode(), VZLoad);
37884 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37885 DCI.recursivelyDeleteUnusedNodes(LN);
37886 return N;
37887 }
37888 }
37889
37890 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
37891 // and can just use a VZEXT_LOAD.
37892 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
37893 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
37894 auto *LN = cast<MemSDNode>(N0);
37895 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
37896 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37897 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37898 SDValue VZLoad =
37899 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
37900 LN->getMemoryVT(), LN->getMemOperand());
37901 DCI.CombineTo(N.getNode(), VZLoad);
37902 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
37903 DCI.recursivelyDeleteUnusedNodes(LN);
37904 return N;
37905 }
37906 }
37907
37908 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
37909 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
37910 // if the upper bits of the i64 are zero.
37911 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37912 N0.getOperand(0).hasOneUse() &&
37913 N0.getOperand(0).getValueType() == MVT::i64) {
37914 SDValue In = N0.getOperand(0);
37915 APInt Mask = APInt::getHighBitsSet(64, 32);
37916 if (DAG.MaskedValueIsZero(In, Mask)) {
37917 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
37918 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
37919 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
37920 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
37921 return DAG.getBitcast(VT, Movl);
37922 }
37923 }
37924
37925 // Load a scalar integer constant directly to XMM instead of transferring an
37926 // immediate value from GPR.
37927 // vzext_movl (scalar_to_vector C) --> load [C,0...]
37928 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37929 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
37930 // Create a vector constant - scalar constant followed by zeros.
37931 EVT ScalarVT = N0.getOperand(0).getValueType();
37932 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
37933 unsigned NumElts = VT.getVectorNumElements();
37934 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
37935 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
37936 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
37937
37938 // Load the vector constant from constant pool.
37939 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
37940 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
37941 MachinePointerInfo MPI =
37942 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
37943 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
37944 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
37945 MachineMemOperand::MOLoad);
37946 }
37947 }
37948
37949 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
37950 // insert into a zero vector. This helps get VZEXT_MOVL closer to
37951 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
37952 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
37953 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
37954 SDValue V = peekThroughOneUseBitcasts(N0);
37955
37956 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
37957 isNullConstant(V.getOperand(2))) {
37958 SDValue In = V.getOperand(1);
37959 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
37960 In.getValueSizeInBits() /
37961 VT.getScalarSizeInBits());
37962 In = DAG.getBitcast(SubVT, In);
37963 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
37964 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
37965 getZeroVector(VT, Subtarget, DAG, DL), Movl,
37966 V.getOperand(2));
37967 }
37968 }
37969
37970 return SDValue();
37971 }
37972 case X86ISD::BLENDI: {
37973 SDValue N0 = N.getOperand(0);
37974 SDValue N1 = N.getOperand(1);
37975
37976 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
37977 // TODO: Handle MVT::v16i16 repeated blend mask.
37978 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
37979 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
37980 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
37981 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
37982 SrcVT.getScalarSizeInBits() >= 32) {
37983 unsigned BlendMask = N.getConstantOperandVal(2);
37984 unsigned Size = VT.getVectorNumElements();
37985 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
37986 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
37987 return DAG.getBitcast(
37988 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
37989 N1.getOperand(0),
37990 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
37991 }
37992 }
37993 return SDValue();
37994 }
37995 case X86ISD::VPERMI: {
37996 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
37997 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
37998 SDValue N0 = N.getOperand(0);
37999 SDValue N1 = N.getOperand(1);
38000 unsigned EltSizeInBits = VT.getScalarSizeInBits();
38001 if (N0.getOpcode() == ISD::BITCAST &&
38002 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
38003 SDValue Src = N0.getOperand(0);
38004 EVT SrcVT = Src.getValueType();
38005 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
38006 return DAG.getBitcast(VT, Res);
38007 }
38008 return SDValue();
38009 }
38010 case X86ISD::VPERM2X128: {
38011 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
38012 SDValue LHS = N->getOperand(0);
38013 SDValue RHS = N->getOperand(1);
38014 if (LHS.getOpcode() == ISD::BITCAST &&
38015 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
38016 EVT SrcVT = LHS.getOperand(0).getValueType();
38017 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
38018 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
38019 DAG.getBitcast(SrcVT, LHS),
38020 DAG.getBitcast(SrcVT, RHS),
38021 N->getOperand(2)));
38022 }
38023 }
38024
38025 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
38026 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
38027 return Res;
38028
38029 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
38030 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
38031 auto FindSubVector128 = [&](unsigned Idx) {
38032 if (Idx > 3)
38033 return SDValue();
38034 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
38035 SmallVector<SDValue> SubOps;
38036 if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
38037 return SubOps[Idx & 1];
38038 unsigned NumElts = Src.getValueType().getVectorNumElements();
38039 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
38040 Src.getOperand(1).getValueSizeInBits() == 128 &&
38041 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
38042 return Src.getOperand(1);
38043 }
38044 return SDValue();
38045 };
38046 unsigned Imm = N.getConstantOperandVal(2);
38047 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
38048 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
38049 MVT SubVT = VT.getHalfNumVectorElementsVT();
38050 SubLo = DAG.getBitcast(SubVT, SubLo);
38051 SubHi = DAG.getBitcast(SubVT, SubHi);
38052 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
38053 }
38054 }
38055 return SDValue();
38056 }
38057 case X86ISD::PSHUFD:
38058 case X86ISD::PSHUFLW:
38059 case X86ISD::PSHUFHW:
38060 Mask = getPSHUFShuffleMask(N);
38061 assert(Mask.size() == 4)((void)0);
38062 break;
38063 case X86ISD::MOVSD:
38064 case X86ISD::MOVSS: {
38065 SDValue N0 = N.getOperand(0);
38066 SDValue N1 = N.getOperand(1);
38067
38068 // Canonicalize scalar FPOps:
38069 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
38070 // If commutable, allow OP(N1[0], N0[0]).
38071 unsigned Opcode1 = N1.getOpcode();
38072 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
38073 Opcode1 == ISD::FDIV) {
38074 SDValue N10 = N1.getOperand(0);
38075 SDValue N11 = N1.getOperand(1);
38076 if (N10 == N0 ||
38077 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
38078 if (N10 != N0)
38079 std::swap(N10, N11);
38080 MVT SVT = VT.getVectorElementType();
38081 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
38082 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
38083 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
38084 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
38085 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
38086 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
38087 }
38088 }
38089
38090 return SDValue();
38091 }
38092 case X86ISD::INSERTPS: {
38093 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")((void)0);
38094 SDValue Op0 = N.getOperand(0);
38095 SDValue Op1 = N.getOperand(1);
38096 unsigned InsertPSMask = N.getConstantOperandVal(2);
38097 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
38098 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
38099 unsigned ZeroMask = InsertPSMask & 0xF;
38100
38101 // If we zero out all elements from Op0 then we don't need to reference it.
38102 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
38103 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
38104 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38105
38106 // If we zero out the element from Op1 then we don't need to reference it.
38107 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
38108 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38109 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38110
38111 // Attempt to merge insertps Op1 with an inner target shuffle node.
38112 SmallVector<int, 8> TargetMask1;
38113 SmallVector<SDValue, 2> Ops1;
38114 APInt KnownUndef1, KnownZero1;
38115 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
38116 KnownZero1)) {
38117 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
38118 // Zero/UNDEF insertion - zero out element and remove dependency.
38119 InsertPSMask |= (1u << DstIdx);
38120 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38121 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38122 }
38123 // Update insertps mask srcidx and reference the source input directly.
38124 int M = TargetMask1[SrcIdx];
38125 assert(0 <= M && M < 8 && "Shuffle index out of range")((void)0);
38126 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
38127 Op1 = Ops1[M < 4 ? 0 : 1];
38128 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38129 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38130 }
38131
38132 // Attempt to merge insertps Op0 with an inner target shuffle node.
38133 SmallVector<int, 8> TargetMask0;
38134 SmallVector<SDValue, 2> Ops0;
38135 APInt KnownUndef0, KnownZero0;
38136 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
38137 KnownZero0)) {
38138 bool Updated = false;
38139 bool UseInput00 = false;
38140 bool UseInput01 = false;
38141 for (int i = 0; i != 4; ++i) {
38142 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
38143 // No change if element is already zero or the inserted element.
38144 continue;
38145 } else if (KnownUndef0[i] || KnownZero0[i]) {
38146 // If the target mask is undef/zero then we must zero the element.
38147 InsertPSMask |= (1u << i);
38148 Updated = true;
38149 continue;
38150 }
38151
38152 // The input vector element must be inline.
38153 int M = TargetMask0[i];
38154 if (M != i && M != (i + 4))
38155 return SDValue();
38156
38157 // Determine which inputs of the target shuffle we're using.
38158 UseInput00 |= (0 <= M && M < 4);
38159 UseInput01 |= (4 <= M);
38160 }
38161
38162 // If we're not using both inputs of the target shuffle then use the
38163 // referenced input directly.
38164 if (UseInput00 && !UseInput01) {
38165 Updated = true;
38166 Op0 = Ops0[0];
38167 } else if (!UseInput00 && UseInput01) {
38168 Updated = true;
38169 Op0 = Ops0[1];
38170 }
38171
38172 if (Updated)
38173 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38174 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38175 }
38176
38177 // If we're inserting an element from a vbroadcast load, fold the
38178 // load into the X86insertps instruction. We need to convert the scalar
38179 // load to a vector and clear the source lane of the INSERTPS control.
38180 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
38181 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
38182 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
38183 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
38184 MemIntr->getBasePtr(),
38185 MemIntr->getMemOperand());
38186 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
38187 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
38188 Load),
38189 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
38190 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
38191 return Insert;
38192 }
38193 }
38194
38195 return SDValue();
38196 }
38197 default:
38198 return SDValue();
38199 }
38200
38201 // Nuke no-op shuffles that show up after combining.
38202 if (isNoopShuffleMask(Mask))
38203 return N.getOperand(0);
38204
38205 // Look for simplifications involving one or two shuffle instructions.
38206 SDValue V = N.getOperand(0);
38207 switch (N.getOpcode()) {
38208 default:
38209 break;
38210 case X86ISD::PSHUFLW:
38211 case X86ISD::PSHUFHW:
38212 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")((void)0);
38213
38214 // See if this reduces to a PSHUFD which is no more expensive and can
38215 // combine with more operations. Note that it has to at least flip the
38216 // dwords as otherwise it would have been removed as a no-op.
38217 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
38218 int DMask[] = {0, 1, 2, 3};
38219 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
38220 DMask[DOffset + 0] = DOffset + 1;
38221 DMask[DOffset + 1] = DOffset + 0;
38222 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
38223 V = DAG.getBitcast(DVT, V);
38224 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
38225 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
38226 return DAG.getBitcast(VT, V);
38227 }
38228
38229 // Look for shuffle patterns which can be implemented as a single unpack.
38230 // FIXME: This doesn't handle the location of the PSHUFD generically, and
38231 // only works when we have a PSHUFD followed by two half-shuffles.
38232 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
38233 (V.getOpcode() == X86ISD::PSHUFLW ||
38234 V.getOpcode() == X86ISD::PSHUFHW) &&
38235 V.getOpcode() != N.getOpcode() &&
38236 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
38237 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
38238 if (D.getOpcode() == X86ISD::PSHUFD) {
38239 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38240 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
38241 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38242 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38243 int WordMask[8];
38244 for (int i = 0; i < 4; ++i) {
38245 WordMask[i + NOffset] = Mask[i] + NOffset;
38246 WordMask[i + VOffset] = VMask[i] + VOffset;
38247 }
38248 // Map the word mask through the DWord mask.
38249 int MappedMask[8];
38250 for (int i = 0; i < 8; ++i)
38251 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
38252 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
38253 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
38254 // We can replace all three shuffles with an unpack.
38255 V = DAG.getBitcast(VT, D.getOperand(0));
38256 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
38257 : X86ISD::UNPCKH,
38258 DL, VT, V, V);
38259 }
38260 }
38261 }
38262
38263 break;
38264
38265 case X86ISD::PSHUFD:
38266 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
38267 return NewN;
38268
38269 break;
38270 }
38271
38272 return SDValue();
38273}
38274
38275/// Checks if the shuffle mask takes subsequent elements
38276/// alternately from two vectors.
38277/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
38278static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
38279
38280 int ParitySrc[2] = {-1, -1};
38281 unsigned Size = Mask.size();
38282 for (unsigned i = 0; i != Size; ++i) {
38283 int M = Mask[i];
38284 if (M < 0)
38285 continue;
38286
38287 // Make sure we are using the matching element from the input.
38288 if ((M % Size) != i)
38289 return false;
38290
38291 // Make sure we use the same input for all elements of the same parity.
38292 int Src = M / Size;
38293 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
38294 return false;
38295 ParitySrc[i % 2] = Src;
38296 }
38297
38298 // Make sure each input is used.
38299 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
38300 return false;
38301
38302 Op0Even = ParitySrc[0] == 0;
38303 return true;
38304}
38305
38306/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
38307/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
38308/// are written to the parameters \p Opnd0 and \p Opnd1.
38309///
38310/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
38311/// so it is easier to generically match. We also insert dummy vector shuffle
38312/// nodes for the operands which explicitly discard the lanes which are unused
38313/// by this operation to try to flow through the rest of the combiner
38314/// the fact that they're unused.
38315static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
38316 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
38317 bool &IsSubAdd) {
38318
38319 EVT VT = N->getValueType(0);
38320 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38321 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
38322 !VT.getSimpleVT().isFloatingPoint())
38323 return false;
38324
38325 // We only handle target-independent shuffles.
38326 // FIXME: It would be easy and harmless to use the target shuffle mask
38327 // extraction tool to support more.
38328 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38329 return false;
38330
38331 SDValue V1 = N->getOperand(0);
38332 SDValue V2 = N->getOperand(1);
38333
38334 // Make sure we have an FADD and an FSUB.
38335 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
38336 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
38337 V1.getOpcode() == V2.getOpcode())
38338 return false;
38339
38340 // If there are other uses of these operations we can't fold them.
38341 if (!V1->hasOneUse() || !V2->hasOneUse())
38342 return false;
38343
38344 // Ensure that both operations have the same operands. Note that we can
38345 // commute the FADD operands.
38346 SDValue LHS, RHS;
38347 if (V1.getOpcode() == ISD::FSUB) {
38348 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
38349 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
38350 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
38351 return false;
38352 } else {
38353 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")((void)0);
38354 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
38355 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
38356 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
38357 return false;
38358 }
38359
38360 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38361 bool Op0Even;
38362 if (!isAddSubOrSubAddMask(Mask, Op0Even))
38363 return false;
38364
38365 // It's a subadd if the vector in the even parity is an FADD.
38366 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
38367 : V2->getOpcode() == ISD::FADD;
38368
38369 Opnd0 = LHS;
38370 Opnd1 = RHS;
38371 return true;
38372}
38373
38374/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
38375static SDValue combineShuffleToFMAddSub(SDNode *N,
38376 const X86Subtarget &Subtarget,
38377 SelectionDAG &DAG) {
38378 // We only handle target-independent shuffles.
38379 // FIXME: It would be easy and harmless to use the target shuffle mask
38380 // extraction tool to support more.
38381 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
38382 return SDValue();
38383
38384 MVT VT = N->getSimpleValueType(0);
38385 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38386 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
38387 return SDValue();
38388
38389 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
38390 SDValue Op0 = N->getOperand(0);
38391 SDValue Op1 = N->getOperand(1);
38392 SDValue FMAdd = Op0, FMSub = Op1;
38393 if (FMSub.getOpcode() != X86ISD::FMSUB)
38394 std::swap(FMAdd, FMSub);
38395
38396 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
38397 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
38398 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
38399 FMAdd.getOperand(2) != FMSub.getOperand(2))
38400 return SDValue();
38401
38402 // Check for correct shuffle mask.
38403 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
38404 bool Op0Even;
38405 if (!isAddSubOrSubAddMask(Mask, Op0Even))
38406 return SDValue();
38407
38408 // FMAddSub takes zeroth operand from FMSub node.
38409 SDLoc DL(N);
38410 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
38411 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38412 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
38413 FMAdd.getOperand(2));
38414}
38415
38416/// Try to combine a shuffle into a target-specific add-sub or
38417/// mul-add-sub node.
38418static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
38419 const X86Subtarget &Subtarget,
38420 SelectionDAG &DAG) {
38421 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
38422 return V;
38423
38424 SDValue Opnd0, Opnd1;
38425 bool IsSubAdd;
38426 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
38427 return SDValue();
38428
38429 MVT VT = N->getSimpleValueType(0);
38430 SDLoc DL(N);
38431
38432 // Try to generate X86ISD::FMADDSUB node here.
38433 SDValue Opnd2;
38434 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
38435 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
38436 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
38437 }
38438
38439 if (IsSubAdd)
38440 return SDValue();
38441
38442 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
38443 // the ADDSUB idiom has been successfully recognized. There are no known
38444 // X86 targets with 512-bit ADDSUB instructions!
38445 if (VT.is512BitVector())
38446 return SDValue();
38447
38448 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
38449}
38450
38451// We are looking for a shuffle where both sources are concatenated with undef
38452// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
38453// if we can express this as a single-source shuffle, that's preferable.
38454static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
38455 const X86Subtarget &Subtarget) {
38456 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
38457 return SDValue();
38458
38459 EVT VT = N->getValueType(0);
38460
38461 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
38462 if (!VT.is128BitVector() && !VT.is256BitVector())
38463 return SDValue();
38464
38465 if (VT.getVectorElementType() != MVT::i32 &&
38466 VT.getVectorElementType() != MVT::i64 &&
38467 VT.getVectorElementType() != MVT::f32 &&
38468 VT.getVectorElementType() != MVT::f64)
38469 return SDValue();
38470
38471 SDValue N0 = N->getOperand(0);
38472 SDValue N1 = N->getOperand(1);
38473
38474 // Check that both sources are concats with undef.
38475 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
38476 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
38477 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
38478 !N1.getOperand(1).isUndef())
38479 return SDValue();
38480
38481 // Construct the new shuffle mask. Elements from the first source retain their
38482 // index, but elements from the second source no longer need to skip an undef.
38483 SmallVector<int, 8> Mask;
38484 int NumElts = VT.getVectorNumElements();
38485
38486 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
38487 for (int Elt : SVOp->getMask())
38488 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
38489
38490 SDLoc DL(N);
38491 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
38492 N1.getOperand(0));
38493 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
38494}
38495
38496/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
38497/// low half of each source vector and does not set any high half elements in
38498/// the destination vector, narrow the shuffle to half its original size.
38499static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
38500 if (!Shuf->getValueType(0).isSimple())
38501 return SDValue();
38502 MVT VT = Shuf->getSimpleValueType(0);
38503 if (!VT.is256BitVector() && !VT.is512BitVector())
38504 return SDValue();
38505
38506 // See if we can ignore all of the high elements of the shuffle.
38507 ArrayRef<int> Mask = Shuf->getMask();
38508 if (!isUndefUpperHalf(Mask))
38509 return SDValue();
38510
38511 // Check if the shuffle mask accesses only the low half of each input vector
38512 // (half-index output is 0 or 2).
38513 int HalfIdx1, HalfIdx2;
38514 SmallVector<int, 8> HalfMask(Mask.size() / 2);
38515 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
38516 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
38517 return SDValue();
38518
38519 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
38520 // The trick is knowing that all of the insert/extract are actually free
38521 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
38522 // of narrow inputs into a narrow output, and that is always cheaper than
38523 // the wide shuffle that we started with.
38524 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
38525 Shuf->getOperand(1), HalfMask, HalfIdx1,
38526 HalfIdx2, false, DAG, /*UseConcat*/true);
38527}
38528
38529static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
38530 TargetLowering::DAGCombinerInfo &DCI,
38531 const X86Subtarget &Subtarget) {
38532 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
38533 if (SDValue V = narrowShuffle(Shuf, DAG))
38534 return V;
38535
38536 // If we have legalized the vector types, look for blends of FADD and FSUB
38537 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
38538 SDLoc dl(N);
38539 EVT VT = N->getValueType(0);
38540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38541 if (TLI.isTypeLegal(VT))
38542 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
38543 return AddSub;
38544
38545 // Attempt to combine into a vector load/broadcast.
38546 if (SDValue LD = combineToConsecutiveLoads(
38547 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
38548 return LD;
38549
38550 // For AVX2, we sometimes want to combine
38551 // (vector_shuffle <mask> (concat_vectors t1, undef)
38552 // (concat_vectors t2, undef))
38553 // Into:
38554 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
38555 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
38556 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
38557 return ShufConcat;
38558
38559 if (isTargetShuffle(N->getOpcode())) {
38560 SDValue Op(N, 0);
38561 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
38562 return Shuffle;
38563
38564 // Try recursively combining arbitrary sequences of x86 shuffle
38565 // instructions into higher-order shuffles. We do this after combining
38566 // specific PSHUF instruction sequences into their minimal form so that we
38567 // can evaluate how many specialized shuffle instructions are involved in
38568 // a particular chain.
38569 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
38570 return Res;
38571
38572 // Simplify source operands based on shuffle mask.
38573 // TODO - merge this into combineX86ShufflesRecursively.
38574 APInt KnownUndef, KnownZero;
38575 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
38576 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
38577 DCI))
38578 return SDValue(N, 0);
38579 }
38580
38581 return SDValue();
38582}
38583
38584// Simplify variable target shuffle masks based on the demanded elements.
38585// TODO: Handle DemandedBits in mask indices as well?
38586bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
38587 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
38588 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
38589 // If we're demanding all elements don't bother trying to simplify the mask.
38590 unsigned NumElts = DemandedElts.getBitWidth();
38591 if (DemandedElts.isAllOnesValue())
38592 return false;
38593
38594 SDValue Mask = Op.getOperand(MaskIndex);
38595 if (!Mask.hasOneUse())
38596 return false;
38597
38598 // Attempt to generically simplify the variable shuffle mask.
38599 APInt MaskUndef, MaskZero;
38600 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
38601 Depth + 1))
38602 return true;
38603
38604 // Attempt to extract+simplify a (constant pool load) shuffle mask.
38605 // TODO: Support other types from getTargetShuffleMaskIndices?
38606 SDValue BC = peekThroughOneUseBitcasts(Mask);
38607 EVT BCVT = BC.getValueType();
38608 auto *Load = dyn_cast<LoadSDNode>(BC);
38609 if (!Load)
38610 return false;
38611
38612 const Constant *C = getTargetConstantFromNode(Load);
38613 if (!C)
38614 return false;
38615
38616 Type *CTy = C->getType();
38617 if (!CTy->isVectorTy() ||
38618 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
38619 return false;
38620
38621 // Handle scaling for i64 elements on 32-bit targets.
38622 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
38623 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
38624 return false;
38625 unsigned Scale = NumCstElts / NumElts;
38626
38627 // Simplify mask if we have an undemanded element that is not undef.
38628 bool Simplified = false;
38629 SmallVector<Constant *, 32> ConstVecOps;
38630 for (unsigned i = 0; i != NumCstElts; ++i) {
38631 Constant *Elt = C->getAggregateElement(i);
38632 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
38633 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
38634 Simplified = true;
38635 continue;
38636 }
38637 ConstVecOps.push_back(Elt);
38638 }
38639 if (!Simplified)
38640 return false;
38641
38642 // Generate new constant pool entry + legalize immediately for the load.
38643 SDLoc DL(Op);
38644 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
38645 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
38646 SDValue NewMask = TLO.DAG.getLoad(
38647 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
38648 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
38649 Load->getAlign());
38650 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
38651}
38652
38653bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
38654 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
38655 TargetLoweringOpt &TLO, unsigned Depth) const {
38656 int NumElts = DemandedElts.getBitWidth();
38657 unsigned Opc = Op.getOpcode();
38658 EVT VT = Op.getValueType();
38659
38660 // Handle special case opcodes.
38661 switch (Opc) {
38662 case X86ISD::PMULDQ:
38663 case X86ISD::PMULUDQ: {
38664 APInt LHSUndef, LHSZero;
38665 APInt RHSUndef, RHSZero;
38666 SDValue LHS = Op.getOperand(0);
38667 SDValue RHS = Op.getOperand(1);
38668 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
38669 Depth + 1))
38670 return true;
38671 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
38672 Depth + 1))
38673 return true;
38674 // Multiply by zero.
38675 KnownZero = LHSZero | RHSZero;
38676 break;
38677 }
38678 case X86ISD::VSHL:
38679 case X86ISD::VSRL:
38680 case X86ISD::VSRA: {
38681 // We only need the bottom 64-bits of the (128-bit) shift amount.
38682 SDValue Amt = Op.getOperand(1);
38683 MVT AmtVT = Amt.getSimpleValueType();
38684 assert(AmtVT.is128BitVector() && "Unexpected value type")((void)0);
38685
38686 // If we reuse the shift amount just for sse shift amounts then we know that
38687 // only the bottom 64-bits are only ever used.
38688 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
38689 unsigned UseOpc = Use->getOpcode();
38690 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
38691 UseOpc == X86ISD::VSRA) &&
38692 Use->getOperand(0) != Amt;
38693 });
38694
38695 APInt AmtUndef, AmtZero;
38696 unsigned NumAmtElts = AmtVT.getVectorNumElements();
38697 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
38698 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
38699 Depth + 1, AssumeSingleUse))
38700 return true;
38701 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38702 }
38703 case X86ISD::VSHLI:
38704 case X86ISD::VSRLI:
38705 case X86ISD::VSRAI: {
38706 SDValue Src = Op.getOperand(0);
38707 APInt SrcUndef;
38708 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
38709 Depth + 1))
38710 return true;
38711
38712 // Aggressively peek through ops to get at the demanded elts.
38713 if (!DemandedElts.isAllOnesValue())
38714 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38715 Src, DemandedElts, TLO.DAG, Depth + 1))
38716 return TLO.CombineTo(
38717 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
38718 break;
38719 }
38720 case X86ISD::KSHIFTL: {
38721 SDValue Src = Op.getOperand(0);
38722 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38723 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((void)0);
38724 unsigned ShiftAmt = Amt->getZExtValue();
38725
38726 if (ShiftAmt == 0)
38727 return TLO.CombineTo(Op, Src);
38728
38729 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
38730 // single shift. We can do this if the bottom bits (which are shifted
38731 // out) are never demanded.
38732 if (Src.getOpcode() == X86ISD::KSHIFTR) {
38733 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
38734 unsigned C1 = Src.getConstantOperandVal(1);
38735 unsigned NewOpc = X86ISD::KSHIFTL;
38736 int Diff = ShiftAmt - C1;
38737 if (Diff < 0) {
38738 Diff = -Diff;
38739 NewOpc = X86ISD::KSHIFTR;
38740 }
38741
38742 SDLoc dl(Op);
38743 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38744 return TLO.CombineTo(
38745 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38746 }
38747 }
38748
38749 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
38750 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38751 Depth + 1))
38752 return true;
38753
38754 KnownUndef <<= ShiftAmt;
38755 KnownZero <<= ShiftAmt;
38756 KnownZero.setLowBits(ShiftAmt);
38757 break;
38758 }
38759 case X86ISD::KSHIFTR: {
38760 SDValue Src = Op.getOperand(0);
38761 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
38762 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((void)0);
38763 unsigned ShiftAmt = Amt->getZExtValue();
38764
38765 if (ShiftAmt == 0)
38766 return TLO.CombineTo(Op, Src);
38767
38768 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
38769 // single shift. We can do this if the top bits (which are shifted
38770 // out) are never demanded.
38771 if (Src.getOpcode() == X86ISD::KSHIFTL) {
38772 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
38773 unsigned C1 = Src.getConstantOperandVal(1);
38774 unsigned NewOpc = X86ISD::KSHIFTR;
38775 int Diff = ShiftAmt - C1;
38776 if (Diff < 0) {
38777 Diff = -Diff;
38778 NewOpc = X86ISD::KSHIFTL;
38779 }
38780
38781 SDLoc dl(Op);
38782 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
38783 return TLO.CombineTo(
38784 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
38785 }
38786 }
38787
38788 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
38789 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
38790 Depth + 1))
38791 return true;
38792
38793 KnownUndef.lshrInPlace(ShiftAmt);
38794 KnownZero.lshrInPlace(ShiftAmt);
38795 KnownZero.setHighBits(ShiftAmt);
38796 break;
38797 }
38798 case X86ISD::CVTSI2P:
38799 case X86ISD::CVTUI2P: {
38800 SDValue Src = Op.getOperand(0);
38801 MVT SrcVT = Src.getSimpleValueType();
38802 APInt SrcUndef, SrcZero;
38803 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38804 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38805 Depth + 1))
38806 return true;
38807 break;
38808 }
38809 case X86ISD::PACKSS:
38810 case X86ISD::PACKUS: {
38811 SDValue N0 = Op.getOperand(0);
38812 SDValue N1 = Op.getOperand(1);
38813
38814 APInt DemandedLHS, DemandedRHS;
38815 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38816
38817 APInt LHSUndef, LHSZero;
38818 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38819 Depth + 1))
38820 return true;
38821 APInt RHSUndef, RHSZero;
38822 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38823 Depth + 1))
38824 return true;
38825
38826 // TODO - pass on known zero/undef.
38827
38828 // Aggressively peek through ops to get at the demanded elts.
38829 // TODO - we should do this for all target/faux shuffles ops.
38830 if (!DemandedElts.isAllOnesValue()) {
38831 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38832 TLO.DAG, Depth + 1);
38833 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38834 TLO.DAG, Depth + 1);
38835 if (NewN0 || NewN1) {
38836 NewN0 = NewN0 ? NewN0 : N0;
38837 NewN1 = NewN1 ? NewN1 : N1;
38838 return TLO.CombineTo(Op,
38839 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38840 }
38841 }
38842 break;
38843 }
38844 case X86ISD::HADD:
38845 case X86ISD::HSUB:
38846 case X86ISD::FHADD:
38847 case X86ISD::FHSUB: {
38848 SDValue N0 = Op.getOperand(0);
38849 SDValue N1 = Op.getOperand(1);
38850
38851 APInt DemandedLHS, DemandedRHS;
38852 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38853
38854 APInt LHSUndef, LHSZero;
38855 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
38856 Depth + 1))
38857 return true;
38858 APInt RHSUndef, RHSZero;
38859 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
38860 Depth + 1))
38861 return true;
38862
38863 // TODO - pass on known zero/undef.
38864
38865 // Aggressively peek through ops to get at the demanded elts.
38866 // TODO: Handle repeated operands.
38867 if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
38868 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
38869 TLO.DAG, Depth + 1);
38870 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
38871 TLO.DAG, Depth + 1);
38872 if (NewN0 || NewN1) {
38873 NewN0 = NewN0 ? NewN0 : N0;
38874 NewN1 = NewN1 ? NewN1 : N1;
38875 return TLO.CombineTo(Op,
38876 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
38877 }
38878 }
38879 break;
38880 }
38881 case X86ISD::VTRUNC:
38882 case X86ISD::VTRUNCS:
38883 case X86ISD::VTRUNCUS: {
38884 SDValue Src = Op.getOperand(0);
38885 MVT SrcVT = Src.getSimpleValueType();
38886 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
38887 APInt SrcUndef, SrcZero;
38888 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
38889 Depth + 1))
38890 return true;
38891 KnownZero = SrcZero.zextOrTrunc(NumElts);
38892 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
38893 break;
38894 }
38895 case X86ISD::BLENDV: {
38896 APInt SelUndef, SelZero;
38897 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
38898 SelZero, TLO, Depth + 1))
38899 return true;
38900
38901 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
38902 APInt LHSUndef, LHSZero;
38903 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
38904 LHSZero, TLO, Depth + 1))
38905 return true;
38906
38907 APInt RHSUndef, RHSZero;
38908 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
38909 RHSZero, TLO, Depth + 1))
38910 return true;
38911
38912 KnownZero = LHSZero & RHSZero;
38913 KnownUndef = LHSUndef & RHSUndef;
38914 break;
38915 }
38916 case X86ISD::VZEXT_MOVL: {
38917 // If upper demanded elements are already zero then we have nothing to do.
38918 SDValue Src = Op.getOperand(0);
38919 APInt DemandedUpperElts = DemandedElts;
38920 DemandedUpperElts.clearLowBits(1);
38921 if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
38922 return TLO.CombineTo(Op, Src);
38923 break;
38924 }
38925 case X86ISD::VBROADCAST: {
38926 SDValue Src = Op.getOperand(0);
38927 MVT SrcVT = Src.getSimpleValueType();
38928 if (!SrcVT.isVector())
38929 break;
38930 // Don't bother broadcasting if we just need the 0'th element.
38931 if (DemandedElts == 1) {
38932 if (Src.getValueType() != VT)
38933 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
38934 SDLoc(Op));
38935 return TLO.CombineTo(Op, Src);
38936 }
38937 APInt SrcUndef, SrcZero;
38938 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
38939 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
38940 Depth + 1))
38941 return true;
38942 // Aggressively peek through src to get at the demanded elt.
38943 // TODO - we should do this for all target/faux shuffles ops.
38944 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
38945 Src, SrcElts, TLO.DAG, Depth + 1))
38946 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
38947 break;
38948 }
38949 case X86ISD::VPERMV:
38950 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
38951 Depth))
38952 return true;
38953 break;
38954 case X86ISD::PSHUFB:
38955 case X86ISD::VPERMV3:
38956 case X86ISD::VPERMILPV:
38957 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
38958 Depth))
38959 return true;
38960 break;
38961 case X86ISD::VPPERM:
38962 case X86ISD::VPERMIL2:
38963 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
38964 Depth))
38965 return true;
38966 break;
38967 }
38968
38969 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
38970 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
38971 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
38972 if ((VT.is256BitVector() || VT.is512BitVector()) &&
38973 DemandedElts.lshr(NumElts / 2) == 0) {
38974 unsigned SizeInBits = VT.getSizeInBits();
38975 unsigned ExtSizeInBits = SizeInBits / 2;
38976
38977 // See if 512-bit ops only use the bottom 128-bits.
38978 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
38979 ExtSizeInBits = SizeInBits / 4;
38980
38981 switch (Opc) {
38982 // Scalar broadcast.
38983 case X86ISD::VBROADCAST: {
38984 SDLoc DL(Op);
38985 SDValue Src = Op.getOperand(0);
38986 if (Src.getValueSizeInBits() > ExtSizeInBits)
38987 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
38988 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38989 ExtSizeInBits / VT.getScalarSizeInBits());
38990 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
38991 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
38992 TLO.DAG, DL, ExtSizeInBits));
38993 }
38994 case X86ISD::VBROADCAST_LOAD: {
38995 SDLoc DL(Op);
38996 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
38997 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
38998 ExtSizeInBits / VT.getScalarSizeInBits());
38999 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39000 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39001 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
39002 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
39003 MemIntr->getMemOperand());
39004 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39005 Bcst.getValue(1));
39006 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39007 TLO.DAG, DL, ExtSizeInBits));
39008 }
39009 // Subvector broadcast.
39010 case X86ISD::SUBV_BROADCAST_LOAD: {
39011 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39012 EVT MemVT = MemIntr->getMemoryVT();
39013 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
39014 SDLoc DL(Op);
39015 SDValue Ld =
39016 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
39017 MemIntr->getBasePtr(), MemIntr->getMemOperand());
39018 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39019 Ld.getValue(1));
39020 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
39021 TLO.DAG, DL, ExtSizeInBits));
39022 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
39023 SDLoc DL(Op);
39024 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39025 ExtSizeInBits / VT.getScalarSizeInBits());
39026 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39027 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39028 SDValue Bcst =
39029 TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
39030 Ops, MemVT, MemIntr->getMemOperand());
39031 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39032 Bcst.getValue(1));
39033 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39034 TLO.DAG, DL, ExtSizeInBits));
39035 }
39036 break;
39037 }
39038 // Byte shifts by immediate.
39039 case X86ISD::VSHLDQ:
39040 case X86ISD::VSRLDQ:
39041 // Shift by uniform.
39042 case X86ISD::VSHL:
39043 case X86ISD::VSRL:
39044 case X86ISD::VSRA:
39045 // Shift by immediate.
39046 case X86ISD::VSHLI:
39047 case X86ISD::VSRLI:
39048 case X86ISD::VSRAI: {
39049 SDLoc DL(Op);
39050 SDValue Ext0 =
39051 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
39052 SDValue ExtOp =
39053 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
39054 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39055 SDValue Insert =
39056 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39057 return TLO.CombineTo(Op, Insert);
39058 }
39059 case X86ISD::VPERMI: {
39060 // Simplify PERMPD/PERMQ to extract_subvector.
39061 // TODO: This should be done in shuffle combining.
39062 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
39063 SmallVector<int, 4> Mask;
39064 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
39065 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
39066 SDLoc DL(Op);
39067 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
39068 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39069 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
39070 return TLO.CombineTo(Op, Insert);
39071 }
39072 }
39073 break;
39074 }
39075 case X86ISD::VPERM2X128: {
39076 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
39077 SDLoc DL(Op);
39078 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
39079 if (LoMask & 0x8)
39080 return TLO.CombineTo(
39081 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
39082 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
39083 unsigned SrcIdx = (LoMask & 0x2) >> 1;
39084 SDValue ExtOp =
39085 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
39086 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39087 SDValue Insert =
39088 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39089 return TLO.CombineTo(Op, Insert);
39090 }
39091 // Zero upper elements.
39092 case X86ISD::VZEXT_MOVL:
39093 // Target unary shuffles by immediate:
39094 case X86ISD::PSHUFD:
39095 case X86ISD::PSHUFLW:
39096 case X86ISD::PSHUFHW:
39097 case X86ISD::VPERMILPI:
39098 // (Non-Lane Crossing) Target Shuffles.
39099 case X86ISD::VPERMILPV:
39100 case X86ISD::VPERMIL2:
39101 case X86ISD::PSHUFB:
39102 case X86ISD::UNPCKL:
39103 case X86ISD::UNPCKH:
39104 case X86ISD::BLENDI:
39105 // Integer ops.
39106 case X86ISD::AVG:
39107 case X86ISD::PACKSS:
39108 case X86ISD::PACKUS:
39109 // Horizontal Ops.
39110 case X86ISD::HADD:
39111 case X86ISD::HSUB:
39112 case X86ISD::FHADD:
39113 case X86ISD::FHSUB: {
39114 SDLoc DL(Op);
39115 SmallVector<SDValue, 4> Ops;
39116 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
39117 SDValue SrcOp = Op.getOperand(i);
39118 EVT SrcVT = SrcOp.getValueType();
39119 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&((void)0)
39120 "Unsupported vector size")((void)0);
39121 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
39122 ExtSizeInBits)
39123 : SrcOp);
39124 }
39125 MVT ExtVT = VT.getSimpleVT();
39126 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
39127 ExtSizeInBits / ExtVT.getScalarSizeInBits());
39128 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
39129 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39130 SDValue Insert =
39131 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39132 return TLO.CombineTo(Op, Insert);
39133 }
39134 }
39135 }
39136
39137 // Get target/faux shuffle mask.
39138 APInt OpUndef, OpZero;
39139 SmallVector<int, 64> OpMask;
39140 SmallVector<SDValue, 2> OpInputs;
39141 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
39142 OpZero, TLO.DAG, Depth, false))
39143 return false;
39144
39145 // Shuffle inputs must be the same size as the result.
39146 if (OpMask.size() != (unsigned)NumElts ||
39147 llvm::any_of(OpInputs, [VT](SDValue V) {
39148 return VT.getSizeInBits() != V.getValueSizeInBits() ||
39149 !V.getValueType().isVector();
39150 }))
39151 return false;
39152
39153 KnownZero = OpZero;
39154 KnownUndef = OpUndef;
39155
39156 // Check if shuffle mask can be simplified to undef/zero/identity.
39157 int NumSrcs = OpInputs.size();
39158 for (int i = 0; i != NumElts; ++i)
39159 if (!DemandedElts[i])
39160 OpMask[i] = SM_SentinelUndef;
39161
39162 if (isUndefInRange(OpMask, 0, NumElts)) {
39163 KnownUndef.setAllBits();
39164 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
39165 }
39166 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
39167 KnownZero.setAllBits();
39168 return TLO.CombineTo(
39169 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
39170 }
39171 for (int Src = 0; Src != NumSrcs; ++Src)
39172 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
39173 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
39174
39175 // Attempt to simplify inputs.
39176 for (int Src = 0; Src != NumSrcs; ++Src) {
39177 // TODO: Support inputs of different types.
39178 if (OpInputs[Src].getValueType() != VT)
39179 continue;
39180
39181 int Lo = Src * NumElts;
39182 APInt SrcElts = APInt::getNullValue(NumElts);
39183 for (int i = 0; i != NumElts; ++i)
39184 if (DemandedElts[i]) {
39185 int M = OpMask[i] - Lo;
39186 if (0 <= M && M < NumElts)
39187 SrcElts.setBit(M);
39188 }
39189
39190 // TODO - Propagate input undef/zero elts.
39191 APInt SrcUndef, SrcZero;
39192 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
39193 TLO, Depth + 1))
39194 return true;
39195 }
39196
39197 // If we don't demand all elements, then attempt to combine to a simpler
39198 // shuffle.
39199 // We need to convert the depth to something combineX86ShufflesRecursively
39200 // can handle - so pretend its Depth == 0 again, and reduce the max depth
39201 // to match. This prevents combineX86ShuffleChain from returning a
39202 // combined shuffle that's the same as the original root, causing an
39203 // infinite loop.
39204 if (!DemandedElts.isAllOnesValue()) {
39205 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")((void)0);
39206
39207 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
39208 for (int i = 0; i != NumElts; ++i)
39209 if (DemandedElts[i])
39210 DemandedMask[i] = i;
39211
39212 SDValue NewShuffle = combineX86ShufflesRecursively(
39213 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
39214 /*HasVarMask*/ false,
39215 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
39216 Subtarget);
39217 if (NewShuffle)
39218 return TLO.CombineTo(Op, NewShuffle);
39219 }
39220
39221 return false;
39222}
39223
39224bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
39225 SDValue Op, const APInt &OriginalDemandedBits,
39226 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
39227 unsigned Depth) const {
39228 EVT VT = Op.getValueType();
39229 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
39230 unsigned Opc = Op.getOpcode();
39231 switch(Opc) {
39232 case X86ISD::VTRUNC: {
39233 KnownBits KnownOp;
39234 SDValue Src = Op.getOperand(0);
39235 MVT SrcVT = Src.getSimpleValueType();
39236
39237 // Simplify the input, using demanded bit information.
39238 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
39239 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
39240 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
39241 return true;
39242 break;
39243 }
39244 case X86ISD::PMULDQ:
39245 case X86ISD::PMULUDQ: {
39246 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
39247 KnownBits KnownOp;
39248 SDValue LHS = Op.getOperand(0);
39249 SDValue RHS = Op.getOperand(1);
39250 // FIXME: Can we bound this better?
39251 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
39252 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
39253 TLO, Depth + 1))
39254 return true;
39255 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
39256 TLO, Depth + 1))
39257 return true;
39258
39259 // Aggressively peek through ops to get at the demanded low bits.
39260 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
39261 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39262 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
39263 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39264 if (DemandedLHS || DemandedRHS) {
39265 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
39266 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
39267 return TLO.CombineTo(
39268 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
39269 }
39270 break;
39271 }
39272 case X86ISD::VSHLI: {
39273 SDValue Op0 = Op.getOperand(0);
39274
39275 unsigned ShAmt = Op.getConstantOperandVal(1);
39276 if (ShAmt >= BitWidth)
39277 break;
39278
39279 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
39280
39281 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39282 // single shift. We can do this if the bottom bits (which are shifted
39283 // out) are never demanded.
39284 if (Op0.getOpcode() == X86ISD::VSRLI &&
39285 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
39286 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
39287 if (Shift2Amt < BitWidth) {
39288 int Diff = ShAmt - Shift2Amt;
39289 if (Diff == 0)
39290 return TLO.CombineTo(Op, Op0.getOperand(0));
39291
39292 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
39293 SDValue NewShift = TLO.DAG.getNode(
39294 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
39295 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
39296 return TLO.CombineTo(Op, NewShift);
39297 }
39298 }
39299
39300 // If we are only demanding sign bits then we can use the shift source directly.
39301 unsigned NumSignBits =
39302 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
39303 unsigned UpperDemandedBits =
39304 BitWidth - OriginalDemandedBits.countTrailingZeros();
39305 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39306 return TLO.CombineTo(Op, Op0);
39307
39308 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39309 TLO, Depth + 1))
39310 return true;
39311
39312 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((void)0);
39313 Known.Zero <<= ShAmt;
39314 Known.One <<= ShAmt;
39315
39316 // Low bits known zero.
39317 Known.Zero.setLowBits(ShAmt);
39318 return false;
39319 }
39320 case X86ISD::VSRLI: {
39321 unsigned ShAmt = Op.getConstantOperandVal(1);
39322 if (ShAmt >= BitWidth)
39323 break;
39324
39325 APInt DemandedMask = OriginalDemandedBits << ShAmt;
39326
39327 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
39328 OriginalDemandedElts, Known, TLO, Depth + 1))
39329 return true;
39330
39331 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((void)0);
39332 Known.Zero.lshrInPlace(ShAmt);
39333 Known.One.lshrInPlace(ShAmt);
39334
39335 // High bits known zero.
39336 Known.Zero.setHighBits(ShAmt);
39337 return false;
39338 }
39339 case X86ISD::VSRAI: {
39340 SDValue Op0 = Op.getOperand(0);
39341 SDValue Op1 = Op.getOperand(1);
39342
39343 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
39344 if (ShAmt >= BitWidth)
39345 break;
39346
39347 APInt DemandedMask = OriginalDemandedBits << ShAmt;
39348
39349 // If we just want the sign bit then we don't need to shift it.
39350 if (OriginalDemandedBits.isSignMask())
39351 return TLO.CombineTo(Op, Op0);
39352
39353 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
39354 if (Op0.getOpcode() == X86ISD::VSHLI &&
39355 Op.getOperand(1) == Op0.getOperand(1)) {
39356 SDValue Op00 = Op0.getOperand(0);
39357 unsigned NumSignBits =
39358 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
39359 if (ShAmt < NumSignBits)
39360 return TLO.CombineTo(Op, Op00);
39361 }
39362
39363 // If any of the demanded bits are produced by the sign extension, we also
39364 // demand the input sign bit.
39365 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
39366 DemandedMask.setSignBit();
39367
39368 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39369 TLO, Depth + 1))
39370 return true;
39371
39372 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((void)0);
39373 Known.Zero.lshrInPlace(ShAmt);
39374 Known.One.lshrInPlace(ShAmt);
39375
39376 // If the input sign bit is known to be zero, or if none of the top bits
39377 // are demanded, turn this into an unsigned shift right.
39378 if (Known.Zero[BitWidth - ShAmt - 1] ||
39379 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
39380 return TLO.CombineTo(
39381 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
39382
39383 // High bits are known one.
39384 if (Known.One[BitWidth - ShAmt - 1])
39385 Known.One.setHighBits(ShAmt);
39386 return false;
39387 }
39388 case X86ISD::PEXTRB:
39389 case X86ISD::PEXTRW: {
39390 SDValue Vec = Op.getOperand(0);
39391 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
39392 MVT VecVT = Vec.getSimpleValueType();
39393 unsigned NumVecElts = VecVT.getVectorNumElements();
39394
39395 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
39396 unsigned Idx = CIdx->getZExtValue();
39397 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
39398
39399 // If we demand no bits from the vector then we must have demanded
39400 // bits from the implict zext - simplify to zero.
39401 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
39402 if (DemandedVecBits == 0)
39403 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39404
39405 APInt KnownUndef, KnownZero;
39406 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
39407 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
39408 KnownZero, TLO, Depth + 1))
39409 return true;
39410
39411 KnownBits KnownVec;
39412 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
39413 KnownVec, TLO, Depth + 1))
39414 return true;
39415
39416 if (SDValue V = SimplifyMultipleUseDemandedBits(
39417 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
39418 return TLO.CombineTo(
39419 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
39420
39421 Known = KnownVec.zext(BitWidth);
39422 return false;
39423 }
39424 break;
39425 }
39426 case X86ISD::PINSRB:
39427 case X86ISD::PINSRW: {
39428 SDValue Vec = Op.getOperand(0);
39429 SDValue Scl = Op.getOperand(1);
39430 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39431 MVT VecVT = Vec.getSimpleValueType();
39432
39433 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
39434 unsigned Idx = CIdx->getZExtValue();
39435 if (!OriginalDemandedElts[Idx])
39436 return TLO.CombineTo(Op, Vec);
39437
39438 KnownBits KnownVec;
39439 APInt DemandedVecElts(OriginalDemandedElts);
39440 DemandedVecElts.clearBit(Idx);
39441 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
39442 KnownVec, TLO, Depth + 1))
39443 return true;
39444
39445 KnownBits KnownScl;
39446 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
39447 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
39448 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
39449 return true;
39450
39451 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
39452 Known = KnownBits::commonBits(KnownVec, KnownScl);
39453 return false;
39454 }
39455 break;
39456 }
39457 case X86ISD::PACKSS:
39458 // PACKSS saturates to MIN/MAX integer values. So if we just want the
39459 // sign bit then we can just ask for the source operands sign bit.
39460 // TODO - add known bits handling.
39461 if (OriginalDemandedBits.isSignMask()) {
39462 APInt DemandedLHS, DemandedRHS;
39463 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
39464
39465 KnownBits KnownLHS, KnownRHS;
39466 APInt SignMask = APInt::getSignMask(BitWidth * 2);
39467 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
39468 KnownLHS, TLO, Depth + 1))
39469 return true;
39470 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
39471 KnownRHS, TLO, Depth + 1))
39472 return true;
39473
39474 // Attempt to avoid multi-use ops if we don't need anything from them.
39475 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
39476 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
39477 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
39478 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
39479 if (DemandedOp0 || DemandedOp1) {
39480 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
39481 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
39482 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
39483 }
39484 }
39485 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
39486 break;
39487 case X86ISD::VBROADCAST: {
39488 SDValue Src = Op.getOperand(0);
39489 MVT SrcVT = Src.getSimpleValueType();
39490 APInt DemandedElts = APInt::getOneBitSet(
39491 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
39492 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
39493 TLO, Depth + 1))
39494 return true;
39495 // If we don't need the upper bits, attempt to narrow the broadcast source.
39496 // Don't attempt this on AVX512 as it might affect broadcast folding.
39497 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
39498 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
39499 OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
39500 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
39501 SDValue NewSrc =
39502 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
39503 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
39504 SDValue NewBcst =
39505 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
39506 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
39507 }
39508 break;
39509 }
39510 case X86ISD::PCMPGT:
39511 // icmp sgt(0, R) == ashr(R, BitWidth-1).
39512 // iff we only need the sign bit then we can use R directly.
39513 if (OriginalDemandedBits.isSignMask() &&
39514 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39515 return TLO.CombineTo(Op, Op.getOperand(1));
39516 break;
39517 case X86ISD::MOVMSK: {
39518 SDValue Src = Op.getOperand(0);
39519 MVT SrcVT = Src.getSimpleValueType();
39520 unsigned SrcBits = SrcVT.getScalarSizeInBits();
39521 unsigned NumElts = SrcVT.getVectorNumElements();
39522
39523 // If we don't need the sign bits at all just return zero.
39524 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
39525 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39526
39527 // Only demand the vector elements of the sign bits we need.
39528 APInt KnownUndef, KnownZero;
39529 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
39530 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
39531 TLO, Depth + 1))
39532 return true;
39533
39534 Known.Zero = KnownZero.zextOrSelf(BitWidth);
39535 Known.Zero.setHighBits(BitWidth - NumElts);
39536
39537 // MOVMSK only uses the MSB from each vector element.
39538 KnownBits KnownSrc;
39539 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
39540 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
39541 Depth + 1))
39542 return true;
39543
39544 if (KnownSrc.One[SrcBits - 1])
39545 Known.One.setLowBits(NumElts);
39546 else if (KnownSrc.Zero[SrcBits - 1])
39547 Known.Zero.setLowBits(NumElts);
39548
39549 // Attempt to avoid multi-use os if we don't need anything from it.
39550 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
39551 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
39552 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
39553 return false;
39554 }
39555 case X86ISD::BEXTR:
39556 case X86ISD::BEXTRI: {
39557 SDValue Op0 = Op.getOperand(0);
39558 SDValue Op1 = Op.getOperand(1);
39559
39560 // Only bottom 16-bits of the control bits are required.
39561 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
39562 // NOTE: SimplifyDemandedBits won't do this for constants.
39563 uint64_t Val1 = Cst1->getZExtValue();
39564 uint64_t MaskedVal1 = Val1 & 0xFFFF;
39565 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
39566 SDLoc DL(Op);
39567 return TLO.CombineTo(
39568 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
39569 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
39570 }
39571
39572 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
39573 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
39574
39575 // If the length is 0, the result is 0.
39576 if (Length == 0) {
39577 Known.setAllZero();
39578 return false;
39579 }
39580
39581 if ((Shift + Length) <= BitWidth) {
39582 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
39583 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
39584 return true;
39585
39586 Known = Known.extractBits(Length, Shift);
39587 Known = Known.zextOrTrunc(BitWidth);
39588 return false;
39589 }
39590 } else {
39591 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")((void)0);
39592 KnownBits Known1;
39593 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
39594 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
39595 return true;
39596
39597 // If the length is 0, replace with 0.
39598 KnownBits LengthBits = Known1.extractBits(8, 8);
39599 if (LengthBits.isZero())
39600 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
39601 }
39602
39603 break;
39604 }
39605 case X86ISD::PDEP: {
39606 SDValue Op0 = Op.getOperand(0);
39607 SDValue Op1 = Op.getOperand(1);
39608
39609 unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
39610 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
39611
39612 // If the demanded bits has leading zeroes, we don't demand those from the
39613 // mask.
39614 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
39615 return true;
39616
39617 // The number of possible 1s in the mask determines the number of LSBs of
39618 // operand 0 used. Undemanded bits from the mask don't matter so filter
39619 // them before counting.
39620 KnownBits Known2;
39621 uint64_t Count = (~Known.Zero & LoMask).countPopulation();
39622 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
39623 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
39624 return true;
39625
39626 // Zeroes are retained from the mask, but not ones.
39627 Known.One.clearAllBits();
39628 // The result will have at least as many trailing zeros as the non-mask
39629 // operand since bits can only map to the same or higher bit position.
39630 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
39631 return false;
39632 }
39633 }
39634
39635 return TargetLowering::SimplifyDemandedBitsForTargetNode(
39636 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
39637}
39638
39639SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39640 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
39641 SelectionDAG &DAG, unsigned Depth) const {
39642 int NumElts = DemandedElts.getBitWidth();
39643 unsigned Opc = Op.getOpcode();
39644 EVT VT = Op.getValueType();
39645
39646 switch (Opc) {
39647 case X86ISD::PINSRB:
39648 case X86ISD::PINSRW: {
39649 // If we don't demand the inserted element, return the base vector.
39650 SDValue Vec = Op.getOperand(0);
39651 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
39652 MVT VecVT = Vec.getSimpleValueType();
39653 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
39654 !DemandedElts[CIdx->getZExtValue()])
39655 return Vec;
39656 break;
39657 }
39658 case X86ISD::VSHLI: {
39659 // If we are only demanding sign bits then we can use the shift source
39660 // directly.
39661 SDValue Op0 = Op.getOperand(0);
39662 unsigned ShAmt = Op.getConstantOperandVal(1);
39663 unsigned BitWidth = DemandedBits.getBitWidth();
39664 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
39665 unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
39666 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39667 return Op0;
39668 break;
39669 }
39670 case X86ISD::VSRAI:
39671 // iff we only need the sign bit then we can use the source directly.
39672 // TODO: generalize where we only demand extended signbits.
39673 if (DemandedBits.isSignMask())
39674 return Op.getOperand(0);
39675 break;
39676 case X86ISD::PCMPGT:
39677 // icmp sgt(0, R) == ashr(R, BitWidth-1).
39678 // iff we only need the sign bit then we can use R directly.
39679 if (DemandedBits.isSignMask() &&
39680 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
39681 return Op.getOperand(1);
39682 break;
39683 }
39684
39685 APInt ShuffleUndef, ShuffleZero;
39686 SmallVector<int, 16> ShuffleMask;
39687 SmallVector<SDValue, 2> ShuffleOps;
39688 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
39689 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
39690 // If all the demanded elts are from one operand and are inline,
39691 // then we can use the operand directly.
39692 int NumOps = ShuffleOps.size();
39693 if (ShuffleMask.size() == (unsigned)NumElts &&
39694 llvm::all_of(ShuffleOps, [VT](SDValue V) {
39695 return VT.getSizeInBits() == V.getValueSizeInBits();
39696 })) {
39697
39698 if (DemandedElts.isSubsetOf(ShuffleUndef))
39699 return DAG.getUNDEF(VT);
39700 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
39701 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
39702
39703 // Bitmask that indicates which ops have only been accessed 'inline'.
39704 APInt IdentityOp = APInt::getAllOnesValue(NumOps);
39705 for (int i = 0; i != NumElts; ++i) {
39706 int M = ShuffleMask[i];
39707 if (!DemandedElts[i] || ShuffleUndef[i])
39708 continue;
39709 int OpIdx = M / NumElts;
39710 int EltIdx = M % NumElts;
39711 if (M < 0 || EltIdx != i) {
39712 IdentityOp.clearAllBits();
39713 break;
39714 }
39715 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
39716 if (IdentityOp == 0)
39717 break;
39718 }
39719 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&((void)0)
39720 "Multiple identity shuffles detected")((void)0);
39721
39722 if (IdentityOp != 0)
39723 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
39724 }
39725 }
39726
39727 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
39728 Op, DemandedBits, DemandedElts, DAG, Depth);
39729}
39730
39731// Helper to peek through bitops/trunc/setcc to determine size of source vector.
39732// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
39733static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
39734 bool AllowTruncate) {
39735 switch (Src.getOpcode()) {
39736 case ISD::TRUNCATE:
39737 if (!AllowTruncate)
39738 return false;
39739 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39740 case ISD::SETCC:
39741 return Src.getOperand(0).getValueSizeInBits() == Size;
39742 case ISD::AND:
39743 case ISD::XOR:
39744 case ISD::OR:
39745 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
39746 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
39747 }
39748 return false;
39749}
39750
39751// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
39752static unsigned getAltBitOpcode(unsigned Opcode) {
39753 switch(Opcode) {
39754 case ISD::AND: return X86ISD::FAND;
39755 case ISD::OR: return X86ISD::FOR;
39756 case ISD::XOR: return X86ISD::FXOR;
39757 case X86ISD::ANDNP: return X86ISD::FANDN;
39758 }
39759 llvm_unreachable("Unknown bitwise opcode")__builtin_unreachable();
39760}
39761
39762// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
39763static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
39764 const SDLoc &DL) {
39765 EVT SrcVT = Src.getValueType();
39766 if (SrcVT != MVT::v4i1)
39767 return SDValue();
39768
39769 switch (Src.getOpcode()) {
39770 case ISD::SETCC:
39771 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
39772 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
39773 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
39774 SDValue Op0 = Src.getOperand(0);
39775 if (ISD::isNormalLoad(Op0.getNode()))
39776 return DAG.getBitcast(MVT::v4f32, Op0);
39777 if (Op0.getOpcode() == ISD::BITCAST &&
39778 Op0.getOperand(0).getValueType() == MVT::v4f32)
39779 return Op0.getOperand(0);
39780 }
39781 break;
39782 case ISD::AND:
39783 case ISD::XOR:
39784 case ISD::OR: {
39785 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
39786 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
39787 if (Op0 && Op1)
39788 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
39789 Op1);
39790 break;
39791 }
39792 }
39793 return SDValue();
39794}
39795
39796// Helper to push sign extension of vXi1 SETCC result through bitops.
39797static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
39798 SDValue Src, const SDLoc &DL) {
39799 switch (Src.getOpcode()) {
39800 case ISD::SETCC:
39801 case ISD::TRUNCATE:
39802 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39803 case ISD::AND:
39804 case ISD::XOR:
39805 case ISD::OR:
39806 return DAG.getNode(
39807 Src.getOpcode(), DL, SExtVT,
39808 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
39809 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
39810 }
39811 llvm_unreachable("Unexpected node type for vXi1 sign extension")__builtin_unreachable();
39812}
39813
39814// Try to match patterns such as
39815// (i16 bitcast (v16i1 x))
39816// ->
39817// (i16 movmsk (16i8 sext (v16i1 x)))
39818// before the illegal vector is scalarized on subtargets that don't have legal
39819// vxi1 types.
39820static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
39821 const SDLoc &DL,
39822 const X86Subtarget &Subtarget) {
39823 EVT SrcVT = Src.getValueType();
39824 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
39825 return SDValue();
39826
39827 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
39828 // legalization destroys the v4i32 type.
39829 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
39830 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
39831 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
39832 DAG.getBitcast(MVT::v4f32, V));
39833 return DAG.getZExtOrTrunc(V, DL, VT);
39834 }
39835 }
39836
39837 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
39838 // movmskb even with avx512. This will be better than truncating to vXi1 and
39839 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
39840 // vpcmpeqb/vpcmpgtb.
39841 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
39842 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
39843 Src.getOperand(0).getValueType() == MVT::v32i8 ||
39844 Src.getOperand(0).getValueType() == MVT::v64i8);
39845
39846 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
39847 // directly with vpmovmskb/vmovmskps/vmovmskpd.
39848 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
39849 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
39850 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
39851 EVT CmpVT = Src.getOperand(0).getValueType();
39852 EVT EltVT = CmpVT.getVectorElementType();
39853 if (CmpVT.getSizeInBits() <= 256 &&
39854 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
39855 PreferMovMsk = true;
39856 }
39857
39858 // With AVX512 vxi1 types are legal and we prefer using k-regs.
39859 // MOVMSK is supported in SSE2 or later.
39860 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
39861 return SDValue();
39862
39863 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
39864 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
39865 // v8i16 and v16i16.
39866 // For these two cases, we can shuffle the upper element bytes to a
39867 // consecutive sequence at the start of the vector and treat the results as
39868 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
39869 // for v16i16 this is not the case, because the shuffle is expensive, so we
39870 // avoid sign-extending to this type entirely.
39871 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
39872 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
39873 MVT SExtVT;
39874 bool PropagateSExt = false;
39875 switch (SrcVT.getSimpleVT().SimpleTy) {
39876 default:
39877 return SDValue();
39878 case MVT::v2i1:
39879 SExtVT = MVT::v2i64;
39880 break;
39881 case MVT::v4i1:
39882 SExtVT = MVT::v4i32;
39883 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
39884 // sign-extend to a 256-bit operation to avoid truncation.
39885 if (Subtarget.hasAVX() &&
39886 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
39887 SExtVT = MVT::v4i64;
39888 PropagateSExt = true;
39889 }
39890 break;
39891 case MVT::v8i1:
39892 SExtVT = MVT::v8i16;
39893 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
39894 // sign-extend to a 256-bit operation to match the compare.
39895 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
39896 // 256-bit because the shuffle is cheaper than sign extending the result of
39897 // the compare.
39898 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
39899 checkBitcastSrcVectorSize(Src, 512, true))) {
39900 SExtVT = MVT::v8i32;
39901 PropagateSExt = true;
39902 }
39903 break;
39904 case MVT::v16i1:
39905 SExtVT = MVT::v16i8;
39906 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
39907 // it is not profitable to sign-extend to 256-bit because this will
39908 // require an extra cross-lane shuffle which is more expensive than
39909 // truncating the result of the compare to 128-bits.
39910 break;
39911 case MVT::v32i1:
39912 SExtVT = MVT::v32i8;
39913 break;
39914 case MVT::v64i1:
39915 // If we have AVX512F, but not AVX512BW and the input is truncated from
39916 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
39917 if (Subtarget.hasAVX512()) {
39918 if (Subtarget.hasBWI())
39919 return SDValue();
39920 SExtVT = MVT::v64i8;
39921 break;
39922 }
39923 // Split if this is a <64 x i8> comparison result.
39924 if (checkBitcastSrcVectorSize(Src, 512, false)) {
39925 SExtVT = MVT::v64i8;
39926 break;
39927 }
39928 return SDValue();
39929 };
39930
39931 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
39932 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
39933
39934 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
39935 V = getPMOVMSKB(DL, V, DAG, Subtarget);
39936 } else {
39937 if (SExtVT == MVT::v8i16)
39938 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
39939 DAG.getUNDEF(MVT::v8i16));
39940 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
39941 }
39942
39943 EVT IntVT =
39944 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
39945 V = DAG.getZExtOrTrunc(V, DL, IntVT);
39946 return DAG.getBitcast(VT, V);
39947}
39948
39949// Convert a vXi1 constant build vector to the same width scalar integer.
39950static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
39951 EVT SrcVT = Op.getValueType();
39952 assert(SrcVT.getVectorElementType() == MVT::i1 &&((void)0)
39953 "Expected a vXi1 vector")((void)0);
39954 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((void)0)
39955 "Expected a constant build vector")((void)0);
39956
39957 APInt Imm(SrcVT.getVectorNumElements(), 0);
39958 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
39959 SDValue In = Op.getOperand(Idx);
39960 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
39961 Imm.setBit(Idx);
39962 }
39963 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
39964 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
39965}
39966
39967static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
39968 TargetLowering::DAGCombinerInfo &DCI,
39969 const X86Subtarget &Subtarget) {
39970 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")((void)0);
39971
39972 if (!DCI.isBeforeLegalizeOps())
39973 return SDValue();
39974
39975 // Only do this if we have k-registers.
39976 if (!Subtarget.hasAVX512())
39977 return SDValue();
39978
39979 EVT DstVT = N->getValueType(0);
39980 SDValue Op = N->getOperand(0);
39981 EVT SrcVT = Op.getValueType();
39982
39983 if (!Op.hasOneUse())
39984 return SDValue();
39985
39986 // Look for logic ops.
39987 if (Op.getOpcode() != ISD::AND &&
39988 Op.getOpcode() != ISD::OR &&
39989 Op.getOpcode() != ISD::XOR)
39990 return SDValue();
39991
39992 // Make sure we have a bitcast between mask registers and a scalar type.
39993 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
39994 DstVT.isScalarInteger()) &&
39995 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
39996 SrcVT.isScalarInteger()))
39997 return SDValue();
39998
39999 SDValue LHS = Op.getOperand(0);
40000 SDValue RHS = Op.getOperand(1);
40001
40002 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
40003 LHS.getOperand(0).getValueType() == DstVT)
40004 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
40005 DAG.getBitcast(DstVT, RHS));
40006
40007 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
40008 RHS.getOperand(0).getValueType() == DstVT)
40009 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40010 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
40011
40012 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
40013 // Most of these have to move a constant from the scalar domain anyway.
40014 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
40015 RHS = combinevXi1ConstantToInteger(RHS, DAG);
40016 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40017 DAG.getBitcast(DstVT, LHS), RHS);
40018 }
40019
40020 return SDValue();
40021}
40022
40023static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
40024 const X86Subtarget &Subtarget) {
40025 SDLoc DL(BV);
40026 unsigned NumElts = BV->getNumOperands();
40027 SDValue Splat = BV->getSplatValue();
40028
40029 // Build MMX element from integer GPR or SSE float values.
40030 auto CreateMMXElement = [&](SDValue V) {
40031 if (V.isUndef())
40032 return DAG.getUNDEF(MVT::x86mmx);
40033 if (V.getValueType().isFloatingPoint()) {
40034 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
40035 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
40036 V = DAG.getBitcast(MVT::v2i64, V);
40037 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
40038 }
40039 V = DAG.getBitcast(MVT::i32, V);
40040 } else {
40041 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
40042 }
40043 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
40044 };
40045
40046 // Convert build vector ops to MMX data in the bottom elements.
40047 SmallVector<SDValue, 8> Ops;
40048
40049 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40050
40051 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
40052 if (Splat) {
40053 if (Splat.isUndef())
40054 return DAG.getUNDEF(MVT::x86mmx);
40055
40056 Splat = CreateMMXElement(Splat);
40057
40058 if (Subtarget.hasSSE1()) {
40059 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
40060 if (NumElts == 8)
40061 Splat = DAG.getNode(
40062 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40063 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
40064 TLI.getPointerTy(DAG.getDataLayout())),
40065 Splat, Splat);
40066
40067 // Use PSHUFW to repeat 16-bit elements.
40068 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
40069 return DAG.getNode(
40070 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40071 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
40072 TLI.getPointerTy(DAG.getDataLayout())),
40073 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
40074 }
40075 Ops.append(NumElts, Splat);
40076 } else {
40077 for (unsigned i = 0; i != NumElts; ++i)
40078 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
40079 }
40080
40081 // Use tree of PUNPCKLs to build up general MMX vector.
40082 while (Ops.size() > 1) {
40083 unsigned NumOps = Ops.size();
40084 unsigned IntrinOp =
40085 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
40086 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
40087 : Intrinsic::x86_mmx_punpcklbw));
40088 SDValue Intrin = DAG.getTargetConstant(
40089 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
40090 for (unsigned i = 0; i != NumOps; i += 2)
40091 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
40092 Ops[i], Ops[i + 1]);
40093 Ops.resize(NumOps / 2);
40094 }
40095
40096 return Ops[0];
40097}
40098
40099// Recursive function that attempts to find if a bool vector node was originally
40100// a vector/float/double that got truncated/extended/bitcast to/from a scalar
40101// integer. If so, replace the scalar ops with bool vector equivalents back down
40102// the chain.
40103static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
40104 SelectionDAG &DAG,
40105 const X86Subtarget &Subtarget) {
40106 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40107 unsigned Opc = V.getOpcode();
40108 switch (Opc) {
40109 case ISD::BITCAST: {
40110 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
40111 SDValue Src = V.getOperand(0);
40112 EVT SrcVT = Src.getValueType();
40113 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
40114 return DAG.getBitcast(VT, Src);
40115 break;
40116 }
40117 case ISD::TRUNCATE: {
40118 // If we find a suitable source, a truncated scalar becomes a subvector.
40119 SDValue Src = V.getOperand(0);
40120 EVT NewSrcVT =
40121 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
40122 if (TLI.isTypeLegal(NewSrcVT))
40123 if (SDValue N0 =
40124 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40125 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
40126 DAG.getIntPtrConstant(0, DL));
40127 break;
40128 }
40129 case ISD::ANY_EXTEND:
40130 case ISD::ZERO_EXTEND: {
40131 // If we find a suitable source, an extended scalar becomes a subvector.
40132 SDValue Src = V.getOperand(0);
40133 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
40134 Src.getScalarValueSizeInBits());
40135 if (TLI.isTypeLegal(NewSrcVT))
40136 if (SDValue N0 =
40137 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40138 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40139 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
40140 : DAG.getConstant(0, DL, VT),
40141 N0, DAG.getIntPtrConstant(0, DL));
40142 break;
40143 }
40144 case ISD::OR: {
40145 // If we find suitable sources, we can just move an OR to the vector domain.
40146 SDValue Src0 = V.getOperand(0);
40147 SDValue Src1 = V.getOperand(1);
40148 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40149 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
40150 return DAG.getNode(Opc, DL, VT, N0, N1);
40151 break;
40152 }
40153 case ISD::SHL: {
40154 // If we find a suitable source, a SHL becomes a KSHIFTL.
40155 SDValue Src0 = V.getOperand(0);
40156 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
40157 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
40158 break;
40159
40160 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
40161 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40162 return DAG.getNode(
40163 X86ISD::KSHIFTL, DL, VT, N0,
40164 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
40165 break;
40166 }
40167 }
40168 return SDValue();
40169}
40170
40171static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
40172 TargetLowering::DAGCombinerInfo &DCI,
40173 const X86Subtarget &Subtarget) {
40174 SDValue N0 = N->getOperand(0);
40175 EVT VT = N->getValueType(0);
40176 EVT SrcVT = N0.getValueType();
40177 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40178
40179 // Try to match patterns such as
40180 // (i16 bitcast (v16i1 x))
40181 // ->
40182 // (i16 movmsk (16i8 sext (v16i1 x)))
40183 // before the setcc result is scalarized on subtargets that don't have legal
40184 // vxi1 types.
40185 if (DCI.isBeforeLegalize()) {
40186 SDLoc dl(N);
40187 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
40188 return V;
40189
40190 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40191 // type, widen both sides to avoid a trip through memory.
40192 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
40193 Subtarget.hasAVX512()) {
40194 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
40195 N0 = DAG.getBitcast(MVT::v8i1, N0);
40196 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
40197 DAG.getIntPtrConstant(0, dl));
40198 }
40199
40200 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40201 // type, widen both sides to avoid a trip through memory.
40202 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
40203 Subtarget.hasAVX512()) {
40204 // Use zeros for the widening if we already have some zeroes. This can
40205 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
40206 // stream of this.
40207 // FIXME: It might make sense to detect a concat_vectors with a mix of
40208 // zeroes and undef and turn it into insert_subvector for i1 vectors as
40209 // a separate combine. What we can't do is canonicalize the operands of
40210 // such a concat or we'll get into a loop with SimplifyDemandedBits.
40211 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
40212 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
40213 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
40214 SrcVT = LastOp.getValueType();
40215 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40216 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
40217 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
40218 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40219 N0 = DAG.getBitcast(MVT::i8, N0);
40220 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40221 }
40222 }
40223
40224 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40225 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
40226 Ops[0] = N0;
40227 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40228 N0 = DAG.getBitcast(MVT::i8, N0);
40229 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40230 }
40231 } else {
40232 // If we're bitcasting from iX to vXi1, see if the integer originally
40233 // began as a vXi1 and whether we can remove the bitcast entirely.
40234 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
40235 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
40236 if (SDValue V =
40237 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
40238 return V;
40239 }
40240 }
40241
40242 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
40243 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
40244 // due to insert_subvector legalization on KNL. By promoting the copy to i16
40245 // we can help with known bits propagation from the vXi1 domain to the
40246 // scalar domain.
40247 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
40248 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40249 N0.getOperand(0).getValueType() == MVT::v16i1 &&
40250 isNullConstant(N0.getOperand(1)))
40251 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
40252 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
40253
40254 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
40255 // and the vbroadcast_load are both integer or both fp. In some cases this
40256 // will remove the bitcast entirely.
40257 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
40258 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
40259 auto *BCast = cast<MemIntrinsicSDNode>(N0);
40260 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
40261 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
40262 // Don't swap i8/i16 since don't have fp types that size.
40263 if (MemSize >= 32) {
40264 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
40265 : MVT::getIntegerVT(MemSize);
40266 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
40267 : MVT::getIntegerVT(SrcVTSize);
40268 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
40269
40270 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
40271 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
40272 SDValue ResNode =
40273 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
40274 MemVT, BCast->getMemOperand());
40275 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
40276 return DAG.getBitcast(VT, ResNode);
40277 }
40278 }
40279
40280 // Since MMX types are special and don't usually play with other vector types,
40281 // it's better to handle them early to be sure we emit efficient code by
40282 // avoiding store-load conversions.
40283 if (VT == MVT::x86mmx) {
40284 // Detect MMX constant vectors.
40285 APInt UndefElts;
40286 SmallVector<APInt, 1> EltBits;
40287 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
40288 SDLoc DL(N0);
40289 // Handle zero-extension of i32 with MOVD.
40290 if (EltBits[0].countLeadingZeros() >= 32)
40291 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
40292 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
40293 // Else, bitcast to a double.
40294 // TODO - investigate supporting sext 32-bit immediates on x86_64.
40295 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
40296 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
40297 }
40298
40299 // Detect bitcasts to x86mmx low word.
40300 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40301 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
40302 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
40303 bool LowUndef = true, AllUndefOrZero = true;
40304 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
40305 SDValue Op = N0.getOperand(i);
40306 LowUndef &= Op.isUndef() || (i >= e/2);
40307 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
40308 }
40309 if (AllUndefOrZero) {
40310 SDValue N00 = N0.getOperand(0);
40311 SDLoc dl(N00);
40312 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
40313 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
40314 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
40315 }
40316 }
40317
40318 // Detect bitcasts of 64-bit build vectors and convert to a
40319 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
40320 // lowest element.
40321 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40322 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
40323 SrcVT == MVT::v8i8))
40324 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
40325
40326 // Detect bitcasts between element or subvector extraction to x86mmx.
40327 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
40328 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
40329 isNullConstant(N0.getOperand(1))) {
40330 SDValue N00 = N0.getOperand(0);
40331 if (N00.getValueType().is128BitVector())
40332 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
40333 DAG.getBitcast(MVT::v2i64, N00));
40334 }
40335
40336 // Detect bitcasts from FP_TO_SINT to x86mmx.
40337 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
40338 SDLoc DL(N0);
40339 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
40340 DAG.getUNDEF(MVT::v2i32));
40341 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
40342 DAG.getBitcast(MVT::v2i64, Res));
40343 }
40344 }
40345
40346 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
40347 // most of these to scalar anyway.
40348 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
40349 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40350 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
40351 return combinevXi1ConstantToInteger(N0, DAG);
40352 }
40353
40354 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40355 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40356 isa<ConstantSDNode>(N0)) {
40357 auto *C = cast<ConstantSDNode>(N0);
40358 if (C->isAllOnesValue())
40359 return DAG.getConstant(1, SDLoc(N0), VT);
40360 if (C->isNullValue())
40361 return DAG.getConstant(0, SDLoc(N0), VT);
40362 }
40363
40364 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
40365 // Turn it into a sign bit compare that produces a k-register. This avoids
40366 // a trip through a GPR.
40367 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
40368 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
40369 isPowerOf2_32(VT.getVectorNumElements())) {
40370 unsigned NumElts = VT.getVectorNumElements();
40371 SDValue Src = N0;
40372
40373 // Peek through truncate.
40374 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
40375 Src = N0.getOperand(0);
40376
40377 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
40378 SDValue MovmskIn = Src.getOperand(0);
40379 MVT MovmskVT = MovmskIn.getSimpleValueType();
40380 unsigned MovMskElts = MovmskVT.getVectorNumElements();
40381
40382 // We allow extra bits of the movmsk to be used since they are known zero.
40383 // We can't convert a VPMOVMSKB without avx512bw.
40384 if (MovMskElts <= NumElts &&
40385 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
40386 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
40387 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
40388 SDLoc dl(N);
40389 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
40390 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
40391 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
40392 if (EVT(CmpVT) == VT)
40393 return Cmp;
40394
40395 // Pad with zeroes up to original VT to replace the zeroes that were
40396 // being used from the MOVMSK.
40397 unsigned NumConcats = NumElts / MovMskElts;
40398 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
40399 Ops[0] = Cmp;
40400 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
40401 }
40402 }
40403 }
40404
40405 // Try to remove bitcasts from input and output of mask arithmetic to
40406 // remove GPR<->K-register crossings.
40407 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
40408 return V;
40409
40410 // Convert a bitcasted integer logic operation that has one bitcasted
40411 // floating-point operand into a floating-point logic operation. This may
40412 // create a load of a constant, but that is cheaper than materializing the
40413 // constant in an integer register and transferring it to an SSE register or
40414 // transferring the SSE operand to integer register and back.
40415 unsigned FPOpcode;
40416 switch (N0.getOpcode()) {
40417 case ISD::AND: FPOpcode = X86ISD::FAND; break;
40418 case ISD::OR: FPOpcode = X86ISD::FOR; break;
40419 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
40420 default: return SDValue();
40421 }
40422
40423 // Check if we have a bitcast from another integer type as well.
40424 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
40425 (Subtarget.hasSSE2() && VT == MVT::f64) ||
40426 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
40427 TLI.isTypeLegal(VT))))
40428 return SDValue();
40429
40430 SDValue LogicOp0 = N0.getOperand(0);
40431 SDValue LogicOp1 = N0.getOperand(1);
40432 SDLoc DL0(N0);
40433
40434 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
40435 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
40436 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
40437 LogicOp0.getOperand(0).getValueType() == VT &&
40438 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
40439 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
40440 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40441 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
40442 }
40443 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
40444 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
40445 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
40446 LogicOp1.getOperand(0).getValueType() == VT &&
40447 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
40448 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
40449 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
40450 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
40451 }
40452
40453 return SDValue();
40454}
40455
40456// Given a ABS node, detect the following pattern:
40457// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
40458// This is useful as it is the input into a SAD pattern.
40459static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
40460 SDValue AbsOp1 = Abs->getOperand(0);
40461 if (AbsOp1.getOpcode() != ISD::SUB)
40462 return false;
40463
40464 Op0 = AbsOp1.getOperand(0);
40465 Op1 = AbsOp1.getOperand(1);
40466
40467 // Check if the operands of the sub are zero-extended from vectors of i8.
40468 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
40469 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
40470 Op1.getOpcode() != ISD::ZERO_EXTEND ||
40471 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
40472 return false;
40473
40474 return true;
40475}
40476
40477// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
40478// to these zexts.
40479static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
40480 const SDValue &Zext1, const SDLoc &DL,
40481 const X86Subtarget &Subtarget) {
40482 // Find the appropriate width for the PSADBW.
40483 EVT InVT = Zext0.getOperand(0).getValueType();
40484 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
40485
40486 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
40487 // fill in the missing vector elements with 0.
40488 unsigned NumConcat = RegSize / InVT.getSizeInBits();
40489 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
40490 Ops[0] = Zext0.getOperand(0);
40491 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
40492 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40493 Ops[0] = Zext1.getOperand(0);
40494 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
40495
40496 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
40497 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
40498 ArrayRef<SDValue> Ops) {
40499 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
40500 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
40501 };
40502 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
40503 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
40504 PSADBWBuilder);
40505}
40506
40507// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
40508// PHMINPOSUW.
40509static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
40510 const X86Subtarget &Subtarget) {
40511 // Bail without SSE41.
40512 if (!Subtarget.hasSSE41())
40513 return SDValue();
40514
40515 EVT ExtractVT = Extract->getValueType(0);
40516 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
40517 return SDValue();
40518
40519 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
40520 ISD::NodeType BinOp;
40521 SDValue Src = DAG.matchBinOpReduction(
40522 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
40523 if (!Src)
40524 return SDValue();
40525
40526 EVT SrcVT = Src.getValueType();
40527 EVT SrcSVT = SrcVT.getScalarType();
40528 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
40529 return SDValue();
40530
40531 SDLoc DL(Extract);
40532 SDValue MinPos = Src;
40533
40534 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
40535 while (SrcVT.getSizeInBits() > 128) {
40536 SDValue Lo, Hi;
40537 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
40538 SrcVT = Lo.getValueType();
40539 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
40540 }
40541 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||((void)0)
40542 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&((void)0)
40543 "Unexpected value type")((void)0);
40544
40545 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
40546 // to flip the value accordingly.
40547 SDValue Mask;
40548 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
40549 if (BinOp == ISD::SMAX)
40550 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
40551 else if (BinOp == ISD::SMIN)
40552 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
40553 else if (BinOp == ISD::UMAX)
40554 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
40555
40556 if (Mask)
40557 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40558
40559 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
40560 // shuffling each upper element down and insert zeros. This means that the
40561 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
40562 // ready for the PHMINPOS.
40563 if (ExtractVT == MVT::i8) {
40564 SDValue Upper = DAG.getVectorShuffle(
40565 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
40566 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
40567 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
40568 }
40569
40570 // Perform the PHMINPOS on a v8i16 vector,
40571 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
40572 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
40573 MinPos = DAG.getBitcast(SrcVT, MinPos);
40574
40575 if (Mask)
40576 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
40577
40578 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
40579 DAG.getIntPtrConstant(0, DL));
40580}
40581
40582// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
40583static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
40584 const X86Subtarget &Subtarget) {
40585 // Bail without SSE2.
40586 if (!Subtarget.hasSSE2())
40587 return SDValue();
40588
40589 EVT ExtractVT = Extract->getValueType(0);
40590 unsigned BitWidth = ExtractVT.getSizeInBits();
40591 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
40592 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
40593 return SDValue();
40594
40595 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
40596 ISD::NodeType BinOp;
40597 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
40598 if (!Match && ExtractVT == MVT::i1)
40599 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
40600 if (!Match)
40601 return SDValue();
40602
40603 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
40604 // which we can't support here for now.
40605 if (Match.getScalarValueSizeInBits() != BitWidth)
40606 return SDValue();
40607
40608 SDValue Movmsk;
40609 SDLoc DL(Extract);
40610 EVT MatchVT = Match.getValueType();
40611 unsigned NumElts = MatchVT.getVectorNumElements();
40612 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
40613 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40614
40615 if (ExtractVT == MVT::i1) {
40616 // Special case for (pre-legalization) vXi1 reductions.
40617 if (NumElts > 64 || !isPowerOf2_32(NumElts))
40618 return SDValue();
40619 if (TLI.isTypeLegal(MatchVT)) {
40620 // If this is a legal AVX512 predicate type then we can just bitcast.
40621 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40622 Movmsk = DAG.getBitcast(MovmskVT, Match);
40623 } else {
40624 // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
40625 // PCMPEQQ (SSE41+), use PCMPEQD instead.
40626 if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
40627 Match.getOpcode() == ISD::SETCC &&
40628 ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
40629 cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
40630 ISD::CondCode::SETEQ) {
40631 SDValue Vec = Match.getOperand(0);
40632 if (Vec.getValueType().getScalarType() == MVT::i64 &&
40633 (2 * NumElts) <= MaxElts) {
40634 NumElts *= 2;
40635 EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
40636 MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
40637 Match = DAG.getSetCC(
40638 DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
40639 DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
40640 }
40641 }
40642
40643 // Use combineBitcastvxi1 to create the MOVMSK.
40644 while (NumElts > MaxElts) {
40645 SDValue Lo, Hi;
40646 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40647 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40648 NumElts /= 2;
40649 }
40650 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
40651 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
40652 }
40653 if (!Movmsk)
40654 return SDValue();
40655 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
40656 } else {
40657 // FIXME: Better handling of k-registers or 512-bit vectors?
40658 unsigned MatchSizeInBits = Match.getValueSizeInBits();
40659 if (!(MatchSizeInBits == 128 ||
40660 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
40661 return SDValue();
40662
40663 // Make sure this isn't a vector of 1 element. The perf win from using
40664 // MOVMSK diminishes with less elements in the reduction, but it is
40665 // generally better to get the comparison over to the GPRs as soon as
40666 // possible to reduce the number of vector ops.
40667 if (Match.getValueType().getVectorNumElements() < 2)
40668 return SDValue();
40669
40670 // Check that we are extracting a reduction of all sign bits.
40671 if (DAG.ComputeNumSignBits(Match) != BitWidth)
40672 return SDValue();
40673
40674 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
40675 SDValue Lo, Hi;
40676 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
40677 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
40678 MatchSizeInBits = Match.getValueSizeInBits();
40679 }
40680
40681 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
40682 MVT MaskSrcVT;
40683 if (64 == BitWidth || 32 == BitWidth)
40684 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
40685 MatchSizeInBits / BitWidth);
40686 else
40687 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
40688
40689 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
40690 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
40691 NumElts = MaskSrcVT.getVectorNumElements();
40692 }
40693 assert((NumElts <= 32 || NumElts == 64) &&((void)0)
40694 "Not expecting more than 64 elements")((void)0);
40695
40696 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
40697 if (BinOp == ISD::XOR) {
40698 // parity -> (PARITY(MOVMSK X))
40699 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
40700 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
40701 }
40702
40703 SDValue CmpC;
40704 ISD::CondCode CondCode;
40705 if (BinOp == ISD::OR) {
40706 // any_of -> MOVMSK != 0
40707 CmpC = DAG.getConstant(0, DL, CmpVT);
40708 CondCode = ISD::CondCode::SETNE;
40709 } else {
40710 // all_of -> MOVMSK == ((1 << NumElts) - 1)
40711 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
40712 DL, CmpVT);
40713 CondCode = ISD::CondCode::SETEQ;
40714 }
40715
40716 // The setcc produces an i8 of 0/1, so extend that to the result width and
40717 // negate to get the final 0/-1 mask value.
40718 EVT SetccVT =
40719 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
40720 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
40721 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
40722 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
40723 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
40724}
40725
40726static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
40727 const X86Subtarget &Subtarget) {
40728 // PSADBW is only supported on SSE2 and up.
40729 if (!Subtarget.hasSSE2())
40730 return SDValue();
40731
40732 EVT ExtractVT = Extract->getValueType(0);
40733 // Verify the type we're extracting is either i32 or i64.
40734 // FIXME: Could support other types, but this is what we have coverage for.
40735 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
40736 return SDValue();
40737
40738 EVT VT = Extract->getOperand(0).getValueType();
40739 if (!isPowerOf2_32(VT.getVectorNumElements()))
40740 return SDValue();
40741
40742 // Match shuffle + add pyramid.
40743 ISD::NodeType BinOp;
40744 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
40745
40746 // The operand is expected to be zero extended from i8
40747 // (verified in detectZextAbsDiff).
40748 // In order to convert to i64 and above, additional any/zero/sign
40749 // extend is expected.
40750 // The zero extend from 32 bit has no mathematical effect on the result.
40751 // Also the sign extend is basically zero extend
40752 // (extends the sign bit which is zero).
40753 // So it is correct to skip the sign/zero extend instruction.
40754 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
40755 Root.getOpcode() == ISD::ZERO_EXTEND ||
40756 Root.getOpcode() == ISD::ANY_EXTEND))
40757 Root = Root.getOperand(0);
40758
40759 // If there was a match, we want Root to be a select that is the root of an
40760 // abs-diff pattern.
40761 if (!Root || Root.getOpcode() != ISD::ABS)
40762 return SDValue();
40763
40764 // Check whether we have an abs-diff pattern feeding into the select.
40765 SDValue Zext0, Zext1;
40766 if (!detectZextAbsDiff(Root, Zext0, Zext1))
40767 return SDValue();
40768
40769 // Create the SAD instruction.
40770 SDLoc DL(Extract);
40771 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
40772
40773 // If the original vector was wider than 8 elements, sum over the results
40774 // in the SAD vector.
40775 unsigned Stages = Log2_32(VT.getVectorNumElements());
40776 EVT SadVT = SAD.getValueType();
40777 if (Stages > 3) {
40778 unsigned SadElems = SadVT.getVectorNumElements();
40779
40780 for(unsigned i = Stages - 3; i > 0; --i) {
40781 SmallVector<int, 16> Mask(SadElems, -1);
40782 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
40783 Mask[j] = MaskEnd + j;
40784
40785 SDValue Shuffle =
40786 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
40787 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
40788 }
40789 }
40790
40791 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
40792 // Return the lowest ExtractSizeInBits bits.
40793 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
40794 SadVT.getSizeInBits() / ExtractSizeInBits);
40795 SAD = DAG.getBitcast(ResVT, SAD);
40796 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
40797 Extract->getOperand(1));
40798}
40799
40800// Attempt to peek through a target shuffle and extract the scalar from the
40801// source.
40802static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
40803 TargetLowering::DAGCombinerInfo &DCI,
40804 const X86Subtarget &Subtarget) {
40805 if (DCI.isBeforeLegalizeOps())
40806 return SDValue();
40807
40808 SDLoc dl(N);
40809 SDValue Src = N->getOperand(0);
40810 SDValue Idx = N->getOperand(1);
40811
40812 EVT VT = N->getValueType(0);
40813 EVT SrcVT = Src.getValueType();
40814 EVT SrcSVT = SrcVT.getVectorElementType();
40815 unsigned SrcEltBits = SrcSVT.getSizeInBits();
40816 unsigned NumSrcElts = SrcVT.getVectorNumElements();
40817
40818 // Don't attempt this for boolean mask vectors or unknown extraction indices.
40819 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
40820 return SDValue();
40821
40822 const APInt &IdxC = N->getConstantOperandAPInt(1);
40823 if (IdxC.uge(NumSrcElts))
40824 return SDValue();
40825
40826 SDValue SrcBC = peekThroughBitcasts(Src);
40827
40828 // Handle extract(bitcast(broadcast(scalar_value))).
40829 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
40830 SDValue SrcOp = SrcBC.getOperand(0);
40831 EVT SrcOpVT = SrcOp.getValueType();
40832 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
40833 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
40834 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
40835 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
40836 // TODO support non-zero offsets.
40837 if (Offset == 0) {
40838 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
40839 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
40840 return SrcOp;
40841 }
40842 }
40843 }
40844
40845 // If we're extracting a single element from a broadcast load and there are
40846 // no other users, just create a single load.
40847 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
40848 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
40849 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
40850 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
40851 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
40852 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
40853 MemIntr->getBasePtr(),
40854 MemIntr->getPointerInfo(),
40855 MemIntr->getOriginalAlign(),
40856 MemIntr->getMemOperand()->getFlags());
40857 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40858 return Load;
40859 }
40860 }
40861
40862 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
40863 // TODO: Move to DAGCombine?
40864 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
40865 SrcBC.getValueType().isInteger() &&
40866 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
40867 SrcBC.getScalarValueSizeInBits() ==
40868 SrcBC.getOperand(0).getValueSizeInBits()) {
40869 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
40870 if (IdxC.ult(Scale)) {
40871 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
40872 SDValue Scl = SrcBC.getOperand(0);
40873 EVT SclVT = Scl.getValueType();
40874 if (Offset) {
40875 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
40876 DAG.getShiftAmountConstant(Offset, SclVT, dl));
40877 }
40878 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
40879 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
40880 return Scl;
40881 }
40882 }
40883
40884 // Handle extract(truncate(x)) for 0'th index.
40885 // TODO: Treat this as a faux shuffle?
40886 // TODO: When can we use this for general indices?
40887 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
40888 (SrcVT.getSizeInBits() % 128) == 0) {
40889 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
40890 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
40891 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
40892 Idx);
40893 }
40894
40895 // We can only legally extract other elements from 128-bit vectors and in
40896 // certain circumstances, depending on SSE-level.
40897 // TODO: Investigate float/double extraction if it will be just stored.
40898 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
40899 unsigned Idx) {
40900 EVT VecSVT = VecVT.getScalarType();
40901 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
40902 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
40903 VecSVT == MVT::i64)) {
40904 unsigned EltSizeInBits = VecSVT.getSizeInBits();
40905 unsigned NumEltsPerLane = 128 / EltSizeInBits;
40906 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
40907 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
40908 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
40909 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
40910 Idx &= (NumEltsPerLane - 1);
40911 }
40912 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
40913 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
40914 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
40915 DAG.getBitcast(VecVT, Vec),
40916 DAG.getIntPtrConstant(Idx, dl));
40917 }
40918 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
40919 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
40920 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
40921 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
40922 DAG.getTargetConstant(Idx, dl, MVT::i8));
40923 }
40924 return SDValue();
40925 };
40926
40927 // Resolve the target shuffle inputs and mask.
40928 SmallVector<int, 16> Mask;
40929 SmallVector<SDValue, 2> Ops;
40930 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
40931 return SDValue();
40932
40933 // Shuffle inputs must be the same size as the result.
40934 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
40935 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
40936 }))
40937 return SDValue();
40938
40939 // Attempt to narrow/widen the shuffle mask to the correct size.
40940 if (Mask.size() != NumSrcElts) {
40941 if ((NumSrcElts % Mask.size()) == 0) {
40942 SmallVector<int, 16> ScaledMask;
40943 int Scale = NumSrcElts / Mask.size();
40944 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
40945 Mask = std::move(ScaledMask);
40946 } else if ((Mask.size() % NumSrcElts) == 0) {
40947 // Simplify Mask based on demanded element.
40948 int ExtractIdx = (int)IdxC.getZExtValue();
40949 int Scale = Mask.size() / NumSrcElts;
40950 int Lo = Scale * ExtractIdx;
40951 int Hi = Scale * (ExtractIdx + 1);
40952 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
40953 if (i < Lo || Hi <= i)
40954 Mask[i] = SM_SentinelUndef;
40955
40956 SmallVector<int, 16> WidenedMask;
40957 while (Mask.size() > NumSrcElts &&
40958 canWidenShuffleElements(Mask, WidenedMask))
40959 Mask = std::move(WidenedMask);
40960 }
40961 }
40962
40963 // If narrowing/widening failed, see if we can extract+zero-extend.
40964 int ExtractIdx;
40965 EVT ExtractVT;
40966 if (Mask.size() == NumSrcElts) {
40967 ExtractIdx = Mask[IdxC.getZExtValue()];
40968 ExtractVT = SrcVT;
40969 } else {
40970 unsigned Scale = Mask.size() / NumSrcElts;
40971 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
40972 return SDValue();
40973 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
40974 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
40975 return SDValue();
40976 ExtractIdx = Mask[ScaledIdx];
40977 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
40978 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
40979 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&((void)0)
40980 "Failed to widen vector type")((void)0);
40981 }
40982
40983 // If the shuffle source element is undef/zero then we can just accept it.
40984 if (ExtractIdx == SM_SentinelUndef)
40985 return DAG.getUNDEF(VT);
40986
40987 if (ExtractIdx == SM_SentinelZero)
40988 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
40989 : DAG.getConstant(0, dl, VT);
40990
40991 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
40992 ExtractIdx = ExtractIdx % Mask.size();
40993 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
40994 return DAG.getZExtOrTrunc(V, dl, VT);
40995
40996 return SDValue();
40997}
40998
40999/// Extracting a scalar FP value from vector element 0 is free, so extract each
41000/// operand first, then perform the math as a scalar op.
41001static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
41002 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")((void)0);
41003 SDValue Vec = ExtElt->getOperand(0);
41004 SDValue Index = ExtElt->getOperand(1);
41005 EVT VT = ExtElt->getValueType(0);
41006 EVT VecVT = Vec.getValueType();
41007
41008 // TODO: If this is a unary/expensive/expand op, allow extraction from a
41009 // non-zero element because the shuffle+scalar op will be cheaper?
41010 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
41011 return SDValue();
41012
41013 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
41014 // extract, the condition code), so deal with those as a special-case.
41015 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
41016 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
41017 if (OpVT != MVT::f32 && OpVT != MVT::f64)
41018 return SDValue();
41019
41020 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
41021 SDLoc DL(ExtElt);
41022 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41023 Vec.getOperand(0), Index);
41024 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41025 Vec.getOperand(1), Index);
41026 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
41027 }
41028
41029 if (VT != MVT::f32 && VT != MVT::f64)
41030 return SDValue();
41031
41032 // Vector FP selects don't fit the pattern of FP math ops (because the
41033 // condition has a different type and we have to change the opcode), so deal
41034 // with those here.
41035 // FIXME: This is restricted to pre type legalization by ensuring the setcc
41036 // has i1 elements. If we loosen this we need to convert vector bool to a
41037 // scalar bool.
41038 if (Vec.getOpcode() == ISD::VSELECT &&
41039 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
41040 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
41041 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
41042 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
41043 SDLoc DL(ExtElt);
41044 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
41045 Vec.getOperand(0).getValueType().getScalarType(),
41046 Vec.getOperand(0), Index);
41047 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41048 Vec.getOperand(1), Index);
41049 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41050 Vec.getOperand(2), Index);
41051 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
41052 }
41053
41054 // TODO: This switch could include FNEG and the x86-specific FP logic ops
41055 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
41056 // missed load folding and fma+fneg combining.
41057 switch (Vec.getOpcode()) {
41058 case ISD::FMA: // Begin 3 operands
41059 case ISD::FMAD:
41060 case ISD::FADD: // Begin 2 operands
41061 case ISD::FSUB:
41062 case ISD::FMUL:
41063 case ISD::FDIV:
41064 case ISD::FREM:
41065 case ISD::FCOPYSIGN:
41066 case ISD::FMINNUM:
41067 case ISD::FMAXNUM:
41068 case ISD::FMINNUM_IEEE:
41069 case ISD::FMAXNUM_IEEE:
41070 case ISD::FMAXIMUM:
41071 case ISD::FMINIMUM:
41072 case X86ISD::FMAX:
41073 case X86ISD::FMIN:
41074 case ISD::FABS: // Begin 1 operand
41075 case ISD::FSQRT:
41076 case ISD::FRINT:
41077 case ISD::FCEIL:
41078 case ISD::FTRUNC:
41079 case ISD::FNEARBYINT:
41080 case ISD::FROUND:
41081 case ISD::FFLOOR:
41082 case X86ISD::FRCP:
41083 case X86ISD::FRSQRT: {
41084 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
41085 SDLoc DL(ExtElt);
41086 SmallVector<SDValue, 4> ExtOps;
41087 for (SDValue Op : Vec->ops())
41088 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
41089 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
41090 }
41091 default:
41092 return SDValue();
41093 }
41094 llvm_unreachable("All opcodes should return within switch")__builtin_unreachable();
41095}
41096
41097/// Try to convert a vector reduction sequence composed of binops and shuffles
41098/// into horizontal ops.
41099static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
41100 const X86Subtarget &Subtarget) {
41101 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")((void)0);
41102
41103 // We need at least SSE2 to anything here.
41104 if (!Subtarget.hasSSE2())
41105 return SDValue();
41106
41107 ISD::NodeType Opc;
41108 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
41109 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
41110 if (!Rdx)
41111 return SDValue();
41112
41113 SDValue Index = ExtElt->getOperand(1);
41114 assert(isNullConstant(Index) &&((void)0)
41115 "Reduction doesn't end in an extract from index 0")((void)0);
41116
41117 EVT VT = ExtElt->getValueType(0);
41118 EVT VecVT = Rdx.getValueType();
41119 if (VecVT.getScalarType() != VT)
41120 return SDValue();
41121
41122 SDLoc DL(ExtElt);
41123
41124 // vXi8 mul reduction - promote to vXi16 mul reduction.
41125 if (Opc == ISD::MUL) {
41126 unsigned NumElts = VecVT.getVectorNumElements();
41127 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
41128 return SDValue();
41129 if (VecVT.getSizeInBits() >= 128) {
41130 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
41131 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41132 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41133 Lo = DAG.getBitcast(WideVT, Lo);
41134 Hi = DAG.getBitcast(WideVT, Hi);
41135 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
41136 while (Rdx.getValueSizeInBits() > 128) {
41137 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41138 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
41139 }
41140 } else {
41141 if (VecVT == MVT::v4i8)
41142 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41143 DAG.getUNDEF(MVT::v4i8));
41144 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41145 DAG.getUNDEF(MVT::v8i8));
41146 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
41147 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
41148 }
41149 if (NumElts >= 8)
41150 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41151 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41152 {4, 5, 6, 7, -1, -1, -1, -1}));
41153 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41154 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41155 {2, 3, -1, -1, -1, -1, -1, -1}));
41156 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41157 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41158 {1, -1, -1, -1, -1, -1, -1, -1}));
41159 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41160 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41161 }
41162
41163 // vXi8 add reduction - sub 128-bit vector.
41164 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
41165 if (VecVT == MVT::v4i8) {
41166 // Pad with zero.
41167 if (Subtarget.hasSSE41()) {
41168 Rdx = DAG.getBitcast(MVT::i32, Rdx);
41169 Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
41170 DAG.getConstant(0, DL, MVT::v4i32), Rdx,
41171 DAG.getIntPtrConstant(0, DL));
41172 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41173 } else {
41174 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41175 DAG.getConstant(0, DL, VecVT));
41176 }
41177 }
41178 if (Rdx.getValueType() == MVT::v8i8) {
41179 // Pad with undef.
41180 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41181 DAG.getUNDEF(MVT::v8i8));
41182 }
41183 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41184 DAG.getConstant(0, DL, MVT::v16i8));
41185 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41186 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41187 }
41188
41189 // Must be a >=128-bit vector with pow2 elements.
41190 if ((VecVT.getSizeInBits() % 128) != 0 ||
41191 !isPowerOf2_32(VecVT.getVectorNumElements()))
41192 return SDValue();
41193
41194 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
41195 if (VT == MVT::i8) {
41196 while (Rdx.getValueSizeInBits() > 128) {
41197 SDValue Lo, Hi;
41198 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41199 VecVT = Lo.getValueType();
41200 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
41201 }
41202 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")((void)0);
41203
41204 SDValue Hi = DAG.getVectorShuffle(
41205 MVT::v16i8, DL, Rdx, Rdx,
41206 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
41207 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
41208 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41209 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
41210 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41211 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41212 }
41213
41214 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
41215 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
41216 return SDValue();
41217
41218 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
41219
41220 // 256-bit horizontal instructions operate on 128-bit chunks rather than
41221 // across the whole vector, so we need an extract + hop preliminary stage.
41222 // This is the only step where the operands of the hop are not the same value.
41223 // TODO: We could extend this to handle 512-bit or even longer vectors.
41224 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
41225 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
41226 unsigned NumElts = VecVT.getVectorNumElements();
41227 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
41228 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
41229 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
41230 VecVT = Rdx.getValueType();
41231 }
41232 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
41233 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
41234 return SDValue();
41235
41236 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
41237 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
41238 for (unsigned i = 0; i != ReductionSteps; ++i)
41239 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
41240
41241 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41242}
41243
41244/// Detect vector gather/scatter index generation and convert it from being a
41245/// bunch of shuffles and extracts into a somewhat faster sequence.
41246/// For i686, the best sequence is apparently storing the value and loading
41247/// scalars back, while for x64 we should use 64-bit extracts and shifts.
41248static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
41249 TargetLowering::DAGCombinerInfo &DCI,
41250 const X86Subtarget &Subtarget) {
41251 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
41252 return NewOp;
41253
41254 SDValue InputVector = N->getOperand(0);
41255 SDValue EltIdx = N->getOperand(1);
41256 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
41257
41258 EVT SrcVT = InputVector.getValueType();
41259 EVT VT = N->getValueType(0);
41260 SDLoc dl(InputVector);
41261 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
41262 unsigned NumSrcElts = SrcVT.getVectorNumElements();
41263
41264 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
41265 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41266
41267 // Integer Constant Folding.
41268 if (CIdx && VT.isInteger()) {
41269 APInt UndefVecElts;
41270 SmallVector<APInt, 16> EltBits;
41271 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
41272 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
41273 EltBits, true, false)) {
41274 uint64_t Idx = CIdx->getZExtValue();
41275 if (UndefVecElts[Idx])
41276 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41277 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
41278 dl, VT);
41279 }
41280 }
41281
41282 if (IsPextr) {
41283 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41284 if (TLI.SimplifyDemandedBits(
41285 SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
41286 return SDValue(N, 0);
41287
41288 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
41289 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
41290 InputVector.getOpcode() == X86ISD::PINSRW) &&
41291 InputVector.getOperand(2) == EltIdx) {
41292 assert(SrcVT == InputVector.getOperand(0).getValueType() &&((void)0)
41293 "Vector type mismatch")((void)0);
41294 SDValue Scl = InputVector.getOperand(1);
41295 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
41296 return DAG.getZExtOrTrunc(Scl, dl, VT);
41297 }
41298
41299 // TODO - Remove this once we can handle the implicit zero-extension of
41300 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
41301 // combineBasicSADPattern.
41302 return SDValue();
41303 }
41304
41305 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
41306 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41307 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
41308 SDValue MMXSrc = InputVector.getOperand(0);
41309
41310 // The bitcast source is a direct mmx result.
41311 if (MMXSrc.getValueType() == MVT::x86mmx)
41312 return DAG.getBitcast(VT, InputVector);
41313 }
41314
41315 // Detect mmx to i32 conversion through a v2i32 elt extract.
41316 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41317 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
41318 SDValue MMXSrc = InputVector.getOperand(0);
41319
41320 // The bitcast source is a direct mmx result.
41321 if (MMXSrc.getValueType() == MVT::x86mmx)
41322 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
41323 }
41324
41325 // Check whether this extract is the root of a sum of absolute differences
41326 // pattern. This has to be done here because we really want it to happen
41327 // pre-legalization,
41328 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
41329 return SAD;
41330
41331 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
41332 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
41333 return Cmp;
41334
41335 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
41336 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
41337 return MinMax;
41338
41339 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
41340 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
41341 return V;
41342
41343 if (SDValue V = scalarizeExtEltFP(N, DAG))
41344 return V;
41345
41346 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
41347 // and then testing the relevant element.
41348 //
41349 // Note that we only combine extracts on the *same* result number, i.e.
41350 // t0 = merge_values a0, a1, a2, a3
41351 // i1 = extract_vector_elt t0, Constant:i64<2>
41352 // i1 = extract_vector_elt t0, Constant:i64<3>
41353 // but not
41354 // i1 = extract_vector_elt t0:1, Constant:i64<2>
41355 // since the latter would need its own MOVMSK.
41356 if (CIdx && SrcVT.getScalarType() == MVT::i1) {
41357 SmallVector<SDNode *, 16> BoolExtracts;
41358 unsigned ResNo = InputVector.getResNo();
41359 auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
41360 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41361 isa<ConstantSDNode>(Use->getOperand(1)) &&
41362 Use->getOperand(0).getResNo() == ResNo &&
41363 Use->getValueType(0) == MVT::i1) {
41364 BoolExtracts.push_back(Use);
41365 return true;
41366 }
41367 return false;
41368 };
41369 if (all_of(InputVector->uses(), IsBoolExtract) &&
41370 BoolExtracts.size() > 1) {
41371 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
41372 if (SDValue BC =
41373 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
41374 for (SDNode *Use : BoolExtracts) {
41375 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
41376 unsigned MaskIdx = Use->getConstantOperandVal(1);
41377 APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
41378 SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
41379 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
41380 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
41381 DCI.CombineTo(Use, Res);
41382 }
41383 return SDValue(N, 0);
41384 }
41385 }
41386 }
41387
41388 return SDValue();
41389}
41390
41391/// If a vector select has an operand that is -1 or 0, try to simplify the
41392/// select to a bitwise logic operation.
41393/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
41394static SDValue
41395combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
41396 TargetLowering::DAGCombinerInfo &DCI,
41397 const X86Subtarget &Subtarget) {
41398 SDValue Cond = N->getOperand(0);
41399 SDValue LHS = N->getOperand(1);
41400 SDValue RHS = N->getOperand(2);
41401 EVT VT = LHS.getValueType();
41402 EVT CondVT = Cond.getValueType();
41403 SDLoc DL(N);
41404 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41405
41406 if (N->getOpcode() != ISD::VSELECT)
41407 return SDValue();
41408
41409 assert(CondVT.isVector() && "Vector select expects a vector selector!")((void)0);
41410
41411 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
41412 // TODO: Can we assert that both operands are not zeros (because that should
41413 // get simplified at node creation time)?
41414 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
41415 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
41416
41417 // If both inputs are 0/undef, create a complete zero vector.
41418 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
41419 if (TValIsAllZeros && FValIsAllZeros) {
41420 if (VT.isFloatingPoint())
41421 return DAG.getConstantFP(0.0, DL, VT);
41422 return DAG.getConstant(0, DL, VT);
41423 }
41424
41425 // To use the condition operand as a bitwise mask, it must have elements that
41426 // are the same size as the select elements. Ie, the condition operand must
41427 // have already been promoted from the IR select condition type <N x i1>.
41428 // Don't check if the types themselves are equal because that excludes
41429 // vector floating-point selects.
41430 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
41431 return SDValue();
41432
41433 // Try to invert the condition if true value is not all 1s and false value is
41434 // not all 0s. Only do this if the condition has one use.
41435 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
41436 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
41437 // Check if the selector will be produced by CMPP*/PCMP*.
41438 Cond.getOpcode() == ISD::SETCC &&
41439 // Check if SETCC has already been promoted.
41440 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
41441 CondVT) {
41442 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
41443
41444 if (TValIsAllZeros || FValIsAllOnes) {
41445 SDValue CC = Cond.getOperand(2);
41446 ISD::CondCode NewCC = ISD::getSetCCInverse(
41447 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
41448 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
41449 NewCC);
41450 std::swap(LHS, RHS);
41451 TValIsAllOnes = FValIsAllOnes;
41452 FValIsAllZeros = TValIsAllZeros;
41453 }
41454 }
41455
41456 // Cond value must be 'sign splat' to be converted to a logical op.
41457 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
41458 return SDValue();
41459
41460 // vselect Cond, 111..., 000... -> Cond
41461 if (TValIsAllOnes && FValIsAllZeros)
41462 return DAG.getBitcast(VT, Cond);
41463
41464 if (!TLI.isTypeLegal(CondVT))
41465 return SDValue();
41466
41467 // vselect Cond, 111..., X -> or Cond, X
41468 if (TValIsAllOnes) {
41469 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41470 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
41471 return DAG.getBitcast(VT, Or);
41472 }
41473
41474 // vselect Cond, X, 000... -> and Cond, X
41475 if (FValIsAllZeros) {
41476 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
41477 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
41478 return DAG.getBitcast(VT, And);
41479 }
41480
41481 // vselect Cond, 000..., X -> andn Cond, X
41482 if (TValIsAllZeros) {
41483 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
41484 SDValue AndN;
41485 // The canonical form differs for i1 vectors - x86andnp is not used
41486 if (CondVT.getScalarType() == MVT::i1)
41487 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
41488 CastRHS);
41489 else
41490 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
41491 return DAG.getBitcast(VT, AndN);
41492 }
41493
41494 return SDValue();
41495}
41496
41497/// If both arms of a vector select are concatenated vectors, split the select,
41498/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
41499/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
41500/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
41501static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
41502 const X86Subtarget &Subtarget) {
41503 unsigned Opcode = N->getOpcode();
41504 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
41505 return SDValue();
41506
41507 // TODO: Split 512-bit vectors too?
41508 EVT VT = N->getValueType(0);
41509 if (!VT.is256BitVector())
41510 return SDValue();
41511
41512 // TODO: Split as long as any 2 of the 3 operands are concatenated?
41513 SDValue Cond = N->getOperand(0);
41514 SDValue TVal = N->getOperand(1);
41515 SDValue FVal = N->getOperand(2);
41516 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
41517 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
41518 !collectConcatOps(TVal.getNode(), CatOpsT) ||
41519 !collectConcatOps(FVal.getNode(), CatOpsF))
41520 return SDValue();
41521
41522 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
41523 ArrayRef<SDValue> Ops) {
41524 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
41525 };
41526 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
41527 makeBlend, /*CheckBWI*/ false);
41528}
41529
41530static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
41531 SDValue Cond = N->getOperand(0);
41532 SDValue LHS = N->getOperand(1);
41533 SDValue RHS = N->getOperand(2);
41534 SDLoc DL(N);
41535
41536 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
41537 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
41538 if (!TrueC || !FalseC)
41539 return SDValue();
41540
41541 // Don't do this for crazy integer types.
41542 EVT VT = N->getValueType(0);
41543 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
41544 return SDValue();
41545
41546 // We're going to use the condition bit in math or logic ops. We could allow
41547 // this with a wider condition value (post-legalization it becomes an i8),
41548 // but if nothing is creating selects that late, it doesn't matter.
41549 if (Cond.getValueType() != MVT::i1)
41550 return SDValue();
41551
41552 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
41553 // 3, 5, or 9 with i32/i64, so those get transformed too.
41554 // TODO: For constants that overflow or do not differ by power-of-2 or small
41555 // multiplier, convert to 'and' + 'add'.
41556 const APInt &TrueVal = TrueC->getAPIntValue();
41557 const APInt &FalseVal = FalseC->getAPIntValue();
41558 bool OV;
41559 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
41560 if (OV)
41561 return SDValue();
41562
41563 APInt AbsDiff = Diff.abs();
41564 if (AbsDiff.isPowerOf2() ||
41565 ((VT == MVT::i32 || VT == MVT::i64) &&
41566 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
41567
41568 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
41569 // of the condition can usually be folded into a compare predicate, but even
41570 // without that, the sequence should be cheaper than a CMOV alternative.
41571 if (TrueVal.slt(FalseVal)) {
41572 Cond = DAG.getNOT(DL, Cond, MVT::i1);
41573 std::swap(TrueC, FalseC);
41574 }
41575
41576 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
41577 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
41578
41579 // Multiply condition by the difference if non-one.
41580 if (!AbsDiff.isOneValue())
41581 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
41582
41583 // Add the base if non-zero.
41584 if (!FalseC->isNullValue())
41585 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
41586
41587 return R;
41588 }
41589
41590 return SDValue();
41591}
41592
41593/// If this is a *dynamic* select (non-constant condition) and we can match
41594/// this node with one of the variable blend instructions, restructure the
41595/// condition so that blends can use the high (sign) bit of each element.
41596/// This function will also call SimplifyDemandedBits on already created
41597/// BLENDV to perform additional simplifications.
41598static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
41599 TargetLowering::DAGCombinerInfo &DCI,
41600 const X86Subtarget &Subtarget) {
41601 SDValue Cond = N->getOperand(0);
41602 if ((N->getOpcode() != ISD::VSELECT &&
41603 N->getOpcode() != X86ISD::BLENDV) ||
41604 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
41605 return SDValue();
41606
41607 // Don't optimize before the condition has been transformed to a legal type
41608 // and don't ever optimize vector selects that map to AVX512 mask-registers.
41609 unsigned BitWidth = Cond.getScalarValueSizeInBits();
41610 if (BitWidth < 8 || BitWidth > 64)
41611 return SDValue();
41612
41613 // We can only handle the cases where VSELECT is directly legal on the
41614 // subtarget. We custom lower VSELECT nodes with constant conditions and
41615 // this makes it hard to see whether a dynamic VSELECT will correctly
41616 // lower, so we both check the operation's status and explicitly handle the
41617 // cases where a *dynamic* blend will fail even though a constant-condition
41618 // blend could be custom lowered.
41619 // FIXME: We should find a better way to handle this class of problems.
41620 // Potentially, we should combine constant-condition vselect nodes
41621 // pre-legalization into shuffles and not mark as many types as custom
41622 // lowered.
41623 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41624 EVT VT = N->getValueType(0);
41625 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
41626 return SDValue();
41627 // FIXME: We don't support i16-element blends currently. We could and
41628 // should support them by making *all* the bits in the condition be set
41629 // rather than just the high bit and using an i8-element blend.
41630 if (VT.getVectorElementType() == MVT::i16)
41631 return SDValue();
41632 // Dynamic blending was only available from SSE4.1 onward.
41633 if (VT.is128BitVector() && !Subtarget.hasSSE41())
41634 return SDValue();
41635 // Byte blends are only available in AVX2
41636 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
41637 return SDValue();
41638 // There are no 512-bit blend instructions that use sign bits.
41639 if (VT.is512BitVector())
41640 return SDValue();
41641
41642 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
41643 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
41644 UI != UE; ++UI)
41645 if ((UI->getOpcode() != ISD::VSELECT &&
41646 UI->getOpcode() != X86ISD::BLENDV) ||
41647 UI.getOperandNo() != 0)
41648 return false;
41649
41650 return true;
41651 };
41652
41653 APInt DemandedBits(APInt::getSignMask(BitWidth));
41654
41655 if (OnlyUsedAsSelectCond(Cond)) {
41656 KnownBits Known;
41657 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
41658 !DCI.isBeforeLegalizeOps());
41659 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
41660 return SDValue();
41661
41662 // If we changed the computation somewhere in the DAG, this change will
41663 // affect all users of Cond. Update all the nodes so that we do not use
41664 // the generic VSELECT anymore. Otherwise, we may perform wrong
41665 // optimizations as we messed with the actual expectation for the vector
41666 // boolean values.
41667 for (SDNode *U : Cond->uses()) {
41668 if (U->getOpcode() == X86ISD::BLENDV)
41669 continue;
41670
41671 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
41672 Cond, U->getOperand(1), U->getOperand(2));
41673 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
41674 DCI.AddToWorklist(U);
41675 }
41676 DCI.CommitTargetLoweringOpt(TLO);
41677 return SDValue(N, 0);
41678 }
41679
41680 // Otherwise we can still at least try to simplify multiple use bits.
41681 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
41682 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
41683 N->getOperand(1), N->getOperand(2));
41684
41685 return SDValue();
41686}
41687
41688// Try to match:
41689// (or (and (M, (sub 0, X)), (pandn M, X)))
41690// which is a special case of:
41691// (select M, (sub 0, X), X)
41692// Per:
41693// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
41694// We know that, if fNegate is 0 or 1:
41695// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
41696//
41697// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
41698// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
41699// ( M ? -X : X) == ((X ^ M ) + (M & 1))
41700// This lets us transform our vselect to:
41701// (add (xor X, M), (and M, 1))
41702// And further to:
41703// (sub (xor X, M), M)
41704static SDValue combineLogicBlendIntoConditionalNegate(
41705 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
41706 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
41707 EVT MaskVT = Mask.getValueType();
41708 assert(MaskVT.isInteger() &&((void)0)
41709 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&((void)0)
41710 "Mask must be zero/all-bits")((void)0);
41711
41712 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
41713 return SDValue();
41714 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
41715 return SDValue();
41716
41717 auto IsNegV = [](SDNode *N, SDValue V) {
41718 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
41719 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
41720 };
41721
41722 SDValue V;
41723 if (IsNegV(Y.getNode(), X))
41724 V = X;
41725 else if (IsNegV(X.getNode(), Y))
41726 V = Y;
41727 else
41728 return SDValue();
41729
41730 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
41731 SDValue SubOp2 = Mask;
41732
41733 // If the negate was on the false side of the select, then
41734 // the operands of the SUB need to be swapped. PR 27251.
41735 // This is because the pattern being matched above is
41736 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
41737 // but if the pattern matched was
41738 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
41739 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
41740 // pattern also needs to be a negation of the replacement pattern above.
41741 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
41742 // sub accomplishes the negation of the replacement pattern.
41743 if (V == Y)
41744 std::swap(SubOp1, SubOp2);
41745
41746 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
41747 return DAG.getBitcast(VT, Res);
41748}
41749
41750/// Do target-specific dag combines on SELECT and VSELECT nodes.
41751static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
41752 TargetLowering::DAGCombinerInfo &DCI,
41753 const X86Subtarget &Subtarget) {
41754 SDLoc DL(N);
41755 SDValue Cond = N->getOperand(0);
41756 SDValue LHS = N->getOperand(1);
41757 SDValue RHS = N->getOperand(2);
41758
41759 // Try simplification again because we use this function to optimize
41760 // BLENDV nodes that are not handled by the generic combiner.
41761 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
41762 return V;
41763
41764 EVT VT = LHS.getValueType();
41765 EVT CondVT = Cond.getValueType();
41766 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41767 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
41768
41769 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
41770 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
41771 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
41772 if (CondVT.isVector() && CondVT.isInteger() &&
41773 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
41774 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
41775 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
41776 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
41777 DL, DAG, Subtarget))
41778 return V;
41779
41780 // Convert vselects with constant condition into shuffles.
41781 if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
41782 SmallVector<int, 64> Mask;
41783 if (createShuffleMaskFromVSELECT(Mask, Cond))
41784 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
41785 }
41786
41787 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
41788 // by forcing the unselected elements to zero.
41789 // TODO: Can we handle more shuffles with this?
41790 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
41791 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
41792 LHS.hasOneUse() && RHS.hasOneUse()) {
41793 MVT SimpleVT = VT.getSimpleVT();
41794 SmallVector<SDValue, 1> LHSOps, RHSOps;
41795 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
41796 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
41797 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
41798 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
41799 int NumElts = VT.getVectorNumElements();
41800 for (int i = 0; i != NumElts; ++i) {
41801 if (CondMask[i] < NumElts)
41802 RHSMask[i] = 0x80;
41803 else
41804 LHSMask[i] = 0x80;
41805 }
41806 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
41807 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
41808 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
41809 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
41810 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
41811 }
41812 }
41813
41814 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
41815 // instructions match the semantics of the common C idiom x<y?x:y but not
41816 // x<=y?x:y, because of how they handle negative zero (which can be
41817 // ignored in unsafe-math mode).
41818 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
41819 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
41820 VT != MVT::f80 && VT != MVT::f128 &&
41821 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
41822 (Subtarget.hasSSE2() ||
41823 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
41824 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41825
41826 unsigned Opcode = 0;
41827 // Check for x CC y ? x : y.
41828 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
41829 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
41830 switch (CC) {
41831 default: break;
41832 case ISD::SETULT:
41833 // Converting this to a min would handle NaNs incorrectly, and swapping
41834 // the operands would cause it to handle comparisons between positive
41835 // and negative zero incorrectly.
41836 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41837 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41838 !(DAG.isKnownNeverZeroFloat(LHS) ||
41839 DAG.isKnownNeverZeroFloat(RHS)))
41840 break;
41841 std::swap(LHS, RHS);
41842 }
41843 Opcode = X86ISD::FMIN;
41844 break;
41845 case ISD::SETOLE:
41846 // Converting this to a min would handle comparisons between positive
41847 // and negative zero incorrectly.
41848 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41849 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41850 break;
41851 Opcode = X86ISD::FMIN;
41852 break;
41853 case ISD::SETULE:
41854 // Converting this to a min would handle both negative zeros and NaNs
41855 // incorrectly, but we can swap the operands to fix both.
41856 std::swap(LHS, RHS);
41857 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41858 case ISD::SETOLT:
41859 case ISD::SETLT:
41860 case ISD::SETLE:
41861 Opcode = X86ISD::FMIN;
41862 break;
41863
41864 case ISD::SETOGE:
41865 // Converting this to a max would handle comparisons between positive
41866 // and negative zero incorrectly.
41867 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41868 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
41869 break;
41870 Opcode = X86ISD::FMAX;
41871 break;
41872 case ISD::SETUGT:
41873 // Converting this to a max would handle NaNs incorrectly, and swapping
41874 // the operands would cause it to handle comparisons between positive
41875 // and negative zero incorrectly.
41876 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
41877 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41878 !(DAG.isKnownNeverZeroFloat(LHS) ||
41879 DAG.isKnownNeverZeroFloat(RHS)))
41880 break;
41881 std::swap(LHS, RHS);
41882 }
41883 Opcode = X86ISD::FMAX;
41884 break;
41885 case ISD::SETUGE:
41886 // Converting this to a max would handle both negative zeros and NaNs
41887 // incorrectly, but we can swap the operands to fix both.
41888 std::swap(LHS, RHS);
41889 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41890 case ISD::SETOGT:
41891 case ISD::SETGT:
41892 case ISD::SETGE:
41893 Opcode = X86ISD::FMAX;
41894 break;
41895 }
41896 // Check for x CC y ? y : x -- a min/max with reversed arms.
41897 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
41898 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
41899 switch (CC) {
41900 default: break;
41901 case ISD::SETOGE:
41902 // Converting this to a min would handle comparisons between positive
41903 // and negative zero incorrectly, and swapping the operands would
41904 // cause it to handle NaNs incorrectly.
41905 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41906 !(DAG.isKnownNeverZeroFloat(LHS) ||
41907 DAG.isKnownNeverZeroFloat(RHS))) {
41908 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41909 break;
41910 std::swap(LHS, RHS);
41911 }
41912 Opcode = X86ISD::FMIN;
41913 break;
41914 case ISD::SETUGT:
41915 // Converting this to a min would handle NaNs incorrectly.
41916 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41917 break;
41918 Opcode = X86ISD::FMIN;
41919 break;
41920 case ISD::SETUGE:
41921 // Converting this to a min would handle both negative zeros and NaNs
41922 // incorrectly, but we can swap the operands to fix both.
41923 std::swap(LHS, RHS);
41924 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41925 case ISD::SETOGT:
41926 case ISD::SETGT:
41927 case ISD::SETGE:
41928 Opcode = X86ISD::FMIN;
41929 break;
41930
41931 case ISD::SETULT:
41932 // Converting this to a max would handle NaNs incorrectly.
41933 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41934 break;
41935 Opcode = X86ISD::FMAX;
41936 break;
41937 case ISD::SETOLE:
41938 // Converting this to a max would handle comparisons between positive
41939 // and negative zero incorrectly, and swapping the operands would
41940 // cause it to handle NaNs incorrectly.
41941 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
41942 !DAG.isKnownNeverZeroFloat(LHS) &&
41943 !DAG.isKnownNeverZeroFloat(RHS)) {
41944 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
41945 break;
41946 std::swap(LHS, RHS);
41947 }
41948 Opcode = X86ISD::FMAX;
41949 break;
41950 case ISD::SETULE:
41951 // Converting this to a max would handle both negative zeros and NaNs
41952 // incorrectly, but we can swap the operands to fix both.
41953 std::swap(LHS, RHS);
41954 LLVM_FALLTHROUGH[[gnu::fallthrough]];
41955 case ISD::SETOLT:
41956 case ISD::SETLT:
41957 case ISD::SETLE:
41958 Opcode = X86ISD::FMAX;
41959 break;
41960 }
41961 }
41962
41963 if (Opcode)
41964 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
41965 }
41966
41967 // Some mask scalar intrinsics rely on checking if only one bit is set
41968 // and implement it in C code like this:
41969 // A[0] = (U & 1) ? A[0] : W[0];
41970 // This creates some redundant instructions that break pattern matching.
41971 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
41972 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
41973 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
41974 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
41975 SDValue AndNode = Cond.getOperand(0);
41976 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
41977 isNullConstant(Cond.getOperand(1)) &&
41978 isOneConstant(AndNode.getOperand(1))) {
41979 // LHS and RHS swapped due to
41980 // setcc outputting 1 when AND resulted in 0 and vice versa.
41981 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
41982 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
41983 }
41984 }
41985
41986 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
41987 // lowering on KNL. In this case we convert it to
41988 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
41989 // The same situation all vectors of i8 and i16 without BWI.
41990 // Make sure we extend these even before type legalization gets a chance to
41991 // split wide vectors.
41992 // Since SKX these selects have a proper lowering.
41993 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
41994 CondVT.getVectorElementType() == MVT::i1 &&
41995 (VT.getVectorElementType() == MVT::i8 ||
41996 VT.getVectorElementType() == MVT::i16)) {
41997 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
41998 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
41999 }
42000
42001 // AVX512 - Extend select with zero to merge with target shuffle.
42002 // select(mask, extract_subvector(shuffle(x)), zero) -->
42003 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
42004 // TODO - support non target shuffles as well.
42005 if (Subtarget.hasAVX512() && CondVT.isVector() &&
42006 CondVT.getVectorElementType() == MVT::i1) {
42007 auto SelectableOp = [&TLI](SDValue Op) {
42008 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42009 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
42010 isNullConstant(Op.getOperand(1)) &&
42011 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
42012 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
42013 };
42014
42015 bool SelectableLHS = SelectableOp(LHS);
42016 bool SelectableRHS = SelectableOp(RHS);
42017 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
42018 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
42019
42020 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
42021 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
42022 : RHS.getOperand(0).getValueType();
42023 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
42024 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
42025 VT.getSizeInBits());
42026 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
42027 VT.getSizeInBits());
42028 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
42029 DAG.getUNDEF(SrcCondVT), Cond,
42030 DAG.getIntPtrConstant(0, DL));
42031 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
42032 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
42033 }
42034 }
42035
42036 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
42037 return V;
42038
42039 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
42040 Cond.hasOneUse()) {
42041 EVT CondVT = Cond.getValueType();
42042 SDValue Cond0 = Cond.getOperand(0);
42043 SDValue Cond1 = Cond.getOperand(1);
42044 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42045
42046 // Canonicalize min/max:
42047 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
42048 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
42049 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
42050 // the need for an extra compare against zero. e.g.
42051 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
42052 // subl %esi, %edi
42053 // testl %edi, %edi
42054 // movl $0, %eax
42055 // cmovgl %edi, %eax
42056 // =>
42057 // xorl %eax, %eax
42058 // subl %esi, $edi
42059 // cmovsl %eax, %edi
42060 //
42061 // We can also canonicalize
42062 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
42063 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
42064 // This allows the use of a test instruction for the compare.
42065 if (LHS == Cond0 && RHS == Cond1) {
42066 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
42067 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
42068 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
42069 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42070 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42071 }
42072 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
42073 ISD::CondCode NewCC = ISD::SETUGE;
42074 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42075 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42076 }
42077 }
42078
42079 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
42080 // fold eq + gt/lt nested selects into ge/le selects
42081 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
42082 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
42083 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
42084 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
42085 // .. etc ..
42086 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
42087 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
42088 SDValue InnerSetCC = RHS.getOperand(0);
42089 ISD::CondCode InnerCC =
42090 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
42091 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
42092 Cond0 == InnerSetCC.getOperand(0) &&
42093 Cond1 == InnerSetCC.getOperand(1)) {
42094 ISD::CondCode NewCC;
42095 switch (CC == ISD::SETEQ ? InnerCC : CC) {
42096 case ISD::SETGT: NewCC = ISD::SETGE; break;
42097 case ISD::SETLT: NewCC = ISD::SETLE; break;
42098 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
42099 case ISD::SETULT: NewCC = ISD::SETULE; break;
42100 default: NewCC = ISD::SETCC_INVALID; break;
42101 }
42102 if (NewCC != ISD::SETCC_INVALID) {
42103 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
42104 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
42105 }
42106 }
42107 }
42108 }
42109
42110 // Check if the first operand is all zeros and Cond type is vXi1.
42111 // If this an avx512 target we can improve the use of zero masking by
42112 // swapping the operands and inverting the condition.
42113 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
42114 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
42115 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
42116 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
42117 // Invert the cond to not(cond) : xor(op,allones)=not(op)
42118 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
42119 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
42120 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
42121 }
42122
42123 // Early exit check
42124 if (!TLI.isTypeLegal(VT))
42125 return SDValue();
42126
42127 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
42128 return V;
42129
42130 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
42131 return V;
42132
42133 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
42134 return V;
42135
42136 // select(~Cond, X, Y) -> select(Cond, Y, X)
42137 if (CondVT.getScalarType() != MVT::i1) {
42138 if (SDValue CondNot = IsNOT(Cond, DAG))
42139 return DAG.getNode(N->getOpcode(), DL, VT,
42140 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
42141 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
42142 if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
42143 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
42144 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
42145 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
42146 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
42147 }
42148 }
42149
42150 // Try to optimize vXi1 selects if both operands are either all constants or
42151 // bitcasts from scalar integer type. In that case we can convert the operands
42152 // to integer and use an integer select which will be converted to a CMOV.
42153 // We need to take a little bit of care to avoid creating an i64 type after
42154 // type legalization.
42155 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
42156 VT.getVectorElementType() == MVT::i1 &&
42157 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
42158 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
42159 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
42160 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
42161
42162 if ((LHSIsConst ||
42163 (LHS.getOpcode() == ISD::BITCAST &&
42164 LHS.getOperand(0).getValueType() == IntVT)) &&
42165 (RHSIsConst ||
42166 (RHS.getOpcode() == ISD::BITCAST &&
42167 RHS.getOperand(0).getValueType() == IntVT))) {
42168 if (LHSIsConst)
42169 LHS = combinevXi1ConstantToInteger(LHS, DAG);
42170 else
42171 LHS = LHS.getOperand(0);
42172
42173 if (RHSIsConst)
42174 RHS = combinevXi1ConstantToInteger(RHS, DAG);
42175 else
42176 RHS = RHS.getOperand(0);
42177
42178 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
42179 return DAG.getBitcast(VT, Select);
42180 }
42181 }
42182
42183 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
42184 // single bits, then invert the predicate and swap the select operands.
42185 // This can lower using a vector shift bit-hack rather than mask and compare.
42186 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
42187 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
42188 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
42189 Cond.getOperand(0).getOpcode() == ISD::AND &&
42190 isNullOrNullSplat(Cond.getOperand(1)) &&
42191 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
42192 Cond.getOperand(0).getValueType() == VT) {
42193 // The 'and' mask must be composed of power-of-2 constants.
42194 SDValue And = Cond.getOperand(0);
42195 auto *C = isConstOrConstSplat(And.getOperand(1));
42196 if (C && C->getAPIntValue().isPowerOf2()) {
42197 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
42198 SDValue NotCond =
42199 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
42200 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
42201 }
42202
42203 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
42204 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
42205 // 16-bit lacks a proper blendv.
42206 unsigned EltBitWidth = VT.getScalarSizeInBits();
42207 bool CanShiftBlend =
42208 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
42209 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
42210 (Subtarget.hasXOP()));
42211 if (CanShiftBlend &&
42212 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
42213 return C->getAPIntValue().isPowerOf2();
42214 })) {
42215 // Create a left-shift constant to get the mask bits over to the sign-bit.
42216 SDValue Mask = And.getOperand(1);
42217 SmallVector<int, 32> ShlVals;
42218 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
42219 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
42220 ShlVals.push_back(EltBitWidth - 1 -
42221 MaskVal->getAPIntValue().exactLogBase2());
42222 }
42223 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
42224 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
42225 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
42226 SDValue NewCond =
42227 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
42228 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
42229 }
42230 }
42231
42232 return SDValue();
42233}
42234
42235/// Combine:
42236/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
42237/// to:
42238/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
42239/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
42240/// Note that this is only legal for some op/cc combinations.
42241static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
42242 SelectionDAG &DAG,
42243 const X86Subtarget &Subtarget) {
42244 // This combine only operates on CMP-like nodes.
42245 if (!(Cmp.getOpcode() == X86ISD::CMP ||
42246 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42247 return SDValue();
42248
42249 // Can't replace the cmp if it has more uses than the one we're looking at.
42250 // FIXME: We would like to be able to handle this, but would need to make sure
42251 // all uses were updated.
42252 if (!Cmp.hasOneUse())
42253 return SDValue();
42254
42255 // This only applies to variations of the common case:
42256 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
42257 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
42258 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
42259 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
42260 // Using the proper condcodes (see below), overflow is checked for.
42261
42262 // FIXME: We can generalize both constraints:
42263 // - XOR/OR/AND (if they were made to survive AtomicExpand)
42264 // - LHS != 1
42265 // if the result is compared.
42266
42267 SDValue CmpLHS = Cmp.getOperand(0);
42268 SDValue CmpRHS = Cmp.getOperand(1);
42269 EVT CmpVT = CmpLHS.getValueType();
42270
42271 if (!CmpLHS.hasOneUse())
42272 return SDValue();
42273
42274 unsigned Opc = CmpLHS.getOpcode();
42275 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
42276 return SDValue();
42277
42278 SDValue OpRHS = CmpLHS.getOperand(2);
42279 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
42280 if (!OpRHSC)
42281 return SDValue();
42282
42283 APInt Addend = OpRHSC->getAPIntValue();
42284 if (Opc == ISD::ATOMIC_LOAD_SUB)
42285 Addend = -Addend;
42286
42287 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
42288 if (!CmpRHSC)
42289 return SDValue();
42290
42291 APInt Comparison = CmpRHSC->getAPIntValue();
42292 APInt NegAddend = -Addend;
42293
42294 // See if we can adjust the CC to make the comparison match the negated
42295 // addend.
42296 if (Comparison != NegAddend) {
42297 APInt IncComparison = Comparison + 1;
42298 if (IncComparison == NegAddend) {
42299 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
42300 Comparison = IncComparison;
42301 CC = X86::COND_AE;
42302 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
42303 Comparison = IncComparison;
42304 CC = X86::COND_L;
42305 }
42306 }
42307 APInt DecComparison = Comparison - 1;
42308 if (DecComparison == NegAddend) {
42309 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
42310 Comparison = DecComparison;
42311 CC = X86::COND_A;
42312 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
42313 Comparison = DecComparison;
42314 CC = X86::COND_LE;
42315 }
42316 }
42317 }
42318
42319 // If the addend is the negation of the comparison value, then we can do
42320 // a full comparison by emitting the atomic arithmetic as a locked sub.
42321 if (Comparison == NegAddend) {
42322 // The CC is fine, but we need to rewrite the LHS of the comparison as an
42323 // atomic sub.
42324 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
42325 auto AtomicSub = DAG.getAtomic(
42326 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
42327 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
42328 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
42329 AN->getMemOperand());
42330 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
42331 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42332 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42333 return LockOp;
42334 }
42335
42336 // We can handle comparisons with zero in a number of cases by manipulating
42337 // the CC used.
42338 if (!Comparison.isNullValue())
42339 return SDValue();
42340
42341 if (CC == X86::COND_S && Addend == 1)
42342 CC = X86::COND_LE;
42343 else if (CC == X86::COND_NS && Addend == 1)
42344 CC = X86::COND_G;
42345 else if (CC == X86::COND_G && Addend == -1)
42346 CC = X86::COND_GE;
42347 else if (CC == X86::COND_LE && Addend == -1)
42348 CC = X86::COND_L;
42349 else
42350 return SDValue();
42351
42352 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
42353 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
42354 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
42355 return LockOp;
42356}
42357
42358// Check whether a boolean test is testing a boolean value generated by
42359// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
42360// code.
42361//
42362// Simplify the following patterns:
42363// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
42364// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
42365// to (Op EFLAGS Cond)
42366//
42367// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
42368// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
42369// to (Op EFLAGS !Cond)
42370//
42371// where Op could be BRCOND or CMOV.
42372//
42373static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
42374 // This combine only operates on CMP-like nodes.
42375 if (!(Cmp.getOpcode() == X86ISD::CMP ||
42376 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42377 return SDValue();
42378
42379 // Quit if not used as a boolean value.
42380 if (CC != X86::COND_E && CC != X86::COND_NE)
42381 return SDValue();
42382
42383 // Check CMP operands. One of them should be 0 or 1 and the other should be
42384 // an SetCC or extended from it.
42385 SDValue Op1 = Cmp.getOperand(0);
42386 SDValue Op2 = Cmp.getOperand(1);
42387
42388 SDValue SetCC;
42389 const ConstantSDNode* C = nullptr;
42390 bool needOppositeCond = (CC == X86::COND_E);
42391 bool checkAgainstTrue = false; // Is it a comparison against 1?
42392
42393 if ((C = dyn_cast<ConstantSDNode>(Op1)))
42394 SetCC = Op2;
42395 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
42396 SetCC = Op1;
42397 else // Quit if all operands are not constants.
42398 return SDValue();
42399
42400 if (C->getZExtValue() == 1) {
42401 needOppositeCond = !needOppositeCond;
42402 checkAgainstTrue = true;
42403 } else if (C->getZExtValue() != 0)
42404 // Quit if the constant is neither 0 or 1.
42405 return SDValue();
42406
42407 bool truncatedToBoolWithAnd = false;
42408 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
42409 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
42410 SetCC.getOpcode() == ISD::TRUNCATE ||
42411 SetCC.getOpcode() == ISD::AND) {
42412 if (SetCC.getOpcode() == ISD::AND) {
42413 int OpIdx = -1;
42414 if (isOneConstant(SetCC.getOperand(0)))
42415 OpIdx = 1;
42416 if (isOneConstant(SetCC.getOperand(1)))
42417 OpIdx = 0;
42418 if (OpIdx < 0)
42419 break;
42420 SetCC = SetCC.getOperand(OpIdx);
42421 truncatedToBoolWithAnd = true;
42422 } else
42423 SetCC = SetCC.getOperand(0);
42424 }
42425
42426 switch (SetCC.getOpcode()) {
42427 case X86ISD::SETCC_CARRY:
42428 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
42429 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
42430 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
42431 // truncated to i1 using 'and'.
42432 if (checkAgainstTrue && !truncatedToBoolWithAnd)
42433 break;
42434 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((void)0)
42435 "Invalid use of SETCC_CARRY!")((void)0);
42436 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42437 case X86ISD::SETCC:
42438 // Set the condition code or opposite one if necessary.
42439 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
42440 if (needOppositeCond)
42441 CC = X86::GetOppositeBranchCondition(CC);
42442 return SetCC.getOperand(1);
42443 case X86ISD::CMOV: {
42444 // Check whether false/true value has canonical one, i.e. 0 or 1.
42445 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
42446 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
42447 // Quit if true value is not a constant.
42448 if (!TVal)
42449 return SDValue();
42450 // Quit if false value is not a constant.
42451 if (!FVal) {
42452 SDValue Op = SetCC.getOperand(0);
42453 // Skip 'zext' or 'trunc' node.
42454 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
42455 Op.getOpcode() == ISD::TRUNCATE)
42456 Op = Op.getOperand(0);
42457 // A special case for rdrand/rdseed, where 0 is set if false cond is
42458 // found.
42459 if ((Op.getOpcode() != X86ISD::RDRAND &&
42460 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
42461 return SDValue();
42462 }
42463 // Quit if false value is not the constant 0 or 1.
42464 bool FValIsFalse = true;
42465 if (FVal && FVal->getZExtValue() != 0) {
42466 if (FVal->getZExtValue() != 1)
42467 return SDValue();
42468 // If FVal is 1, opposite cond is needed.
42469 needOppositeCond = !needOppositeCond;
42470 FValIsFalse = false;
42471 }
42472 // Quit if TVal is not the constant opposite of FVal.
42473 if (FValIsFalse && TVal->getZExtValue() != 1)
42474 return SDValue();
42475 if (!FValIsFalse && TVal->getZExtValue() != 0)
42476 return SDValue();
42477 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
42478 if (needOppositeCond)
42479 CC = X86::GetOppositeBranchCondition(CC);
42480 return SetCC.getOperand(3);
42481 }
42482 }
42483
42484 return SDValue();
42485}
42486
42487/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
42488/// Match:
42489/// (X86or (X86setcc) (X86setcc))
42490/// (X86cmp (and (X86setcc) (X86setcc)), 0)
42491static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
42492 X86::CondCode &CC1, SDValue &Flags,
42493 bool &isAnd) {
42494 if (Cond->getOpcode() == X86ISD::CMP) {
42495 if (!isNullConstant(Cond->getOperand(1)))
42496 return false;
42497
42498 Cond = Cond->getOperand(0);
42499 }
42500
42501 isAnd = false;
42502
42503 SDValue SetCC0, SetCC1;
42504 switch (Cond->getOpcode()) {
42505 default: return false;
42506 case ISD::AND:
42507 case X86ISD::AND:
42508 isAnd = true;
42509 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42510 case ISD::OR:
42511 case X86ISD::OR:
42512 SetCC0 = Cond->getOperand(0);
42513 SetCC1 = Cond->getOperand(1);
42514 break;
42515 };
42516
42517 // Make sure we have SETCC nodes, using the same flags value.
42518 if (SetCC0.getOpcode() != X86ISD::SETCC ||
42519 SetCC1.getOpcode() != X86ISD::SETCC ||
42520 SetCC0->getOperand(1) != SetCC1->getOperand(1))
42521 return false;
42522
42523 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
42524 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
42525 Flags = SetCC0->getOperand(1);
42526 return true;
42527}
42528
42529// When legalizing carry, we create carries via add X, -1
42530// If that comes from an actual carry, via setcc, we use the
42531// carry directly.
42532static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
42533 if (EFLAGS.getOpcode() == X86ISD::ADD) {
42534 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
42535 SDValue Carry = EFLAGS.getOperand(0);
42536 while (Carry.getOpcode() == ISD::TRUNCATE ||
42537 Carry.getOpcode() == ISD::ZERO_EXTEND ||
42538 Carry.getOpcode() == ISD::SIGN_EXTEND ||
42539 Carry.getOpcode() == ISD::ANY_EXTEND ||
42540 (Carry.getOpcode() == ISD::AND &&
42541 isOneConstant(Carry.getOperand(1))))
42542 Carry = Carry.getOperand(0);
42543 if (Carry.getOpcode() == X86ISD::SETCC ||
42544 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
42545 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
42546 uint64_t CarryCC = Carry.getConstantOperandVal(0);
42547 SDValue CarryOp1 = Carry.getOperand(1);
42548 if (CarryCC == X86::COND_B)
42549 return CarryOp1;
42550 if (CarryCC == X86::COND_A) {
42551 // Try to convert COND_A into COND_B in an attempt to facilitate
42552 // materializing "setb reg".
42553 //
42554 // Do not flip "e > c", where "c" is a constant, because Cmp
42555 // instruction cannot take an immediate as its first operand.
42556 //
42557 if (CarryOp1.getOpcode() == X86ISD::SUB &&
42558 CarryOp1.getNode()->hasOneUse() &&
42559 CarryOp1.getValueType().isInteger() &&
42560 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
42561 SDValue SubCommute =
42562 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
42563 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
42564 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
42565 }
42566 }
42567 // If this is a check of the z flag of an add with 1, switch to the
42568 // C flag.
42569 if (CarryCC == X86::COND_E &&
42570 CarryOp1.getOpcode() == X86ISD::ADD &&
42571 isOneConstant(CarryOp1.getOperand(1)))
42572 return CarryOp1;
42573 }
42574 }
42575 }
42576
42577 return SDValue();
42578}
42579
42580/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
42581/// to avoid the inversion.
42582static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
42583 SelectionDAG &DAG,
42584 const X86Subtarget &Subtarget) {
42585 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
42586 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
42587 EFLAGS.getOpcode() != X86ISD::TESTP)
42588 return SDValue();
42589
42590 // PTEST/TESTP sets EFLAGS as:
42591 // TESTZ: ZF = (Op0 & Op1) == 0
42592 // TESTC: CF = (~Op0 & Op1) == 0
42593 // TESTNZC: ZF == 0 && CF == 0
42594 EVT VT = EFLAGS.getValueType();
42595 SDValue Op0 = EFLAGS.getOperand(0);
42596 SDValue Op1 = EFLAGS.getOperand(1);
42597 EVT OpVT = Op0.getValueType();
42598
42599 // TEST*(~X,Y) == TEST*(X,Y)
42600 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
42601 X86::CondCode InvCC;
42602 switch (CC) {
42603 case X86::COND_B:
42604 // testc -> testz.
42605 InvCC = X86::COND_E;
42606 break;
42607 case X86::COND_AE:
42608 // !testc -> !testz.
42609 InvCC = X86::COND_NE;
42610 break;
42611 case X86::COND_E:
42612 // testz -> testc.
42613 InvCC = X86::COND_B;
42614 break;
42615 case X86::COND_NE:
42616 // !testz -> !testc.
42617 InvCC = X86::COND_AE;
42618 break;
42619 case X86::COND_A:
42620 case X86::COND_BE:
42621 // testnzc -> testnzc (no change).
42622 InvCC = CC;
42623 break;
42624 default:
42625 InvCC = X86::COND_INVALID;
42626 break;
42627 }
42628
42629 if (InvCC != X86::COND_INVALID) {
42630 CC = InvCC;
42631 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42632 DAG.getBitcast(OpVT, NotOp0), Op1);
42633 }
42634 }
42635
42636 if (CC == X86::COND_E || CC == X86::COND_NE) {
42637 // TESTZ(X,~Y) == TESTC(Y,X)
42638 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
42639 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42640 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42641 DAG.getBitcast(OpVT, NotOp1), Op0);
42642 }
42643
42644 if (Op0 == Op1) {
42645 SDValue BC = peekThroughBitcasts(Op0);
42646 EVT BCVT = BC.getValueType();
42647 assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&((void)0)
42648 "Unexpected vector type")((void)0);
42649
42650 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
42651 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
42652 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42653 DAG.getBitcast(OpVT, BC.getOperand(0)),
42654 DAG.getBitcast(OpVT, BC.getOperand(1)));
42655 }
42656
42657 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
42658 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
42659 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
42660 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
42661 DAG.getBitcast(OpVT, BC.getOperand(0)),
42662 DAG.getBitcast(OpVT, BC.getOperand(1)));
42663 }
42664
42665 // If every element is an all-sign value, see if we can use MOVMSK to
42666 // more efficiently extract the sign bits and compare that.
42667 // TODO: Handle TESTC with comparison inversion.
42668 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
42669 // MOVMSK combines to make sure its never worse than PTEST?
42670 unsigned EltBits = BCVT.getScalarSizeInBits();
42671 if (DAG.ComputeNumSignBits(BC) == EltBits) {
42672 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")((void)0);
42673 APInt SignMask = APInt::getSignMask(EltBits);
42674 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42675 if (SDValue Res =
42676 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
42677 // For vXi16 cases we need to use pmovmksb and extract every other
42678 // sign bit.
42679 SDLoc DL(EFLAGS);
42680 if (EltBits == 16) {
42681 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
42682 Res = DAG.getBitcast(MovmskVT, Res);
42683 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42684 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
42685 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42686 } else {
42687 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
42688 }
42689 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
42690 DAG.getConstant(0, DL, MVT::i32));
42691 }
42692 }
42693 }
42694
42695 // TESTZ(-1,X) == TESTZ(X,X)
42696 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
42697 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
42698
42699 // TESTZ(X,-1) == TESTZ(X,X)
42700 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
42701 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
42702 }
42703
42704 return SDValue();
42705}
42706
42707// Attempt to simplify the MOVMSK input based on the comparison type.
42708static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
42709 SelectionDAG &DAG,
42710 const X86Subtarget &Subtarget) {
42711 // Handle eq/ne against zero (any_of).
42712 // Handle eq/ne against -1 (all_of).
42713 if (!(CC == X86::COND_E || CC == X86::COND_NE))
42714 return SDValue();
42715 if (EFLAGS.getValueType() != MVT::i32)
42716 return SDValue();
42717 unsigned CmpOpcode = EFLAGS.getOpcode();
42718 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
42719 return SDValue();
42720 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
42721 if (!CmpConstant)
42722 return SDValue();
42723 const APInt &CmpVal = CmpConstant->getAPIntValue();
42724
42725 SDValue CmpOp = EFLAGS.getOperand(0);
42726 unsigned CmpBits = CmpOp.getValueSizeInBits();
42727 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")((void)0);
42728
42729 // Peek through any truncate.
42730 if (CmpOp.getOpcode() == ISD::TRUNCATE)
42731 CmpOp = CmpOp.getOperand(0);
42732
42733 // Bail if we don't find a MOVMSK.
42734 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
42735 return SDValue();
42736
42737 SDValue Vec = CmpOp.getOperand(0);
42738 MVT VecVT = Vec.getSimpleValueType();
42739 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&((void)0)
42740 "Unexpected MOVMSK operand")((void)0);
42741 unsigned NumElts = VecVT.getVectorNumElements();
42742 unsigned NumEltBits = VecVT.getScalarSizeInBits();
42743
42744 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
42745 bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
42746 CmpVal.isMask(NumElts);
42747 if (!IsAnyOf && !IsAllOf)
42748 return SDValue();
42749
42750 // See if we can peek through to a vector with a wider element type, if the
42751 // signbits extend down to all the sub-elements as well.
42752 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
42753 // potential SimplifyDemandedBits/Elts cases.
42754 if (Vec.getOpcode() == ISD::BITCAST) {
42755 SDValue BC = peekThroughBitcasts(Vec);
42756 MVT BCVT = BC.getSimpleValueType();
42757 unsigned BCNumElts = BCVT.getVectorNumElements();
42758 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
42759 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
42760 BCNumEltBits > NumEltBits &&
42761 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
42762 SDLoc DL(EFLAGS);
42763 unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
42764 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
42765 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
42766 DAG.getConstant(CmpMask, DL, MVT::i32));
42767 }
42768 }
42769
42770 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
42771 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
42772 if (IsAllOf && Subtarget.hasSSE41()) {
42773 SDValue BC = peekThroughBitcasts(Vec);
42774 if (BC.getOpcode() == X86ISD::PCMPEQ &&
42775 ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
42776 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
42777 SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
42778 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
42779 }
42780 }
42781
42782 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
42783 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
42784 // sign bits prior to the comparison with zero unless we know that
42785 // the vXi16 splats the sign bit down to the lower i8 half.
42786 // TODO: Handle all_of patterns.
42787 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
42788 SDValue VecOp0 = Vec.getOperand(0);
42789 SDValue VecOp1 = Vec.getOperand(1);
42790 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
42791 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
42792 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
42793 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
42794 SDLoc DL(EFLAGS);
42795 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
42796 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42797 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
42798 if (!SignExt0) {
42799 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
42800 DAG.getConstant(0xAAAA, DL, MVT::i16));
42801 }
42802 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42803 DAG.getConstant(0, DL, MVT::i16));
42804 }
42805 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
42806 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
42807 if (CmpBits >= 16 && Subtarget.hasInt256() &&
42808 VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42809 VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42810 VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
42811 VecOp0.getConstantOperandAPInt(1) == 0 &&
42812 VecOp1.getConstantOperandAPInt(1) == 8 &&
42813 (IsAnyOf || (SignExt0 && SignExt1))) {
42814 SDLoc DL(EFLAGS);
42815 SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
42816 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42817 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
42818 if (!SignExt0 || !SignExt1) {
42819 assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns")((void)0);
42820 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
42821 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
42822 }
42823 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42824 DAG.getConstant(CmpMask, DL, MVT::i32));
42825 }
42826 }
42827
42828 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
42829 SmallVector<int, 32> ShuffleMask;
42830 SmallVector<SDValue, 2> ShuffleInputs;
42831 if (NumElts <= CmpBits &&
42832 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
42833 ShuffleMask, DAG) &&
42834 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
42835 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
42836 unsigned NumShuffleElts = ShuffleMask.size();
42837 APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
42838 for (int M : ShuffleMask) {
42839 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")((void)0);
42840 DemandedElts.setBit(M);
42841 }
42842 if (DemandedElts.isAllOnesValue()) {
42843 SDLoc DL(EFLAGS);
42844 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
42845 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
42846 Result =
42847 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
42848 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
42849 EFLAGS.getOperand(1));
42850 }
42851 }
42852
42853 return SDValue();
42854}
42855
42856/// Optimize an EFLAGS definition used according to the condition code \p CC
42857/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
42858/// uses of chain values.
42859static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
42860 SelectionDAG &DAG,
42861 const X86Subtarget &Subtarget) {
42862 if (CC == X86::COND_B)
42863 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
42864 return Flags;
42865
42866 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
42867 return R;
42868
42869 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
42870 return R;
42871
42872 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
42873 return R;
42874
42875 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
42876}
42877
42878/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
42879static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
42880 TargetLowering::DAGCombinerInfo &DCI,
42881 const X86Subtarget &Subtarget) {
42882 SDLoc DL(N);
42883
42884 SDValue FalseOp = N->getOperand(0);
42885 SDValue TrueOp = N->getOperand(1);
42886 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
42887 SDValue Cond = N->getOperand(3);
42888
42889 // cmov X, X, ?, ? --> X
42890 if (TrueOp == FalseOp)
42891 return TrueOp;
42892
42893 // Try to simplify the EFLAGS and condition code operands.
42894 // We can't always do this as FCMOV only supports a subset of X86 cond.
42895 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
42896 if (!(FalseOp.getValueType() == MVT::f80 ||
42897 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
42898 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
42899 !Subtarget.hasCMov() || hasFPCMov(CC)) {
42900 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
42901 Flags};
42902 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
42903 }
42904 }
42905
42906 // If this is a select between two integer constants, try to do some
42907 // optimizations. Note that the operands are ordered the opposite of SELECT
42908 // operands.
42909 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
42910 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
42911 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
42912 // larger than FalseC (the false value).
42913 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
42914 CC = X86::GetOppositeBranchCondition(CC);
42915 std::swap(TrueC, FalseC);
42916 std::swap(TrueOp, FalseOp);
42917 }
42918
42919 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
42920 // This is efficient for any integer data type (including i8/i16) and
42921 // shift amount.
42922 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
42923 Cond = getSETCC(CC, Cond, DL, DAG);
42924
42925 // Zero extend the condition if needed.
42926 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
42927
42928 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
42929 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
42930 DAG.getConstant(ShAmt, DL, MVT::i8));
42931 return Cond;
42932 }
42933
42934 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
42935 // for any integer data type, including i8/i16.
42936 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
42937 Cond = getSETCC(CC, Cond, DL, DAG);
42938
42939 // Zero extend the condition if needed.
42940 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
42941 FalseC->getValueType(0), Cond);
42942 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42943 SDValue(FalseC, 0));
42944 return Cond;
42945 }
42946
42947 // Optimize cases that will turn into an LEA instruction. This requires
42948 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
42949 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
42950 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
42951 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&((void)0)
42952 "Implicit constant truncation")((void)0);
42953
42954 bool isFastMultiplier = false;
42955 if (Diff.ult(10)) {
42956 switch (Diff.getZExtValue()) {
42957 default: break;
42958 case 1: // result = add base, cond
42959 case 2: // result = lea base( , cond*2)
42960 case 3: // result = lea base(cond, cond*2)
42961 case 4: // result = lea base( , cond*4)
42962 case 5: // result = lea base(cond, cond*4)
42963 case 8: // result = lea base( , cond*8)
42964 case 9: // result = lea base(cond, cond*8)
42965 isFastMultiplier = true;
42966 break;
42967 }
42968 }
42969
42970 if (isFastMultiplier) {
42971 Cond = getSETCC(CC, Cond, DL ,DAG);
42972 // Zero extend the condition if needed.
42973 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
42974 Cond);
42975 // Scale the condition by the difference.
42976 if (Diff != 1)
42977 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
42978 DAG.getConstant(Diff, DL, Cond.getValueType()));
42979
42980 // Add the base if non-zero.
42981 if (FalseC->getAPIntValue() != 0)
42982 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
42983 SDValue(FalseC, 0));
42984 return Cond;
42985 }
42986 }
42987 }
42988 }
42989
42990 // Handle these cases:
42991 // (select (x != c), e, c) -> select (x != c), e, x),
42992 // (select (x == c), c, e) -> select (x == c), x, e)
42993 // where the c is an integer constant, and the "select" is the combination
42994 // of CMOV and CMP.
42995 //
42996 // The rationale for this change is that the conditional-move from a constant
42997 // needs two instructions, however, conditional-move from a register needs
42998 // only one instruction.
42999 //
43000 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
43001 // some instruction-combining opportunities. This opt needs to be
43002 // postponed as late as possible.
43003 //
43004 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
43005 // the DCI.xxxx conditions are provided to postpone the optimization as
43006 // late as possible.
43007
43008 ConstantSDNode *CmpAgainst = nullptr;
43009 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
43010 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
43011 !isa<ConstantSDNode>(Cond.getOperand(0))) {
43012
43013 if (CC == X86::COND_NE &&
43014 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
43015 CC = X86::GetOppositeBranchCondition(CC);
43016 std::swap(TrueOp, FalseOp);
43017 }
43018
43019 if (CC == X86::COND_E &&
43020 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
43021 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
43022 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
43023 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43024 }
43025 }
43026 }
43027
43028 // Fold and/or of setcc's to double CMOV:
43029 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
43030 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
43031 //
43032 // This combine lets us generate:
43033 // cmovcc1 (jcc1 if we don't have CMOV)
43034 // cmovcc2 (same)
43035 // instead of:
43036 // setcc1
43037 // setcc2
43038 // and/or
43039 // cmovne (jne if we don't have CMOV)
43040 // When we can't use the CMOV instruction, it might increase branch
43041 // mispredicts.
43042 // When we can use CMOV, or when there is no mispredict, this improves
43043 // throughput and reduces register pressure.
43044 //
43045 if (CC == X86::COND_NE) {
43046 SDValue Flags;
43047 X86::CondCode CC0, CC1;
43048 bool isAndSetCC;
43049 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
43050 if (isAndSetCC) {
43051 std::swap(FalseOp, TrueOp);
43052 CC0 = X86::GetOppositeBranchCondition(CC0);
43053 CC1 = X86::GetOppositeBranchCondition(CC1);
43054 }
43055
43056 SDValue LOps[] = {FalseOp, TrueOp,
43057 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
43058 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
43059 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
43060 Flags};
43061 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43062 return CMOV;
43063 }
43064 }
43065
43066 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
43067 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
43068 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
43069 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
43070 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
43071 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
43072 SDValue Add = TrueOp;
43073 SDValue Const = FalseOp;
43074 // Canonicalize the condition code for easier matching and output.
43075 if (CC == X86::COND_E)
43076 std::swap(Add, Const);
43077
43078 // We might have replaced the constant in the cmov with the LHS of the
43079 // compare. If so change it to the RHS of the compare.
43080 if (Const == Cond.getOperand(0))
43081 Const = Cond.getOperand(1);
43082
43083 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
43084 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
43085 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
43086 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
43087 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
43088 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
43089 EVT VT = N->getValueType(0);
43090 // This should constant fold.
43091 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
43092 SDValue CMov =
43093 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
43094 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
43095 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
43096 }
43097 }
43098
43099 return SDValue();
43100}
43101
43102/// Different mul shrinking modes.
43103enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
43104
43105static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
43106 EVT VT = N->getOperand(0).getValueType();
43107 if (VT.getScalarSizeInBits() != 32)
43108 return false;
43109
43110 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")((void)0);
43111 unsigned SignBits[2] = {1, 1};
43112 bool IsPositive[2] = {false, false};
43113 for (unsigned i = 0; i < 2; i++) {
43114 SDValue Opd = N->getOperand(i);
43115
43116 SignBits[i] = DAG.ComputeNumSignBits(Opd);
43117 IsPositive[i] = DAG.SignBitIsZero(Opd);
43118 }
43119
43120 bool AllPositive = IsPositive[0] && IsPositive[1];
43121 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
43122 // When ranges are from -128 ~ 127, use MULS8 mode.
43123 if (MinSignBits >= 25)
43124 Mode = ShrinkMode::MULS8;
43125 // When ranges are from 0 ~ 255, use MULU8 mode.
43126 else if (AllPositive && MinSignBits >= 24)
43127 Mode = ShrinkMode::MULU8;
43128 // When ranges are from -32768 ~ 32767, use MULS16 mode.
43129 else if (MinSignBits >= 17)
43130 Mode = ShrinkMode::MULS16;
43131 // When ranges are from 0 ~ 65535, use MULU16 mode.
43132 else if (AllPositive && MinSignBits >= 16)
43133 Mode = ShrinkMode::MULU16;
43134 else
43135 return false;
43136 return true;
43137}
43138
43139/// When the operands of vector mul are extended from smaller size values,
43140/// like i8 and i16, the type of mul may be shrinked to generate more
43141/// efficient code. Two typical patterns are handled:
43142/// Pattern1:
43143/// %2 = sext/zext <N x i8> %1 to <N x i32>
43144/// %4 = sext/zext <N x i8> %3 to <N x i32>
43145// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43146/// %5 = mul <N x i32> %2, %4
43147///
43148/// Pattern2:
43149/// %2 = zext/sext <N x i16> %1 to <N x i32>
43150/// %4 = zext/sext <N x i16> %3 to <N x i32>
43151/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43152/// %5 = mul <N x i32> %2, %4
43153///
43154/// There are four mul shrinking modes:
43155/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
43156/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
43157/// generate pmullw+sext32 for it (MULS8 mode).
43158/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
43159/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
43160/// generate pmullw+zext32 for it (MULU8 mode).
43161/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
43162/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
43163/// generate pmullw+pmulhw for it (MULS16 mode).
43164/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
43165/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
43166/// generate pmullw+pmulhuw for it (MULU16 mode).
43167static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
43168 const X86Subtarget &Subtarget) {
43169 // Check for legality
43170 // pmullw/pmulhw are not supported by SSE.
43171 if (!Subtarget.hasSSE2())
43172 return SDValue();
43173
43174 // Check for profitability
43175 // pmulld is supported since SSE41. It is better to use pmulld
43176 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
43177 // the expansion.
43178 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
43179 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
43180 return SDValue();
43181
43182 ShrinkMode Mode;
43183 if (!canReduceVMulWidth(N, DAG, Mode))
43184 return SDValue();
43185
43186 SDLoc DL(N);
43187 SDValue N0 = N->getOperand(0);
43188 SDValue N1 = N->getOperand(1);
43189 EVT VT = N->getOperand(0).getValueType();
43190 unsigned NumElts = VT.getVectorNumElements();
43191 if ((NumElts % 2) != 0)
43192 return SDValue();
43193
43194 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
43195
43196 // Shrink the operands of mul.
43197 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
43198 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
43199
43200 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
43201 // lower part is needed.
43202 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
43203 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
43204 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
43205 : ISD::SIGN_EXTEND,
43206 DL, VT, MulLo);
43207
43208 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
43209 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
43210 // the higher part is also needed.
43211 SDValue MulHi =
43212 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
43213 ReducedVT, NewN0, NewN1);
43214
43215 // Repack the lower part and higher part result of mul into a wider
43216 // result.
43217 // Generate shuffle functioning as punpcklwd.
43218 SmallVector<int, 16> ShuffleMask(NumElts);
43219 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43220 ShuffleMask[2 * i] = i;
43221 ShuffleMask[2 * i + 1] = i + NumElts;
43222 }
43223 SDValue ResLo =
43224 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43225 ResLo = DAG.getBitcast(ResVT, ResLo);
43226 // Generate shuffle functioning as punpckhwd.
43227 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43228 ShuffleMask[2 * i] = i + NumElts / 2;
43229 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
43230 }
43231 SDValue ResHi =
43232 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43233 ResHi = DAG.getBitcast(ResVT, ResHi);
43234 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
43235}
43236
43237static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
43238 EVT VT, const SDLoc &DL) {
43239
43240 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
43241 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43242 DAG.getConstant(Mult, DL, VT));
43243 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
43244 DAG.getConstant(Shift, DL, MVT::i8));
43245 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43246 N->getOperand(0));
43247 return Result;
43248 };
43249
43250 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
43251 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43252 DAG.getConstant(Mul1, DL, VT));
43253 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
43254 DAG.getConstant(Mul2, DL, VT));
43255 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43256 N->getOperand(0));
43257 return Result;
43258 };
43259
43260 switch (MulAmt) {
43261 default:
43262 break;
43263 case 11:
43264 // mul x, 11 => add ((shl (mul x, 5), 1), x)
43265 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
43266 case 21:
43267 // mul x, 21 => add ((shl (mul x, 5), 2), x)
43268 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
43269 case 41:
43270 // mul x, 41 => add ((shl (mul x, 5), 3), x)
43271 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
43272 case 22:
43273 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
43274 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43275 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
43276 case 19:
43277 // mul x, 19 => add ((shl (mul x, 9), 1), x)
43278 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
43279 case 37:
43280 // mul x, 37 => add ((shl (mul x, 9), 2), x)
43281 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
43282 case 73:
43283 // mul x, 73 => add ((shl (mul x, 9), 3), x)
43284 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
43285 case 13:
43286 // mul x, 13 => add ((shl (mul x, 3), 2), x)
43287 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
43288 case 23:
43289 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
43290 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
43291 case 26:
43292 // mul x, 26 => add ((mul (mul x, 5), 5), x)
43293 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
43294 case 28:
43295 // mul x, 28 => add ((mul (mul x, 9), 3), x)
43296 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
43297 case 29:
43298 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
43299 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43300 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
43301 }
43302
43303 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
43304 // by a single LEA.
43305 // First check if this a sum of two power of 2s because that's easy. Then
43306 // count how many zeros are up to the first bit.
43307 // TODO: We can do this even without LEA at a cost of two shifts and an add.
43308 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
43309 unsigned ScaleShift = countTrailingZeros(MulAmt);
43310 if (ScaleShift >= 1 && ScaleShift < 4) {
43311 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
43312 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43313 DAG.getConstant(ShiftAmt, DL, MVT::i8));
43314 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43315 DAG.getConstant(ScaleShift, DL, MVT::i8));
43316 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
43317 }
43318 }
43319
43320 return SDValue();
43321}
43322
43323// If the upper 17 bits of each element are zero then we can use PMADDWD,
43324// which is always at least as quick as PMULLD, except on KNL.
43325static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
43326 const X86Subtarget &Subtarget) {
43327 if (!Subtarget.hasSSE2())
43328 return SDValue();
43329
43330 if (Subtarget.isPMADDWDSlow())
43331 return SDValue();
43332
43333 EVT VT = N->getValueType(0);
43334
43335 // Only support vXi32 vectors.
43336 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
43337 return SDValue();
43338
43339 // Make sure the type is legal or will be widened to a legal type.
43340 if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
43341 return SDValue();
43342
43343 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
43344
43345 // Without BWI, we would need to split v32i16.
43346 if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
43347 return SDValue();
43348
43349 SDValue N0 = N->getOperand(0);
43350 SDValue N1 = N->getOperand(1);
43351
43352 // If we are zero extending two steps without SSE4.1, its better to reduce
43353 // the vmul width instead.
43354 if (!Subtarget.hasSSE41() &&
43355 (N0.getOpcode() == ISD::ZERO_EXTEND &&
43356 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
43357 (N1.getOpcode() == ISD::ZERO_EXTEND &&
43358 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
43359 return SDValue();
43360
43361 APInt Mask17 = APInt::getHighBitsSet(32, 17);
43362 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
43363 !DAG.MaskedValueIsZero(N0, Mask17))
43364 return SDValue();
43365
43366 // Use SplitOpsAndApply to handle AVX splitting.
43367 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43368 ArrayRef<SDValue> Ops) {
43369 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43370 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
43371 };
43372 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
43373 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
43374 PMADDWDBuilder);
43375}
43376
43377static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
43378 const X86Subtarget &Subtarget) {
43379 if (!Subtarget.hasSSE2())
43380 return SDValue();
43381
43382 EVT VT = N->getValueType(0);
43383
43384 // Only support vXi64 vectors.
43385 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
43386 VT.getVectorNumElements() < 2 ||
43387 !isPowerOf2_32(VT.getVectorNumElements()))
43388 return SDValue();
43389
43390 SDValue N0 = N->getOperand(0);
43391 SDValue N1 = N->getOperand(1);
43392
43393 // MULDQ returns the 64-bit result of the signed multiplication of the lower
43394 // 32-bits. We can lower with this if the sign bits stretch that far.
43395 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
43396 DAG.ComputeNumSignBits(N1) > 32) {
43397 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43398 ArrayRef<SDValue> Ops) {
43399 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
43400 };
43401 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43402 PMULDQBuilder, /*CheckBWI*/false);
43403 }
43404
43405 // If the upper bits are zero we can use a single pmuludq.
43406 APInt Mask = APInt::getHighBitsSet(64, 32);
43407 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
43408 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43409 ArrayRef<SDValue> Ops) {
43410 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
43411 };
43412 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
43413 PMULUDQBuilder, /*CheckBWI*/false);
43414 }
43415
43416 return SDValue();
43417}
43418
43419/// Optimize a single multiply with constant into two operations in order to
43420/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
43421static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
43422 TargetLowering::DAGCombinerInfo &DCI,
43423 const X86Subtarget &Subtarget) {
43424 EVT VT = N->getValueType(0);
43425
43426 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
43427 return V;
43428
43429 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
43430 return V;
43431
43432 if (DCI.isBeforeLegalize() && VT.isVector())
43433 return reduceVMULWidth(N, DAG, Subtarget);
43434
43435 if (!MulConstantOptimization)
43436 return SDValue();
43437 // An imul is usually smaller than the alternative sequence.
43438 if (DAG.getMachineFunction().getFunction().hasMinSize())
43439 return SDValue();
43440
43441 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
43442 return SDValue();
43443
43444 if (VT != MVT::i64 && VT != MVT::i32)
43445 return SDValue();
43446
43447 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
43448 if (!C)
43449 return SDValue();
43450 if (isPowerOf2_64(C->getZExtValue()))
43451 return SDValue();
43452
43453 int64_t SignMulAmt = C->getSExtValue();
43454 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")((void)0);
43455 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
43456
43457 SDLoc DL(N);
43458 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
43459 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43460 DAG.getConstant(AbsMulAmt, DL, VT));
43461 if (SignMulAmt < 0)
43462 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43463 NewMul);
43464
43465 return NewMul;
43466 }
43467
43468 uint64_t MulAmt1 = 0;
43469 uint64_t MulAmt2 = 0;
43470 if ((AbsMulAmt % 9) == 0) {
43471 MulAmt1 = 9;
43472 MulAmt2 = AbsMulAmt / 9;
43473 } else if ((AbsMulAmt % 5) == 0) {
43474 MulAmt1 = 5;
43475 MulAmt2 = AbsMulAmt / 5;
43476 } else if ((AbsMulAmt % 3) == 0) {
43477 MulAmt1 = 3;
43478 MulAmt2 = AbsMulAmt / 3;
43479 }
43480
43481 SDValue NewMul;
43482 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
43483 if (MulAmt2 &&
43484 (isPowerOf2_64(MulAmt2) ||
43485 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
43486
43487 if (isPowerOf2_64(MulAmt2) &&
43488 !(SignMulAmt >= 0 && N->hasOneUse() &&
43489 N->use_begin()->getOpcode() == ISD::ADD))
43490 // If second multiplifer is pow2, issue it first. We want the multiply by
43491 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
43492 // is an add. Only do this for positive multiply amounts since the
43493 // negate would prevent it from being used as an address mode anyway.
43494 std::swap(MulAmt1, MulAmt2);
43495
43496 if (isPowerOf2_64(MulAmt1))
43497 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43498 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
43499 else
43500 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43501 DAG.getConstant(MulAmt1, DL, VT));
43502
43503 if (isPowerOf2_64(MulAmt2))
43504 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
43505 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
43506 else
43507 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
43508 DAG.getConstant(MulAmt2, DL, VT));
43509
43510 // Negate the result.
43511 if (SignMulAmt < 0)
43512 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
43513 NewMul);
43514 } else if (!Subtarget.slowLEA())
43515 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
43516
43517 if (!NewMul) {
43518 assert(C->getZExtValue() != 0 &&((void)0)
43519 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&((void)0)
43520 "Both cases that could cause potential overflows should have "((void)0)
43521 "already been handled.")((void)0);
43522 if (isPowerOf2_64(AbsMulAmt - 1)) {
43523 // (mul x, 2^N + 1) => (add (shl x, N), x)
43524 NewMul = DAG.getNode(
43525 ISD::ADD, DL, VT, N->getOperand(0),
43526 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43527 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
43528 MVT::i8)));
43529 // To negate, subtract the number from zero
43530 if (SignMulAmt < 0)
43531 NewMul = DAG.getNode(ISD::SUB, DL, VT,
43532 DAG.getConstant(0, DL, VT), NewMul);
43533 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
43534 // (mul x, 2^N - 1) => (sub (shl x, N), x)
43535 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43536 DAG.getConstant(Log2_64(AbsMulAmt + 1),
43537 DL, MVT::i8));
43538 // To negate, reverse the operands of the subtract.
43539 if (SignMulAmt < 0)
43540 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
43541 else
43542 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43543 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
43544 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
43545 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43546 DAG.getConstant(Log2_64(AbsMulAmt - 2),
43547 DL, MVT::i8));
43548 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43549 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
43550 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
43551 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
43552 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43553 DAG.getConstant(Log2_64(AbsMulAmt + 2),
43554 DL, MVT::i8));
43555 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43556 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
43557 }
43558 }
43559
43560 return NewMul;
43561}
43562
43563// Try to form a MULHU or MULHS node by looking for
43564// (srl (mul ext, ext), 16)
43565// TODO: This is X86 specific because we want to be able to handle wide types
43566// before type legalization. But we can only do it if the vector will be
43567// legalized via widening/splitting. Type legalization can't handle promotion
43568// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
43569// combiner.
43570static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
43571 const X86Subtarget &Subtarget) {
43572 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&((void)0)
43573 "SRL or SRA node is required here!")((void)0);
43574 SDLoc DL(N);
43575
43576 // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
43577 // the multiply.
43578 if (!Subtarget.hasSSE41())
43579 return SDValue();
43580
43581 // The operation feeding into the shift must be a multiply.
43582 SDValue ShiftOperand = N->getOperand(0);
43583 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
43584 return SDValue();
43585
43586 // Input type should be at least vXi32.
43587 EVT VT = N->getValueType(0);
43588 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
43589 return SDValue();
43590
43591 // Need a shift by 16.
43592 APInt ShiftAmt;
43593 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
43594 ShiftAmt != 16)
43595 return SDValue();
43596
43597 SDValue LHS = ShiftOperand.getOperand(0);
43598 SDValue RHS = ShiftOperand.getOperand(1);
43599
43600 unsigned ExtOpc = LHS.getOpcode();
43601 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
43602 RHS.getOpcode() != ExtOpc)
43603 return SDValue();
43604
43605 // Peek through the extends.
43606 LHS = LHS.getOperand(0);
43607 RHS = RHS.getOperand(0);
43608
43609 // Ensure the input types match.
43610 EVT MulVT = LHS.getValueType();
43611 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
43612 return SDValue();
43613
43614 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
43615 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
43616
43617 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
43618 return DAG.getNode(ExtOpc, DL, VT, Mulh);
43619}
43620
43621static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
43622 SDValue N0 = N->getOperand(0);
43623 SDValue N1 = N->getOperand(1);
43624 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
43625 EVT VT = N0.getValueType();
43626
43627 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
43628 // since the result of setcc_c is all zero's or all ones.
43629 if (VT.isInteger() && !VT.isVector() &&
43630 N1C && N0.getOpcode() == ISD::AND &&
43631 N0.getOperand(1).getOpcode() == ISD::Constant) {
43632 SDValue N00 = N0.getOperand(0);
43633 APInt Mask = N0.getConstantOperandAPInt(1);
43634 Mask <<= N1C->getAPIntValue();
43635 bool MaskOK = false;
43636 // We can handle cases concerning bit-widening nodes containing setcc_c if
43637 // we carefully interrogate the mask to make sure we are semantics
43638 // preserving.
43639 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
43640 // of the underlying setcc_c operation if the setcc_c was zero extended.
43641 // Consider the following example:
43642 // zext(setcc_c) -> i32 0x0000FFFF
43643 // c1 -> i32 0x0000FFFF
43644 // c2 -> i32 0x00000001
43645 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
43646 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
43647 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
43648 MaskOK = true;
43649 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
43650 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43651 MaskOK = true;
43652 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
43653 N00.getOpcode() == ISD::ANY_EXTEND) &&
43654 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
43655 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
43656 }
43657 if (MaskOK && Mask != 0) {
43658 SDLoc DL(N);
43659 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
43660 }
43661 }
43662
43663 // Hardware support for vector shifts is sparse which makes us scalarize the
43664 // vector operations in many cases. Also, on sandybridge ADD is faster than
43665 // shl.
43666 // (shl V, 1) -> add V,V
43667 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
43668 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
43669 assert(N0.getValueType().isVector() && "Invalid vector shift type")((void)0);
43670 // We shift all of the values by one. In many cases we do not have
43671 // hardware support for this operation. This is better expressed as an ADD
43672 // of two values.
43673 if (N1SplatC->isOne())
43674 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
43675 }
43676
43677 return SDValue();
43678}
43679
43680static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
43681 const X86Subtarget &Subtarget) {
43682 SDValue N0 = N->getOperand(0);
43683 SDValue N1 = N->getOperand(1);
43684 EVT VT = N0.getValueType();
43685 unsigned Size = VT.getSizeInBits();
43686
43687 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43688 return V;
43689
43690 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
43691 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
43692 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
43693 // depending on sign of (SarConst - [56,48,32,24,16])
43694
43695 // sexts in X86 are MOVs. The MOVs have the same code size
43696 // as above SHIFTs (only SHIFT on 1 has lower code size).
43697 // However the MOVs have 2 advantages to a SHIFT:
43698 // 1. MOVs can write to a register that differs from source
43699 // 2. MOVs accept memory operands
43700
43701 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
43702 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
43703 N0.getOperand(1).getOpcode() != ISD::Constant)
43704 return SDValue();
43705
43706 SDValue N00 = N0.getOperand(0);
43707 SDValue N01 = N0.getOperand(1);
43708 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
43709 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
43710 EVT CVT = N1.getValueType();
43711
43712 if (SarConst.isNegative())
43713 return SDValue();
43714
43715 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
43716 unsigned ShiftSize = SVT.getSizeInBits();
43717 // skipping types without corresponding sext/zext and
43718 // ShlConst that is not one of [56,48,32,24,16]
43719 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
43720 continue;
43721 SDLoc DL(N);
43722 SDValue NN =
43723 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
43724 SarConst = SarConst - (Size - ShiftSize);
43725 if (SarConst == 0)
43726 return NN;
43727 else if (SarConst.isNegative())
43728 return DAG.getNode(ISD::SHL, DL, VT, NN,
43729 DAG.getConstant(-SarConst, DL, CVT));
43730 else
43731 return DAG.getNode(ISD::SRA, DL, VT, NN,
43732 DAG.getConstant(SarConst, DL, CVT));
43733 }
43734 return SDValue();
43735}
43736
43737static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
43738 TargetLowering::DAGCombinerInfo &DCI,
43739 const X86Subtarget &Subtarget) {
43740 SDValue N0 = N->getOperand(0);
43741 SDValue N1 = N->getOperand(1);
43742 EVT VT = N0.getValueType();
43743
43744 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
43745 return V;
43746
43747 // Only do this on the last DAG combine as it can interfere with other
43748 // combines.
43749 if (!DCI.isAfterLegalizeDAG())
43750 return SDValue();
43751
43752 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
43753 // TODO: This is a generic DAG combine that became an x86-only combine to
43754 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
43755 // and-not ('andn').
43756 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
43757 return SDValue();
43758
43759 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
43760 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
43761 if (!ShiftC || !AndC)
43762 return SDValue();
43763
43764 // If we can shrink the constant mask below 8-bits or 32-bits, then this
43765 // transform should reduce code size. It may also enable secondary transforms
43766 // from improved known-bits analysis or instruction selection.
43767 APInt MaskVal = AndC->getAPIntValue();
43768
43769 // If this can be matched by a zero extend, don't optimize.
43770 if (MaskVal.isMask()) {
43771 unsigned TO = MaskVal.countTrailingOnes();
43772 if (TO >= 8 && isPowerOf2_32(TO))
43773 return SDValue();
43774 }
43775
43776 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
43777 unsigned OldMaskSize = MaskVal.getMinSignedBits();
43778 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
43779 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
43780 (OldMaskSize > 32 && NewMaskSize <= 32)) {
43781 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
43782 SDLoc DL(N);
43783 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
43784 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
43785 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
43786 }
43787 return SDValue();
43788}
43789
43790static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
43791 const X86Subtarget &Subtarget) {
43792 unsigned Opcode = N->getOpcode();
43793 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")((void)0);
43794
43795 SDLoc DL(N);
43796 EVT VT = N->getValueType(0);
43797 SDValue N0 = N->getOperand(0);
43798 SDValue N1 = N->getOperand(1);
43799 EVT SrcVT = N0.getValueType();
43800
43801 SDValue BC0 =
43802 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
43803 SDValue BC1 =
43804 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
43805
43806 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
43807 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
43808 // truncation trees that help us avoid lane crossing shuffles.
43809 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
43810 // TODO: We don't handle vXf64 shuffles yet.
43811 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
43812 BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43813 BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43814 BC0.getOperand(0) == BC1.getOperand(0) &&
43815 BC0.getOperand(0).getValueType().is256BitVector() &&
43816 BC0.getConstantOperandAPInt(1) == 0 &&
43817 BC1.getConstantOperandAPInt(1) ==
43818 BC0.getValueType().getVectorNumElements()) {
43819 SmallVector<SDValue> ShuffleOps;
43820 SmallVector<int> ShuffleMask, ScaledMask;
43821 SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
43822 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
43823 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
43824 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
43825 // shuffle to a v4X64 width - we can probably relax this in the future.
43826 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
43827 ShuffleOps[0].getValueType().is256BitVector() &&
43828 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
43829 SDValue Lo, Hi;
43830 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43831 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
43832 Lo = DAG.getBitcast(SrcVT, Lo);
43833 Hi = DAG.getBitcast(SrcVT, Hi);
43834 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
43835 Res = DAG.getBitcast(ShufVT, Res);
43836 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
43837 return DAG.getBitcast(VT, Res);
43838 }
43839 }
43840 }
43841
43842 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
43843 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
43844 // If either/both ops are a shuffle that can scale to v2x64,
43845 // then see if we can perform this as a v4x32 post shuffle.
43846 SmallVector<SDValue> Ops0, Ops1;
43847 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
43848 bool IsShuf0 =
43849 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43850 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43851 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43852 bool IsShuf1 =
43853 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43854 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
43855 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
43856 if (IsShuf0 || IsShuf1) {
43857 if (!IsShuf0) {
43858 Ops0.assign({BC0});
43859 ScaledMask0.assign({0, 1});
43860 }
43861 if (!IsShuf1) {
43862 Ops1.assign({BC1});
43863 ScaledMask1.assign({0, 1});
43864 }
43865
43866 SDValue LHS, RHS;
43867 int PostShuffle[4] = {-1, -1, -1, -1};
43868 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
43869 if (M < 0)
43870 return true;
43871 Idx = M % 2;
43872 SDValue Src = Ops[M / 2];
43873 if (!LHS || LHS == Src) {
43874 LHS = Src;
43875 return true;
43876 }
43877 if (!RHS || RHS == Src) {
43878 Idx += 2;
43879 RHS = Src;
43880 return true;
43881 }
43882 return false;
43883 };
43884 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
43885 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
43886 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
43887 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
43888 LHS = DAG.getBitcast(SrcVT, LHS);
43889 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
43890 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
43891 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
43892 Res = DAG.getBitcast(ShufVT, Res);
43893 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
43894 return DAG.getBitcast(VT, Res);
43895 }
43896 }
43897 }
43898
43899 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
43900 if (VT.is256BitVector() && Subtarget.hasInt256()) {
43901 SmallVector<int> Mask0, Mask1;
43902 SmallVector<SDValue> Ops0, Ops1;
43903 SmallVector<int, 2> ScaledMask0, ScaledMask1;
43904 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
43905 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
43906 !Ops0.empty() && !Ops1.empty() &&
43907 all_of(Ops0,
43908 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43909 all_of(Ops1,
43910 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
43911 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
43912 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
43913 SDValue Op00 = peekThroughBitcasts(Ops0.front());
43914 SDValue Op10 = peekThroughBitcasts(Ops1.front());
43915 SDValue Op01 = peekThroughBitcasts(Ops0.back());
43916 SDValue Op11 = peekThroughBitcasts(Ops1.back());
43917 if ((Op00 == Op11) && (Op01 == Op10)) {
43918 std::swap(Op10, Op11);
43919 ShuffleVectorSDNode::commuteMask(ScaledMask1);
43920 }
43921 if ((Op00 == Op10) && (Op01 == Op11)) {
43922 const int Map[4] = {0, 2, 1, 3};
43923 SmallVector<int, 4> ShuffleMask(
43924 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
43925 Map[ScaledMask1[1]]});
43926 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
43927 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
43928 DAG.getBitcast(SrcVT, Op01));
43929 Res = DAG.getBitcast(ShufVT, Res);
43930 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
43931 return DAG.getBitcast(VT, Res);
43932 }
43933 }
43934 }
43935
43936 return SDValue();
43937}
43938
43939static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
43940 TargetLowering::DAGCombinerInfo &DCI,
43941 const X86Subtarget &Subtarget) {
43942 unsigned Opcode = N->getOpcode();
43943 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&((void)0)
43944 "Unexpected pack opcode")((void)0);
43945
43946 EVT VT = N->getValueType(0);
43947 SDValue N0 = N->getOperand(0);
43948 SDValue N1 = N->getOperand(1);
43949 unsigned NumDstElts = VT.getVectorNumElements();
43950 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
43951 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
43952 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&((void)0)
43953 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&((void)0)
43954 "Unexpected PACKSS/PACKUS input type")((void)0);
43955
43956 bool IsSigned = (X86ISD::PACKSS == Opcode);
43957
43958 // Constant Folding.
43959 APInt UndefElts0, UndefElts1;
43960 SmallVector<APInt, 32> EltBits0, EltBits1;
43961 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
43962 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
43963 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
43964 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
43965 unsigned NumLanes = VT.getSizeInBits() / 128;
43966 unsigned NumSrcElts = NumDstElts / 2;
43967 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
43968 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
43969
43970 APInt Undefs(NumDstElts, 0);
43971 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
43972 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
43973 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
43974 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
43975 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
43976 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
43977
43978 if (UndefElts[SrcIdx]) {
43979 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
43980 continue;
43981 }
43982
43983 APInt &Val = EltBits[SrcIdx];
43984 if (IsSigned) {
43985 // PACKSS: Truncate signed value with signed saturation.
43986 // Source values less than dst minint are saturated to minint.
43987 // Source values greater than dst maxint are saturated to maxint.
43988 if (Val.isSignedIntN(DstBitsPerElt))
43989 Val = Val.trunc(DstBitsPerElt);
43990 else if (Val.isNegative())
43991 Val = APInt::getSignedMinValue(DstBitsPerElt);
43992 else
43993 Val = APInt::getSignedMaxValue(DstBitsPerElt);
43994 } else {
43995 // PACKUS: Truncate signed value with unsigned saturation.
43996 // Source values less than zero are saturated to zero.
43997 // Source values greater than dst maxuint are saturated to maxuint.
43998 if (Val.isIntN(DstBitsPerElt))
43999 Val = Val.trunc(DstBitsPerElt);
44000 else if (Val.isNegative())
44001 Val = APInt::getNullValue(DstBitsPerElt);
44002 else
44003 Val = APInt::getAllOnesValue(DstBitsPerElt);
44004 }
44005 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
44006 }
44007 }
44008
44009 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
44010 }
44011
44012 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
44013 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44014 return V;
44015
44016 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
44017 // truncate to create a larger truncate.
44018 if (Subtarget.hasAVX512() &&
44019 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
44020 N0.getOperand(0).getValueType() == MVT::v8i32) {
44021 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
44022 (!IsSigned &&
44023 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
44024 if (Subtarget.hasVLX())
44025 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
44026
44027 // Widen input to v16i32 so we can truncate that.
44028 SDLoc dl(N);
44029 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
44030 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
44031 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
44032 }
44033 }
44034
44035 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
44036 if (VT.is128BitVector()) {
44037 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
44038 SDValue Src0, Src1;
44039 if (N0.getOpcode() == ExtOpc &&
44040 N0.getOperand(0).getValueType().is64BitVector() &&
44041 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44042 Src0 = N0.getOperand(0);
44043 }
44044 if (N1.getOpcode() == ExtOpc &&
44045 N1.getOperand(0).getValueType().is64BitVector() &&
44046 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44047 Src1 = N1.getOperand(0);
44048 }
44049 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
44050 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")((void)0);
44051 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
44052 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
44053 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
44054 }
44055 }
44056
44057 // Attempt to combine as shuffle.
44058 SDValue Op(N, 0);
44059 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44060 return Res;
44061
44062 return SDValue();
44063}
44064
44065static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
44066 TargetLowering::DAGCombinerInfo &DCI,
44067 const X86Subtarget &Subtarget) {
44068 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||((void)0)
44069 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&((void)0)
44070 "Unexpected horizontal add/sub opcode")((void)0);
44071
44072 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
44073 // For slow-hop targets, if we have a hop with a single op, see if we already
44074 // have another user that we can reuse and shuffle the result.
44075 MVT VT = N->getSimpleValueType(0);
44076 SDValue LHS = N->getOperand(0);
44077 SDValue RHS = N->getOperand(1);
44078 if (VT.is128BitVector() && LHS == RHS) {
44079 for (SDNode *User : LHS->uses()) {
44080 if (User != N && User->getOpcode() == N->getOpcode()) {
44081 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44082 if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
44083 return DAG.getBitcast(
44084 VT,
44085 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44086 DAG.getBitcast(ShufVT, SDValue(User, 0)),
44087 DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
44088 }
44089 if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
44090 return DAG.getBitcast(
44091 VT,
44092 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44093 DAG.getBitcast(ShufVT, SDValue(User, 0)),
44094 DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
44095 }
44096 }
44097 }
44098 }
44099
44100 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
44101 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
44102 LHS.getOpcode() == RHS.getOpcode() &&
44103 LHS.getValueType() == RHS.getValueType()) {
44104 SDValue LHS0 = LHS.getOperand(0);
44105 SDValue RHS0 = LHS.getOperand(1);
44106 SDValue LHS1 = RHS.getOperand(0);
44107 SDValue RHS1 = RHS.getOperand(1);
44108 if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
44109 (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
44110 SDLoc DL(N);
44111 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
44112 LHS0.isUndef() ? RHS0 : LHS0,
44113 LHS1.isUndef() ? RHS1 : LHS1);
44114 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
44115 Res = DAG.getBitcast(ShufVT, Res);
44116 SDValue NewLHS =
44117 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44118 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
44119 SDValue NewRHS =
44120 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44121 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
44122 DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
44123 DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
44124 return SDValue(N, 0);
44125 }
44126 }
44127 }
44128
44129 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
44130 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44131 return V;
44132
44133 return SDValue();
44134}
44135
44136static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
44137 TargetLowering::DAGCombinerInfo &DCI,
44138 const X86Subtarget &Subtarget) {
44139 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||((void)0)
44140 X86ISD::VSRL == N->getOpcode()) &&((void)0)
44141 "Unexpected shift opcode")((void)0);
44142 EVT VT = N->getValueType(0);
44143 SDValue N0 = N->getOperand(0);
44144 SDValue N1 = N->getOperand(1);
44145
44146 // Shift zero -> zero.
44147 if (ISD::isBuildVectorAllZeros(N0.getNode()))
44148 return DAG.getConstant(0, SDLoc(N), VT);
44149
44150 // Detect constant shift amounts.
44151 APInt UndefElts;
44152 SmallVector<APInt, 32> EltBits;
44153 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
44154 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
44155 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
44156 EltBits[0].getZExtValue(), DAG);
44157 }
44158
44159 APInt KnownUndef, KnownZero;
44160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44161 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
44162 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
44163 KnownZero, DCI))
44164 return SDValue(N, 0);
44165
44166 return SDValue();
44167}
44168
44169static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
44170 TargetLowering::DAGCombinerInfo &DCI,
44171 const X86Subtarget &Subtarget) {
44172 unsigned Opcode = N->getOpcode();
44173 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||((void)0)
44174 X86ISD::VSRLI == Opcode) &&((void)0)
44175 "Unexpected shift opcode")((void)0);
44176 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
44177 EVT VT = N->getValueType(0);
44178 SDValue N0 = N->getOperand(0);
44179 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44180 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&((void)0)
44181 "Unexpected value type")((void)0);
44182 assert(N->getOperand(1).getValueType() == MVT::i8 &&((void)0)
44183 "Unexpected shift amount type")((void)0);
44184
44185 // (shift undef, X) -> 0
44186 if (N0.isUndef())
44187 return DAG.getConstant(0, SDLoc(N), VT);
44188
44189 // Out of range logical bit shifts are guaranteed to be zero.
44190 // Out of range arithmetic bit shifts splat the sign bit.
44191 unsigned ShiftVal = N->getConstantOperandVal(1);
44192 if (ShiftVal >= NumBitsPerElt) {
44193 if (LogicalShift)
44194 return DAG.getConstant(0, SDLoc(N), VT);
44195 ShiftVal = NumBitsPerElt - 1;
44196 }
44197
44198 // (shift X, 0) -> X
44199 if (!ShiftVal)
44200 return N0;
44201
44202 // (shift 0, C) -> 0
44203 if (ISD::isBuildVectorAllZeros(N0.getNode()))
44204 // N0 is all zeros or undef. We guarantee that the bits shifted into the
44205 // result are all zeros, not undef.
44206 return DAG.getConstant(0, SDLoc(N), VT);
44207
44208 // (VSRAI -1, C) -> -1
44209 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
44210 // N0 is all ones or undef. We guarantee that the bits shifted into the
44211 // result are all ones, not undef.
44212 return DAG.getConstant(-1, SDLoc(N), VT);
44213
44214 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
44215 if (Opcode == N0.getOpcode()) {
44216 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
44217 unsigned NewShiftVal = ShiftVal + ShiftVal2;
44218 if (NewShiftVal >= NumBitsPerElt) {
44219 // Out of range logical bit shifts are guaranteed to be zero.
44220 // Out of range arithmetic bit shifts splat the sign bit.
44221 if (LogicalShift)
44222 return DAG.getConstant(0, SDLoc(N), VT);
44223 NewShiftVal = NumBitsPerElt - 1;
44224 }
44225 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
44226 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
44227 }
44228
44229 // We can decode 'whole byte' logical bit shifts as shuffles.
44230 if (LogicalShift && (ShiftVal % 8) == 0) {
44231 SDValue Op(N, 0);
44232 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44233 return Res;
44234 }
44235
44236 // Constant Folding.
44237 APInt UndefElts;
44238 SmallVector<APInt, 32> EltBits;
44239 if (N->isOnlyUserOf(N0.getNode()) &&
44240 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
44241 assert(EltBits.size() == VT.getVectorNumElements() &&((void)0)
44242 "Unexpected shift value type")((void)0);
44243 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
44244 // created an undef input due to no input bits being demanded, but user
44245 // still expects 0 in other bits.
44246 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
44247 APInt &Elt = EltBits[i];
44248 if (UndefElts[i])
44249 Elt = 0;
44250 else if (X86ISD::VSHLI == Opcode)
44251 Elt <<= ShiftVal;
44252 else if (X86ISD::VSRAI == Opcode)
44253 Elt.ashrInPlace(ShiftVal);
44254 else
44255 Elt.lshrInPlace(ShiftVal);
44256 }
44257 // Reset undef elements since they were zeroed above.
44258 UndefElts = 0;
44259 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
44260 }
44261
44262 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44263 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44264 APInt::getAllOnesValue(NumBitsPerElt), DCI))
44265 return SDValue(N, 0);
44266
44267 return SDValue();
44268}
44269
44270static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
44271 TargetLowering::DAGCombinerInfo &DCI,
44272 const X86Subtarget &Subtarget) {
44273 EVT VT = N->getValueType(0);
44274 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||((void)0)
44275 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||((void)0)
44276 N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&((void)0)
44277 "Unexpected vector insertion")((void)0);
44278
44279 if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
44280 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44281 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44282 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44283 APInt::getAllOnesValue(NumBitsPerElt), DCI))
44284 return SDValue(N, 0);
44285 }
44286
44287 // Attempt to combine insertion patterns to a shuffle.
44288 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
44289 SDValue Op(N, 0);
44290 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44291 return Res;
44292 }
44293
44294 return SDValue();
44295}
44296
44297/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
44298/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
44299/// OR -> CMPNEQSS.
44300static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
44301 TargetLowering::DAGCombinerInfo &DCI,
44302 const X86Subtarget &Subtarget) {
44303 unsigned opcode;
44304
44305 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
44306 // we're requiring SSE2 for both.
44307 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
44308 SDValue N0 = N->getOperand(0);
44309 SDValue N1 = N->getOperand(1);
44310 SDValue CMP0 = N0.getOperand(1);
44311 SDValue CMP1 = N1.getOperand(1);
44312 SDLoc DL(N);
44313
44314 // The SETCCs should both refer to the same CMP.
44315 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
44316 return SDValue();
44317
44318 SDValue CMP00 = CMP0->getOperand(0);
44319 SDValue CMP01 = CMP0->getOperand(1);
44320 EVT VT = CMP00.getValueType();
44321
44322 if (VT == MVT::f32 || VT == MVT::f64) {
44323 bool ExpectingFlags = false;
44324 // Check for any users that want flags:
44325 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
44326 !ExpectingFlags && UI != UE; ++UI)
44327 switch (UI->getOpcode()) {
44328 default:
44329 case ISD::BR_CC:
44330 case ISD::BRCOND:
44331 case ISD::SELECT:
44332 ExpectingFlags = true;
44333 break;
44334 case ISD::CopyToReg:
44335 case ISD::SIGN_EXTEND:
44336 case ISD::ZERO_EXTEND:
44337 case ISD::ANY_EXTEND:
44338 break;
44339 }
44340
44341 if (!ExpectingFlags) {
44342 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
44343 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
44344
44345 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
44346 X86::CondCode tmp = cc0;
44347 cc0 = cc1;
44348 cc1 = tmp;
44349 }
44350
44351 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
44352 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
44353 // FIXME: need symbolic constants for these magic numbers.
44354 // See X86ATTInstPrinter.cpp:printSSECC().
44355 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
44356 if (Subtarget.hasAVX512()) {
44357 SDValue FSetCC =
44358 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
44359 DAG.getTargetConstant(x86cc, DL, MVT::i8));
44360 // Need to fill with zeros to ensure the bitcast will produce zeroes
44361 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
44362 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
44363 DAG.getConstant(0, DL, MVT::v16i1),
44364 FSetCC, DAG.getIntPtrConstant(0, DL));
44365 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
44366 N->getSimpleValueType(0));
44367 }
44368 SDValue OnesOrZeroesF =
44369 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
44370 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
44371
44372 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
44373 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
44374
44375 if (is64BitFP && !Subtarget.is64Bit()) {
44376 // On a 32-bit target, we cannot bitcast the 64-bit float to a
44377 // 64-bit integer, since that's not a legal type. Since
44378 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
44379 // bits, but can do this little dance to extract the lowest 32 bits
44380 // and work with those going forward.
44381 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
44382 OnesOrZeroesF);
44383 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
44384 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
44385 Vector32, DAG.getIntPtrConstant(0, DL));
44386 IntVT = MVT::i32;
44387 }
44388
44389 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
44390 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
44391 DAG.getConstant(1, DL, IntVT));
44392 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
44393 ANDed);
44394 return OneBitOfTruth;
44395 }
44396 }
44397 }
44398 }
44399 return SDValue();
44400}
44401
44402/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
44403static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
44404 assert(N->getOpcode() == ISD::AND)((void)0);
44405
44406 MVT VT = N->getSimpleValueType(0);
44407 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
44408 return SDValue();
44409
44410 SDValue X, Y;
44411 SDValue N0 = N->getOperand(0);
44412 SDValue N1 = N->getOperand(1);
44413
44414 auto GetNot = [&VT, &DAG](SDValue V) {
44415 // Basic X = NOT(Y) detection.
44416 if (SDValue Not = IsNOT(V, DAG))
44417 return Not;
44418 // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
44419 if (V.getOpcode() == X86ISD::VBROADCAST) {
44420 SDValue Src = V.getOperand(0);
44421 EVT SrcVT = Src.getValueType();
44422 if (!SrcVT.isVector())
44423 return SDValue();
44424 if (SDValue Not = IsNOT(Src, DAG))
44425 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
44426 DAG.getBitcast(SrcVT, Not));
44427 }
44428 return SDValue();
44429 };
44430
44431 if (SDValue Not = GetNot(N0)) {
44432 X = Not;
44433 Y = N1;
44434 } else if (SDValue Not = GetNot(N1)) {
44435 X = Not;
44436 Y = N0;
44437 } else
44438 return SDValue();
44439
44440 X = DAG.getBitcast(VT, X);
44441 Y = DAG.getBitcast(VT, Y);
44442 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
44443}
44444
44445// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
44446// logical operations, like in the example below.
44447// or (and (truncate x, truncate y)),
44448// (xor (truncate z, build_vector (constants)))
44449// Given a target type \p VT, we generate
44450// or (and x, y), (xor z, zext(build_vector (constants)))
44451// given x, y and z are of type \p VT. We can do so, if operands are either
44452// truncates from VT types, the second operand is a vector of constants or can
44453// be recursively promoted.
44454static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
44455 unsigned Depth) {
44456 // Limit recursion to avoid excessive compile times.
44457 if (Depth >= SelectionDAG::MaxRecursionDepth)
44458 return SDValue();
44459
44460 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
44461 N->getOpcode() != ISD::OR)
44462 return SDValue();
44463
44464 SDValue N0 = N->getOperand(0);
44465 SDValue N1 = N->getOperand(1);
44466 SDLoc DL(N);
44467
44468 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44469 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
44470 return SDValue();
44471
44472 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
44473 N0 = NN0;
44474 else {
44475 // The Left side has to be a trunc.
44476 if (N0.getOpcode() != ISD::TRUNCATE)
44477 return SDValue();
44478
44479 // The type of the truncated inputs.
44480 if (N0.getOperand(0).getValueType() != VT)
44481 return SDValue();
44482
44483 N0 = N0.getOperand(0);
44484 }
44485
44486 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
44487 N1 = NN1;
44488 else {
44489 // The right side has to be a 'trunc' or a constant vector.
44490 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
44491 N1.getOperand(0).getValueType() == VT;
44492 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
44493 return SDValue();
44494
44495 if (RHSTrunc)
44496 N1 = N1.getOperand(0);
44497 else
44498 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
44499 }
44500
44501 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
44502}
44503
44504// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
44505// register. In most cases we actually compare or select YMM-sized registers
44506// and mixing the two types creates horrible code. This method optimizes
44507// some of the transition sequences.
44508// Even with AVX-512 this is still useful for removing casts around logical
44509// operations on vXi1 mask types.
44510static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44511 const X86Subtarget &Subtarget) {
44512 EVT VT = N->getValueType(0);
44513 assert(VT.isVector() && "Expected vector type")((void)0);
44514
44515 SDLoc DL(N);
44516 assert((N->getOpcode() == ISD::ANY_EXTEND ||((void)0)
44517 N->getOpcode() == ISD::ZERO_EXTEND ||((void)0)
44518 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")((void)0);
44519
44520 SDValue Narrow = N->getOperand(0);
44521 EVT NarrowVT = Narrow.getValueType();
44522
44523 // Generate the wide operation.
44524 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
44525 if (!Op)
44526 return SDValue();
44527 switch (N->getOpcode()) {
44528 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
44529 case ISD::ANY_EXTEND:
44530 return Op;
44531 case ISD::ZERO_EXTEND:
44532 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
44533 case ISD::SIGN_EXTEND:
44534 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
44535 Op, DAG.getValueType(NarrowVT));
44536 }
44537}
44538
44539static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
44540 unsigned FPOpcode;
44541 switch (Opcode) {
44542 default: llvm_unreachable("Unexpected input node for FP logic conversion")__builtin_unreachable();
44543 case ISD::AND: FPOpcode = X86ISD::FAND; break;
44544 case ISD::OR: FPOpcode = X86ISD::FOR; break;
44545 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
44546 }
44547 return FPOpcode;
44548}
44549
44550/// If both input operands of a logic op are being cast from floating point
44551/// types, try to convert this into a floating point logic node to avoid
44552/// unnecessary moves from SSE to integer registers.
44553static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
44554 const X86Subtarget &Subtarget) {
44555 EVT VT = N->getValueType(0);
44556 SDValue N0 = N->getOperand(0);
44557 SDValue N1 = N->getOperand(1);
44558 SDLoc DL(N);
44559
44560 if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
44561 return SDValue();
44562
44563 SDValue N00 = N0.getOperand(0);
44564 SDValue N10 = N1.getOperand(0);
44565 EVT N00Type = N00.getValueType();
44566 EVT N10Type = N10.getValueType();
44567
44568 // Ensure that both types are the same and are legal scalar fp types.
44569 if (N00Type != N10Type ||
44570 !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
44571 (Subtarget.hasSSE2() && N00Type == MVT::f64)))
44572 return SDValue();
44573
44574 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
44575 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
44576 return DAG.getBitcast(VT, FPLogic);
44577}
44578
44579// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
44580// to reduce XMM->GPR traffic.
44581static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
44582 unsigned Opc = N->getOpcode();
44583 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&((void)0)
44584 "Unexpected bit opcode")((void)0);
44585
44586 SDValue N0 = N->getOperand(0);
44587 SDValue N1 = N->getOperand(1);
44588
44589 // Both operands must be single use MOVMSK.
44590 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
44591 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
44592 return SDValue();
44593
44594 SDValue Vec0 = N0.getOperand(0);
44595 SDValue Vec1 = N1.getOperand(0);
44596 EVT VecVT0 = Vec0.getValueType();
44597 EVT VecVT1 = Vec1.getValueType();
44598
44599 // Both MOVMSK operands must be from vectors of the same size and same element
44600 // size, but its OK for a fp/int diff.
44601 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
44602 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
44603 return SDValue();
44604
44605 SDLoc DL(N);
44606 unsigned VecOpc =
44607 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
44608 SDValue Result =
44609 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
44610 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
44611}
44612
44613/// If this is a zero/all-bits result that is bitwise-anded with a low bits
44614/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
44615/// with a shift-right to eliminate loading the vector constant mask value.
44616static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
44617 const X86Subtarget &Subtarget) {
44618 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
44619 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
44620 EVT VT0 = Op0.getValueType();
44621 EVT VT1 = Op1.getValueType();
44622
44623 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
44624 return SDValue();
44625
44626 APInt SplatVal;
44627 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
44628 !SplatVal.isMask())
44629 return SDValue();
44630
44631 // Don't prevent creation of ANDN.
44632 if (isBitwiseNot(Op0))
44633 return SDValue();
44634
44635 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
44636 return SDValue();
44637
44638 unsigned EltBitWidth = VT0.getScalarSizeInBits();
44639 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
44640 return SDValue();
44641
44642 SDLoc DL(N);
44643 unsigned ShiftVal = SplatVal.countTrailingOnes();
44644 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
44645 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
44646 return DAG.getBitcast(N->getValueType(0), Shift);
44647}
44648
44649// Get the index node from the lowered DAG of a GEP IR instruction with one
44650// indexing dimension.
44651static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
44652 if (Ld->isIndexed())
44653 return SDValue();
44654
44655 SDValue Base = Ld->getBasePtr();
44656
44657 if (Base.getOpcode() != ISD::ADD)
44658 return SDValue();
44659
44660 SDValue ShiftedIndex = Base.getOperand(0);
44661
44662 if (ShiftedIndex.getOpcode() != ISD::SHL)
44663 return SDValue();
44664
44665 return ShiftedIndex.getOperand(0);
44666
44667}
44668
44669static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
44670 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
44671 switch (VT.getSizeInBits()) {
44672 default: return false;
44673 case 64: return Subtarget.is64Bit() ? true : false;
44674 case 32: return true;
44675 }
44676 }
44677 return false;
44678}
44679
44680// This function recognizes cases where X86 bzhi instruction can replace and
44681// 'and-load' sequence.
44682// In case of loading integer value from an array of constants which is defined
44683// as follows:
44684//
44685// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
44686//
44687// then applying a bitwise and on the result with another input.
44688// It's equivalent to performing bzhi (zero high bits) on the input, with the
44689// same index of the load.
44690static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
44691 const X86Subtarget &Subtarget) {
44692 MVT VT = Node->getSimpleValueType(0);
44693 SDLoc dl(Node);
44694
44695 // Check if subtarget has BZHI instruction for the node's type
44696 if (!hasBZHI(Subtarget, VT))
44697 return SDValue();
44698
44699 // Try matching the pattern for both operands.
44700 for (unsigned i = 0; i < 2; i++) {
44701 SDValue N = Node->getOperand(i);
44702 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
44703
44704 // continue if the operand is not a load instruction
44705 if (!Ld)
44706 return SDValue();
44707
44708 const Value *MemOp = Ld->getMemOperand()->getValue();
44709
44710 if (!MemOp)
44711 return SDValue();
44712
44713 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
44714 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
44715 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
44716
44717 Constant *Init = GV->getInitializer();
44718 Type *Ty = Init->getType();
44719 if (!isa<ConstantDataArray>(Init) ||
44720 !Ty->getArrayElementType()->isIntegerTy() ||
44721 Ty->getArrayElementType()->getScalarSizeInBits() !=
44722 VT.getSizeInBits() ||
44723 Ty->getArrayNumElements() >
44724 Ty->getArrayElementType()->getScalarSizeInBits())
44725 continue;
44726
44727 // Check if the array's constant elements are suitable to our case.
44728 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
44729 bool ConstantsMatch = true;
44730 for (uint64_t j = 0; j < ArrayElementCount; j++) {
44731 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
44732 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
44733 ConstantsMatch = false;
44734 break;
44735 }
44736 }
44737 if (!ConstantsMatch)
44738 continue;
44739
44740 // Do the transformation (For 32-bit type):
44741 // -> (and (load arr[idx]), inp)
44742 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
44743 // that will be replaced with one bzhi instruction.
44744 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
44745 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
44746
44747 // Get the Node which indexes into the array.
44748 SDValue Index = getIndexFromUnindexedLoad(Ld);
44749 if (!Index)
44750 return SDValue();
44751 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
44752
44753 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
44754 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
44755
44756 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
44757 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
44758
44759 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
44760 }
44761 }
44762 }
44763 }
44764 return SDValue();
44765}
44766
44767// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
44768// Where C is a mask containing the same number of bits as the setcc and
44769// where the setcc will freely 0 upper bits of k-register. We can replace the
44770// undef in the concat with 0s and remove the AND. This mainly helps with
44771// v2i1/v4i1 setcc being casted to scalar.
44772static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
44773 const X86Subtarget &Subtarget) {
44774 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")((void)0);
44775
44776 EVT VT = N->getValueType(0);
44777
44778 // Make sure this is an AND with constant. We will check the value of the
44779 // constant later.
44780 if (!isa<ConstantSDNode>(N->getOperand(1)))
44781 return SDValue();
44782
44783 // This is implied by the ConstantSDNode.
44784 assert(!VT.isVector() && "Expected scalar VT!")((void)0);
44785
44786 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
44787 !N->getOperand(0).hasOneUse() ||
44788 !N->getOperand(0).getOperand(0).hasOneUse())
44789 return SDValue();
44790
44791 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44792 SDValue Src = N->getOperand(0).getOperand(0);
44793 EVT SrcVT = Src.getValueType();
44794 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
44795 !TLI.isTypeLegal(SrcVT))
44796 return SDValue();
44797
44798 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
44799 return SDValue();
44800
44801 // We only care about the first subvector of the concat, we expect the
44802 // other subvectors to be ignored due to the AND if we make the change.
44803 SDValue SubVec = Src.getOperand(0);
44804 EVT SubVecVT = SubVec.getValueType();
44805
44806 // First subvector should be a setcc with a legal result type. The RHS of the
44807 // AND should be a mask with this many bits.
44808 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
44809 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
44810 return SDValue();
44811
44812 EVT SetccVT = SubVec.getOperand(0).getValueType();
44813 if (!TLI.isTypeLegal(SetccVT) ||
44814 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
44815 return SDValue();
44816
44817 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
44818 return SDValue();
44819
44820 // We passed all the checks. Rebuild the concat_vectors with zeroes
44821 // and cast it back to VT.
44822 SDLoc dl(N);
44823 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
44824 DAG.getConstant(0, dl, SubVecVT));
44825 Ops[0] = SubVec;
44826 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
44827 Ops);
44828 return DAG.getBitcast(VT, Concat);
44829}
44830
44831static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
44832 TargetLowering::DAGCombinerInfo &DCI,
44833 const X86Subtarget &Subtarget) {
44834 EVT VT = N->getValueType(0);
44835
44836 // If this is SSE1 only convert to FAND to avoid scalarization.
44837 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
44838 return DAG.getBitcast(
44839 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
44840 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
44841 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
44842 }
44843
44844 // Use a 32-bit and+zext if upper bits known zero.
44845 if (VT == MVT::i64 && Subtarget.is64Bit() &&
44846 !isa<ConstantSDNode>(N->getOperand(1))) {
44847 APInt HiMask = APInt::getHighBitsSet(64, 32);
44848 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
44849 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
44850 SDLoc dl(N);
44851 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
44852 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
44853 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
44854 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
44855 }
44856 }
44857
44858 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
44859 // TODO: Support multiple SrcOps.
44860 if (VT == MVT::i1) {
44861 SmallVector<SDValue, 2> SrcOps;
44862 SmallVector<APInt, 2> SrcPartials;
44863 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
44864 SrcOps.size() == 1) {
44865 SDLoc dl(N);
44866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44867 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
44868 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
44869 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
44870 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
44871 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
44872 if (Mask) {
44873 assert(SrcPartials[0].getBitWidth() == NumElts &&((void)0)
44874 "Unexpected partial reduction mask")((void)0);
44875 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
44876 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
44877 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
44878 }
44879 }
44880 }
44881
44882 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
44883 return V;
44884
44885 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
44886 return R;
44887
44888 if (DCI.isBeforeLegalizeOps())
44889 return SDValue();
44890
44891 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
44892 return R;
44893
44894 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
44895 return FPLogic;
44896
44897 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
44898 return R;
44899
44900 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
44901 return ShiftRight;
44902
44903 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
44904 return R;
44905
44906 // Attempt to recursively combine a bitmask AND with shuffles.
44907 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
44908 SDValue Op(N, 0);
44909 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44910 return Res;
44911 }
44912
44913 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
44914 if ((VT.getScalarSizeInBits() % 8) == 0 &&
44915 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44916 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
44917 SDValue BitMask = N->getOperand(1);
44918 SDValue SrcVec = N->getOperand(0).getOperand(0);
44919 EVT SrcVecVT = SrcVec.getValueType();
44920
44921 // Check that the constant bitmask masks whole bytes.
44922 APInt UndefElts;
44923 SmallVector<APInt, 64> EltBits;
44924 if (VT == SrcVecVT.getScalarType() &&
44925 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
44926 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
44927 llvm::all_of(EltBits, [](const APInt &M) {
44928 return M.isNullValue() || M.isAllOnesValue();
44929 })) {
44930 unsigned NumElts = SrcVecVT.getVectorNumElements();
44931 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
44932 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
44933
44934 // Create a root shuffle mask from the byte mask and the extracted index.
44935 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
44936 for (unsigned i = 0; i != Scale; ++i) {
44937 if (UndefElts[i])
44938 continue;
44939 int VecIdx = Scale * Idx + i;
44940 ShuffleMask[VecIdx] =
44941 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
44942 }
44943
44944 if (SDValue Shuffle = combineX86ShufflesRecursively(
44945 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
44946 X86::MaxShuffleCombineDepth,
44947 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
44948 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
44949 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
44950 N->getOperand(0).getOperand(1));
44951 }
44952 }
44953
44954 return SDValue();
44955}
44956
44957// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
44958static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
44959 const X86Subtarget &Subtarget) {
44960 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((void)0);
44961
44962 MVT VT = N->getSimpleValueType(0);
44963 if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
44964 return SDValue();
44965
44966 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
44967 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
44968 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
44969 return SDValue();
44970
44971 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
44972 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
44973 bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
44974 Subtarget.hasVLX();
44975 if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
44976 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
44977 return SDValue();
44978
44979 // Attempt to extract constant byte masks.
44980 APInt UndefElts0, UndefElts1;
44981 SmallVector<APInt, 32> EltBits0, EltBits1;
44982 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
44983 false, false))
44984 return SDValue();
44985 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
44986 false, false))
44987 return SDValue();
44988
44989 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
44990 // TODO - add UNDEF elts support.
44991 if (UndefElts0[i] || UndefElts1[i])
44992 return SDValue();
44993 if (EltBits0[i] != ~EltBits1[i])
44994 return SDValue();
44995 }
44996
44997 SDLoc DL(N);
44998
44999 if (UseVPTERNLOG) {
45000 // Emit a VPTERNLOG node directly.
45001 SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
45002 SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
45003 SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
45004 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
45005 return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
45006 }
45007
45008 SDValue X = N->getOperand(0);
45009 SDValue Y =
45010 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
45011 DAG.getBitcast(VT, N1.getOperand(0)));
45012 return DAG.getNode(ISD::OR, DL, VT, X, Y);
45013}
45014
45015// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
45016static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
45017 if (N->getOpcode() != ISD::OR)
45018 return false;
45019
45020 SDValue N0 = N->getOperand(0);
45021 SDValue N1 = N->getOperand(1);
45022
45023 // Canonicalize AND to LHS.
45024 if (N1.getOpcode() == ISD::AND)
45025 std::swap(N0, N1);
45026
45027 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
45028 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
45029 return false;
45030
45031 Mask = N1.getOperand(0);
45032 X = N1.getOperand(1);
45033
45034 // Check to see if the mask appeared in both the AND and ANDNP.
45035 if (N0.getOperand(0) == Mask)
45036 Y = N0.getOperand(1);
45037 else if (N0.getOperand(1) == Mask)
45038 Y = N0.getOperand(0);
45039 else
45040 return false;
45041
45042 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
45043 // ANDNP combine allows other combines to happen that prevent matching.
45044 return true;
45045}
45046
45047// Try to fold:
45048// (or (and (m, y), (pandn m, x)))
45049// into:
45050// (vselect m, x, y)
45051// As a special case, try to fold:
45052// (or (and (m, (sub 0, x)), (pandn m, x)))
45053// into:
45054// (sub (xor X, M), M)
45055static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
45056 const X86Subtarget &Subtarget) {
45057 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((void)0);
45058
45059 EVT VT = N->getValueType(0);
45060 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
45061 (VT.is256BitVector() && Subtarget.hasInt256())))
45062 return SDValue();
45063
45064 SDValue X, Y, Mask;
45065 if (!matchLogicBlend(N, X, Y, Mask))
45066 return SDValue();
45067
45068 // Validate that X, Y, and Mask are bitcasts, and see through them.
45069 Mask = peekThroughBitcasts(Mask);
45070 X = peekThroughBitcasts(X);
45071 Y = peekThroughBitcasts(Y);
45072
45073 EVT MaskVT = Mask.getValueType();
45074 unsigned EltBits = MaskVT.getScalarSizeInBits();
45075
45076 // TODO: Attempt to handle floating point cases as well?
45077 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
45078 return SDValue();
45079
45080 SDLoc DL(N);
45081
45082 // Attempt to combine to conditional negate: (sub (xor X, M), M)
45083 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
45084 DAG, Subtarget))
45085 return Res;
45086
45087 // PBLENDVB is only available on SSE 4.1.
45088 if (!Subtarget.hasSSE41())
45089 return SDValue();
45090
45091 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
45092 if (Subtarget.hasVLX())
45093 return SDValue();
45094
45095 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
45096
45097 X = DAG.getBitcast(BlendVT, X);
45098 Y = DAG.getBitcast(BlendVT, Y);
45099 Mask = DAG.getBitcast(BlendVT, Mask);
45100 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
45101 return DAG.getBitcast(VT, Mask);
45102}
45103
45104// Helper function for combineOrCmpEqZeroToCtlzSrl
45105// Transforms:
45106// seteq(cmp x, 0)
45107// into:
45108// srl(ctlz x), log2(bitsize(x))
45109// Input pattern is checked by caller.
45110static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
45111 SelectionDAG &DAG) {
45112 SDValue Cmp = Op.getOperand(1);
45113 EVT VT = Cmp.getOperand(0).getValueType();
45114 unsigned Log2b = Log2_32(VT.getSizeInBits());
45115 SDLoc dl(Op);
45116 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
45117 // The result of the shift is true or false, and on X86, the 32-bit
45118 // encoding of shr and lzcnt is more desirable.
45119 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
45120 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
45121 DAG.getConstant(Log2b, dl, MVT::i8));
45122 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
45123}
45124
45125// Try to transform:
45126// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
45127// into:
45128// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
45129// Will also attempt to match more generic cases, eg:
45130// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
45131// Only applies if the target supports the FastLZCNT feature.
45132static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
45133 TargetLowering::DAGCombinerInfo &DCI,
45134 const X86Subtarget &Subtarget) {
45135 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
45136 return SDValue();
45137
45138 auto isORCandidate = [](SDValue N) {
45139 return (N->getOpcode() == ISD::OR && N->hasOneUse());
45140 };
45141
45142 // Check the zero extend is extending to 32-bit or more. The code generated by
45143 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
45144 // instructions to clear the upper bits.
45145 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
45146 !isORCandidate(N->getOperand(0)))
45147 return SDValue();
45148
45149 // Check the node matches: setcc(eq, cmp 0)
45150 auto isSetCCCandidate = [](SDValue N) {
45151 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
45152 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
45153 N->getOperand(1).getOpcode() == X86ISD::CMP &&
45154 isNullConstant(N->getOperand(1).getOperand(1)) &&
45155 N->getOperand(1).getValueType().bitsGE(MVT::i32);
45156 };
45157
45158 SDNode *OR = N->getOperand(0).getNode();
45159 SDValue LHS = OR->getOperand(0);
45160 SDValue RHS = OR->getOperand(1);
45161
45162 // Save nodes matching or(or, setcc(eq, cmp 0)).
45163 SmallVector<SDNode *, 2> ORNodes;
45164 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
45165 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
45166 ORNodes.push_back(OR);
45167 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
45168 LHS = OR->getOperand(0);
45169 RHS = OR->getOperand(1);
45170 }
45171
45172 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
45173 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
45174 !isORCandidate(SDValue(OR, 0)))
45175 return SDValue();
45176
45177 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
45178 // to
45179 // or(srl(ctlz),srl(ctlz)).
45180 // The dag combiner can then fold it into:
45181 // srl(or(ctlz, ctlz)).
45182 EVT VT = OR->getValueType(0);
45183 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
45184 SDValue Ret, NewRHS;
45185 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
45186 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
45187
45188 if (!Ret)
45189 return SDValue();
45190
45191 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
45192 while (ORNodes.size() > 0) {
45193 OR = ORNodes.pop_back_val();
45194 LHS = OR->getOperand(0);
45195 RHS = OR->getOperand(1);
45196 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
45197 if (RHS->getOpcode() == ISD::OR)
45198 std::swap(LHS, RHS);
45199 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
45200 if (!NewRHS)
45201 return SDValue();
45202 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
45203 }
45204
45205 if (Ret)
45206 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
45207
45208 return Ret;
45209}
45210
45211static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
45212 TargetLowering::DAGCombinerInfo &DCI,
45213 const X86Subtarget &Subtarget) {
45214 SDValue N0 = N->getOperand(0);
45215 SDValue N1 = N->getOperand(1);
45216 EVT VT = N->getValueType(0);
45217
45218 // If this is SSE1 only convert to FOR to avoid scalarization.
45219 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
45220 return DAG.getBitcast(MVT::v4i32,
45221 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
45222 DAG.getBitcast(MVT::v4f32, N0),
45223 DAG.getBitcast(MVT::v4f32, N1)));
45224 }
45225
45226 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
45227 // TODO: Support multiple SrcOps.
45228 if (VT == MVT::i1) {
45229 SmallVector<SDValue, 2> SrcOps;
45230 SmallVector<APInt, 2> SrcPartials;
45231 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
45232 SrcOps.size() == 1) {
45233 SDLoc dl(N);
45234 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45235 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
45236 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45237 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
45238 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
45239 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
45240 if (Mask) {
45241 assert(SrcPartials[0].getBitWidth() == NumElts &&((void)0)
45242 "Unexpected partial reduction mask")((void)0);
45243 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
45244 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
45245 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
45246 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
45247 }
45248 }
45249 }
45250
45251 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
45252 return R;
45253
45254 if (DCI.isBeforeLegalizeOps())
45255 return SDValue();
45256
45257 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
45258 return R;
45259
45260 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
45261 return FPLogic;
45262
45263 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
45264 return R;
45265
45266 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
45267 return R;
45268
45269 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
45270 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
45271 // iff the upper elements of the non-shifted arg are zero.
45272 // KUNPCK require 16+ bool vector elements.
45273 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
45274 unsigned NumElts = VT.getVectorNumElements();
45275 unsigned HalfElts = NumElts / 2;
45276 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
45277 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
45278 N1.getConstantOperandAPInt(1) == HalfElts &&
45279 DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
45280 SDLoc dl(N);
45281 return DAG.getNode(
45282 ISD::CONCAT_VECTORS, dl, VT,
45283 extractSubVector(N0, 0, DAG, dl, HalfElts),
45284 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
45285 }
45286 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
45287 N0.getConstantOperandAPInt(1) == HalfElts &&
45288 DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
45289 SDLoc dl(N);
45290 return DAG.getNode(
45291 ISD::CONCAT_VECTORS, dl, VT,
45292 extractSubVector(N1, 0, DAG, dl, HalfElts),
45293 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
45294 }
45295 }
45296
45297 // Attempt to recursively combine an OR of shuffles.
45298 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
45299 SDValue Op(N, 0);
45300 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45301 return Res;
45302 }
45303
45304 return SDValue();
45305}
45306
45307/// Try to turn tests against the signbit in the form of:
45308/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
45309/// into:
45310/// SETGT(X, -1)
45311static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
45312 // This is only worth doing if the output type is i8 or i1.
45313 EVT ResultType = N->getValueType(0);
45314 if (ResultType != MVT::i8 && ResultType != MVT::i1)
45315 return SDValue();
45316
45317 SDValue N0 = N->getOperand(0);
45318 SDValue N1 = N->getOperand(1);
45319
45320 // We should be performing an xor against a truncated shift.
45321 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
45322 return SDValue();
45323
45324 // Make sure we are performing an xor against one.
45325 if (!isOneConstant(N1))
45326 return SDValue();
45327
45328 // SetCC on x86 zero extends so only act on this if it's a logical shift.
45329 SDValue Shift = N0.getOperand(0);
45330 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
45331 return SDValue();
45332
45333 // Make sure we are truncating from one of i16, i32 or i64.
45334 EVT ShiftTy = Shift.getValueType();
45335 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
45336 return SDValue();
45337
45338 // Make sure the shift amount extracts the sign bit.
45339 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
45340 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
45341 return SDValue();
45342
45343 // Create a greater-than comparison against -1.
45344 // N.B. Using SETGE against 0 works but we want a canonical looking
45345 // comparison, using SETGT matches up with what TranslateX86CC.
45346 SDLoc DL(N);
45347 SDValue ShiftOp = Shift.getOperand(0);
45348 EVT ShiftOpTy = ShiftOp.getValueType();
45349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45350 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
45351 *DAG.getContext(), ResultType);
45352 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
45353 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
45354 if (SetCCResultType != ResultType)
45355 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
45356 return Cond;
45357}
45358
45359/// Turn vector tests of the signbit in the form of:
45360/// xor (sra X, elt_size(X)-1), -1
45361/// into:
45362/// pcmpgt X, -1
45363///
45364/// This should be called before type legalization because the pattern may not
45365/// persist after that.
45366static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
45367 const X86Subtarget &Subtarget) {
45368 EVT VT = N->getValueType(0);
45369 if (!VT.isSimple())
45370 return SDValue();
45371
45372 switch (VT.getSimpleVT().SimpleTy) {
45373 default: return SDValue();
45374 case MVT::v16i8:
45375 case MVT::v8i16:
45376 case MVT::v4i32:
45377 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
45378 case MVT::v32i8:
45379 case MVT::v16i16:
45380 case MVT::v8i32:
45381 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
45382 }
45383
45384 // There must be a shift right algebraic before the xor, and the xor must be a
45385 // 'not' operation.
45386 SDValue Shift = N->getOperand(0);
45387 SDValue Ones = N->getOperand(1);
45388 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
45389 !ISD::isBuildVectorAllOnes(Ones.getNode()))
45390 return SDValue();
45391
45392 // The shift should be smearing the sign bit across each vector element.
45393 auto *ShiftAmt =
45394 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
45395 if (!ShiftAmt ||
45396 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
45397 return SDValue();
45398
45399 // Create a greater-than comparison against -1. We don't use the more obvious
45400 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
45401 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
45402}
45403
45404/// Detect patterns of truncation with unsigned saturation:
45405///
45406/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
45407/// Return the source value x to be truncated or SDValue() if the pattern was
45408/// not matched.
45409///
45410/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
45411/// where C1 >= 0 and C2 is unsigned max of destination type.
45412///
45413/// (truncate (smax (smin (x, C2), C1)) to dest_type)
45414/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
45415///
45416/// These two patterns are equivalent to:
45417/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
45418/// So return the smax(x, C1) value to be truncated or SDValue() if the
45419/// pattern was not matched.
45420static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45421 const SDLoc &DL) {
45422 EVT InVT = In.getValueType();
45423
45424 // Saturation with truncation. We truncate from InVT to VT.
45425 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&((void)0)
45426 "Unexpected types for truncate operation")((void)0);
45427
45428 // Match min/max and return limit value as a parameter.
45429 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
45430 if (V.getOpcode() == Opcode &&
45431 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
45432 return V.getOperand(0);
45433 return SDValue();
45434 };
45435
45436 APInt C1, C2;
45437 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
45438 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
45439 // the element size of the destination type.
45440 if (C2.isMask(VT.getScalarSizeInBits()))
45441 return UMin;
45442
45443 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
45444 if (MatchMinMax(SMin, ISD::SMAX, C1))
45445 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
45446 return SMin;
45447
45448 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
45449 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
45450 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
45451 C2.uge(C1)) {
45452 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
45453 }
45454
45455 return SDValue();
45456}
45457
45458/// Detect patterns of truncation with signed saturation:
45459/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
45460/// signed_max_of_dest_type)) to dest_type)
45461/// or:
45462/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
45463/// signed_min_of_dest_type)) to dest_type).
45464/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
45465/// Return the source value to be truncated or SDValue() if the pattern was not
45466/// matched.
45467static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
45468 unsigned NumDstBits = VT.getScalarSizeInBits();
45469 unsigned NumSrcBits = In.getScalarValueSizeInBits();
45470 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")((void)0);
45471
45472 auto MatchMinMax = [](SDValue V, unsigned Opcode,
45473 const APInt &Limit) -> SDValue {
45474 APInt C;
45475 if (V.getOpcode() == Opcode &&
45476 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
45477 return V.getOperand(0);
45478 return SDValue();
45479 };
45480
45481 APInt SignedMax, SignedMin;
45482 if (MatchPackUS) {
45483 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
45484 SignedMin = APInt(NumSrcBits, 0);
45485 } else {
45486 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
45487 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
45488 }
45489
45490 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
45491 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
45492 return SMax;
45493
45494 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
45495 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
45496 return SMin;
45497
45498 return SDValue();
45499}
45500
45501static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
45502 SelectionDAG &DAG,
45503 const X86Subtarget &Subtarget) {
45504 if (!Subtarget.hasSSE2() || !VT.isVector())
45505 return SDValue();
45506
45507 EVT SVT = VT.getVectorElementType();
45508 EVT InVT = In.getValueType();
45509 EVT InSVT = InVT.getVectorElementType();
45510
45511 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
45512 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
45513 // and concatenate at the same time. Then we can use a final vpmovuswb to
45514 // clip to 0-255.
45515 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
45516 InVT == MVT::v16i32 && VT == MVT::v16i8) {
45517 if (auto USatVal = detectSSatPattern(In, VT, true)) {
45518 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
45519 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
45520 DL, DAG, Subtarget);
45521 assert(Mid && "Failed to pack!")((void)0);
45522 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
45523 }
45524 }
45525
45526 // vXi32 truncate instructions are available with AVX512F.
45527 // vXi16 truncate instructions are only available with AVX512BW.
45528 // For 256-bit or smaller vectors, we require VLX.
45529 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
45530 // If the result type is 256-bits or larger and we have disable 512-bit
45531 // registers, we should go ahead and use the pack instructions if possible.
45532 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
45533 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
45534 (InVT.getSizeInBits() > 128) &&
45535 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
45536 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
45537
45538 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
45539 VT.getSizeInBits() >= 64 &&
45540 (SVT == MVT::i8 || SVT == MVT::i16) &&
45541 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
45542 if (auto USatVal = detectSSatPattern(In, VT, true)) {
45543 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
45544 // Only do this when the result is at least 64 bits or we'll leaving
45545 // dangling PACKSSDW nodes.
45546 if (SVT == MVT::i8 && InSVT == MVT::i32) {
45547 EVT MidVT = VT.changeVectorElementType(MVT::i16);
45548 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
45549 DAG, Subtarget);
45550 assert(Mid && "Failed to pack!")((void)0);
45551 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
45552 Subtarget);
45553 assert(V && "Failed to pack!")((void)0);
45554 return V;
45555 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
45556 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
45557 Subtarget);
45558 }
45559 if (auto SSatVal = detectSSatPattern(In, VT))
45560 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
45561 Subtarget);
45562 }
45563
45564 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45565 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
45566 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
45567 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
45568 unsigned TruncOpc = 0;
45569 SDValue SatVal;
45570 if (auto SSatVal = detectSSatPattern(In, VT)) {
45571 SatVal = SSatVal;
45572 TruncOpc = X86ISD::VTRUNCS;
45573 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
45574 SatVal = USatVal;
45575 TruncOpc = X86ISD::VTRUNCUS;
45576 }
45577 if (SatVal) {
45578 unsigned ResElts = VT.getVectorNumElements();
45579 // If the input type is less than 512 bits and we don't have VLX, we need
45580 // to widen to 512 bits.
45581 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
45582 unsigned NumConcats = 512 / InVT.getSizeInBits();
45583 ResElts *= NumConcats;
45584 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
45585 ConcatOps[0] = SatVal;
45586 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
45587 NumConcats * InVT.getVectorNumElements());
45588 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
45589 }
45590 // Widen the result if its narrower than 128 bits.
45591 if (ResElts * SVT.getSizeInBits() < 128)
45592 ResElts = 128 / SVT.getSizeInBits();
45593 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
45594 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
45595 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45596 DAG.getIntPtrConstant(0, DL));
45597 }
45598 }
45599
45600 return SDValue();
45601}
45602
45603/// This function detects the AVG pattern between vectors of unsigned i8/i16,
45604/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
45605/// X86ISD::AVG instruction.
45606static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
45607 const X86Subtarget &Subtarget,
45608 const SDLoc &DL) {
45609 if (!VT.isVector())
45610 return SDValue();
45611 EVT InVT = In.getValueType();
45612 unsigned NumElems = VT.getVectorNumElements();
45613
45614 EVT ScalarVT = VT.getVectorElementType();
45615 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
45616 return SDValue();
45617
45618 // InScalarVT is the intermediate type in AVG pattern and it should be greater
45619 // than the original input type (i8/i16).
45620 EVT InScalarVT = InVT.getVectorElementType();
45621 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
45622 return SDValue();
45623
45624 if (!Subtarget.hasSSE2())
45625 return SDValue();
45626
45627 // Detect the following pattern:
45628 //
45629 // %1 = zext <N x i8> %a to <N x i32>
45630 // %2 = zext <N x i8> %b to <N x i32>
45631 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
45632 // %4 = add nuw nsw <N x i32> %3, %2
45633 // %5 = lshr <N x i32> %N, <i32 1 x N>
45634 // %6 = trunc <N x i32> %5 to <N x i8>
45635 //
45636 // In AVX512, the last instruction can also be a trunc store.
45637 if (In.getOpcode() != ISD::SRL)
45638 return SDValue();
45639
45640 // A lambda checking the given SDValue is a constant vector and each element
45641 // is in the range [Min, Max].
45642 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
45643 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
45644 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
45645 });
45646 };
45647
45648 // Check if each element of the vector is right-shifted by one.
45649 SDValue LHS = In.getOperand(0);
45650 SDValue RHS = In.getOperand(1);
45651 if (!IsConstVectorInRange(RHS, 1, 1))
45652 return SDValue();
45653 if (LHS.getOpcode() != ISD::ADD)
45654 return SDValue();
45655
45656 // Detect a pattern of a + b + 1 where the order doesn't matter.
45657 SDValue Operands[3];
45658 Operands[0] = LHS.getOperand(0);
45659 Operands[1] = LHS.getOperand(1);
45660
45661 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45662 ArrayRef<SDValue> Ops) {
45663 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
45664 };
45665
45666 auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
45667 // Pad to a power-of-2 vector, split+apply and extract the original vector.
45668 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
45669 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
45670 if (NumElemsPow2 != NumElems) {
45671 SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45672 SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
45673 for (unsigned i = 0; i != NumElems; ++i) {
45674 SDValue Idx = DAG.getIntPtrConstant(i, DL);
45675 Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
45676 Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
45677 }
45678 Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
45679 Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
45680 }
45681 SDValue Res =
45682 SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
45683 if (NumElemsPow2 == NumElems)
45684 return Res;
45685 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
45686 DAG.getIntPtrConstant(0, DL));
45687 };
45688
45689 // Take care of the case when one of the operands is a constant vector whose
45690 // element is in the range [1, 256].
45691 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
45692 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
45693 Operands[0].getOperand(0).getValueType() == VT) {
45694 // The pattern is detected. Subtract one from the constant vector, then
45695 // demote it and emit X86ISD::AVG instruction.
45696 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
45697 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
45698 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
45699 return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
45700 }
45701
45702 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
45703 // Match the or case only if its 'add-like' - can be replaced by an add.
45704 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
45705 if (ISD::ADD == V.getOpcode()) {
45706 Op0 = V.getOperand(0);
45707 Op1 = V.getOperand(1);
45708 return true;
45709 }
45710 if (ISD::ZERO_EXTEND != V.getOpcode())
45711 return false;
45712 V = V.getOperand(0);
45713 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
45714 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
45715 return false;
45716 Op0 = V.getOperand(0);
45717 Op1 = V.getOperand(1);
45718 return true;
45719 };
45720
45721 SDValue Op0, Op1;
45722 if (FindAddLike(Operands[0], Op0, Op1))
45723 std::swap(Operands[0], Operands[1]);
45724 else if (!FindAddLike(Operands[1], Op0, Op1))
45725 return SDValue();
45726 Operands[2] = Op0;
45727 Operands[1] = Op1;
45728
45729 // Now we have three operands of two additions. Check that one of them is a
45730 // constant vector with ones, and the other two can be promoted from i8/i16.
45731 for (int i = 0; i < 3; ++i) {
45732 if (!IsConstVectorInRange(Operands[i], 1, 1))
45733 continue;
45734 std::swap(Operands[i], Operands[2]);
45735
45736 // Check if Operands[0] and Operands[1] are results of type promotion.
45737 for (int j = 0; j < 2; ++j)
45738 if (Operands[j].getValueType() != VT) {
45739 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
45740 Operands[j].getOperand(0).getValueType() != VT)
45741 return SDValue();
45742 Operands[j] = Operands[j].getOperand(0);
45743 }
45744
45745 // The pattern is detected, emit X86ISD::AVG instruction(s).
45746 return AVGSplitter(Operands[0], Operands[1]);
45747 }
45748
45749 return SDValue();
45750}
45751
45752static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
45753 TargetLowering::DAGCombinerInfo &DCI,
45754 const X86Subtarget &Subtarget) {
45755 LoadSDNode *Ld = cast<LoadSDNode>(N);
45756 EVT RegVT = Ld->getValueType(0);
45757 EVT MemVT = Ld->getMemoryVT();
45758 SDLoc dl(Ld);
45759 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45760
45761 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
45762 // into two 16-byte operations. Also split non-temporal aligned loads on
45763 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
45764 ISD::LoadExtType Ext = Ld->getExtensionType();
45765 bool Fast;
45766 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
45767 Ext == ISD::NON_EXTLOAD &&
45768 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
45769 Ld->getAlignment() >= 16) ||
45770 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
45771 *Ld->getMemOperand(), &Fast) &&
45772 !Fast))) {
45773 unsigned NumElems = RegVT.getVectorNumElements();
45774 if (NumElems < 2)
45775 return SDValue();
45776
45777 unsigned HalfOffset = 16;
45778 SDValue Ptr1 = Ld->getBasePtr();
45779 SDValue Ptr2 =
45780 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
45781 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
45782 NumElems / 2);
45783 SDValue Load1 =
45784 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
45785 Ld->getOriginalAlign(),
45786 Ld->getMemOperand()->getFlags());
45787 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
45788 Ld->getPointerInfo().getWithOffset(HalfOffset),
45789 Ld->getOriginalAlign(),
45790 Ld->getMemOperand()->getFlags());
45791 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
45792 Load1.getValue(1), Load2.getValue(1));
45793
45794 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
45795 return DCI.CombineTo(N, NewVec, TF, true);
45796 }
45797
45798 // Bool vector load - attempt to cast to an integer, as we have good
45799 // (vXiY *ext(vXi1 bitcast(iX))) handling.
45800 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
45801 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
45802 unsigned NumElts = RegVT.getVectorNumElements();
45803 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45804 if (TLI.isTypeLegal(IntVT)) {
45805 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
45806 Ld->getPointerInfo(),
45807 Ld->getOriginalAlign(),
45808 Ld->getMemOperand()->getFlags());
45809 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
45810 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
45811 }
45812 }
45813
45814 // If we also broadcast this as a subvector to a wider type, then just extract
45815 // the lowest subvector.
45816 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
45817 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
45818 SDValue Ptr = Ld->getBasePtr();
45819 SDValue Chain = Ld->getChain();
45820 for (SDNode *User : Ptr->uses()) {
45821 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
45822 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
45823 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
45824 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
45825 MemVT.getSizeInBits() &&
45826 !User->hasAnyUseOfValue(1) &&
45827 User->getValueSizeInBits(0).getFixedSize() >
45828 RegVT.getFixedSizeInBits()) {
45829 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
45830 RegVT.getSizeInBits());
45831 Extract = DAG.getBitcast(RegVT, Extract);
45832 return DCI.CombineTo(N, Extract, SDValue(User, 1));
45833 }
45834 }
45835 }
45836
45837 // Cast ptr32 and ptr64 pointers to the default address space before a load.
45838 unsigned AddrSpace = Ld->getAddressSpace();
45839 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
45840 AddrSpace == X86AS::PTR32_UPTR) {
45841 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
45842 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
45843 SDValue Cast =
45844 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
45845 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
45846 Ld->getOriginalAlign(),
45847 Ld->getMemOperand()->getFlags());
45848 }
45849 }
45850
45851 return SDValue();
45852}
45853
45854/// If V is a build vector of boolean constants and exactly one of those
45855/// constants is true, return the operand index of that true element.
45856/// Otherwise, return -1.
45857static int getOneTrueElt(SDValue V) {
45858 // This needs to be a build vector of booleans.
45859 // TODO: Checking for the i1 type matches the IR definition for the mask,
45860 // but the mask check could be loosened to i8 or other types. That might
45861 // also require checking more than 'allOnesValue'; eg, the x86 HW
45862 // instructions only require that the MSB is set for each mask element.
45863 // The ISD::MSTORE comments/definition do not specify how the mask operand
45864 // is formatted.
45865 auto *BV = dyn_cast<BuildVectorSDNode>(V);
45866 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
45867 return -1;
45868
45869 int TrueIndex = -1;
45870 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
45871 for (unsigned i = 0; i < NumElts; ++i) {
45872 const SDValue &Op = BV->getOperand(i);
45873 if (Op.isUndef())
45874 continue;
45875 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
45876 if (!ConstNode)
45877 return -1;
45878 if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
45879 // If we already found a one, this is too many.
45880 if (TrueIndex >= 0)
45881 return -1;
45882 TrueIndex = i;
45883 }
45884 }
45885 return TrueIndex;
45886}
45887
45888/// Given a masked memory load/store operation, return true if it has one mask
45889/// bit set. If it has one mask bit set, then also return the memory address of
45890/// the scalar element to load/store, the vector index to insert/extract that
45891/// scalar element, and the alignment for the scalar memory access.
45892static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
45893 SelectionDAG &DAG, SDValue &Addr,
45894 SDValue &Index, Align &Alignment,
45895 unsigned &Offset) {
45896 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
45897 if (TrueMaskElt < 0)
45898 return false;
45899
45900 // Get the address of the one scalar element that is specified by the mask
45901 // using the appropriate offset from the base pointer.
45902 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
45903 Offset = 0;
45904 Addr = MaskedOp->getBasePtr();
45905 if (TrueMaskElt != 0) {
45906 Offset = TrueMaskElt * EltVT.getStoreSize();
45907 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
45908 SDLoc(MaskedOp));
45909 }
45910
45911 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
45912 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
45913 EltVT.getStoreSize());
45914 return true;
45915}
45916
45917/// If exactly one element of the mask is set for a non-extending masked load,
45918/// it is a scalar load and vector insert.
45919/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
45920/// mask have already been optimized in IR, so we don't bother with those here.
45921static SDValue
45922reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45923 TargetLowering::DAGCombinerInfo &DCI,
45924 const X86Subtarget &Subtarget) {
45925 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((void)0);
45926 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
45927 // However, some target hooks may need to be added to know when the transform
45928 // is profitable. Endianness would also have to be considered.
45929
45930 SDValue Addr, VecIndex;
45931 Align Alignment;
45932 unsigned Offset;
45933 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
45934 return SDValue();
45935
45936 // Load the one scalar element that is specified by the mask using the
45937 // appropriate offset from the base pointer.
45938 SDLoc DL(ML);
45939 EVT VT = ML->getValueType(0);
45940 EVT EltVT = VT.getVectorElementType();
45941
45942 EVT CastVT = VT;
45943 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
45944 EltVT = MVT::f64;
45945 CastVT = VT.changeVectorElementType(EltVT);
45946 }
45947
45948 SDValue Load =
45949 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
45950 ML->getPointerInfo().getWithOffset(Offset),
45951 Alignment, ML->getMemOperand()->getFlags());
45952
45953 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
45954
45955 // Insert the loaded element into the appropriate place in the vector.
45956 SDValue Insert =
45957 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
45958 Insert = DAG.getBitcast(VT, Insert);
45959 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
45960}
45961
45962static SDValue
45963combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
45964 TargetLowering::DAGCombinerInfo &DCI) {
45965 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((void)0);
45966 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
45967 return SDValue();
45968
45969 SDLoc DL(ML);
45970 EVT VT = ML->getValueType(0);
45971
45972 // If we are loading the first and last elements of a vector, it is safe and
45973 // always faster to load the whole vector. Replace the masked load with a
45974 // vector load and select.
45975 unsigned NumElts = VT.getVectorNumElements();
45976 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
45977 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
45978 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
45979 if (LoadFirstElt && LoadLastElt) {
45980 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
45981 ML->getMemOperand());
45982 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
45983 ML->getPassThru());
45984 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
45985 }
45986
45987 // Convert a masked load with a constant mask into a masked load and a select.
45988 // This allows the select operation to use a faster kind of select instruction
45989 // (for example, vblendvps -> vblendps).
45990
45991 // Don't try this if the pass-through operand is already undefined. That would
45992 // cause an infinite loop because that's what we're about to create.
45993 if (ML->getPassThru().isUndef())
45994 return SDValue();
45995
45996 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
45997 return SDValue();
45998
45999 // The new masked load has an undef pass-through operand. The select uses the
46000 // original pass-through operand.
46001 SDValue NewML = DAG.getMaskedLoad(
46002 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
46003 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
46004 ML->getAddressingMode(), ML->getExtensionType());
46005 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
46006 ML->getPassThru());
46007
46008 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
46009}
46010
46011static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
46012 TargetLowering::DAGCombinerInfo &DCI,
46013 const X86Subtarget &Subtarget) {
46014 auto *Mld = cast<MaskedLoadSDNode>(N);
46015
46016 // TODO: Expanding load with constant mask may be optimized as well.
46017 if (Mld->isExpandingLoad())
46018 return SDValue();
46019
46020 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
46021 if (SDValue ScalarLoad =
46022 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
46023 return ScalarLoad;
46024
46025 // TODO: Do some AVX512 subsets benefit from this transform?
46026 if (!Subtarget.hasAVX512())
46027 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
46028 return Blend;
46029 }
46030
46031 // If the mask value has been legalized to a non-boolean vector, try to
46032 // simplify ops leading up to it. We only demand the MSB of each lane.
46033 SDValue Mask = Mld->getMask();
46034 if (Mask.getScalarValueSizeInBits() != 1) {
46035 EVT VT = Mld->getValueType(0);
46036 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46037 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46038 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46039 if (N->getOpcode() != ISD::DELETED_NODE)
46040 DCI.AddToWorklist(N);
46041 return SDValue(N, 0);
46042 }
46043 if (SDValue NewMask =
46044 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46045 return DAG.getMaskedLoad(
46046 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
46047 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
46048 Mld->getAddressingMode(), Mld->getExtensionType());
46049 }
46050
46051 return SDValue();
46052}
46053
46054/// If exactly one element of the mask is set for a non-truncating masked store,
46055/// it is a vector extract and scalar store.
46056/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
46057/// mask have already been optimized in IR, so we don't bother with those here.
46058static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
46059 SelectionDAG &DAG,
46060 const X86Subtarget &Subtarget) {
46061 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
46062 // However, some target hooks may need to be added to know when the transform
46063 // is profitable. Endianness would also have to be considered.
46064
46065 SDValue Addr, VecIndex;
46066 Align Alignment;
46067 unsigned Offset;
46068 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
46069 return SDValue();
46070
46071 // Extract the one scalar element that is actually being stored.
46072 SDLoc DL(MS);
46073 SDValue Value = MS->getValue();
46074 EVT VT = Value.getValueType();
46075 EVT EltVT = VT.getVectorElementType();
46076 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
46077 EltVT = MVT::f64;
46078 EVT CastVT = VT.changeVectorElementType(EltVT);
46079 Value = DAG.getBitcast(CastVT, Value);
46080 }
46081 SDValue Extract =
46082 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
46083
46084 // Store that element at the appropriate offset from the base pointer.
46085 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
46086 MS->getPointerInfo().getWithOffset(Offset),
46087 Alignment, MS->getMemOperand()->getFlags());
46088}
46089
46090static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
46091 TargetLowering::DAGCombinerInfo &DCI,
46092 const X86Subtarget &Subtarget) {
46093 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
46094 if (Mst->isCompressingStore())
46095 return SDValue();
46096
46097 EVT VT = Mst->getValue().getValueType();
46098 SDLoc dl(Mst);
46099 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46100
46101 if (Mst->isTruncatingStore())
46102 return SDValue();
46103
46104 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
46105 return ScalarStore;
46106
46107 // If the mask value has been legalized to a non-boolean vector, try to
46108 // simplify ops leading up to it. We only demand the MSB of each lane.
46109 SDValue Mask = Mst->getMask();
46110 if (Mask.getScalarValueSizeInBits() != 1) {
46111 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46112 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46113 if (N->getOpcode() != ISD::DELETED_NODE)
46114 DCI.AddToWorklist(N);
46115 return SDValue(N, 0);
46116 }
46117 if (SDValue NewMask =
46118 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46119 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
46120 Mst->getBasePtr(), Mst->getOffset(), NewMask,
46121 Mst->getMemoryVT(), Mst->getMemOperand(),
46122 Mst->getAddressingMode());
46123 }
46124
46125 SDValue Value = Mst->getValue();
46126 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
46127 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
46128 Mst->getMemoryVT())) {
46129 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
46130 Mst->getBasePtr(), Mst->getOffset(), Mask,
46131 Mst->getMemoryVT(), Mst->getMemOperand(),
46132 Mst->getAddressingMode(), true);
46133 }
46134
46135 return SDValue();
46136}
46137
46138static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
46139 TargetLowering::DAGCombinerInfo &DCI,
46140 const X86Subtarget &Subtarget) {
46141 StoreSDNode *St = cast<StoreSDNode>(N);
46142 EVT StVT = St->getMemoryVT();
46143 SDLoc dl(St);
46144 SDValue StoredVal = St->getValue();
46145 EVT VT = StoredVal.getValueType();
46146 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46147
46148 // Convert a store of vXi1 into a store of iX and a bitcast.
46149 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
46150 VT.getVectorElementType() == MVT::i1) {
46151
46152 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46153 StoredVal = DAG.getBitcast(NewVT, StoredVal);
46154
46155 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46156 St->getPointerInfo(), St->getOriginalAlign(),
46157 St->getMemOperand()->getFlags());
46158 }
46159
46160 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
46161 // This will avoid a copy to k-register.
46162 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
46163 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
46164 StoredVal.getOperand(0).getValueType() == MVT::i8) {
46165 SDValue Val = StoredVal.getOperand(0);
46166 // We must store zeros to the unused bits.
46167 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
46168 return DAG.getStore(St->getChain(), dl, Val,
46169 St->getBasePtr(), St->getPointerInfo(),
46170 St->getOriginalAlign(),
46171 St->getMemOperand()->getFlags());
46172 }
46173
46174 // Widen v2i1/v4i1 stores to v8i1.
46175 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
46176 Subtarget.hasAVX512()) {
46177 unsigned NumConcats = 8 / VT.getVectorNumElements();
46178 // We must store zeros to the unused bits.
46179 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
46180 Ops[0] = StoredVal;
46181 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
46182 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46183 St->getPointerInfo(), St->getOriginalAlign(),
46184 St->getMemOperand()->getFlags());
46185 }
46186
46187 // Turn vXi1 stores of constants into a scalar store.
46188 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
46189 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
46190 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
46191 // If its a v64i1 store without 64-bit support, we need two stores.
46192 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
46193 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
46194 StoredVal->ops().slice(0, 32));
46195 Lo = combinevXi1ConstantToInteger(Lo, DAG);
46196 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
46197 StoredVal->ops().slice(32, 32));
46198 Hi = combinevXi1ConstantToInteger(Hi, DAG);
46199
46200 SDValue Ptr0 = St->getBasePtr();
46201 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
46202
46203 SDValue Ch0 =
46204 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
46205 St->getOriginalAlign(),
46206 St->getMemOperand()->getFlags());
46207 SDValue Ch1 =
46208 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
46209 St->getPointerInfo().getWithOffset(4),
46210 St->getOriginalAlign(),
46211 St->getMemOperand()->getFlags());
46212 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
46213 }
46214
46215 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
46216 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46217 St->getPointerInfo(), St->getOriginalAlign(),
46218 St->getMemOperand()->getFlags());
46219 }
46220
46221 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
46222 // Sandy Bridge, perform two 16-byte stores.
46223 bool Fast;
46224 if (VT.is256BitVector() && StVT == VT &&
46225 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
46226 *St->getMemOperand(), &Fast) &&
46227 !Fast) {
46228 unsigned NumElems = VT.getVectorNumElements();
46229 if (NumElems < 2)
46230 return SDValue();
46231
46232 return splitVectorStore(St, DAG);
46233 }
46234
46235 // Split under-aligned vector non-temporal stores.
46236 if (St->isNonTemporal() && StVT == VT &&
46237 St->getAlignment() < VT.getStoreSize()) {
46238 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
46239 // vectors or the legalizer can scalarize it to use MOVNTI.
46240 if (VT.is256BitVector() || VT.is512BitVector()) {
46241 unsigned NumElems = VT.getVectorNumElements();
46242 if (NumElems < 2)
46243 return SDValue();
46244 return splitVectorStore(St, DAG);
46245 }
46246
46247 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
46248 // to use MOVNTI.
46249 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
46250 MVT NTVT = Subtarget.hasSSE4A()
46251 ? MVT::v2f64
46252 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
46253 return scalarizeVectorStore(St, NTVT, DAG);
46254 }
46255 }
46256
46257 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
46258 // supported, but avx512f is by extending to v16i32 and truncating.
46259 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
46260 St->getValue().getOpcode() == ISD::TRUNCATE &&
46261 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
46262 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
46263 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
46264 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
46265 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
46266 MVT::v16i8, St->getMemOperand());
46267 }
46268
46269 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
46270 if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
46271 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
46272 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
46273 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
46274 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
46275 return EmitTruncSStore(IsSigned, St->getChain(),
46276 dl, StoredVal.getOperand(0), St->getBasePtr(),
46277 VT, St->getMemOperand(), DAG);
46278 }
46279
46280 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
46281 if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
46282 auto IsExtractedElement = [](SDValue V) {
46283 if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
46284 V = V.getOperand(0);
46285 unsigned Opc = V.getOpcode();
46286 if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
46287 if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
46288 return V.getOperand(0);
46289 }
46290 return SDValue();
46291 };
46292 if (SDValue Extract = IsExtractedElement(StoredVal)) {
46293 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
46294 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
46295 SDValue Src = Trunc.getOperand(0);
46296 MVT DstVT = Trunc.getSimpleValueType();
46297 MVT SrcVT = Src.getSimpleValueType();
46298 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46299 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
46300 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
46301 if (NumTruncBits == VT.getSizeInBits() &&
46302 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
46303 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
46304 TruncVT, St->getMemOperand());
46305 }
46306 }
46307 }
46308 }
46309
46310 // Optimize trunc store (of multiple scalars) to shuffle and store.
46311 // First, pack all of the elements in one place. Next, store to memory
46312 // in fewer chunks.
46313 if (St->isTruncatingStore() && VT.isVector()) {
46314 // Check if we can detect an AVG pattern from the truncation. If yes,
46315 // replace the trunc store by a normal store with the result of X86ISD::AVG
46316 // instruction.
46317 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
46318 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
46319 Subtarget, dl))
46320 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
46321 St->getPointerInfo(), St->getOriginalAlign(),
46322 St->getMemOperand()->getFlags());
46323
46324 if (TLI.isTruncStoreLegal(VT, StVT)) {
46325 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
46326 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
46327 dl, Val, St->getBasePtr(),
46328 St->getMemoryVT(), St->getMemOperand(), DAG);
46329 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
46330 DAG, dl))
46331 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
46332 dl, Val, St->getBasePtr(),
46333 St->getMemoryVT(), St->getMemOperand(), DAG);
46334 }
46335
46336 return SDValue();
46337 }
46338
46339 // Cast ptr32 and ptr64 pointers to the default address space before a store.
46340 unsigned AddrSpace = St->getAddressSpace();
46341 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
46342 AddrSpace == X86AS::PTR32_UPTR) {
46343 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
46344 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
46345 SDValue Cast =
46346 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
46347 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
46348 St->getPointerInfo(), St->getOriginalAlign(),
46349 St->getMemOperand()->getFlags(), St->getAAInfo());
46350 }
46351 }
46352
46353 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
46354 // the FP state in cases where an emms may be missing.
46355 // A preferable solution to the general problem is to figure out the right
46356 // places to insert EMMS. This qualifies as a quick hack.
46357
46358 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
46359 if (VT.getSizeInBits() != 64)
46360 return SDValue();
46361
46362 const Function &F = DAG.getMachineFunction().getFunction();
46363 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
46364 bool F64IsLegal =
46365 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
46366 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
46367 isa<LoadSDNode>(St->getValue()) &&
46368 cast<LoadSDNode>(St->getValue())->isSimple() &&
46369 St->getChain().hasOneUse() && St->isSimple()) {
46370 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
46371
46372 if (!ISD::isNormalLoad(Ld))
46373 return SDValue();
46374
46375 // Avoid the transformation if there are multiple uses of the loaded value.
46376 if (!Ld->hasNUsesOfValue(1, 0))
46377 return SDValue();
46378
46379 SDLoc LdDL(Ld);
46380 SDLoc StDL(N);
46381 // Lower to a single movq load/store pair.
46382 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
46383 Ld->getBasePtr(), Ld->getMemOperand());
46384
46385 // Make sure new load is placed in same chain order.
46386 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
46387 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
46388 St->getMemOperand());
46389 }
46390
46391 // This is similar to the above case, but here we handle a scalar 64-bit
46392 // integer store that is extracted from a vector on a 32-bit target.
46393 // If we have SSE2, then we can treat it like a floating-point double
46394 // to get past legalization. The execution dependencies fixup pass will
46395 // choose the optimal machine instruction for the store if this really is
46396 // an integer or v2f32 rather than an f64.
46397 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
46398 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
46399 SDValue OldExtract = St->getOperand(1);
46400 SDValue ExtOp0 = OldExtract.getOperand(0);
46401 unsigned VecSize = ExtOp0.getValueSizeInBits();
46402 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
46403 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
46404 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
46405 BitCast, OldExtract.getOperand(1));
46406 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
46407 St->getPointerInfo(), St->getOriginalAlign(),
46408 St->getMemOperand()->getFlags());
46409 }
46410
46411 return SDValue();
46412}
46413
46414static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
46415 TargetLowering::DAGCombinerInfo &DCI,
46416 const X86Subtarget &Subtarget) {
46417 auto *St = cast<MemIntrinsicSDNode>(N);
46418
46419 SDValue StoredVal = N->getOperand(1);
46420 MVT VT = StoredVal.getSimpleValueType();
46421 EVT MemVT = St->getMemoryVT();
46422
46423 // Figure out which elements we demand.
46424 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
46425 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
46426
46427 APInt KnownUndef, KnownZero;
46428 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46429 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
46430 KnownZero, DCI)) {
46431 if (N->getOpcode() != ISD::DELETED_NODE)
46432 DCI.AddToWorklist(N);
46433 return SDValue(N, 0);
46434 }
46435
46436 return SDValue();
46437}
46438
46439/// Return 'true' if this vector operation is "horizontal"
46440/// and return the operands for the horizontal operation in LHS and RHS. A
46441/// horizontal operation performs the binary operation on successive elements
46442/// of its first operand, then on successive elements of its second operand,
46443/// returning the resulting values in a vector. For example, if
46444/// A = < float a0, float a1, float a2, float a3 >
46445/// and
46446/// B = < float b0, float b1, float b2, float b3 >
46447/// then the result of doing a horizontal operation on A and B is
46448/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
46449/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
46450/// A horizontal-op B, for some already available A and B, and if so then LHS is
46451/// set to A, RHS to B, and the routine returns 'true'.
46452static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
46453 SelectionDAG &DAG, const X86Subtarget &Subtarget,
46454 bool IsCommutative,
46455 SmallVectorImpl<int> &PostShuffleMask) {
46456 // If either operand is undef, bail out. The binop should be simplified.
46457 if (LHS.isUndef() || RHS.isUndef())
46458 return false;
46459
46460 // Look for the following pattern:
46461 // A = < float a0, float a1, float a2, float a3 >
46462 // B = < float b0, float b1, float b2, float b3 >
46463 // and
46464 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
46465 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
46466 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
46467 // which is A horizontal-op B.
46468
46469 MVT VT = LHS.getSimpleValueType();
46470 assert((VT.is128BitVector() || VT.is256BitVector()) &&((void)0)
46471 "Unsupported vector type for horizontal add/sub")((void)0);
46472 unsigned NumElts = VT.getVectorNumElements();
46473
46474 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
46475 SmallVectorImpl<int> &ShuffleMask) {
46476 bool UseSubVector = false;
46477 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46478 Op.getOperand(0).getValueType().is256BitVector() &&
46479 llvm::isNullConstant(Op.getOperand(1))) {
46480 Op = Op.getOperand(0);
46481 UseSubVector = true;
46482 }
46483 SmallVector<SDValue, 2> SrcOps;
46484 SmallVector<int, 16> SrcMask, ScaledMask;
46485 SDValue BC = peekThroughBitcasts(Op);
46486 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
46487 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
46488 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
46489 })) {
46490 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
46491 if (!UseSubVector && SrcOps.size() <= 2 &&
46492 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
46493 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
46494 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
46495 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
46496 }
46497 if (UseSubVector && SrcOps.size() == 1 &&
46498 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
46499 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
46500 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
46501 ShuffleMask.assign(Mask.begin(), Mask.end());
46502 }
46503 }
46504 };
46505
46506 // View LHS in the form
46507 // LHS = VECTOR_SHUFFLE A, B, LMask
46508 // If LHS is not a shuffle, then pretend it is the identity shuffle:
46509 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
46510 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
46511 SDValue A, B;
46512 SmallVector<int, 16> LMask;
46513 GetShuffle(LHS, A, B, LMask);
46514
46515 // Likewise, view RHS in the form
46516 // RHS = VECTOR_SHUFFLE C, D, RMask
46517 SDValue C, D;
46518 SmallVector<int, 16> RMask;
46519 GetShuffle(RHS, C, D, RMask);
46520
46521 // At least one of the operands should be a vector shuffle.
46522 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
46523 if (NumShuffles == 0)
46524 return false;
46525
46526 if (LMask.empty()) {
46527 A = LHS;
46528 for (unsigned i = 0; i != NumElts; ++i)
46529 LMask.push_back(i);
46530 }
46531
46532 if (RMask.empty()) {
46533 C = RHS;
46534 for (unsigned i = 0; i != NumElts; ++i)
46535 RMask.push_back(i);
46536 }
46537
46538 // If we have an unary mask, ensure the other op is set to null.
46539 if (isUndefOrInRange(LMask, 0, NumElts))
46540 B = SDValue();
46541 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
46542 A = SDValue();
46543
46544 if (isUndefOrInRange(RMask, 0, NumElts))
46545 D = SDValue();
46546 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
46547 C = SDValue();
46548
46549 // If A and B occur in reverse order in RHS, then canonicalize by commuting
46550 // RHS operands and shuffle mask.
46551 if (A != C) {
46552 std::swap(C, D);
46553 ShuffleVectorSDNode::commuteMask(RMask);
46554 }
46555 // Check that the shuffles are both shuffling the same vectors.
46556 if (!(A == C && B == D))
46557 return false;
46558
46559 PostShuffleMask.clear();
46560 PostShuffleMask.append(NumElts, SM_SentinelUndef);
46561
46562 // LHS and RHS are now:
46563 // LHS = shuffle A, B, LMask
46564 // RHS = shuffle A, B, RMask
46565 // Check that the masks correspond to performing a horizontal operation.
46566 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
46567 // so we just repeat the inner loop if this is a 256-bit op.
46568 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
46569 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
46570 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
46571 assert((NumEltsPer128BitChunk % 2 == 0) &&((void)0)
46572 "Vector type should have an even number of elements in each lane")((void)0);
46573 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
46574 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
46575 // Ignore undefined components.
46576 int LIdx = LMask[i + j], RIdx = RMask[i + j];
46577 if (LIdx < 0 || RIdx < 0 ||
46578 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
46579 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
46580 continue;
46581
46582 // Check that successive odd/even elements are being operated on. If not,
46583 // this is not a horizontal operation.
46584 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
46585 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
46586 return false;
46587
46588 // Compute the post-shuffle mask index based on where the element
46589 // is stored in the HOP result, and where it needs to be moved to.
46590 int Base = LIdx & ~1u;
46591 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
46592 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
46593
46594 // The low half of the 128-bit result must choose from A.
46595 // The high half of the 128-bit result must choose from B,
46596 // unless B is undef. In that case, we are always choosing from A.
46597 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
46598 Index += NumEltsPer64BitChunk;
46599 PostShuffleMask[i + j] = Index;
46600 }
46601 }
46602
46603 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
46604 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
46605
46606 bool IsIdentityPostShuffle =
46607 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
46608 if (IsIdentityPostShuffle)
46609 PostShuffleMask.clear();
46610
46611 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
46612 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
46613 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
46614 return false;
46615
46616 // If the source nodes are already used in HorizOps then always accept this.
46617 // Shuffle folding should merge these back together.
46618 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
46619 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46620 });
46621 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
46622 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
46623 });
46624 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
46625
46626 // Assume a SingleSource HOP if we only shuffle one input and don't need to
46627 // shuffle the result.
46628 if (!ForceHorizOp &&
46629 !shouldUseHorizontalOp(NewLHS == NewRHS &&
46630 (NumShuffles < 2 || !IsIdentityPostShuffle),
46631 DAG, Subtarget))
46632 return false;
46633
46634 LHS = DAG.getBitcast(VT, NewLHS);
46635 RHS = DAG.getBitcast(VT, NewRHS);
46636 return true;
46637}
46638
46639// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
46640static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
46641 const X86Subtarget &Subtarget) {
46642 EVT VT = N->getValueType(0);
46643 unsigned Opcode = N->getOpcode();
46644 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
46645 SmallVector<int, 8> PostShuffleMask;
46646
46647 switch (Opcode) {
46648 case ISD::FADD:
46649 case ISD::FSUB:
46650 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
46651 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
46652 SDValue LHS = N->getOperand(0);
46653 SDValue RHS = N->getOperand(1);
46654 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
46655 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46656 PostShuffleMask)) {
46657 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
46658 if (!PostShuffleMask.empty())
46659 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46660 DAG.getUNDEF(VT), PostShuffleMask);
46661 return HorizBinOp;
46662 }
46663 }
46664 break;
46665 case ISD::ADD:
46666 case ISD::SUB:
46667 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
46668 VT == MVT::v16i16 || VT == MVT::v8i32)) {
46669 SDValue LHS = N->getOperand(0);
46670 SDValue RHS = N->getOperand(1);
46671 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
46672 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
46673 PostShuffleMask)) {
46674 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
46675 ArrayRef<SDValue> Ops) {
46676 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
46677 };
46678 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
46679 {LHS, RHS}, HOpBuilder);
46680 if (!PostShuffleMask.empty())
46681 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
46682 DAG.getUNDEF(VT), PostShuffleMask);
46683 return HorizBinOp;
46684 }
46685 }
46686 break;
46687 }
46688
46689 return SDValue();
46690}
46691
46692/// Do target-specific dag combines on floating-point adds/subs.
46693static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
46694 const X86Subtarget &Subtarget) {
46695 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
46696 return HOp;
46697 return SDValue();
46698}
46699
46700/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
46701/// the codegen.
46702/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
46703/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
46704/// anything that is guaranteed to be transformed by DAGCombiner.
46705static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
46706 const X86Subtarget &Subtarget,
46707 const SDLoc &DL) {
46708 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")((void)0);
46709 SDValue Src = N->getOperand(0);
46710 unsigned SrcOpcode = Src.getOpcode();
46711 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46712
46713 EVT VT = N->getValueType(0);
46714 EVT SrcVT = Src.getValueType();
46715
46716 auto IsFreeTruncation = [VT](SDValue Op) {
46717 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
46718
46719 // See if this has been extended from a smaller/equal size to
46720 // the truncation size, allowing a truncation to combine with the extend.
46721 unsigned Opcode = Op.getOpcode();
46722 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
46723 Opcode == ISD::ZERO_EXTEND) &&
46724 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
46725 return true;
46726
46727 // See if this is a single use constant which can be constant folded.
46728 // NOTE: We don't peek throught bitcasts here because there is currently
46729 // no support for constant folding truncate+bitcast+vector_of_constants. So
46730 // we'll just send up with a truncate on both operands which will
46731 // get turned back into (truncate (binop)) causing an infinite loop.
46732 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
46733 };
46734
46735 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
46736 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
46737 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
46738 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
46739 };
46740
46741 // Don't combine if the operation has other uses.
46742 if (!Src.hasOneUse())
46743 return SDValue();
46744
46745 // Only support vector truncation for now.
46746 // TODO: i64 scalar math would benefit as well.
46747 if (!VT.isVector())
46748 return SDValue();
46749
46750 // In most cases its only worth pre-truncating if we're only facing the cost
46751 // of one truncation.
46752 // i.e. if one of the inputs will constant fold or the input is repeated.
46753 switch (SrcOpcode) {
46754 case ISD::MUL:
46755 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
46756 // better to truncate if we have the chance.
46757 if (SrcVT.getScalarType() == MVT::i64 &&
46758 TLI.isOperationLegal(SrcOpcode, VT) &&
46759 !TLI.isOperationLegal(SrcOpcode, SrcVT))
46760 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
46761 LLVM_FALLTHROUGH[[gnu::fallthrough]];
46762 case ISD::AND:
46763 case ISD::XOR:
46764 case ISD::OR:
46765 case ISD::ADD:
46766 case ISD::SUB: {
46767 SDValue Op0 = Src.getOperand(0);
46768 SDValue Op1 = Src.getOperand(1);
46769 if (TLI.isOperationLegal(SrcOpcode, VT) &&
46770 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
46771 return TruncateArithmetic(Op0, Op1);
46772 break;
46773 }
46774 }
46775
46776 return SDValue();
46777}
46778
46779/// Truncate using ISD::AND mask and X86ISD::PACKUS.
46780/// e.g. trunc <8 x i32> X to <8 x i16> -->
46781/// MaskX = X & 0xffff (clear high bits to prevent saturation)
46782/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
46783static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
46784 const X86Subtarget &Subtarget,
46785 SelectionDAG &DAG) {
46786 SDValue In = N->getOperand(0);
46787 EVT InVT = In.getValueType();
46788 EVT OutVT = N->getValueType(0);
46789
46790 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
46791 OutVT.getScalarSizeInBits());
46792 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
46793 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
46794}
46795
46796/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
46797static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
46798 const X86Subtarget &Subtarget,
46799 SelectionDAG &DAG) {
46800 SDValue In = N->getOperand(0);
46801 EVT InVT = In.getValueType();
46802 EVT OutVT = N->getValueType(0);
46803 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
46804 DAG.getValueType(OutVT));
46805 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
46806}
46807
46808/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
46809/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
46810/// legalization the truncation will be translated into a BUILD_VECTOR with each
46811/// element that is extracted from a vector and then truncated, and it is
46812/// difficult to do this optimization based on them.
46813static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
46814 const X86Subtarget &Subtarget) {
46815 EVT OutVT = N->getValueType(0);
46816 if (!OutVT.isVector())
46817 return SDValue();
46818
46819 SDValue In = N->getOperand(0);
46820 if (!In.getValueType().isSimple())
46821 return SDValue();
46822
46823 EVT InVT = In.getValueType();
46824 unsigned NumElems = OutVT.getVectorNumElements();
46825
46826 // AVX512 provides fast truncate ops.
46827 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46828 return SDValue();
46829
46830 EVT OutSVT = OutVT.getVectorElementType();
46831 EVT InSVT = InVT.getVectorElementType();
46832 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
46833 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
46834 NumElems >= 8))
46835 return SDValue();
46836
46837 // SSSE3's pshufb results in less instructions in the cases below.
46838 if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64)
46839 return SDValue();
46840
46841 SDLoc DL(N);
46842 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
46843 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
46844 // truncate 2 x v4i32 to v8i16.
46845 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
46846 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
46847 if (InSVT == MVT::i32)
46848 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
46849
46850 return SDValue();
46851}
46852
46853/// This function transforms vector truncation of 'extended sign-bits' or
46854/// 'extended zero-bits' values.
46855/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
46856static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
46857 SelectionDAG &DAG,
46858 const X86Subtarget &Subtarget) {
46859 // Requires SSE2.
46860 if (!Subtarget.hasSSE2())
46861 return SDValue();
46862
46863 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
46864 return SDValue();
46865
46866 SDValue In = N->getOperand(0);
46867 if (!In.getValueType().isSimple())
46868 return SDValue();
46869
46870 MVT VT = N->getValueType(0).getSimpleVT();
46871 MVT SVT = VT.getScalarType();
46872
46873 MVT InVT = In.getValueType().getSimpleVT();
46874 MVT InSVT = InVT.getScalarType();
46875
46876 // Check we have a truncation suited for PACKSS/PACKUS.
46877 if (!isPowerOf2_32(VT.getVectorNumElements()))
46878 return SDValue();
46879 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
46880 return SDValue();
46881 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
46882 return SDValue();
46883
46884 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
46885 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
46886 return SDValue();
46887
46888 // AVX512 has fast truncate, but if the input is already going to be split,
46889 // there's no harm in trying pack.
46890 if (Subtarget.hasAVX512() &&
46891 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
46892 InVT.is512BitVector())) {
46893 // PACK should still be worth it for 128-bit vectors if the sources were
46894 // originally concatenated from subvectors.
46895 SmallVector<SDValue> ConcatOps;
46896 if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
46897 return SDValue();
46898 }
46899
46900 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
46901 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
46902
46903 // Use PACKUS if the input has zero-bits that extend all the way to the
46904 // packed/truncated value. e.g. masks, zext_in_reg, etc.
46905 KnownBits Known = DAG.computeKnownBits(In);
46906 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
46907 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
46908 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
46909
46910 // Use PACKSS if the input has sign-bits that extend all the way to the
46911 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
46912 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
46913
46914 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
46915 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
46916 // on and combines/simplifications can't then use it.
46917 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
46918 return SDValue();
46919
46920 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
46921 if (NumSignBits > MinSignBits)
46922 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
46923
46924 // If we have a srl that only generates signbits that we will discard in
46925 // the truncation then we can use PACKSS by converting the srl to a sra.
46926 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
46927 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
46928 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
46929 In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
46930 if (*ShAmt == MinSignBits) {
46931 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
46932 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
46933 Subtarget);
46934 }
46935 }
46936
46937 return SDValue();
46938}
46939
46940// Try to form a MULHU or MULHS node by looking for
46941// (trunc (srl (mul ext, ext), 16))
46942// TODO: This is X86 specific because we want to be able to handle wide types
46943// before type legalization. But we can only do it if the vector will be
46944// legalized via widening/splitting. Type legalization can't handle promotion
46945// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
46946// combiner.
46947static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
46948 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46949 // First instruction should be a right shift of a multiply.
46950 if (Src.getOpcode() != ISD::SRL ||
46951 Src.getOperand(0).getOpcode() != ISD::MUL)
46952 return SDValue();
46953
46954 if (!Subtarget.hasSSE2())
46955 return SDValue();
46956
46957 // Only handle vXi16 types that are at least 128-bits unless they will be
46958 // widened.
46959 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
46960 return SDValue();
46961
46962 // Input type should be at least vXi32.
46963 EVT InVT = Src.getValueType();
46964 if (InVT.getVectorElementType().getSizeInBits() < 32)
46965 return SDValue();
46966
46967 // Need a shift by 16.
46968 APInt ShiftAmt;
46969 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
46970 ShiftAmt != 16)
46971 return SDValue();
46972
46973 SDValue LHS = Src.getOperand(0).getOperand(0);
46974 SDValue RHS = Src.getOperand(0).getOperand(1);
46975
46976 unsigned ExtOpc = LHS.getOpcode();
46977 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
46978 RHS.getOpcode() != ExtOpc)
46979 return SDValue();
46980
46981 // Peek through the extends.
46982 LHS = LHS.getOperand(0);
46983 RHS = RHS.getOperand(0);
46984
46985 // Ensure the input types match.
46986 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
46987 return SDValue();
46988
46989 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
46990 return DAG.getNode(Opc, DL, VT, LHS, RHS);
46991}
46992
46993// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
46994// from one vector with signed bytes from another vector, adds together
46995// adjacent pairs of 16-bit products, and saturates the result before
46996// truncating to 16-bits.
46997//
46998// Which looks something like this:
46999// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
47000// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
47001static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
47002 const X86Subtarget &Subtarget,
47003 const SDLoc &DL) {
47004 if (!VT.isVector() || !Subtarget.hasSSSE3())
47005 return SDValue();
47006
47007 unsigned NumElems = VT.getVectorNumElements();
47008 EVT ScalarVT = VT.getVectorElementType();
47009 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
47010 return SDValue();
47011
47012 SDValue SSatVal = detectSSatPattern(In, VT);
47013 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
47014 return SDValue();
47015
47016 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
47017 // of multiplies from even/odd elements.
47018 SDValue N0 = SSatVal.getOperand(0);
47019 SDValue N1 = SSatVal.getOperand(1);
47020
47021 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
47022 return SDValue();
47023
47024 SDValue N00 = N0.getOperand(0);
47025 SDValue N01 = N0.getOperand(1);
47026 SDValue N10 = N1.getOperand(0);
47027 SDValue N11 = N1.getOperand(1);
47028
47029 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
47030 // Canonicalize zero_extend to LHS.
47031 if (N01.getOpcode() == ISD::ZERO_EXTEND)
47032 std::swap(N00, N01);
47033 if (N11.getOpcode() == ISD::ZERO_EXTEND)
47034 std::swap(N10, N11);
47035
47036 // Ensure we have a zero_extend and a sign_extend.
47037 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
47038 N01.getOpcode() != ISD::SIGN_EXTEND ||
47039 N10.getOpcode() != ISD::ZERO_EXTEND ||
47040 N11.getOpcode() != ISD::SIGN_EXTEND)
47041 return SDValue();
47042
47043 // Peek through the extends.
47044 N00 = N00.getOperand(0);
47045 N01 = N01.getOperand(0);
47046 N10 = N10.getOperand(0);
47047 N11 = N11.getOperand(0);
47048
47049 // Ensure the extend is from vXi8.
47050 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
47051 N01.getValueType().getVectorElementType() != MVT::i8 ||
47052 N10.getValueType().getVectorElementType() != MVT::i8 ||
47053 N11.getValueType().getVectorElementType() != MVT::i8)
47054 return SDValue();
47055
47056 // All inputs should be build_vectors.
47057 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
47058 N01.getOpcode() != ISD::BUILD_VECTOR ||
47059 N10.getOpcode() != ISD::BUILD_VECTOR ||
47060 N11.getOpcode() != ISD::BUILD_VECTOR)
47061 return SDValue();
47062
47063 // N00/N10 are zero extended. N01/N11 are sign extended.
47064
47065 // For each element, we need to ensure we have an odd element from one vector
47066 // multiplied by the odd element of another vector and the even element from
47067 // one of the same vectors being multiplied by the even element from the
47068 // other vector. So we need to make sure for each element i, this operator
47069 // is being performed:
47070 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
47071 SDValue ZExtIn, SExtIn;
47072 for (unsigned i = 0; i != NumElems; ++i) {
47073 SDValue N00Elt = N00.getOperand(i);
47074 SDValue N01Elt = N01.getOperand(i);
47075 SDValue N10Elt = N10.getOperand(i);
47076 SDValue N11Elt = N11.getOperand(i);
47077 // TODO: Be more tolerant to undefs.
47078 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47079 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47080 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47081 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
47082 return SDValue();
47083 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
47084 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
47085 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
47086 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
47087 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
47088 return SDValue();
47089 unsigned IdxN00 = ConstN00Elt->getZExtValue();
47090 unsigned IdxN01 = ConstN01Elt->getZExtValue();
47091 unsigned IdxN10 = ConstN10Elt->getZExtValue();
47092 unsigned IdxN11 = ConstN11Elt->getZExtValue();
47093 // Add is commutative so indices can be reordered.
47094 if (IdxN00 > IdxN10) {
47095 std::swap(IdxN00, IdxN10);
47096 std::swap(IdxN01, IdxN11);
47097 }
47098 // N0 indices be the even element. N1 indices must be the next odd element.
47099 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
47100 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
47101 return SDValue();
47102 SDValue N00In = N00Elt.getOperand(0);
47103 SDValue N01In = N01Elt.getOperand(0);
47104 SDValue N10In = N10Elt.getOperand(0);
47105 SDValue N11In = N11Elt.getOperand(0);
47106 // First time we find an input capture it.
47107 if (!ZExtIn) {
47108 ZExtIn = N00In;
47109 SExtIn = N01In;
47110 }
47111 if (ZExtIn != N00In || SExtIn != N01In ||
47112 ZExtIn != N10In || SExtIn != N11In)
47113 return SDValue();
47114 }
47115
47116 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47117 ArrayRef<SDValue> Ops) {
47118 // Shrink by adding truncate nodes and let DAGCombine fold with the
47119 // sources.
47120 EVT InVT = Ops[0].getValueType();
47121 assert(InVT.getScalarType() == MVT::i8 &&((void)0)
47122 "Unexpected scalar element type")((void)0);
47123 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((void)0);
47124 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
47125 InVT.getVectorNumElements() / 2);
47126 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
47127 };
47128 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
47129 PMADDBuilder);
47130}
47131
47132static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
47133 const X86Subtarget &Subtarget) {
47134 EVT VT = N->getValueType(0);
47135 SDValue Src = N->getOperand(0);
47136 SDLoc DL(N);
47137
47138 // Attempt to pre-truncate inputs to arithmetic ops instead.
47139 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
47140 return V;
47141
47142 // Try to detect AVG pattern first.
47143 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
47144 return Avg;
47145
47146 // Try to detect PMADD
47147 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
47148 return PMAdd;
47149
47150 // Try to combine truncation with signed/unsigned saturation.
47151 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
47152 return Val;
47153
47154 // Try to combine PMULHUW/PMULHW for vXi16.
47155 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
47156 return V;
47157
47158 // The bitcast source is a direct mmx result.
47159 // Detect bitcasts between i32 to x86mmx
47160 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
47161 SDValue BCSrc = Src.getOperand(0);
47162 if (BCSrc.getValueType() == MVT::x86mmx)
47163 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
47164 }
47165
47166 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
47167 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
47168 return V;
47169
47170 return combineVectorTruncation(N, DAG, Subtarget);
47171}
47172
47173static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
47174 TargetLowering::DAGCombinerInfo &DCI) {
47175 EVT VT = N->getValueType(0);
47176 SDValue In = N->getOperand(0);
47177 SDLoc DL(N);
47178
47179 if (auto SSatVal = detectSSatPattern(In, VT))
47180 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
47181 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
47182 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
47183
47184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47185 APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
47186 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47187 return SDValue(N, 0);
47188
47189 return SDValue();
47190}
47191
47192/// Returns the negated value if the node \p N flips sign of FP value.
47193///
47194/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
47195/// or FSUB(0, x)
47196/// AVX512F does not have FXOR, so FNEG is lowered as
47197/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
47198/// In this case we go though all bitcasts.
47199/// This also recognizes splat of a negated value and returns the splat of that
47200/// value.
47201static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
47202 if (N->getOpcode() == ISD::FNEG)
47203 return N->getOperand(0);
47204
47205 // Don't recurse exponentially.
47206 if (Depth > SelectionDAG::MaxRecursionDepth)
47207 return SDValue();
47208
47209 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
47210
47211 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
47212 EVT VT = Op->getValueType(0);
47213
47214 // Make sure the element size doesn't change.
47215 if (VT.getScalarSizeInBits() != ScalarSize)
47216 return SDValue();
47217
47218 unsigned Opc = Op.getOpcode();
47219 switch (Opc) {
47220 case ISD::VECTOR_SHUFFLE: {
47221 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
47222 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
47223 if (!Op.getOperand(1).isUndef())
47224 return SDValue();
47225 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
47226 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
47227 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
47228 cast<ShuffleVectorSDNode>(Op)->getMask());
47229 break;
47230 }
47231 case ISD::INSERT_VECTOR_ELT: {
47232 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
47233 // -V, INDEX).
47234 SDValue InsVector = Op.getOperand(0);
47235 SDValue InsVal = Op.getOperand(1);
47236 if (!InsVector.isUndef())
47237 return SDValue();
47238 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
47239 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
47240 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
47241 NegInsVal, Op.getOperand(2));
47242 break;
47243 }
47244 case ISD::FSUB:
47245 case ISD::XOR:
47246 case X86ISD::FXOR: {
47247 SDValue Op1 = Op.getOperand(1);
47248 SDValue Op0 = Op.getOperand(0);
47249
47250 // For XOR and FXOR, we want to check if constant
47251 // bits of Op1 are sign bit masks. For FSUB, we
47252 // have to check if constant bits of Op0 are sign
47253 // bit masks and hence we swap the operands.
47254 if (Opc == ISD::FSUB)
47255 std::swap(Op0, Op1);
47256
47257 APInt UndefElts;
47258 SmallVector<APInt, 16> EltBits;
47259 // Extract constant bits and see if they are all
47260 // sign bit masks. Ignore the undef elements.
47261 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
47262 /* AllowWholeUndefs */ true,
47263 /* AllowPartialUndefs */ false)) {
47264 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
47265 if (!UndefElts[I] && !EltBits[I].isSignMask())
47266 return SDValue();
47267
47268 return peekThroughBitcasts(Op0);
47269 }
47270 }
47271 }
47272
47273 return SDValue();
47274}
47275
47276static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
47277 bool NegRes) {
47278 if (NegMul) {
47279 switch (Opcode) {
47280 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
47281 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
47282 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
47283 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
47284 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
47285 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
47286 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
47287 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
47288 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
47289 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
47290 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
47291 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
47292 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
47293 }
47294 }
47295
47296 if (NegAcc) {
47297 switch (Opcode) {
47298 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
47299 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
47300 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
47301 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
47302 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
47303 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
47304 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
47305 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
47306 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
47307 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
47308 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
47309 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
47310 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
47311 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
47312 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
47313 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
47314 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
47315 }
47316 }
47317
47318 if (NegRes) {
47319 switch (Opcode) {
47320 // For accuracy reason, we never combine fneg and fma under strict FP.
47321 default: llvm_unreachable("Unexpected opcode")__builtin_unreachable();
47322 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
47323 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
47324 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
47325 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
47326 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
47327 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
47328 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
47329 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
47330 }
47331 }
47332
47333 return Opcode;
47334}
47335
47336/// Do target-specific dag combines on floating point negations.
47337static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
47338 TargetLowering::DAGCombinerInfo &DCI,
47339 const X86Subtarget &Subtarget) {
47340 EVT OrigVT = N->getValueType(0);
47341 SDValue Arg = isFNEG(DAG, N);
47342 if (!Arg)
47343 return SDValue();
47344
47345 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47346 EVT VT = Arg.getValueType();
47347 EVT SVT = VT.getScalarType();
47348 SDLoc DL(N);
47349
47350 // Let legalize expand this if it isn't a legal type yet.
47351 if (!TLI.isTypeLegal(VT))
47352 return SDValue();
47353
47354 // If we're negating a FMUL node on a target with FMA, then we can avoid the
47355 // use of a constant by performing (-0 - A*B) instead.
47356 // FIXME: Check rounding control flags as well once it becomes available.
47357 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
47358 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
47359 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
47360 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
47361 Arg.getOperand(1), Zero);
47362 return DAG.getBitcast(OrigVT, NewNode);
47363 }
47364
47365 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
47366 bool LegalOperations = !DCI.isBeforeLegalizeOps();
47367 if (SDValue NegArg =
47368 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
47369 return DAG.getBitcast(OrigVT, NegArg);
47370
47371 return SDValue();
47372}
47373
47374SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
47375 bool LegalOperations,
47376 bool ForCodeSize,
47377 NegatibleCost &Cost,
47378 unsigned Depth) const {
47379 // fneg patterns are removable even if they have multiple uses.
47380 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
47381 Cost = NegatibleCost::Cheaper;
47382 return DAG.getBitcast(Op.getValueType(), Arg);
47383 }
47384
47385 EVT VT = Op.getValueType();
47386 EVT SVT = VT.getScalarType();
47387 unsigned Opc = Op.getOpcode();
47388 SDNodeFlags Flags = Op.getNode()->getFlags();
47389 switch (Opc) {
47390 case ISD::FMA:
47391 case X86ISD::FMSUB:
47392 case X86ISD::FNMADD:
47393 case X86ISD::FNMSUB:
47394 case X86ISD::FMADD_RND:
47395 case X86ISD::FMSUB_RND:
47396 case X86ISD::FNMADD_RND:
47397 case X86ISD::FNMSUB_RND: {
47398 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
47399 !(SVT == MVT::f32 || SVT == MVT::f64) ||
47400 !isOperationLegal(ISD::FMA, VT))
47401 break;
47402
47403 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
47404 // if it may have signed zeros.
47405 if (!Flags.hasNoSignedZeros())
47406 break;
47407
47408 // This is always negatible for free but we might be able to remove some
47409 // extra operand negations as well.
47410 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
47411 for (int i = 0; i != 3; ++i)
47412 NewOps[i] = getCheaperNegatedExpression(
47413 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
47414
47415 bool NegA = !!NewOps[0];
47416 bool NegB = !!NewOps[1];
47417 bool NegC = !!NewOps[2];
47418 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
47419
47420 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
47421 : NegatibleCost::Neutral;
47422
47423 // Fill in the non-negated ops with the original values.
47424 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
47425 if (!NewOps[i])
47426 NewOps[i] = Op.getOperand(i);
47427 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
47428 }
47429 case X86ISD::FRCP:
47430 if (SDValue NegOp0 =
47431 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
47432 ForCodeSize, Cost, Depth + 1))
47433 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
47434 break;
47435 }
47436
47437 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
47438 ForCodeSize, Cost, Depth);
47439}
47440
47441static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
47442 const X86Subtarget &Subtarget) {
47443 MVT VT = N->getSimpleValueType(0);
47444 // If we have integer vector types available, use the integer opcodes.
47445 if (!VT.isVector() || !Subtarget.hasSSE2())
47446 return SDValue();
47447
47448 SDLoc dl(N);
47449
47450 unsigned IntBits = VT.getScalarSizeInBits();
47451 MVT IntSVT = MVT::getIntegerVT(IntBits);
47452 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
47453
47454 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
47455 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
47456 unsigned IntOpcode;
47457 switch (N->getOpcode()) {
47458 default: llvm_unreachable("Unexpected FP logic op")__builtin_unreachable();
47459 case X86ISD::FOR: IntOpcode = ISD::OR; break;
47460 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
47461 case X86ISD::FAND: IntOpcode = ISD::AND; break;
47462 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
47463 }
47464 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
47465 return DAG.getBitcast(VT, IntOp);
47466}
47467
47468
47469/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
47470static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
47471 if (N->getOpcode() != ISD::XOR)
47472 return SDValue();
47473
47474 SDValue LHS = N->getOperand(0);
47475 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
47476 return SDValue();
47477
47478 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
47479 X86::CondCode(LHS->getConstantOperandVal(0)));
47480 SDLoc DL(N);
47481 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
47482}
47483
47484static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
47485 TargetLowering::DAGCombinerInfo &DCI,
47486 const X86Subtarget &Subtarget) {
47487 SDValue N0 = N->getOperand(0);
47488 SDValue N1 = N->getOperand(1);
47489 EVT VT = N->getValueType(0);
47490
47491 // If this is SSE1 only convert to FXOR to avoid scalarization.
47492 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
47493 return DAG.getBitcast(MVT::v4i32,
47494 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
47495 DAG.getBitcast(MVT::v4f32, N0),
47496 DAG.getBitcast(MVT::v4f32, N1)));
47497 }
47498
47499 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
47500 return Cmp;
47501
47502 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
47503 return R;
47504
47505 if (DCI.isBeforeLegalizeOps())
47506 return SDValue();
47507
47508 if (SDValue SetCC = foldXor1SetCC(N, DAG))
47509 return SetCC;
47510
47511 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
47512 return RV;
47513
47514 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
47515 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47516 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
47517 N0.getOperand(0).getValueType().isVector() &&
47518 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
47519 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
47520 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
47521 N0.getOperand(0).getValueType()));
47522 }
47523
47524 // Handle AVX512 mask widening.
47525 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
47526 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
47527 VT.getVectorElementType() == MVT::i1 &&
47528 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
47529 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
47530 return DAG.getNode(
47531 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
47532 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
47533 N0.getOperand(2));
47534 }
47535
47536 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
47537 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
47538 // TODO: Under what circumstances could this be performed in DAGCombine?
47539 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
47540 N0.getOperand(0).getOpcode() == N->getOpcode()) {
47541 SDValue TruncExtSrc = N0.getOperand(0);
47542 auto *N1C = dyn_cast<ConstantSDNode>(N1);
47543 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
47544 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
47545 SDLoc DL(N);
47546 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
47547 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
47548 return DAG.getNode(ISD::XOR, DL, VT, LHS,
47549 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
47550 }
47551 }
47552
47553 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
47554 return FPLogic;
47555
47556 return combineFneg(N, DAG, DCI, Subtarget);
47557}
47558
47559static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
47560 TargetLowering::DAGCombinerInfo &DCI,
47561 const X86Subtarget &Subtarget) {
47562 EVT VT = N->getValueType(0);
47563 unsigned NumBits = VT.getSizeInBits();
47564
47565 // TODO - Constant Folding.
47566
47567 // Simplify the inputs.
47568 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47569 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
47570 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47571 return SDValue(N, 0);
47572
47573 return SDValue();
47574}
47575
47576static bool isNullFPScalarOrVectorConst(SDValue V) {
47577 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
47578}
47579
47580/// If a value is a scalar FP zero or a vector FP zero (potentially including
47581/// undefined elements), return a zero constant that may be used to fold away
47582/// that value. In the case of a vector, the returned constant will not contain
47583/// undefined elements even if the input parameter does. This makes it suitable
47584/// to be used as a replacement operand with operations (eg, bitwise-and) where
47585/// an undef should not propagate.
47586static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
47587 const X86Subtarget &Subtarget) {
47588 if (!isNullFPScalarOrVectorConst(V))
47589 return SDValue();
47590
47591 if (V.getValueType().isVector())
47592 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
47593
47594 return V;
47595}
47596
47597static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
47598 const X86Subtarget &Subtarget) {
47599 SDValue N0 = N->getOperand(0);
47600 SDValue N1 = N->getOperand(1);
47601 EVT VT = N->getValueType(0);
47602 SDLoc DL(N);
47603
47604 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
47605 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
47606 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
47607 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
47608 return SDValue();
47609
47610 auto isAllOnesConstantFP = [](SDValue V) {
47611 if (V.getSimpleValueType().isVector())
47612 return ISD::isBuildVectorAllOnes(V.getNode());
47613 auto *C = dyn_cast<ConstantFPSDNode>(V);
47614 return C && C->getConstantFPValue()->isAllOnesValue();
47615 };
47616
47617 // fand (fxor X, -1), Y --> fandn X, Y
47618 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
47619 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
47620
47621 // fand X, (fxor Y, -1) --> fandn Y, X
47622 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
47623 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
47624
47625 return SDValue();
47626}
47627
47628/// Do target-specific dag combines on X86ISD::FAND nodes.
47629static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
47630 const X86Subtarget &Subtarget) {
47631 // FAND(0.0, x) -> 0.0
47632 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
47633 return V;
47634
47635 // FAND(x, 0.0) -> 0.0
47636 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47637 return V;
47638
47639 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
47640 return V;
47641
47642 return lowerX86FPLogicOp(N, DAG, Subtarget);
47643}
47644
47645/// Do target-specific dag combines on X86ISD::FANDN nodes.
47646static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
47647 const X86Subtarget &Subtarget) {
47648 // FANDN(0.0, x) -> x
47649 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47650 return N->getOperand(1);
47651
47652 // FANDN(x, 0.0) -> 0.0
47653 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
47654 return V;
47655
47656 return lowerX86FPLogicOp(N, DAG, Subtarget);
47657}
47658
47659/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
47660static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
47661 TargetLowering::DAGCombinerInfo &DCI,
47662 const X86Subtarget &Subtarget) {
47663 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((void)0);
47664
47665 // F[X]OR(0.0, x) -> x
47666 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
47667 return N->getOperand(1);
47668
47669 // F[X]OR(x, 0.0) -> x
47670 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
47671 return N->getOperand(0);
47672
47673 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
47674 return NewVal;
47675
47676 return lowerX86FPLogicOp(N, DAG, Subtarget);
47677}
47678
47679/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
47680static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
47681 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((void)0);
47682
47683 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
47684 if (!DAG.getTarget().Options.NoNaNsFPMath ||
47685 !DAG.getTarget().Options.NoSignedZerosFPMath)
47686 return SDValue();
47687
47688 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
47689 // into FMINC and FMAXC, which are Commutative operations.
47690 unsigned NewOp = 0;
47691 switch (N->getOpcode()) {
47692 default: llvm_unreachable("unknown opcode")__builtin_unreachable();
47693 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
47694 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
47695 }
47696
47697 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
47698 N->getOperand(0), N->getOperand(1));
47699}
47700
47701static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
47702 const X86Subtarget &Subtarget) {
47703 if (Subtarget.useSoftFloat())
47704 return SDValue();
47705
47706 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47707
47708 EVT VT = N->getValueType(0);
47709 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
47710 (Subtarget.hasSSE2() && VT == MVT::f64) ||
47711 (VT.isVector() && TLI.isTypeLegal(VT))))
47712 return SDValue();
47713
47714 SDValue Op0 = N->getOperand(0);
47715 SDValue Op1 = N->getOperand(1);
47716 SDLoc DL(N);
47717 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
47718
47719 // If we don't have to respect NaN inputs, this is a direct translation to x86
47720 // min/max instructions.
47721 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
47722 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47723
47724 // If one of the operands is known non-NaN use the native min/max instructions
47725 // with the non-NaN input as second operand.
47726 if (DAG.isKnownNeverNaN(Op1))
47727 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
47728 if (DAG.isKnownNeverNaN(Op0))
47729 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
47730
47731 // If we have to respect NaN inputs, this takes at least 3 instructions.
47732 // Favor a library call when operating on a scalar and minimizing code size.
47733 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
47734 return SDValue();
47735
47736 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
47737 VT);
47738
47739 // There are 4 possibilities involving NaN inputs, and these are the required
47740 // outputs:
47741 // Op1
47742 // Num NaN
47743 // ----------------
47744 // Num | Max | Op0 |
47745 // Op0 ----------------
47746 // NaN | Op1 | NaN |
47747 // ----------------
47748 //
47749 // The SSE FP max/min instructions were not designed for this case, but rather
47750 // to implement:
47751 // Min = Op1 < Op0 ? Op1 : Op0
47752 // Max = Op1 > Op0 ? Op1 : Op0
47753 //
47754 // So they always return Op0 if either input is a NaN. However, we can still
47755 // use those instructions for fmaxnum by selecting away a NaN input.
47756
47757 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
47758 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
47759 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
47760
47761 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
47762 // are NaN, the NaN value of Op1 is the result.
47763 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
47764}
47765
47766static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
47767 TargetLowering::DAGCombinerInfo &DCI) {
47768 EVT VT = N->getValueType(0);
47769 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47770
47771 APInt KnownUndef, KnownZero;
47772 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
47773 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
47774 KnownZero, DCI))
47775 return SDValue(N, 0);
47776
47777 // Convert a full vector load into vzload when not all bits are needed.
47778 SDValue In = N->getOperand(0);
47779 MVT InVT = In.getSimpleValueType();
47780 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47781 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47782 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((void)0);
47783 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
47784 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47785 MVT MemVT = MVT::getIntegerVT(NumBits);
47786 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47787 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47788 SDLoc dl(N);
47789 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
47790 DAG.getBitcast(InVT, VZLoad));
47791 DCI.CombineTo(N, Convert);
47792 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47793 DCI.recursivelyDeleteUnusedNodes(LN);
47794 return SDValue(N, 0);
47795 }
47796 }
47797
47798 return SDValue();
47799}
47800
47801static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
47802 TargetLowering::DAGCombinerInfo &DCI) {
47803 bool IsStrict = N->isTargetStrictFPOpcode();
47804 EVT VT = N->getValueType(0);
47805
47806 // Convert a full vector load into vzload when not all bits are needed.
47807 SDValue In = N->getOperand(IsStrict ? 1 : 0);
47808 MVT InVT = In.getSimpleValueType();
47809 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
47810 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
47811 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((void)0);
47812 LoadSDNode *LN = cast<LoadSDNode>(In);
47813 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
47814 MVT MemVT = MVT::getFloatingPointVT(NumBits);
47815 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
47816 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
47817 SDLoc dl(N);
47818 if (IsStrict) {
47819 SDValue Convert =
47820 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
47821 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
47822 DCI.CombineTo(N, Convert, Convert.getValue(1));
47823 } else {
47824 SDValue Convert =
47825 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
47826 DCI.CombineTo(N, Convert);
47827 }
47828 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47829 DCI.recursivelyDeleteUnusedNodes(LN);
47830 return SDValue(N, 0);
47831 }
47832 }
47833
47834 return SDValue();
47835}
47836
47837/// Do target-specific dag combines on X86ISD::ANDNP nodes.
47838static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
47839 TargetLowering::DAGCombinerInfo &DCI,
47840 const X86Subtarget &Subtarget) {
47841 MVT VT = N->getSimpleValueType(0);
47842
47843 // ANDNP(0, x) -> x
47844 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
47845 return N->getOperand(1);
47846
47847 // ANDNP(x, 0) -> 0
47848 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
47849 return DAG.getConstant(0, SDLoc(N), VT);
47850
47851 // Turn ANDNP back to AND if input is inverted.
47852 if (SDValue Not = IsNOT(N->getOperand(0), DAG))
47853 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
47854 N->getOperand(1));
47855
47856 // Attempt to recursively combine a bitmask ANDNP with shuffles.
47857 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
47858 SDValue Op(N, 0);
47859 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47860 return Res;
47861 }
47862
47863 return SDValue();
47864}
47865
47866static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
47867 TargetLowering::DAGCombinerInfo &DCI) {
47868 SDValue N1 = N->getOperand(1);
47869
47870 // BT ignores high bits in the bit index operand.
47871 unsigned BitWidth = N1.getValueSizeInBits();
47872 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
47873 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
47874 if (N->getOpcode() != ISD::DELETED_NODE)
47875 DCI.AddToWorklist(N);
47876 return SDValue(N, 0);
47877 }
47878
47879 return SDValue();
47880}
47881
47882static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
47883 TargetLowering::DAGCombinerInfo &DCI) {
47884 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
47885 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
47886
47887 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
47888 APInt KnownUndef, KnownZero;
47889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47890 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
47891 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
47892 DCI)) {
47893 if (N->getOpcode() != ISD::DELETED_NODE)
47894 DCI.AddToWorklist(N);
47895 return SDValue(N, 0);
47896 }
47897
47898 // Convert a full vector load into vzload when not all bits are needed.
47899 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
47900 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
47901 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
47902 SDLoc dl(N);
47903 if (IsStrict) {
47904 SDValue Convert = DAG.getNode(
47905 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
47906 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
47907 DCI.CombineTo(N, Convert, Convert.getValue(1));
47908 } else {
47909 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
47910 DAG.getBitcast(MVT::v8i16, VZLoad));
47911 DCI.CombineTo(N, Convert);
47912 }
47913
47914 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
47915 DCI.recursivelyDeleteUnusedNodes(LN);
47916 return SDValue(N, 0);
47917 }
47918 }
47919 }
47920
47921 return SDValue();
47922}
47923
47924// Try to combine sext_in_reg of a cmov of constants by extending the constants.
47925static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
47926 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((void)0);
47927
47928 EVT DstVT = N->getValueType(0);
47929
47930 SDValue N0 = N->getOperand(0);
47931 SDValue N1 = N->getOperand(1);
47932 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47933
47934 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
47935 return SDValue();
47936
47937 // Look through single use any_extends / truncs.
47938 SDValue IntermediateBitwidthOp;
47939 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
47940 N0.hasOneUse()) {
47941 IntermediateBitwidthOp = N0;
47942 N0 = N0.getOperand(0);
47943 }
47944
47945 // See if we have a single use cmov.
47946 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
47947 return SDValue();
47948
47949 SDValue CMovOp0 = N0.getOperand(0);
47950 SDValue CMovOp1 = N0.getOperand(1);
47951
47952 // Make sure both operands are constants.
47953 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
47954 !isa<ConstantSDNode>(CMovOp1.getNode()))
47955 return SDValue();
47956
47957 SDLoc DL(N);
47958
47959 // If we looked through an any_extend/trunc above, add one to the constants.
47960 if (IntermediateBitwidthOp) {
47961 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
47962 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
47963 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
47964 }
47965
47966 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
47967 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
47968
47969 EVT CMovVT = DstVT;
47970 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
47971 if (DstVT == MVT::i16) {
47972 CMovVT = MVT::i32;
47973 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
47974 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
47975 }
47976
47977 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
47978 N0.getOperand(2), N0.getOperand(3));
47979
47980 if (CMovVT != DstVT)
47981 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
47982
47983 return CMov;
47984}
47985
47986static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
47987 const X86Subtarget &Subtarget) {
47988 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((void)0);
47989
47990 if (SDValue V = combineSextInRegCmov(N, DAG))
47991 return V;
47992
47993 EVT VT = N->getValueType(0);
47994 SDValue N0 = N->getOperand(0);
47995 SDValue N1 = N->getOperand(1);
47996 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
47997 SDLoc dl(N);
47998
47999 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
48000 // both SSE and AVX2 since there is no sign-extended shift right
48001 // operation on a vector with 64-bit elements.
48002 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
48003 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
48004 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
48005 N0.getOpcode() == ISD::SIGN_EXTEND)) {
48006 SDValue N00 = N0.getOperand(0);
48007
48008 // EXTLOAD has a better solution on AVX2,
48009 // it may be replaced with X86ISD::VSEXT node.
48010 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
48011 if (!ISD::isNormalLoad(N00.getNode()))
48012 return SDValue();
48013
48014 // Attempt to promote any comparison mask ops before moving the
48015 // SIGN_EXTEND_INREG in the way.
48016 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
48017 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
48018
48019 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
48020 SDValue Tmp =
48021 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
48022 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
48023 }
48024 }
48025 return SDValue();
48026}
48027
48028/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
48029/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
48030/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
48031/// opportunities to combine math ops, use an LEA, or use a complex addressing
48032/// mode. This can eliminate extend, add, and shift instructions.
48033static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
48034 const X86Subtarget &Subtarget) {
48035 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
48036 Ext->getOpcode() != ISD::ZERO_EXTEND)
48037 return SDValue();
48038
48039 // TODO: This should be valid for other integer types.
48040 EVT VT = Ext->getValueType(0);
48041 if (VT != MVT::i64)
48042 return SDValue();
48043
48044 SDValue Add = Ext->getOperand(0);
48045 if (Add.getOpcode() != ISD::ADD)
48046 return SDValue();
48047
48048 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
48049 bool NSW = Add->getFlags().hasNoSignedWrap();
48050 bool NUW = Add->getFlags().hasNoUnsignedWrap();
48051
48052 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
48053 // into the 'zext'
48054 if ((Sext && !NSW) || (!Sext && !NUW))
48055 return SDValue();
48056
48057 // Having a constant operand to the 'add' ensures that we are not increasing
48058 // the instruction count because the constant is extended for free below.
48059 // A constant operand can also become the displacement field of an LEA.
48060 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
48061 if (!AddOp1)
48062 return SDValue();
48063
48064 // Don't make the 'add' bigger if there's no hope of combining it with some
48065 // other 'add' or 'shl' instruction.
48066 // TODO: It may be profitable to generate simpler LEA instructions in place
48067 // of single 'add' instructions, but the cost model for selecting an LEA
48068 // currently has a high threshold.
48069 bool HasLEAPotential = false;
48070 for (auto *User : Ext->uses()) {
48071 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
48072 HasLEAPotential = true;
48073 break;
48074 }
48075 }
48076 if (!HasLEAPotential)
48077 return SDValue();
48078
48079 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
48080 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
48081 SDValue AddOp0 = Add.getOperand(0);
48082 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
48083 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
48084
48085 // The wider add is guaranteed to not wrap because both operands are
48086 // sign-extended.
48087 SDNodeFlags Flags;
48088 Flags.setNoSignedWrap(NSW);
48089 Flags.setNoUnsignedWrap(NUW);
48090 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
48091}
48092
48093// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
48094// operands and the result of CMOV is not used anywhere else - promote CMOV
48095// itself instead of promoting its result. This could be beneficial, because:
48096// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
48097// (or more) pseudo-CMOVs only when they go one-after-another and
48098// getting rid of result extension code after CMOV will help that.
48099// 2) Promotion of constant CMOV arguments is free, hence the
48100// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
48101// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
48102// promotion is also good in terms of code-size.
48103// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
48104// promotion).
48105static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
48106 SDValue CMovN = Extend->getOperand(0);
48107 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
48108 return SDValue();
48109
48110 EVT TargetVT = Extend->getValueType(0);
48111 unsigned ExtendOpcode = Extend->getOpcode();
48112 SDLoc DL(Extend);
48113
48114 EVT VT = CMovN.getValueType();
48115 SDValue CMovOp0 = CMovN.getOperand(0);
48116 SDValue CMovOp1 = CMovN.getOperand(1);
48117
48118 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
48119 !isa<ConstantSDNode>(CMovOp1.getNode()))
48120 return SDValue();
48121
48122 // Only extend to i32 or i64.
48123 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
48124 return SDValue();
48125
48126 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
48127 // are free.
48128 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
48129 return SDValue();
48130
48131 // If this a zero extend to i64, we should only extend to i32 and use a free
48132 // zero extend to finish.
48133 EVT ExtendVT = TargetVT;
48134 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
48135 ExtendVT = MVT::i32;
48136
48137 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
48138 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
48139
48140 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
48141 CMovN.getOperand(2), CMovN.getOperand(3));
48142
48143 // Finish extending if needed.
48144 if (ExtendVT != TargetVT)
48145 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
48146
48147 return Res;
48148}
48149
48150// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
48151// This is more or less the reverse of combineBitcastvxi1.
48152static SDValue
48153combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
48154 TargetLowering::DAGCombinerInfo &DCI,
48155 const X86Subtarget &Subtarget) {
48156 unsigned Opcode = N->getOpcode();
48157 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
48158 Opcode != ISD::ANY_EXTEND)
48159 return SDValue();
48160 if (!DCI.isBeforeLegalizeOps())
48161 return SDValue();
48162 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
48163 return SDValue();
48164
48165 SDValue N0 = N->getOperand(0);
48166 EVT VT = N->getValueType(0);
48167 EVT SVT = VT.getScalarType();
48168 EVT InSVT = N0.getValueType().getScalarType();
48169 unsigned EltSizeInBits = SVT.getSizeInBits();
48170
48171 // Input type must be extending a bool vector (bit-casted from a scalar
48172 // integer) to legal integer types.
48173 if (!VT.isVector())
48174 return SDValue();
48175 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
48176 return SDValue();
48177 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
48178 return SDValue();
48179
48180 SDValue N00 = N0.getOperand(0);
48181 EVT SclVT = N0.getOperand(0).getValueType();
48182 if (!SclVT.isScalarInteger())
48183 return SDValue();
48184
48185 SDLoc DL(N);
48186 SDValue Vec;
48187 SmallVector<int, 32> ShuffleMask;
48188 unsigned NumElts = VT.getVectorNumElements();
48189 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")((void)0);
48190
48191 // Broadcast the scalar integer to the vector elements.
48192 if (NumElts > EltSizeInBits) {
48193 // If the scalar integer is greater than the vector element size, then we
48194 // must split it down into sub-sections for broadcasting. For example:
48195 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
48196 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
48197 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")((void)0);
48198 unsigned Scale = NumElts / EltSizeInBits;
48199 EVT BroadcastVT =
48200 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
48201 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48202 Vec = DAG.getBitcast(VT, Vec);
48203
48204 for (unsigned i = 0; i != Scale; ++i)
48205 ShuffleMask.append(EltSizeInBits, i);
48206 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48207 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
48208 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
48209 // If we have register broadcast instructions, use the scalar size as the
48210 // element type for the shuffle. Then cast to the wider element type. The
48211 // widened bits won't be used, and this might allow the use of a broadcast
48212 // load.
48213 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")((void)0);
48214 unsigned Scale = EltSizeInBits / NumElts;
48215 EVT BroadcastVT =
48216 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
48217 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48218 ShuffleMask.append(NumElts * Scale, 0);
48219 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
48220 Vec = DAG.getBitcast(VT, Vec);
48221 } else {
48222 // For smaller scalar integers, we can simply any-extend it to the vector
48223 // element size (we don't care about the upper bits) and broadcast it to all
48224 // elements.
48225 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
48226 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
48227 ShuffleMask.append(NumElts, 0);
48228 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48229 }
48230
48231 // Now, mask the relevant bit in each element.
48232 SmallVector<SDValue, 32> Bits;
48233 for (unsigned i = 0; i != NumElts; ++i) {
48234 int BitIdx = (i % EltSizeInBits);
48235 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
48236 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
48237 }
48238 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
48239 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
48240
48241 // Compare against the bitmask and extend the result.
48242 EVT CCVT = VT.changeVectorElementType(MVT::i1);
48243 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
48244 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
48245
48246 // For SEXT, this is now done, otherwise shift the result down for
48247 // zero-extension.
48248 if (Opcode == ISD::SIGN_EXTEND)
48249 return Vec;
48250 return DAG.getNode(ISD::SRL, DL, VT, Vec,
48251 DAG.getConstant(EltSizeInBits - 1, DL, VT));
48252}
48253
48254// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
48255// result type.
48256static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
48257 const X86Subtarget &Subtarget) {
48258 SDValue N0 = N->getOperand(0);
48259 EVT VT = N->getValueType(0);
48260 SDLoc dl(N);
48261
48262 // Only do this combine with AVX512 for vector extends.
48263 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
48264 return SDValue();
48265
48266 // Only combine legal element types.
48267 EVT SVT = VT.getVectorElementType();
48268 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
48269 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
48270 return SDValue();
48271
48272 // We can only do this if the vector size in 256 bits or less.
48273 unsigned Size = VT.getSizeInBits();
48274 if (Size > 256 && Subtarget.useAVX512Regs())
48275 return SDValue();
48276
48277 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
48278 // that's the only integer compares with we have.
48279 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48280 if (ISD::isUnsignedIntSetCC(CC))
48281 return SDValue();
48282
48283 // Only do this combine if the extension will be fully consumed by the setcc.
48284 EVT N00VT = N0.getOperand(0).getValueType();
48285 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
48286 if (Size != MatchingVecType.getSizeInBits())
48287 return SDValue();
48288
48289 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
48290
48291 if (N->getOpcode() == ISD::ZERO_EXTEND)
48292 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
48293
48294 return Res;
48295}
48296
48297static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
48298 TargetLowering::DAGCombinerInfo &DCI,
48299 const X86Subtarget &Subtarget) {
48300 SDValue N0 = N->getOperand(0);
48301 EVT VT = N->getValueType(0);
48302 SDLoc DL(N);
48303
48304 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48305 if (!DCI.isBeforeLegalizeOps() &&
48306 N0.getOpcode() == X86ISD::SETCC_CARRY) {
48307 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
48308 N0->getOperand(1));
48309 bool ReplaceOtherUses = !N0.hasOneUse();
48310 DCI.CombineTo(N, Setcc);
48311 // Replace other uses with a truncate of the widened setcc_carry.
48312 if (ReplaceOtherUses) {
48313 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48314 N0.getValueType(), Setcc);
48315 DCI.CombineTo(N0.getNode(), Trunc);
48316 }
48317
48318 return SDValue(N, 0);
48319 }
48320
48321 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48322 return NewCMov;
48323
48324 if (!DCI.isBeforeLegalizeOps())
48325 return SDValue();
48326
48327 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48328 return V;
48329
48330 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48331 return V;
48332
48333 if (VT.isVector()) {
48334 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48335 return R;
48336
48337 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
48338 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
48339 }
48340
48341 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48342 return NewAdd;
48343
48344 return SDValue();
48345}
48346
48347static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
48348 TargetLowering::DAGCombinerInfo &DCI,
48349 const X86Subtarget &Subtarget) {
48350 SDLoc dl(N);
48351 EVT VT = N->getValueType(0);
48352 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
48353
48354 // Let legalize expand this if it isn't a legal type yet.
48355 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48356 if (!TLI.isTypeLegal(VT))
48357 return SDValue();
48358
48359 SDValue A = N->getOperand(IsStrict ? 1 : 0);
48360 SDValue B = N->getOperand(IsStrict ? 2 : 1);
48361 SDValue C = N->getOperand(IsStrict ? 3 : 2);
48362
48363 // If the operation allows fast-math and the target does not support FMA,
48364 // split this into mul+add to avoid libcall(s).
48365 SDNodeFlags Flags = N->getFlags();
48366 if (!IsStrict && Flags.hasAllowReassociation() &&
48367 TLI.isOperationExpand(ISD::FMA, VT)) {
48368 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
48369 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
48370 }
48371
48372 EVT ScalarVT = VT.getScalarType();
48373 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
48374 return SDValue();
48375
48376 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
48377 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48378 bool LegalOperations = !DCI.isBeforeLegalizeOps();
48379 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
48380 CodeSize)) {
48381 V = NegV;
48382 return true;
48383 }
48384 // Look through extract_vector_elts. If it comes from an FNEG, create a
48385 // new extract from the FNEG input.
48386 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
48387 isNullConstant(V.getOperand(1))) {
48388 SDValue Vec = V.getOperand(0);
48389 if (SDValue NegV = TLI.getCheaperNegatedExpression(
48390 Vec, DAG, LegalOperations, CodeSize)) {
48391 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
48392 NegV, V.getOperand(1));
48393 return true;
48394 }
48395 }
48396
48397 return false;
48398 };
48399
48400 // Do not convert the passthru input of scalar intrinsics.
48401 // FIXME: We could allow negations of the lower element only.
48402 bool NegA = invertIfNegative(A);
48403 bool NegB = invertIfNegative(B);
48404 bool NegC = invertIfNegative(C);
48405
48406 if (!NegA && !NegB && !NegC)
48407 return SDValue();
48408
48409 unsigned NewOpcode =
48410 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
48411
48412 // Propagate fast-math-flags to new FMA node.
48413 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
48414 if (IsStrict) {
48415 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")((void)0);
48416 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
48417 {N->getOperand(0), A, B, C});
48418 } else {
48419 if (N->getNumOperands() == 4)
48420 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
48421 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
48422 }
48423}
48424
48425// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
48426// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
48427static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
48428 TargetLowering::DAGCombinerInfo &DCI) {
48429 SDLoc dl(N);
48430 EVT VT = N->getValueType(0);
48431 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48432 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48433 bool LegalOperations = !DCI.isBeforeLegalizeOps();
48434
48435 SDValue N2 = N->getOperand(2);
48436
48437 SDValue NegN2 =
48438 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
48439 if (!NegN2)
48440 return SDValue();
48441 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
48442
48443 if (N->getNumOperands() == 4)
48444 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48445 NegN2, N->getOperand(3));
48446 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
48447 NegN2);
48448}
48449
48450static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
48451 TargetLowering::DAGCombinerInfo &DCI,
48452 const X86Subtarget &Subtarget) {
48453 SDLoc dl(N);
48454 SDValue N0 = N->getOperand(0);
48455 EVT VT = N->getValueType(0);
48456
48457 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48458 // FIXME: Is this needed? We don't seem to have any tests for it.
48459 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
48460 N0.getOpcode() == X86ISD::SETCC_CARRY) {
48461 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
48462 N0->getOperand(1));
48463 bool ReplaceOtherUses = !N0.hasOneUse();
48464 DCI.CombineTo(N, Setcc);
48465 // Replace other uses with a truncate of the widened setcc_carry.
48466 if (ReplaceOtherUses) {
48467 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
48468 N0.getValueType(), Setcc);
48469 DCI.CombineTo(N0.getNode(), Trunc);
48470 }
48471
48472 return SDValue(N, 0);
48473 }
48474
48475 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
48476 return NewCMov;
48477
48478 if (DCI.isBeforeLegalizeOps())
48479 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
48480 return V;
48481
48482 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
48483 return V;
48484
48485 if (VT.isVector())
48486 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
48487 return R;
48488
48489 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
48490 return NewAdd;
48491
48492 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
48493 return R;
48494
48495 // TODO: Combine with any target/faux shuffle.
48496 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
48497 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
48498 SDValue N00 = N0.getOperand(0);
48499 SDValue N01 = N0.getOperand(1);
48500 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
48501 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
48502 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
48503 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
48504 return concatSubVectors(N00, N01, DAG, dl);
48505 }
48506 }
48507
48508 return SDValue();
48509}
48510
48511/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
48512/// recognizable memcmp expansion.
48513static bool isOrXorXorTree(SDValue X, bool Root = true) {
48514 if (X.getOpcode() == ISD::OR)
48515 return isOrXorXorTree(X.getOperand(0), false) &&
48516 isOrXorXorTree(X.getOperand(1), false);
48517 if (Root)
48518 return false;
48519 return X.getOpcode() == ISD::XOR;
48520}
48521
48522/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
48523/// expansion.
48524template<typename F>
48525static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
48526 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
48527 SDValue Op0 = X.getOperand(0);
48528 SDValue Op1 = X.getOperand(1);
48529 if (X.getOpcode() == ISD::OR) {
48530 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48531 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
48532 if (VecVT != CmpVT)
48533 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
48534 if (HasPT)
48535 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
48536 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
48537 } else if (X.getOpcode() == ISD::XOR) {
48538 SDValue A = SToV(Op0);
48539 SDValue B = SToV(Op1);
48540 if (VecVT != CmpVT)
48541 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
48542 if (HasPT)
48543 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
48544 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
48545 }
48546 llvm_unreachable("Impossible")__builtin_unreachable();
48547}
48548
48549/// Try to map a 128-bit or larger integer comparison to vector instructions
48550/// before type legalization splits it up into chunks.
48551static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
48552 const X86Subtarget &Subtarget) {
48553 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
48554 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")((void)0);
48555
48556 // We're looking for an oversized integer equality comparison.
48557 SDValue X = SetCC->getOperand(0);
48558 SDValue Y = SetCC->getOperand(1);
48559 EVT OpVT = X.getValueType();
48560 unsigned OpSize = OpVT.getSizeInBits();
48561 if (!OpVT.isScalarInteger() || OpSize < 128)
48562 return SDValue();
48563
48564 // Ignore a comparison with zero because that gets special treatment in
48565 // EmitTest(). But make an exception for the special case of a pair of
48566 // logically-combined vector-sized operands compared to zero. This pattern may
48567 // be generated by the memcmp expansion pass with oversized integer compares
48568 // (see PR33325).
48569 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
48570 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
48571 return SDValue();
48572
48573 // Don't perform this combine if constructing the vector will be expensive.
48574 auto IsVectorBitCastCheap = [](SDValue X) {
48575 X = peekThroughBitcasts(X);
48576 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
48577 X.getOpcode() == ISD::LOAD;
48578 };
48579 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
48580 !IsOrXorXorTreeCCZero)
48581 return SDValue();
48582
48583 EVT VT = SetCC->getValueType(0);
48584 SDLoc DL(SetCC);
48585
48586 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
48587 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
48588 // Otherwise use PCMPEQ (plus AND) and mask testing.
48589 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
48590 (OpSize == 256 && Subtarget.hasAVX()) ||
48591 (OpSize == 512 && Subtarget.useAVX512Regs())) {
48592 bool HasPT = Subtarget.hasSSE41();
48593
48594 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
48595 // vector registers are essentially free. (Technically, widening registers
48596 // prevents load folding, but the tradeoff is worth it.)
48597 bool PreferKOT = Subtarget.preferMaskRegisters();
48598 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
48599
48600 EVT VecVT = MVT::v16i8;
48601 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
48602 if (OpSize == 256) {
48603 VecVT = MVT::v32i8;
48604 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
48605 }
48606 EVT CastVT = VecVT;
48607 bool NeedsAVX512FCast = false;
48608 if (OpSize == 512 || NeedZExt) {
48609 if (Subtarget.hasBWI()) {
48610 VecVT = MVT::v64i8;
48611 CmpVT = MVT::v64i1;
48612 if (OpSize == 512)
48613 CastVT = VecVT;
48614 } else {
48615 VecVT = MVT::v16i32;
48616 CmpVT = MVT::v16i1;
48617 CastVT = OpSize == 512 ? VecVT :
48618 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
48619 NeedsAVX512FCast = true;
48620 }
48621 }
48622
48623 auto ScalarToVector = [&](SDValue X) -> SDValue {
48624 bool TmpZext = false;
48625 EVT TmpCastVT = CastVT;
48626 if (X.getOpcode() == ISD::ZERO_EXTEND) {
48627 SDValue OrigX = X.getOperand(0);
48628 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
48629 if (OrigSize < OpSize) {
48630 if (OrigSize == 128) {
48631 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
48632 X = OrigX;
48633 TmpZext = true;
48634 } else if (OrigSize == 256) {
48635 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
48636 X = OrigX;
48637 TmpZext = true;
48638 }
48639 }
48640 }
48641 X = DAG.getBitcast(TmpCastVT, X);
48642 if (!NeedZExt && !TmpZext)
48643 return X;
48644 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
48645 DAG.getConstant(0, DL, VecVT), X,
48646 DAG.getVectorIdxConstant(0, DL));
48647 };
48648
48649 SDValue Cmp;
48650 if (IsOrXorXorTreeCCZero) {
48651 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
48652 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
48653 // Use 2 vector equality compares and 'and' the results before doing a
48654 // MOVMSK.
48655 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
48656 } else {
48657 SDValue VecX = ScalarToVector(X);
48658 SDValue VecY = ScalarToVector(Y);
48659 if (VecVT != CmpVT) {
48660 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
48661 } else if (HasPT) {
48662 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
48663 } else {
48664 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
48665 }
48666 }
48667 // AVX512 should emit a setcc that will lower to kortest.
48668 if (VecVT != CmpVT) {
48669 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
48670 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
48671 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
48672 DAG.getConstant(0, DL, KRegVT), CC);
48673 }
48674 if (HasPT) {
48675 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
48676 Cmp);
48677 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
48678 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
48679 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
48680 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
48681 }
48682 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
48683 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
48684 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
48685 assert(Cmp.getValueType() == MVT::v16i8 &&((void)0)
48686 "Non 128-bit vector on pre-SSE41 target")((void)0);
48687 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
48688 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
48689 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
48690 }
48691
48692 return SDValue();
48693}
48694
48695static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
48696 TargetLowering::DAGCombinerInfo &DCI,
48697 const X86Subtarget &Subtarget) {
48698 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
48699 const SDValue LHS = N->getOperand(0);
48700 const SDValue RHS = N->getOperand(1);
48701 EVT VT = N->getValueType(0);
48702 EVT OpVT = LHS.getValueType();
48703 SDLoc DL(N);
48704
48705 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
48706 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
48707 return V;
48708
48709 if (VT == MVT::i1 && isNullConstant(RHS)) {
48710 SDValue X86CC;
48711 if (SDValue V =
48712 MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
48713 return DAG.getNode(ISD::TRUNCATE, DL, VT,
48714 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
48715 }
48716
48717 if (OpVT.isScalarInteger()) {
48718 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
48719 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
48720 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
48721 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
48722 if (N0.getOperand(0) == N1)
48723 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48724 N0.getOperand(1));
48725 if (N0.getOperand(1) == N1)
48726 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
48727 N0.getOperand(0));
48728 }
48729 return SDValue();
48730 };
48731 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
48732 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48733 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
48734 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48735
48736 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
48737 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
48738 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
48739 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
48740 if (N0.getOperand(0) == N1)
48741 return DAG.getNode(ISD::AND, DL, OpVT, N1,
48742 DAG.getNOT(DL, N0.getOperand(1), OpVT));
48743 if (N0.getOperand(1) == N1)
48744 return DAG.getNode(ISD::AND, DL, OpVT, N1,
48745 DAG.getNOT(DL, N0.getOperand(0), OpVT));
48746 }
48747 return SDValue();
48748 };
48749 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
48750 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48751 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
48752 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
48753
48754 // cmpeq(trunc(x),0) --> cmpeq(x,0)
48755 // cmpne(trunc(x),0) --> cmpne(x,0)
48756 // iff x upper bits are zero.
48757 // TODO: Add support for RHS to be truncate as well?
48758 if (LHS.getOpcode() == ISD::TRUNCATE &&
48759 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
48760 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
48761 EVT SrcVT = LHS.getOperand(0).getValueType();
48762 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
48763 OpVT.getScalarSizeInBits());
48764 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48765 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
48766 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
48767 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
48768 DAG.getConstant(0, DL, SrcVT), CC);
48769 }
48770 }
48771 }
48772
48773 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
48774 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
48775 // Using temporaries to avoid messing up operand ordering for later
48776 // transformations if this doesn't work.
48777 SDValue Op0 = LHS;
48778 SDValue Op1 = RHS;
48779 ISD::CondCode TmpCC = CC;
48780 // Put build_vector on the right.
48781 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
48782 std::swap(Op0, Op1);
48783 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
48784 }
48785
48786 bool IsSEXT0 =
48787 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
48788 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
48789 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
48790
48791 if (IsSEXT0 && IsVZero1) {
48792 assert(VT == Op0.getOperand(0).getValueType() &&((void)0)
48793 "Unexpected operand type")((void)0);
48794 if (TmpCC == ISD::SETGT)
48795 return DAG.getConstant(0, DL, VT);
48796 if (TmpCC == ISD::SETLE)
48797 return DAG.getConstant(1, DL, VT);
48798 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
48799 return DAG.getNOT(DL, Op0.getOperand(0), VT);
48800
48801 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&((void)0)
48802 "Unexpected condition code!")((void)0);
48803 return Op0.getOperand(0);
48804 }
48805 }
48806
48807 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
48808 // pre-promote its result type since vXi1 vectors don't get promoted
48809 // during type legalization.
48810 // NOTE: The element count check is to ignore operand types that need to
48811 // go through type promotion to a 128-bit vector.
48812 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
48813 VT.getVectorElementType() == MVT::i1 &&
48814 (OpVT.getVectorElementType() == MVT::i8 ||
48815 OpVT.getVectorElementType() == MVT::i16)) {
48816 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
48817 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
48818 }
48819
48820 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
48821 // to avoid scalarization via legalization because v4i32 is not a legal type.
48822 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
48823 LHS.getValueType() == MVT::v4f32)
48824 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
48825
48826 return SDValue();
48827}
48828
48829static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
48830 TargetLowering::DAGCombinerInfo &DCI,
48831 const X86Subtarget &Subtarget) {
48832 SDValue Src = N->getOperand(0);
48833 MVT SrcVT = Src.getSimpleValueType();
48834 MVT VT = N->getSimpleValueType(0);
48835 unsigned NumBits = VT.getScalarSizeInBits();
48836 unsigned NumElts = SrcVT.getVectorNumElements();
48837
48838 // Perform constant folding.
48839 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
48840 assert(VT == MVT::i32 && "Unexpected result type")((void)0);
48841 APInt Imm(32, 0);
48842 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
48843 if (!Src.getOperand(Idx).isUndef() &&
48844 Src.getConstantOperandAPInt(Idx).isNegative())
48845 Imm.setBit(Idx);
48846 }
48847 return DAG.getConstant(Imm, SDLoc(N), VT);
48848 }
48849
48850 // Look through int->fp bitcasts that don't change the element width.
48851 unsigned EltWidth = SrcVT.getScalarSizeInBits();
48852 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
48853 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
48854 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
48855
48856 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
48857 // with scalar comparisons.
48858 if (SDValue NotSrc = IsNOT(Src, DAG)) {
48859 SDLoc DL(N);
48860 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48861 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
48862 return DAG.getNode(ISD::XOR, DL, VT,
48863 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
48864 DAG.getConstant(NotMask, DL, VT));
48865 }
48866
48867 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
48868 // results with scalar comparisons.
48869 if (Src.getOpcode() == X86ISD::PCMPGT &&
48870 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
48871 SDLoc DL(N);
48872 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
48873 return DAG.getNode(ISD::XOR, DL, VT,
48874 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
48875 DAG.getConstant(NotMask, DL, VT));
48876 }
48877
48878 // Simplify the inputs.
48879 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48880 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
48881 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48882 return SDValue(N, 0);
48883
48884 return SDValue();
48885}
48886
48887static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
48888 TargetLowering::DAGCombinerInfo &DCI) {
48889 // With vector masks we only demand the upper bit of the mask.
48890 SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
48891 if (Mask.getScalarValueSizeInBits() != 1) {
48892 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48893 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48894 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48895 if (N->getOpcode() != ISD::DELETED_NODE)
48896 DCI.AddToWorklist(N);
48897 return SDValue(N, 0);
48898 }
48899 }
48900
48901 return SDValue();
48902}
48903
48904static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
48905 SDValue Index, SDValue Base, SDValue Scale,
48906 SelectionDAG &DAG) {
48907 SDLoc DL(GorS);
48908
48909 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
48910 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
48911 Gather->getMask(), Base, Index, Scale } ;
48912 return DAG.getMaskedGather(Gather->getVTList(),
48913 Gather->getMemoryVT(), DL, Ops,
48914 Gather->getMemOperand(),
48915 Gather->getIndexType(),
48916 Gather->getExtensionType());
48917 }
48918 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
48919 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
48920 Scatter->getMask(), Base, Index, Scale };
48921 return DAG.getMaskedScatter(Scatter->getVTList(),
48922 Scatter->getMemoryVT(), DL,
48923 Ops, Scatter->getMemOperand(),
48924 Scatter->getIndexType(),
48925 Scatter->isTruncatingStore());
48926}
48927
48928static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
48929 TargetLowering::DAGCombinerInfo &DCI) {
48930 SDLoc DL(N);
48931 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
48932 SDValue Index = GorS->getIndex();
48933 SDValue Base = GorS->getBasePtr();
48934 SDValue Scale = GorS->getScale();
48935
48936 if (DCI.isBeforeLegalize()) {
48937 unsigned IndexWidth = Index.getScalarValueSizeInBits();
48938
48939 // Shrink constant indices if they are larger than 32-bits.
48940 // Only do this before legalize types since v2i64 could become v2i32.
48941 // FIXME: We could check that the type is legal if we're after legalize
48942 // types, but then we would need to construct test cases where that happens.
48943 // FIXME: We could support more than just constant vectors, but we need to
48944 // careful with costing. A truncate that can be optimized out would be fine.
48945 // Otherwise we might only want to create a truncate if it avoids a split.
48946 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
48947 if (BV->isConstant() && IndexWidth > 32 &&
48948 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48949 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48950 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48951 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48952 }
48953 }
48954
48955 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
48956 // there are sufficient sign bits. Only do this before legalize types to
48957 // avoid creating illegal types in truncate.
48958 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
48959 Index.getOpcode() == ISD::ZERO_EXTEND) &&
48960 IndexWidth > 32 &&
48961 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
48962 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
48963 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
48964 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
48965 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48966 }
48967 }
48968
48969 if (DCI.isBeforeLegalizeOps()) {
48970 unsigned IndexWidth = Index.getScalarValueSizeInBits();
48971
48972 // Make sure the index is either i32 or i64
48973 if (IndexWidth != 32 && IndexWidth != 64) {
48974 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
48975 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
48976 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
48977 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
48978 }
48979 }
48980
48981 // With vector masks we only demand the upper bit of the mask.
48982 SDValue Mask = GorS->getMask();
48983 if (Mask.getScalarValueSizeInBits() != 1) {
48984 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48985 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
48986 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
48987 if (N->getOpcode() != ISD::DELETED_NODE)
48988 DCI.AddToWorklist(N);
48989 return SDValue(N, 0);
48990 }
48991 }
48992
48993 return SDValue();
48994}
48995
48996// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
48997static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
48998 const X86Subtarget &Subtarget) {
48999 SDLoc DL(N);
49000 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
49001 SDValue EFLAGS = N->getOperand(1);
49002
49003 // Try to simplify the EFLAGS and condition code operands.
49004 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
49005 return getSETCC(CC, Flags, DL, DAG);
49006
49007 return SDValue();
49008}
49009
49010/// Optimize branch condition evaluation.
49011static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
49012 const X86Subtarget &Subtarget) {
49013 SDLoc DL(N);
49014 SDValue EFLAGS = N->getOperand(3);
49015 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
49016
49017 // Try to simplify the EFLAGS and condition code operands.
49018 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
49019 // RAUW them under us.
49020 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
49021 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
49022 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
49023 N->getOperand(1), Cond, Flags);
49024 }
49025
49026 return SDValue();
49027}
49028
49029// TODO: Could we move this to DAGCombine?
49030static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
49031 SelectionDAG &DAG) {
49032 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
49033 // to optimize away operation when it's from a constant.
49034 //
49035 // The general transformation is:
49036 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
49037 // AND(VECTOR_CMP(x,y), constant2)
49038 // constant2 = UNARYOP(constant)
49039
49040 // Early exit if this isn't a vector operation, the operand of the
49041 // unary operation isn't a bitwise AND, or if the sizes of the operations
49042 // aren't the same.
49043 EVT VT = N->getValueType(0);
49044 bool IsStrict = N->isStrictFPOpcode();
49045 unsigned NumEltBits = VT.getScalarSizeInBits();
49046 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49047 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
49048 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
49049 VT.getSizeInBits() != Op0.getValueSizeInBits())
49050 return SDValue();
49051
49052 // Now check that the other operand of the AND is a constant. We could
49053 // make the transformation for non-constant splats as well, but it's unclear
49054 // that would be a benefit as it would not eliminate any operations, just
49055 // perform one more step in scalar code before moving to the vector unit.
49056 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
49057 // Bail out if the vector isn't a constant.
49058 if (!BV->isConstant())
49059 return SDValue();
49060
49061 // Everything checks out. Build up the new and improved node.
49062 SDLoc DL(N);
49063 EVT IntVT = BV->getValueType(0);
49064 // Create a new constant of the appropriate type for the transformed
49065 // DAG.
49066 SDValue SourceConst;
49067 if (IsStrict)
49068 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
49069 {N->getOperand(0), SDValue(BV, 0)});
49070 else
49071 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
49072 // The AND node needs bitcasts to/from an integer vector type around it.
49073 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
49074 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
49075 MaskConst);
49076 SDValue Res = DAG.getBitcast(VT, NewAnd);
49077 if (IsStrict)
49078 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
49079 return Res;
49080 }
49081
49082 return SDValue();
49083}
49084
49085/// If we are converting a value to floating-point, try to replace scalar
49086/// truncate of an extracted vector element with a bitcast. This tries to keep
49087/// the sequence on XMM registers rather than moving between vector and GPRs.
49088static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
49089 // TODO: This is currently only used by combineSIntToFP, but it is generalized
49090 // to allow being called by any similar cast opcode.
49091 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
49092 SDValue Trunc = N->getOperand(0);
49093 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
49094 return SDValue();
49095
49096 SDValue ExtElt = Trunc.getOperand(0);
49097 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49098 !isNullConstant(ExtElt.getOperand(1)))
49099 return SDValue();
49100
49101 EVT TruncVT = Trunc.getValueType();
49102 EVT SrcVT = ExtElt.getValueType();
49103 unsigned DestWidth = TruncVT.getSizeInBits();
49104 unsigned SrcWidth = SrcVT.getSizeInBits();
49105 if (SrcWidth % DestWidth != 0)
49106 return SDValue();
49107
49108 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
49109 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
49110 unsigned VecWidth = SrcVecVT.getSizeInBits();
49111 unsigned NumElts = VecWidth / DestWidth;
49112 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
49113 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
49114 SDLoc DL(N);
49115 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
49116 BitcastVec, ExtElt.getOperand(1));
49117 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
49118}
49119
49120static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
49121 const X86Subtarget &Subtarget) {
49122 bool IsStrict = N->isStrictFPOpcode();
49123 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49124 EVT VT = N->getValueType(0);
49125 EVT InVT = Op0.getValueType();
49126
49127 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
49128 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
49129 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
49130 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
49131 SDLoc dl(N);
49132 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49133 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
49134
49135 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
49136 if (IsStrict)
49137 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49138 {N->getOperand(0), P});
49139 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49140 }
49141
49142 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
49143 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
49144 // the optimization here.
49145 if (DAG.SignBitIsZero(Op0)) {
49146 if (IsStrict)
49147 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
49148 {N->getOperand(0), Op0});
49149 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
49150 }
49151
49152 return SDValue();
49153}
49154
49155static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
49156 TargetLowering::DAGCombinerInfo &DCI,
49157 const X86Subtarget &Subtarget) {
49158 // First try to optimize away the conversion entirely when it's
49159 // conditionally from a constant. Vectors only.
49160 bool IsStrict = N->isStrictFPOpcode();
49161 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
49162 return Res;
49163
49164 // Now move on to more general possibilities.
49165 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49166 EVT VT = N->getValueType(0);
49167 EVT InVT = Op0.getValueType();
49168
49169 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
49170 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
49171 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
49172 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
49173 SDLoc dl(N);
49174 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49175 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
49176 if (IsStrict)
49177 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49178 {N->getOperand(0), P});
49179 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49180 }
49181
49182 // Without AVX512DQ we only support i64 to float scalar conversion. For both
49183 // vectors and scalars, see if we know that the upper bits are all the sign
49184 // bit, in which case we can truncate the input to i32 and convert from that.
49185 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
49186 unsigned BitWidth = InVT.getScalarSizeInBits();
49187 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
49188 if (NumSignBits >= (BitWidth - 31)) {
49189 EVT TruncVT = MVT::i32;
49190 if (InVT.isVector())
49191 TruncVT = InVT.changeVectorElementType(TruncVT);
49192 SDLoc dl(N);
49193 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
49194 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
49195 if (IsStrict)
49196 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49197 {N->getOperand(0), Trunc});
49198 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
49199 }
49200 // If we're after legalize and the type is v2i32 we need to shuffle and
49201 // use CVTSI2P.
49202 assert(InVT == MVT::v2i64 && "Unexpected VT!")((void)0);
49203 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
49204 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
49205 { 0, 2, -1, -1 });
49206 if (IsStrict)
49207 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
49208 {N->getOperand(0), Shuf});
49209 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
49210 }
49211 }
49212
49213 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
49214 // a 32-bit target where SSE doesn't support i64->FP operations.
49215 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
49216 Op0.getOpcode() == ISD::LOAD) {
49217 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
49218
49219 // This transformation is not supported if the result type is f16 or f128.
49220 if (VT == MVT::f16 || VT == MVT::f128)
49221 return SDValue();
49222
49223 // If we have AVX512DQ we can use packed conversion instructions unless
49224 // the VT is f80.
49225 if (Subtarget.hasDQI() && VT != MVT::f80)
49226 return SDValue();
49227
49228 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
49229 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
49230 std::pair<SDValue, SDValue> Tmp =
49231 Subtarget.getTargetLowering()->BuildFILD(
49232 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
49233 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
49234 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
49235 return Tmp.first;
49236 }
49237 }
49238
49239 if (IsStrict)
49240 return SDValue();
49241
49242 if (SDValue V = combineToFPTruncExtElt(N, DAG))
49243 return V;
49244
49245 return SDValue();
49246}
49247
49248static bool needCarryOrOverflowFlag(SDValue Flags) {
49249 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((void)0);
49250
49251 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49252 UI != UE; ++UI) {
49253 SDNode *User = *UI;
49254
49255 X86::CondCode CC;
49256 switch (User->getOpcode()) {
49257 default:
49258 // Be conservative.
49259 return true;
49260 case X86ISD::SETCC:
49261 case X86ISD::SETCC_CARRY:
49262 CC = (X86::CondCode)User->getConstantOperandVal(0);
49263 break;
49264 case X86ISD::BRCOND:
49265 CC = (X86::CondCode)User->getConstantOperandVal(2);
49266 break;
49267 case X86ISD::CMOV:
49268 CC = (X86::CondCode)User->getConstantOperandVal(2);
49269 break;
49270 }
49271
49272 switch (CC) {
49273 default: break;
49274 case X86::COND_A: case X86::COND_AE:
49275 case X86::COND_B: case X86::COND_BE:
49276 case X86::COND_O: case X86::COND_NO:
49277 case X86::COND_G: case X86::COND_GE:
49278 case X86::COND_L: case X86::COND_LE:
49279 return true;
49280 }
49281 }
49282
49283 return false;
49284}
49285
49286static bool onlyZeroFlagUsed(SDValue Flags) {
49287 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((void)0);
49288
49289 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49290 UI != UE; ++UI) {
49291 SDNode *User = *UI;
49292
49293 unsigned CCOpNo;
49294 switch (User->getOpcode()) {
49295 default:
49296 // Be conservative.
49297 return false;
49298 case X86ISD::SETCC: CCOpNo = 0; break;
49299 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
49300 case X86ISD::BRCOND: CCOpNo = 2; break;
49301 case X86ISD::CMOV: CCOpNo = 2; break;
49302 }
49303
49304 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
49305 if (CC != X86::COND_E && CC != X86::COND_NE)
49306 return false;
49307 }
49308
49309 return true;
49310}
49311
49312static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
49313 // Only handle test patterns.
49314 if (!isNullConstant(N->getOperand(1)))
49315 return SDValue();
49316
49317 // If we have a CMP of a truncated binop, see if we can make a smaller binop
49318 // and use its flags directly.
49319 // TODO: Maybe we should try promoting compares that only use the zero flag
49320 // first if we can prove the upper bits with computeKnownBits?
49321 SDLoc dl(N);
49322 SDValue Op = N->getOperand(0);
49323 EVT VT = Op.getValueType();
49324
49325 // If we have a constant logical shift that's only used in a comparison
49326 // against zero turn it into an equivalent AND. This allows turning it into
49327 // a TEST instruction later.
49328 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
49329 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
49330 onlyZeroFlagUsed(SDValue(N, 0))) {
49331 unsigned BitWidth = VT.getSizeInBits();
49332 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
49333 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
49334 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
49335 APInt Mask = Op.getOpcode() == ISD::SRL
49336 ? APInt::getHighBitsSet(BitWidth, MaskBits)
49337 : APInt::getLowBitsSet(BitWidth, MaskBits);
49338 if (Mask.isSignedIntN(32)) {
49339 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
49340 DAG.getConstant(Mask, dl, VT));
49341 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49342 DAG.getConstant(0, dl, VT));
49343 }
49344 }
49345 }
49346
49347 // Look for a truncate.
49348 if (Op.getOpcode() != ISD::TRUNCATE)
49349 return SDValue();
49350
49351 SDValue Trunc = Op;
49352 Op = Op.getOperand(0);
49353
49354 // See if we can compare with zero against the truncation source,
49355 // which should help using the Z flag from many ops. Only do this for
49356 // i32 truncated op to prevent partial-reg compares of promoted ops.
49357 EVT OpVT = Op.getValueType();
49358 APInt UpperBits =
49359 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
49360 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
49361 onlyZeroFlagUsed(SDValue(N, 0))) {
49362 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49363 DAG.getConstant(0, dl, OpVT));
49364 }
49365
49366 // After this the truncate and arithmetic op must have a single use.
49367 if (!Trunc.hasOneUse() || !Op.hasOneUse())
49368 return SDValue();
49369
49370 unsigned NewOpc;
49371 switch (Op.getOpcode()) {
49372 default: return SDValue();
49373 case ISD::AND:
49374 // Skip and with constant. We have special handling for and with immediate
49375 // during isel to generate test instructions.
49376 if (isa<ConstantSDNode>(Op.getOperand(1)))
49377 return SDValue();
49378 NewOpc = X86ISD::AND;
49379 break;
49380 case ISD::OR: NewOpc = X86ISD::OR; break;
49381 case ISD::XOR: NewOpc = X86ISD::XOR; break;
49382 case ISD::ADD:
49383 // If the carry or overflow flag is used, we can't truncate.
49384 if (needCarryOrOverflowFlag(SDValue(N, 0)))
49385 return SDValue();
49386 NewOpc = X86ISD::ADD;
49387 break;
49388 case ISD::SUB:
49389 // If the carry or overflow flag is used, we can't truncate.
49390 if (needCarryOrOverflowFlag(SDValue(N, 0)))
49391 return SDValue();
49392 NewOpc = X86ISD::SUB;
49393 break;
49394 }
49395
49396 // We found an op we can narrow. Truncate its inputs.
49397 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
49398 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
49399
49400 // Use a X86 specific opcode to avoid DAG combine messing with it.
49401 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49402 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
49403
49404 // For AND, keep a CMP so that we can match the test pattern.
49405 if (NewOpc == X86ISD::AND)
49406 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
49407 DAG.getConstant(0, dl, VT));
49408
49409 // Return the flags.
49410 return Op.getValue(1);
49411}
49412
49413static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
49414 TargetLowering::DAGCombinerInfo &DCI) {
49415 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&((void)0)
49416 "Expected X86ISD::ADD or X86ISD::SUB")((void)0);
49417
49418 SDLoc DL(N);
49419 SDValue LHS = N->getOperand(0);
49420 SDValue RHS = N->getOperand(1);
49421 MVT VT = LHS.getSimpleValueType();
49422 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
49423
49424 // If we don't use the flag result, simplify back to a generic ADD/SUB.
49425 if (!N->hasAnyUseOfValue(1)) {
49426 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
49427 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
49428 }
49429
49430 // Fold any similar generic ADD/SUB opcodes to reuse this node.
49431 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
49432 SDValue Ops[] = {N0, N1};
49433 SDVTList VTs = DAG.getVTList(N->getValueType(0));
49434 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
49435 SDValue Op(N, 0);
49436 if (Negate)
49437 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
49438 DCI.CombineTo(GenericAddSub, Op);
49439 }
49440 };
49441 MatchGeneric(LHS, RHS, false);
49442 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
49443
49444 return SDValue();
49445}
49446
49447static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
49448 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49449 MVT VT = N->getSimpleValueType(0);
49450 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49451 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
49452 N->getOperand(0), N->getOperand(1),
49453 Flags);
49454 }
49455
49456 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
49457 // iff the flag result is dead.
49458 SDValue Op0 = N->getOperand(0);
49459 SDValue Op1 = N->getOperand(1);
49460 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
49461 !N->hasAnyUseOfValue(1))
49462 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
49463 Op0.getOperand(1), N->getOperand(2));
49464
49465 return SDValue();
49466}
49467
49468// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
49469static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
49470 TargetLowering::DAGCombinerInfo &DCI) {
49471 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
49472 // the result is either zero or one (depending on the input carry bit).
49473 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
49474 if (X86::isZeroNode(N->getOperand(0)) &&
49475 X86::isZeroNode(N->getOperand(1)) &&
49476 // We don't have a good way to replace an EFLAGS use, so only do this when
49477 // dead right now.
49478 SDValue(N, 1).use_empty()) {
49479 SDLoc DL(N);
49480 EVT VT = N->getValueType(0);
49481 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
49482 SDValue Res1 =
49483 DAG.getNode(ISD::AND, DL, VT,
49484 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49485 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49486 N->getOperand(2)),
49487 DAG.getConstant(1, DL, VT));
49488 return DCI.CombineTo(N, Res1, CarryOut);
49489 }
49490
49491 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
49492 MVT VT = N->getSimpleValueType(0);
49493 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49494 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
49495 N->getOperand(0), N->getOperand(1),
49496 Flags);
49497 }
49498
49499 return SDValue();
49500}
49501
49502/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49503/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49504/// with CMP+{ADC, SBB}.
49505static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
49506 bool IsSub = N->getOpcode() == ISD::SUB;
49507 SDValue X = N->getOperand(0);
49508 SDValue Y = N->getOperand(1);
49509
49510 // If this is an add, canonicalize a zext operand to the RHS.
49511 // TODO: Incomplete? What if both sides are zexts?
49512 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
49513 Y.getOpcode() != ISD::ZERO_EXTEND)
49514 std::swap(X, Y);
49515
49516 // Look through a one-use zext.
49517 bool PeekedThroughZext = false;
49518 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
49519 Y = Y.getOperand(0);
49520 PeekedThroughZext = true;
49521 }
49522
49523 // If this is an add, canonicalize a setcc operand to the RHS.
49524 // TODO: Incomplete? What if both sides are setcc?
49525 // TODO: Should we allow peeking through a zext of the other operand?
49526 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
49527 Y.getOpcode() != X86ISD::SETCC)
49528 std::swap(X, Y);
49529
49530 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
49531 return SDValue();
49532
49533 SDLoc DL(N);
49534 EVT VT = N->getValueType(0);
49535 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
49536
49537 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49538 // the general case below.
49539 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49540 if (ConstantX) {
49541 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
49542 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
49543 // This is a complicated way to get -1 or 0 from the carry flag:
49544 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49545 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49546 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49547 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49548 Y.getOperand(1));
49549 }
49550
49551 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
49552 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
49553 SDValue EFLAGS = Y->getOperand(1);
49554 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49555 EFLAGS.getValueType().isInteger() &&
49556 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49557 // Swap the operands of a SUB, and we have the same pattern as above.
49558 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49559 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
49560 SDValue NewSub = DAG.getNode(
49561 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49562 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49563 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49564 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49565 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49566 NewEFLAGS);
49567 }
49568 }
49569 }
49570
49571 if (CC == X86::COND_B) {
49572 // X + SETB Z --> adc X, 0
49573 // X - SETB Z --> sbb X, 0
49574 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49575 DAG.getVTList(VT, MVT::i32), X,
49576 DAG.getConstant(0, DL, VT), Y.getOperand(1));
49577 }
49578
49579 if (CC == X86::COND_A) {
49580 SDValue EFLAGS = Y.getOperand(1);
49581 // Try to convert COND_A into COND_B in an attempt to facilitate
49582 // materializing "setb reg".
49583 //
49584 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49585 // cannot take an immediate as its first operand.
49586 //
49587 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49588 EFLAGS.getValueType().isInteger() &&
49589 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49590 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
49591 EFLAGS.getNode()->getVTList(),
49592 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49593 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49594 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49595 DAG.getVTList(VT, MVT::i32), X,
49596 DAG.getConstant(0, DL, VT), NewEFLAGS);
49597 }
49598 }
49599
49600 if (CC == X86::COND_AE) {
49601 // X + SETAE --> sbb X, -1
49602 // X - SETAE --> adc X, -1
49603 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49604 DAG.getVTList(VT, MVT::i32), X,
49605 DAG.getConstant(-1, DL, VT), Y.getOperand(1));
49606 }
49607
49608 if (CC == X86::COND_BE) {
49609 // X + SETBE --> sbb X, -1
49610 // X - SETBE --> adc X, -1
49611 SDValue EFLAGS = Y.getOperand(1);
49612 // Try to convert COND_BE into COND_AE in an attempt to facilitate
49613 // materializing "setae reg".
49614 //
49615 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49616 // cannot take an immediate as its first operand.
49617 //
49618 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49619 EFLAGS.getValueType().isInteger() &&
49620 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49621 SDValue NewSub = DAG.getNode(
49622 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49623 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49624 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49625 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49626 DAG.getVTList(VT, MVT::i32), X,
49627 DAG.getConstant(-1, DL, VT), NewEFLAGS);
49628 }
49629 }
49630
49631 if (CC != X86::COND_E && CC != X86::COND_NE)
49632 return SDValue();
49633
49634 SDValue Cmp = Y.getOperand(1);
49635 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
49636 !X86::isZeroNode(Cmp.getOperand(1)) ||
49637 !Cmp.getOperand(0).getValueType().isInteger())
49638 return SDValue();
49639
49640 SDValue Z = Cmp.getOperand(0);
49641 EVT ZVT = Z.getValueType();
49642
49643 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49644 // the general case below.
49645 if (ConstantX) {
49646 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49647 // fake operands:
49648 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49649 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49650 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
49651 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
49652 SDValue Zero = DAG.getConstant(0, DL, ZVT);
49653 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49654 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49655 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49656 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49657 SDValue(Neg.getNode(), 1));
49658 }
49659
49660 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49661 // with fake operands:
49662 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49663 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49664 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
49665 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
49666 SDValue One = DAG.getConstant(1, DL, ZVT);
49667 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49668 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49669 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49670 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49671 Cmp1.getValue(1));
49672 }
49673 }
49674
49675 // (cmp Z, 1) sets the carry flag if Z is 0.
49676 SDValue One = DAG.getConstant(1, DL, ZVT);
49677 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49678 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49679
49680 // Add the flags type for ADC/SBB nodes.
49681 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49682
49683 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49684 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49685 if (CC == X86::COND_NE)
49686 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49687 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49688
49689 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
49690 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
49691 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49692 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49693}
49694
49695static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
49696 const SDLoc &DL, EVT VT,
49697 const X86Subtarget &Subtarget) {
49698 // Example of pattern we try to detect:
49699 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
49700 //(add (build_vector (extract_elt t, 0),
49701 // (extract_elt t, 2),
49702 // (extract_elt t, 4),
49703 // (extract_elt t, 6)),
49704 // (build_vector (extract_elt t, 1),
49705 // (extract_elt t, 3),
49706 // (extract_elt t, 5),
49707 // (extract_elt t, 7)))
49708
49709 if (!Subtarget.hasSSE2())
49710 return SDValue();
49711
49712 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
49713 Op1.getOpcode() != ISD::BUILD_VECTOR)
49714 return SDValue();
49715
49716 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49717 VT.getVectorNumElements() < 4 ||
49718 !isPowerOf2_32(VT.getVectorNumElements()))
49719 return SDValue();
49720
49721 // Check if one of Op0,Op1 is of the form:
49722 // (build_vector (extract_elt Mul, 0),
49723 // (extract_elt Mul, 2),
49724 // (extract_elt Mul, 4),
49725 // ...
49726 // the other is of the form:
49727 // (build_vector (extract_elt Mul, 1),
49728 // (extract_elt Mul, 3),
49729 // (extract_elt Mul, 5),
49730 // ...
49731 // and identify Mul.
49732 SDValue Mul;
49733 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
49734 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
49735 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
49736 // TODO: Be more tolerant to undefs.
49737 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49738 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49739 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49740 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49741 return SDValue();
49742 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
49743 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
49744 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
49745 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
49746 if (!Const0L || !Const1L || !Const0H || !Const1H)
49747 return SDValue();
49748 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
49749 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
49750 // Commutativity of mul allows factors of a product to reorder.
49751 if (Idx0L > Idx1L)
49752 std::swap(Idx0L, Idx1L);
49753 if (Idx0H > Idx1H)
49754 std::swap(Idx0H, Idx1H);
49755 // Commutativity of add allows pairs of factors to reorder.
49756 if (Idx0L > Idx0H) {
49757 std::swap(Idx0L, Idx0H);
49758 std::swap(Idx1L, Idx1H);
49759 }
49760 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
49761 Idx1H != 2 * i + 3)
49762 return SDValue();
49763 if (!Mul) {
49764 // First time an extract_elt's source vector is visited. Must be a MUL
49765 // with 2X number of vector elements than the BUILD_VECTOR.
49766 // Both extracts must be from same MUL.
49767 Mul = Op0L->getOperand(0);
49768 if (Mul->getOpcode() != ISD::MUL ||
49769 Mul.getValueType().getVectorNumElements() != 2 * e)
49770 return SDValue();
49771 }
49772 // Check that the extract is from the same MUL previously seen.
49773 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
49774 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
49775 return SDValue();
49776 }
49777
49778 // Check if the Mul source can be safely shrunk.
49779 ShrinkMode Mode;
49780 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
49781 Mode == ShrinkMode::MULU16)
49782 return SDValue();
49783
49784 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49785 VT.getVectorNumElements() * 2);
49786 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
49787 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
49788
49789 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49790 ArrayRef<SDValue> Ops) {
49791 EVT InVT = Ops[0].getValueType();
49792 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((void)0);
49793 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49794 InVT.getVectorNumElements() / 2);
49795 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49796 };
49797 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
49798}
49799
49800// Attempt to turn this pattern into PMADDWD.
49801// (add (mul (sext (build_vector)), (sext (build_vector))),
49802// (mul (sext (build_vector)), (sext (build_vector)))
49803static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
49804 const SDLoc &DL, EVT VT,
49805 const X86Subtarget &Subtarget) {
49806 if (!Subtarget.hasSSE2())
49807 return SDValue();
49808
49809 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
49810 return SDValue();
49811
49812 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
49813 VT.getVectorNumElements() < 4 ||
49814 !isPowerOf2_32(VT.getVectorNumElements()))
49815 return SDValue();
49816
49817 SDValue N00 = N0.getOperand(0);
49818 SDValue N01 = N0.getOperand(1);
49819 SDValue N10 = N1.getOperand(0);
49820 SDValue N11 = N1.getOperand(1);
49821
49822 // All inputs need to be sign extends.
49823 // TODO: Support ZERO_EXTEND from known positive?
49824 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
49825 N01.getOpcode() != ISD::SIGN_EXTEND ||
49826 N10.getOpcode() != ISD::SIGN_EXTEND ||
49827 N11.getOpcode() != ISD::SIGN_EXTEND)
49828 return SDValue();
49829
49830 // Peek through the extends.
49831 N00 = N00.getOperand(0);
49832 N01 = N01.getOperand(0);
49833 N10 = N10.getOperand(0);
49834 N11 = N11.getOperand(0);
49835
49836 // Must be extending from vXi16.
49837 EVT InVT = N00.getValueType();
49838 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
49839 N10.getValueType() != InVT || N11.getValueType() != InVT)
49840 return SDValue();
49841
49842 // All inputs should be build_vectors.
49843 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
49844 N01.getOpcode() != ISD::BUILD_VECTOR ||
49845 N10.getOpcode() != ISD::BUILD_VECTOR ||
49846 N11.getOpcode() != ISD::BUILD_VECTOR)
49847 return SDValue();
49848
49849 // For each element, we need to ensure we have an odd element from one vector
49850 // multiplied by the odd element of another vector and the even element from
49851 // one of the same vectors being multiplied by the even element from the
49852 // other vector. So we need to make sure for each element i, this operator
49853 // is being performed:
49854 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
49855 SDValue In0, In1;
49856 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
49857 SDValue N00Elt = N00.getOperand(i);
49858 SDValue N01Elt = N01.getOperand(i);
49859 SDValue N10Elt = N10.getOperand(i);
49860 SDValue N11Elt = N11.getOperand(i);
49861 // TODO: Be more tolerant to undefs.
49862 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49863 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49864 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49865 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
49866 return SDValue();
49867 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
49868 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
49869 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
49870 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
49871 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
49872 return SDValue();
49873 unsigned IdxN00 = ConstN00Elt->getZExtValue();
49874 unsigned IdxN01 = ConstN01Elt->getZExtValue();
49875 unsigned IdxN10 = ConstN10Elt->getZExtValue();
49876 unsigned IdxN11 = ConstN11Elt->getZExtValue();
49877 // Add is commutative so indices can be reordered.
49878 if (IdxN00 > IdxN10) {
49879 std::swap(IdxN00, IdxN10);
49880 std::swap(IdxN01, IdxN11);
49881 }
49882 // N0 indices be the even element. N1 indices must be the next odd element.
49883 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
49884 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
49885 return SDValue();
49886 SDValue N00In = N00Elt.getOperand(0);
49887 SDValue N01In = N01Elt.getOperand(0);
49888 SDValue N10In = N10Elt.getOperand(0);
49889 SDValue N11In = N11Elt.getOperand(0);
49890
49891 // First time we find an input capture it.
49892 if (!In0) {
49893 In0 = N00In;
49894 In1 = N01In;
49895
49896 // The input vectors must be at least as wide as the output.
49897 // If they are larger than the output, we extract subvector below.
49898 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
49899 In1.getValueSizeInBits() < VT.getSizeInBits())
49900 return SDValue();
49901 }
49902 // Mul is commutative so the input vectors can be in any order.
49903 // Canonicalize to make the compares easier.
49904 if (In0 != N00In)
49905 std::swap(N00In, N01In);
49906 if (In0 != N10In)
49907 std::swap(N10In, N11In);
49908 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
49909 return SDValue();
49910 }
49911
49912 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49913 ArrayRef<SDValue> Ops) {
49914 EVT OpVT = Ops[0].getValueType();
49915 assert(OpVT.getScalarType() == MVT::i16 &&((void)0)
49916 "Unexpected scalar element type")((void)0);
49917 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")((void)0);
49918 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
49919 OpVT.getVectorNumElements() / 2);
49920 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
49921 };
49922
49923 // If the output is narrower than an input, extract the low part of the input
49924 // vector.
49925 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
49926 VT.getVectorNumElements() * 2);
49927 if (OutVT16.bitsLT(In0.getValueType())) {
49928 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
49929 DAG.getIntPtrConstant(0, DL));
49930 }
49931 if (OutVT16.bitsLT(In1.getValueType())) {
49932 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
49933 DAG.getIntPtrConstant(0, DL));
49934 }
49935 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
49936 PMADDBuilder);
49937}
49938
49939/// CMOV of constants requires materializing constant operands in registers.
49940/// Try to fold those constants into an 'add' instruction to reduce instruction
49941/// count. We do this with CMOV rather the generic 'select' because there are
49942/// earlier folds that may be used to turn select-of-constants into logic hacks.
49943static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
49944 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
49945 // better because we eliminate 1-2 instructions. This transform is still
49946 // an improvement without zero operands because we trade 2 move constants and
49947 // 1 add for 2 adds (LEA) as long as the constants can be represented as
49948 // immediate asm operands (fit in 32-bits).
49949 auto isSuitableCmov = [](SDValue V) {
49950 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
49951 return false;
49952 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
49953 !isa<ConstantSDNode>(V.getOperand(1)))
49954 return false;
49955 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
49956 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
49957 V.getConstantOperandAPInt(1).isSignedIntN(32));
49958 };
49959
49960 // Match an appropriate CMOV as the first operand of the add.
49961 SDValue Cmov = N->getOperand(0);
49962 SDValue OtherOp = N->getOperand(1);
49963 if (!isSuitableCmov(Cmov))
49964 std::swap(Cmov, OtherOp);
49965 if (!isSuitableCmov(Cmov))
49966 return SDValue();
49967
49968 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
49969 EVT VT = N->getValueType(0);
49970 SDLoc DL(N);
49971 SDValue FalseOp = Cmov.getOperand(0);
49972 SDValue TrueOp = Cmov.getOperand(1);
49973 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
49974 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
49975 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
49976 Cmov.getOperand(3));
49977}
49978
49979static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
49980 TargetLowering::DAGCombinerInfo &DCI,
49981 const X86Subtarget &Subtarget) {
49982 EVT VT = N->getValueType(0);
49983 SDValue Op0 = N->getOperand(0);
49984 SDValue Op1 = N->getOperand(1);
49985
49986 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
49987 return Select;
49988
49989 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49990 return MAdd;
49991 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
49992 return MAdd;
49993
49994 // Try to synthesize horizontal adds from adds of shuffles.
49995 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
49996 return V;
49997
49998 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
49999 // (sub Y, (sext (vXi1 X))).
50000 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
50001 // generic DAG combine without a legal type check, but adding this there
50002 // caused regressions.
50003 if (VT.isVector()) {
50004 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50005 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
50006 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50007 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
50008 SDLoc DL(N);
50009 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
50010 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
50011 }
50012
50013 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
50014 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50015 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
50016 SDLoc DL(N);
50017 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
50018 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
50019 }
50020 }
50021
50022 return combineAddOrSubToADCOrSBB(N, DAG);
50023}
50024
50025static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
50026 TargetLowering::DAGCombinerInfo &DCI,
50027 const X86Subtarget &Subtarget) {
50028 SDValue Op0 = N->getOperand(0);
50029 SDValue Op1 = N->getOperand(1);
50030
50031 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
50032 auto IsNonOpaqueConstant = [&](SDValue Op) {
50033 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
50034 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
50035 return !Cst->isOpaque();
50036 return true;
50037 }
50038 return false;
50039 };
50040
50041 // X86 can't encode an immediate LHS of a sub. See if we can push the
50042 // negation into a preceding instruction. If the RHS of the sub is a XOR with
50043 // one use and a constant, invert the immediate, saving one register.
50044 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
50045 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
50046 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
50047 SDLoc DL(N);
50048 EVT VT = Op0.getValueType();
50049 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
50050 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
50051 SDValue NewAdd =
50052 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
50053 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
50054 }
50055
50056 // Try to synthesize horizontal subs from subs of shuffles.
50057 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
50058 return V;
50059
50060 return combineAddOrSubToADCOrSBB(N, DAG);
50061}
50062
50063static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
50064 const X86Subtarget &Subtarget) {
50065 MVT VT = N->getSimpleValueType(0);
50066 SDLoc DL(N);
50067
50068 if (N->getOperand(0) == N->getOperand(1)) {
50069 if (N->getOpcode() == X86ISD::PCMPEQ)
50070 return DAG.getConstant(-1, DL, VT);
50071 if (N->getOpcode() == X86ISD::PCMPGT)
50072 return DAG.getConstant(0, DL, VT);
50073 }
50074
50075 return SDValue();
50076}
50077
50078/// Helper that combines an array of subvector ops as if they were the operands
50079/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
50080/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
50081static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
50082 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
50083 TargetLowering::DAGCombinerInfo &DCI,
50084 const X86Subtarget &Subtarget) {
50085 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")((void)0);
50086 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50087
50088 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
50089 return DAG.getUNDEF(VT);
50090
50091 if (llvm::all_of(Ops, [](SDValue Op) {
50092 return ISD::isBuildVectorAllZeros(Op.getNode());
50093 }))
50094 return getZeroVector(VT, Subtarget, DAG, DL);
50095
50096 SDValue Op0 = Ops[0];
50097 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
50098
50099 // Repeated subvectors.
50100 if (IsSplat &&
50101 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
50102 // If this broadcast is inserted into both halves, use a larger broadcast.
50103 if (Op0.getOpcode() == X86ISD::VBROADCAST)
50104 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
50105
50106 // If this scalar/subvector broadcast_load is inserted into both halves, use
50107 // a larger broadcast_load. Update other uses to use an extracted subvector.
50108 if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50109 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
50110 auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
50111 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50112 SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
50113 SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
50114 MemIntr->getMemoryVT(),
50115 MemIntr->getMemOperand());
50116 DAG.ReplaceAllUsesOfValueWith(
50117 Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50118 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50119 return BcastLd;
50120 }
50121
50122 // If this is a simple subvector load repeated across multiple lanes, then
50123 // broadcast the load. Update other uses to use an extracted subvector.
50124 if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
50125 if (Ld->isSimple() && !Ld->isNonTemporal() &&
50126 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
50127 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50128 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
50129 SDValue BcastLd =
50130 DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
50131 Ld->getMemoryVT(), Ld->getMemOperand());
50132 DAG.ReplaceAllUsesOfValueWith(
50133 Op0,
50134 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50135 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
50136 return BcastLd;
50137 }
50138 }
50139
50140 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
50141 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
50142 (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
50143 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
50144 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
50145 Op0.getOperand(0),
50146 DAG.getIntPtrConstant(0, DL)));
50147
50148 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
50149 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50150 (Subtarget.hasAVX2() ||
50151 (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
50152 Op0.getOperand(0).getValueType() == VT.getScalarType())
50153 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
50154
50155 // concat_vectors(extract_subvector(broadcast(x)),
50156 // extract_subvector(broadcast(x))) -> broadcast(x)
50157 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50158 Op0.getOperand(0).getValueType() == VT) {
50159 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
50160 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
50161 return Op0.getOperand(0);
50162 }
50163 }
50164
50165 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
50166 // Only concat of subvector high halves which vperm2x128 is best at.
50167 // TODO: This should go in combineX86ShufflesRecursively eventually.
50168 if (VT.is256BitVector() && Ops.size() == 2) {
50169 SDValue Src0 = peekThroughBitcasts(Ops[0]);
50170 SDValue Src1 = peekThroughBitcasts(Ops[1]);
50171 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50172 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
50173 EVT SrcVT0 = Src0.getOperand(0).getValueType();
50174 EVT SrcVT1 = Src1.getOperand(0).getValueType();
50175 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
50176 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
50177 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
50178 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
50179 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
50180 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
50181 DAG.getBitcast(VT, Src0.getOperand(0)),
50182 DAG.getBitcast(VT, Src1.getOperand(0)),
50183 DAG.getTargetConstant(0x31, DL, MVT::i8));
50184 }
50185 }
50186 }
50187
50188 // Repeated opcode.
50189 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
50190 // but it currently struggles with different vector widths.
50191 if (llvm::all_of(Ops, [Op0](SDValue Op) {
50192 return Op.getOpcode() == Op0.getOpcode();
50193 })) {
50194 auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
50195 SmallVector<SDValue> Subs;
50196 for (SDValue SubOp : SubOps)
50197 Subs.push_back(SubOp.getOperand(I));
50198 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
50199 };
50200
50201 unsigned NumOps = Ops.size();
50202 switch (Op0.getOpcode()) {
50203 case X86ISD::SHUFP: {
50204 // Add SHUFPD support if/when necessary.
50205 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
50206 llvm::all_of(Ops, [Op0](SDValue Op) {
50207 return Op.getOperand(2) == Op0.getOperand(2);
50208 })) {
50209 return DAG.getNode(Op0.getOpcode(), DL, VT,
50210 ConcatSubOperand(VT, Ops, 0),
50211 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50212 }
50213 break;
50214 }
50215 case X86ISD::PSHUFHW:
50216 case X86ISD::PSHUFLW:
50217 case X86ISD::PSHUFD:
50218 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
50219 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50220 return DAG.getNode(Op0.getOpcode(), DL, VT,
50221 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50222 }
50223 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50224 case X86ISD::VPERMILPI:
50225 // TODO - add support for vXf64/vXi64 shuffles.
50226 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
50227 Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50228 SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
50229 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
50230 Op0.getOperand(1));
50231 return DAG.getBitcast(VT, Res);
50232 }
50233 break;
50234 case X86ISD::VPERMV3:
50235 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
50236 MVT OpVT = Op0.getSimpleValueType();
50237 int NumSrcElts = OpVT.getVectorNumElements();
50238 SmallVector<int, 64> ConcatMask;
50239 for (unsigned i = 0; i != NumOps; ++i) {
50240 SmallVector<int, 64> SubMask;
50241 SmallVector<SDValue, 2> SubOps;
50242 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
50243 SubMask))
50244 break;
50245 for (int M : SubMask) {
50246 if (0 <= M) {
50247 M += M < NumSrcElts ? 0 : NumSrcElts;
50248 M += i * NumSrcElts;
50249 }
50250 ConcatMask.push_back(M);
50251 }
50252 }
50253 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
50254 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
50255 Ops[1].getOperand(0), DAG, DL);
50256 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
50257 Ops[1].getOperand(2), DAG, DL);
50258 MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
50259 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
50260 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
50261 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
50262 }
50263 }
50264 break;
50265 case X86ISD::VSHLI:
50266 case X86ISD::VSRLI:
50267 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
50268 // TODO: Move this to LowerScalarImmediateShift?
50269 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
50270 llvm::all_of(Ops, [](SDValue Op) {
50271 return Op.getConstantOperandAPInt(1) == 32;
50272 })) {
50273 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
50274 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
50275 if (Op0.getOpcode() == X86ISD::VSHLI) {
50276 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
50277 {8, 0, 8, 2, 8, 4, 8, 6});
50278 } else {
50279 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
50280 {1, 8, 3, 8, 5, 8, 7, 8});
50281 }
50282 return DAG.getBitcast(VT, Res);
50283 }
50284 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50285 case X86ISD::VSRAI:
50286 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
50287 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
50288 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
50289 llvm::all_of(Ops, [Op0](SDValue Op) {
50290 return Op0.getOperand(1) == Op.getOperand(1);
50291 })) {
50292 return DAG.getNode(Op0.getOpcode(), DL, VT,
50293 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50294 }
50295 break;
50296 case X86ISD::VPERMI:
50297 case X86ISD::VROTLI:
50298 case X86ISD::VROTRI:
50299 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
50300 llvm::all_of(Ops, [Op0](SDValue Op) {
50301 return Op0.getOperand(1) == Op.getOperand(1);
50302 })) {
50303 return DAG.getNode(Op0.getOpcode(), DL, VT,
50304 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50305 }
50306 break;
50307 case ISD::AND:
50308 case ISD::OR:
50309 case ISD::XOR:
50310 case X86ISD::ANDNP:
50311 // TODO: Add 256-bit support.
50312 if (!IsSplat && VT.is512BitVector()) {
50313 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
50314 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
50315 NumOps * SrcVT.getVectorNumElements());
50316 return DAG.getNode(Op0.getOpcode(), DL, VT,
50317 ConcatSubOperand(SrcVT, Ops, 0),
50318 ConcatSubOperand(SrcVT, Ops, 1));
50319 }
50320 break;
50321 case X86ISD::HADD:
50322 case X86ISD::HSUB:
50323 case X86ISD::FHADD:
50324 case X86ISD::FHSUB:
50325 case X86ISD::PACKSS:
50326 case X86ISD::PACKUS:
50327 if (!IsSplat && VT.is256BitVector() &&
50328 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
50329 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
50330 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
50331 NumOps * SrcVT.getVectorNumElements());
50332 return DAG.getNode(Op0.getOpcode(), DL, VT,
50333 ConcatSubOperand(SrcVT, Ops, 0),
50334 ConcatSubOperand(SrcVT, Ops, 1));
50335 }
50336 break;
50337 case X86ISD::PALIGNR:
50338 if (!IsSplat &&
50339 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
50340 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
50341 llvm::all_of(Ops, [Op0](SDValue Op) {
50342 return Op0.getOperand(2) == Op.getOperand(2);
50343 })) {
50344 return DAG.getNode(Op0.getOpcode(), DL, VT,
50345 ConcatSubOperand(VT, Ops, 0),
50346 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50347 }
50348 break;
50349 }
50350 }
50351
50352 // Fold subvector loads into one.
50353 // If needed, look through bitcasts to get to the load.
50354 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
50355 bool Fast;
50356 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
50357 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50358 *FirstLd->getMemOperand(), &Fast) &&
50359 Fast) {
50360 if (SDValue Ld =
50361 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
50362 return Ld;
50363 }
50364 }
50365
50366 return SDValue();
50367}
50368
50369static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
50370 TargetLowering::DAGCombinerInfo &DCI,
50371 const X86Subtarget &Subtarget) {
50372 EVT VT = N->getValueType(0);
50373 EVT SrcVT = N->getOperand(0).getValueType();
50374 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50375
50376 // Don't do anything for i1 vectors.
50377 if (VT.getVectorElementType() == MVT::i1)
50378 return SDValue();
50379
50380 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
50381 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
50382 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
50383 DCI, Subtarget))
50384 return R;
50385 }
50386
50387 return SDValue();
50388}
50389
50390static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
50391 TargetLowering::DAGCombinerInfo &DCI,
50392 const X86Subtarget &Subtarget) {
50393 if (DCI.isBeforeLegalizeOps())
50394 return SDValue();
50395
50396 MVT OpVT = N->getSimpleValueType(0);
50397
50398 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
50399
50400 SDLoc dl(N);
50401 SDValue Vec = N->getOperand(0);
50402 SDValue SubVec = N->getOperand(1);
50403
50404 uint64_t IdxVal = N->getConstantOperandVal(2);
50405 MVT SubVecVT = SubVec.getSimpleValueType();
50406
50407 if (Vec.isUndef() && SubVec.isUndef())
50408 return DAG.getUNDEF(OpVT);
50409
50410 // Inserting undefs/zeros into zeros/undefs is a zero vector.
50411 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
50412 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
50413 return getZeroVector(OpVT, Subtarget, DAG, dl);
50414
50415 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
50416 // If we're inserting into a zero vector and then into a larger zero vector,
50417 // just insert into the larger zero vector directly.
50418 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
50419 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
50420 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
50421 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50422 getZeroVector(OpVT, Subtarget, DAG, dl),
50423 SubVec.getOperand(1),
50424 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
50425 }
50426
50427 // If we're inserting into a zero vector and our input was extracted from an
50428 // insert into a zero vector of the same type and the extraction was at
50429 // least as large as the original insertion. Just insert the original
50430 // subvector into a zero vector.
50431 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
50432 isNullConstant(SubVec.getOperand(1)) &&
50433 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
50434 SDValue Ins = SubVec.getOperand(0);
50435 if (isNullConstant(Ins.getOperand(2)) &&
50436 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
50437 Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
50438 SubVecVT.getFixedSizeInBits())
50439 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50440 getZeroVector(OpVT, Subtarget, DAG, dl),
50441 Ins.getOperand(1), N->getOperand(2));
50442 }
50443 }
50444
50445 // Stop here if this is an i1 vector.
50446 if (IsI1Vector)
50447 return SDValue();
50448
50449 // If this is an insert of an extract, combine to a shuffle. Don't do this
50450 // if the insert or extract can be represented with a subregister operation.
50451 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50452 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
50453 (IdxVal != 0 ||
50454 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
50455 int ExtIdxVal = SubVec.getConstantOperandVal(1);
50456 if (ExtIdxVal != 0) {
50457 int VecNumElts = OpVT.getVectorNumElements();
50458 int SubVecNumElts = SubVecVT.getVectorNumElements();
50459 SmallVector<int, 64> Mask(VecNumElts);
50460 // First create an identity shuffle mask.
50461 for (int i = 0; i != VecNumElts; ++i)
50462 Mask[i] = i;
50463 // Now insert the extracted portion.
50464 for (int i = 0; i != SubVecNumElts; ++i)
50465 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
50466
50467 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
50468 }
50469 }
50470
50471 // Match concat_vector style patterns.
50472 SmallVector<SDValue, 2> SubVectorOps;
50473 if (collectConcatOps(N, SubVectorOps)) {
50474 if (SDValue Fold =
50475 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
50476 return Fold;
50477
50478 // If we're inserting all zeros into the upper half, change this to
50479 // a concat with zero. We will match this to a move
50480 // with implicit upper bit zeroing during isel.
50481 // We do this here because we don't want combineConcatVectorOps to
50482 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
50483 if (SubVectorOps.size() == 2 &&
50484 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
50485 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
50486 getZeroVector(OpVT, Subtarget, DAG, dl),
50487 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
50488 }
50489
50490 // If this is a broadcast insert into an upper undef, use a larger broadcast.
50491 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
50492 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
50493
50494 // If this is a broadcast load inserted into an upper undef, use a larger
50495 // broadcast load.
50496 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
50497 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
50498 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
50499 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
50500 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
50501 SDValue BcastLd =
50502 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
50503 MemIntr->getMemoryVT(),
50504 MemIntr->getMemOperand());
50505 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50506 return BcastLd;
50507 }
50508
50509 // If we're splatting the lower half subvector of a full vector load into the
50510 // upper half, attempt to create a subvector broadcast.
50511 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
50512 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
50513 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
50514 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
50515 if (VecLd && SubLd &&
50516 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
50517 SubVec.getValueSizeInBits() / 8, 0))
50518 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
50519 SubLd, 0, DAG);
50520 }
50521
50522 return SDValue();
50523}
50524
50525/// If we are extracting a subvector of a vector select and the select condition
50526/// is composed of concatenated vectors, try to narrow the select width. This
50527/// is a common pattern for AVX1 integer code because 256-bit selects may be
50528/// legal, but there is almost no integer math/logic available for 256-bit.
50529/// This function should only be called with legal types (otherwise, the calls
50530/// to get simple value types will assert).
50531static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
50532 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
50533 SmallVector<SDValue, 4> CatOps;
50534 if (Sel.getOpcode() != ISD::VSELECT ||
50535 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
50536 return SDValue();
50537
50538 // Note: We assume simple value types because this should only be called with
50539 // legal operations/types.
50540 // TODO: This can be extended to handle extraction to 256-bits.
50541 MVT VT = Ext->getSimpleValueType(0);
50542 if (!VT.is128BitVector())
50543 return SDValue();
50544
50545 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
50546 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
50547 return SDValue();
50548
50549 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
50550 MVT SelVT = Sel.getSimpleValueType();
50551 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&((void)0)
50552 "Unexpected vector type with legal operations")((void)0);
50553
50554 unsigned SelElts = SelVT.getVectorNumElements();
50555 unsigned CastedElts = WideVT.getVectorNumElements();
50556 unsigned ExtIdx = Ext->getConstantOperandVal(1);
50557 if (SelElts % CastedElts == 0) {
50558 // The select has the same or more (narrower) elements than the extract
50559 // operand. The extraction index gets scaled by that factor.
50560 ExtIdx *= (SelElts / CastedElts);
50561 } else if (CastedElts % SelElts == 0) {
50562 // The select has less (wider) elements than the extract operand. Make sure
50563 // that the extraction index can be divided evenly.
50564 unsigned IndexDivisor = CastedElts / SelElts;
50565 if (ExtIdx % IndexDivisor != 0)
50566 return SDValue();
50567 ExtIdx /= IndexDivisor;
50568 } else {
50569 llvm_unreachable("Element count of simple vector types are not divisible?")__builtin_unreachable();
50570 }
50571
50572 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
50573 unsigned NarrowElts = SelElts / NarrowingFactor;
50574 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
50575 SDLoc DL(Ext);
50576 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
50577 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
50578 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
50579 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
50580 return DAG.getBitcast(VT, NarrowSel);
50581}
50582
50583static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
50584 TargetLowering::DAGCombinerInfo &DCI,
50585 const X86Subtarget &Subtarget) {
50586 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
50587 // eventually get combined/lowered into ANDNP) with a concatenated operand,
50588 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
50589 // We let generic combining take over from there to simplify the
50590 // insert/extract and 'not'.
50591 // This pattern emerges during AVX1 legalization. We handle it before lowering
50592 // to avoid complications like splitting constant vector loads.
50593
50594 // Capture the original wide type in the likely case that we need to bitcast
50595 // back to this type.
50596 if (!N->getValueType(0).isSimple())
50597 return SDValue();
50598
50599 MVT VT = N->getSimpleValueType(0);
50600 SDValue InVec = N->getOperand(0);
50601 unsigned IdxVal = N->getConstantOperandVal(1);
50602 SDValue InVecBC = peekThroughBitcasts(InVec);
50603 EVT InVecVT = InVec.getValueType();
50604 unsigned SizeInBits = VT.getSizeInBits();
50605 unsigned InSizeInBits = InVecVT.getSizeInBits();
50606 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50607
50608 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
50609 TLI.isTypeLegal(InVecVT) &&
50610 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
50611 auto isConcatenatedNot = [](SDValue V) {
50612 V = peekThroughBitcasts(V);
50613 if (!isBitwiseNot(V))
50614 return false;
50615 SDValue NotOp = V->getOperand(0);
50616 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
50617 };
50618 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
50619 isConcatenatedNot(InVecBC.getOperand(1))) {
50620 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
50621 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
50622 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
50623 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
50624 }
50625 }
50626
50627 if (DCI.isBeforeLegalizeOps())
50628 return SDValue();
50629
50630 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
50631 return V;
50632
50633 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
50634 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50635
50636 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
50637 if (VT.getScalarType() == MVT::i1)
50638 return DAG.getConstant(1, SDLoc(N), VT);
50639 return getOnesVector(VT, DAG, SDLoc(N));
50640 }
50641
50642 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
50643 return DAG.getBuildVector(
50644 VT, SDLoc(N),
50645 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
50646
50647 // If we are extracting from an insert into a zero vector, replace with a
50648 // smaller insert into zero if we don't access less than the original
50649 // subvector. Don't do this for i1 vectors.
50650 if (VT.getVectorElementType() != MVT::i1 &&
50651 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
50652 InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
50653 ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
50654 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
50655 SDLoc DL(N);
50656 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
50657 getZeroVector(VT, Subtarget, DAG, DL),
50658 InVec.getOperand(1), InVec.getOperand(2));
50659 }
50660
50661 // If we're extracting an upper subvector from a broadcast we should just
50662 // extract the lowest subvector instead which should allow
50663 // SimplifyDemandedVectorElts do more simplifications.
50664 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
50665 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50666 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
50667 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50668
50669 // If we're extracting a broadcasted subvector, just use the lowest subvector.
50670 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50671 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
50672 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
50673
50674 // Attempt to extract from the source of a shuffle vector.
50675 if ((InSizeInBits % SizeInBits) == 0 &&
50676 (IdxVal % VT.getVectorNumElements()) == 0) {
50677 SmallVector<int, 32> ShuffleMask;
50678 SmallVector<int, 32> ScaledMask;
50679 SmallVector<SDValue, 2> ShuffleInputs;
50680 unsigned NumSubVecs = InSizeInBits / SizeInBits;
50681 // Decode the shuffle mask and scale it so its shuffling subvectors.
50682 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
50683 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
50684 unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
50685 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
50686 return DAG.getUNDEF(VT);
50687 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
50688 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
50689 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
50690 if (Src.getValueSizeInBits() == InSizeInBits) {
50691 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
50692 unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
50693 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
50694 SDLoc(N), SizeInBits);
50695 }
50696 }
50697 }
50698
50699 // If we're extracting the lowest subvector and we're the only user,
50700 // we may be able to perform this with a smaller vector width.
50701 unsigned InOpcode = InVec.getOpcode();
50702 if (IdxVal == 0 && InVec.hasOneUse()) {
50703 if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
50704 // v2f64 CVTDQ2PD(v4i32).
50705 if (InOpcode == ISD::SINT_TO_FP &&
50706 InVec.getOperand(0).getValueType() == MVT::v4i32) {
50707 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
50708 }
50709 // v2f64 CVTUDQ2PD(v4i32).
50710 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
50711 InVec.getOperand(0).getValueType() == MVT::v4i32) {
50712 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
50713 }
50714 // v2f64 CVTPS2PD(v4f32).
50715 if (InOpcode == ISD::FP_EXTEND &&
50716 InVec.getOperand(0).getValueType() == MVT::v4f32) {
50717 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
50718 }
50719 }
50720 if ((InOpcode == ISD::ANY_EXTEND ||
50721 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50722 InOpcode == ISD::ZERO_EXTEND ||
50723 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
50724 InOpcode == ISD::SIGN_EXTEND ||
50725 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50726 (SizeInBits == 128 || SizeInBits == 256) &&
50727 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
50728 SDLoc DL(N);
50729 SDValue Ext = InVec.getOperand(0);
50730 if (Ext.getValueSizeInBits() > SizeInBits)
50731 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
50732 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
50733 return DAG.getNode(ExtOp, DL, VT, Ext);
50734 }
50735 if (InOpcode == ISD::VSELECT &&
50736 InVec.getOperand(0).getValueType().is256BitVector() &&
50737 InVec.getOperand(1).getValueType().is256BitVector() &&
50738 InVec.getOperand(2).getValueType().is256BitVector()) {
50739 SDLoc DL(N);
50740 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
50741 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
50742 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
50743 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
50744 }
50745 if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
50746 (VT.is128BitVector() || VT.is256BitVector())) {
50747 SDLoc DL(N);
50748 SDValue InVecSrc = InVec.getOperand(0);
50749 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
50750 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
50751 return DAG.getNode(InOpcode, DL, VT, Ext);
50752 }
50753 }
50754
50755 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
50756 // as this is very likely to fold into a shuffle/truncation.
50757 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
50758 InVecVT.getScalarSizeInBits() == 64 &&
50759 InVec.getConstantOperandAPInt(1) == 32) {
50760 SDLoc DL(N);
50761 SDValue Ext =
50762 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
50763 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
50764 }
50765
50766 return SDValue();
50767}
50768
50769static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
50770 EVT VT = N->getValueType(0);
50771 SDValue Src = N->getOperand(0);
50772 SDLoc DL(N);
50773
50774 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
50775 // This occurs frequently in our masked scalar intrinsic code and our
50776 // floating point select lowering with AVX512.
50777 // TODO: SimplifyDemandedBits instead?
50778 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
50779 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50780 if (C->getAPIntValue().isOneValue())
50781 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
50782 Src.getOperand(0));
50783
50784 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
50785 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50786 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
50787 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
50788 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
50789 if (C->isNullValue())
50790 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
50791 Src.getOperand(1));
50792
50793 // Reduce v2i64 to v4i32 if we don't need the upper bits.
50794 // TODO: Move to DAGCombine/SimplifyDemandedBits?
50795 if (VT == MVT::v2i64 || VT == MVT::v2f64) {
50796 auto IsAnyExt64 = [](SDValue Op) {
50797 if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
50798 return SDValue();
50799 if (Op.getOpcode() == ISD::ANY_EXTEND &&
50800 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
50801 return Op.getOperand(0);
50802 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
50803 if (Ld->getExtensionType() == ISD::EXTLOAD &&
50804 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
50805 return Op;
50806 return SDValue();
50807 };
50808 if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
50809 return DAG.getBitcast(
50810 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
50811 DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
50812 }
50813
50814 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
50815 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
50816 Src.getOperand(0).getValueType() == MVT::x86mmx)
50817 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
50818
50819 // See if we're broadcasting the scalar value, in which case just reuse that.
50820 // Ensure the same SDValue from the SDNode use is being used.
50821 if (VT.getScalarType() == Src.getValueType())
50822 for (SDNode *User : Src->uses())
50823 if (User->getOpcode() == X86ISD::VBROADCAST &&
50824 Src == User->getOperand(0)) {
50825 unsigned SizeInBits = VT.getFixedSizeInBits();
50826 unsigned BroadcastSizeInBits =
50827 User->getValueSizeInBits(0).getFixedSize();
50828 if (BroadcastSizeInBits == SizeInBits)
50829 return SDValue(User, 0);
50830 if (BroadcastSizeInBits > SizeInBits)
50831 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
50832 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
50833 // coverage.
50834 }
50835
50836 return SDValue();
50837}
50838
50839// Simplify PMULDQ and PMULUDQ operations.
50840static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
50841 TargetLowering::DAGCombinerInfo &DCI,
50842 const X86Subtarget &Subtarget) {
50843 SDValue LHS = N->getOperand(0);
50844 SDValue RHS = N->getOperand(1);
50845
50846 // Canonicalize constant to RHS.
50847 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
50848 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
50849 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
50850
50851 // Multiply by zero.
50852 // Don't return RHS as it may contain UNDEFs.
50853 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
50854 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
50855
50856 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
50857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50858 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
50859 return SDValue(N, 0);
50860
50861 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
50862 // convert it to any_extend_invec, due to the LegalOperations check, do the
50863 // conversion directly to a vector shuffle manually. This exposes combine
50864 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
50865 // combineX86ShufflesRecursively on SSE4.1 targets.
50866 // FIXME: This is basically a hack around several other issues related to
50867 // ANY_EXTEND_VECTOR_INREG.
50868 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
50869 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50870 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50871 LHS.getOperand(0).getValueType() == MVT::v4i32) {
50872 SDLoc dl(N);
50873 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
50874 LHS.getOperand(0), { 0, -1, 1, -1 });
50875 LHS = DAG.getBitcast(MVT::v2i64, LHS);
50876 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50877 }
50878 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
50879 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
50880 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
50881 RHS.getOperand(0).getValueType() == MVT::v4i32) {
50882 SDLoc dl(N);
50883 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
50884 RHS.getOperand(0), { 0, -1, 1, -1 });
50885 RHS = DAG.getBitcast(MVT::v2i64, RHS);
50886 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
50887 }
50888
50889 return SDValue();
50890}
50891
50892static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
50893 TargetLowering::DAGCombinerInfo &DCI,
50894 const X86Subtarget &Subtarget) {
50895 EVT VT = N->getValueType(0);
50896 SDValue In = N->getOperand(0);
50897 unsigned Opcode = N->getOpcode();
50898 unsigned InOpcode = In.getOpcode();
50899 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50900
50901 // Try to merge vector loads and extend_inreg to an extload.
50902 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
50903 In.hasOneUse()) {
50904 auto *Ld = cast<LoadSDNode>(In);
50905 if (Ld->isSimple()) {
50906 MVT SVT = In.getSimpleValueType().getVectorElementType();
50907 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
50908 ? ISD::SEXTLOAD
50909 : ISD::ZEXTLOAD;
50910 EVT MemVT = VT.changeVectorElementType(SVT);
50911 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
50912 SDValue Load =
50913 DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
50914 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50915 Ld->getMemOperand()->getFlags());
50916 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
50917 return Load;
50918 }
50919 }
50920 }
50921
50922 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
50923 if (Opcode == InOpcode)
50924 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
50925
50926 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
50927 // -> EXTEND_VECTOR_INREG(X).
50928 // TODO: Handle non-zero subvector indices.
50929 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
50930 In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
50931 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
50932 In.getValueSizeInBits())
50933 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
50934
50935 // Attempt to combine as a shuffle.
50936 // TODO: General ZERO_EXTEND_VECTOR_INREG support.
50937 if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
50938 (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
50939 SDValue Op(N, 0);
50940 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
50941 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50942 return Res;
50943 }
50944
50945 return SDValue();
50946}
50947
50948static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
50949 TargetLowering::DAGCombinerInfo &DCI) {
50950 EVT VT = N->getValueType(0);
50951
50952 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
50953 return DAG.getConstant(0, SDLoc(N), VT);
50954
50955 APInt KnownUndef, KnownZero;
50956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50957 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
50958 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
50959 KnownZero, DCI))
50960 return SDValue(N, 0);
50961
50962 return SDValue();
50963}
50964
50965// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
50966// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
50967// extra instructions between the conversion due to going to scalar and back.
50968static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
50969 const X86Subtarget &Subtarget) {
50970 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
50971 return SDValue();
50972
50973 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
50974 return SDValue();
50975
50976 if (N->getValueType(0) != MVT::f32 ||
50977 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
50978 return SDValue();
50979
50980 SDLoc dl(N);
50981 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
50982 N->getOperand(0).getOperand(0));
50983 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
50984 DAG.getTargetConstant(4, dl, MVT::i32));
50985 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
50986 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
50987 DAG.getIntPtrConstant(0, dl));
50988}
50989
50990static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
50991 const X86Subtarget &Subtarget) {
50992 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
50993 return SDValue();
50994
50995 bool IsStrict = N->isStrictFPOpcode();
50996 EVT VT = N->getValueType(0);
50997 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
50998 EVT SrcVT = Src.getValueType();
50999
51000 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
51001 return SDValue();
51002
51003 if (VT.getVectorElementType() != MVT::f32 &&
51004 VT.getVectorElementType() != MVT::f64)
51005 return SDValue();
51006
51007 unsigned NumElts = VT.getVectorNumElements();
51008 if (NumElts == 1 || !isPowerOf2_32(NumElts))
51009 return SDValue();
51010
51011 SDLoc dl(N);
51012
51013 // Convert the input to vXi16.
51014 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
51015 Src = DAG.getBitcast(IntVT, Src);
51016
51017 // Widen to at least 8 input elements.
51018 if (NumElts < 8) {
51019 unsigned NumConcats = 8 / NumElts;
51020 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
51021 : DAG.getConstant(0, dl, IntVT);
51022 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
51023 Ops[0] = Src;
51024 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
51025 }
51026
51027 // Destination is vXf32 with at least 4 elements.
51028 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
51029 std::max(4U, NumElts));
51030 SDValue Cvt, Chain;
51031 if (IsStrict) {
51032 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
51033 {N->getOperand(0), Src});
51034 Chain = Cvt.getValue(1);
51035 } else {
51036 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
51037 }
51038
51039 if (NumElts < 4) {
51040 assert(NumElts == 2 && "Unexpected size")((void)0);
51041 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
51042 DAG.getIntPtrConstant(0, dl));
51043 }
51044
51045 if (IsStrict) {
51046 // Extend to the original VT if necessary.
51047 if (Cvt.getValueType() != VT) {
51048 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
51049 {Chain, Cvt});
51050 Chain = Cvt.getValue(1);
51051 }
51052 return DAG.getMergeValues({Cvt, Chain}, dl);
51053 }
51054
51055 // Extend to the original VT if necessary.
51056 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
51057}
51058
51059// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
51060// from. Limit this to cases where the loads have the same input chain and the
51061// output chains are unused. This avoids any memory ordering issues.
51062static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
51063 TargetLowering::DAGCombinerInfo &DCI) {
51064 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||((void)0)
51065 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&((void)0)
51066 "Unknown broadcast load type")((void)0);
51067
51068 // Only do this if the chain result is unused.
51069 if (N->hasAnyUseOfValue(1))
51070 return SDValue();
51071
51072 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
51073
51074 SDValue Ptr = MemIntrin->getBasePtr();
51075 SDValue Chain = MemIntrin->getChain();
51076 EVT VT = N->getSimpleValueType(0);
51077 EVT MemVT = MemIntrin->getMemoryVT();
51078
51079 // Look at other users of our base pointer and try to find a wider broadcast.
51080 // The input chain and the size of the memory VT must match.
51081 for (SDNode *User : Ptr->uses())
51082 if (User != N && User->getOpcode() == N->getOpcode() &&
51083 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51084 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51085 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51086 MemVT.getSizeInBits() &&
51087 !User->hasAnyUseOfValue(1) &&
51088 User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
51089 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51090 VT.getSizeInBits());
51091 Extract = DAG.getBitcast(VT, Extract);
51092 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51093 }
51094
51095 return SDValue();
51096}
51097
51098static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
51099 const X86Subtarget &Subtarget) {
51100 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
51101 return SDValue();
51102
51103 EVT VT = N->getValueType(0);
51104 SDValue Src = N->getOperand(0);
51105 EVT SrcVT = Src.getValueType();
51106
51107 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
51108 SrcVT.getVectorElementType() != MVT::f32)
51109 return SDValue();
51110
51111 unsigned NumElts = VT.getVectorNumElements();
51112 if (NumElts == 1 || !isPowerOf2_32(NumElts))
51113 return SDValue();
51114
51115 SDLoc dl(N);
51116
51117 // Widen to at least 4 input elements.
51118 if (NumElts < 4)
51119 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
51120 DAG.getConstantFP(0.0, dl, SrcVT));
51121
51122 // Destination is v8i16 with at least 8 elements.
51123 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51124 std::max(8U, NumElts));
51125 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
51126 DAG.getTargetConstant(4, dl, MVT::i32));
51127
51128 // Extract down to real number of elements.
51129 if (NumElts < 8) {
51130 EVT IntVT = VT.changeVectorElementTypeToInteger();
51131 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
51132 DAG.getIntPtrConstant(0, dl));
51133 }
51134
51135 return DAG.getBitcast(VT, Cvt);
51136}
51137
51138static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
51139 SDValue Src = N->getOperand(0);
51140
51141 // Turn MOVDQ2Q+simple_load into an mmx load.
51142 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
51143 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
51144
51145 if (LN->isSimple()) {
51146 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
51147 LN->getBasePtr(),
51148 LN->getPointerInfo(),
51149 LN->getOriginalAlign(),
51150 LN->getMemOperand()->getFlags());
51151 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
51152 return NewLd;
51153 }
51154 }
51155
51156 return SDValue();
51157}
51158
51159static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
51160 TargetLowering::DAGCombinerInfo &DCI) {
51161 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
51162 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51163 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
51164 APInt::getAllOnesValue(NumBits), DCI))
51165 return SDValue(N, 0);
51166
51167 return SDValue();
51168}
51169
51170SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
51171 DAGCombinerInfo &DCI) const {
51172 SelectionDAG &DAG = DCI.DAG;
51173 switch (N->getOpcode()) {
51174 default: break;
51175 case ISD::SCALAR_TO_VECTOR:
51176 return combineScalarToVector(N, DAG);
51177 case ISD::EXTRACT_VECTOR_ELT:
51178 case X86ISD::PEXTRW:
51179 case X86ISD::PEXTRB:
51180 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
51181 case ISD::CONCAT_VECTORS:
51182 return combineConcatVectors(N, DAG, DCI, Subtarget);
51183 case ISD::INSERT_SUBVECTOR:
51184 return combineInsertSubvector(N, DAG, DCI, Subtarget);
51185 case ISD::EXTRACT_SUBVECTOR:
51186 return combineExtractSubvector(N, DAG, DCI, Subtarget);
51187 case ISD::VSELECT:
51188 case ISD::SELECT:
51189 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
51190 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
51191 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
51192 case X86ISD::CMP: return combineCMP(N, DAG);
51193 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
51194 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
51195 case X86ISD::ADD:
51196 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
51197 case X86ISD::SBB: return combineSBB(N, DAG);
51198 case X86ISD::ADC: return combineADC(N, DAG, DCI);
51199 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
51200 case ISD::SHL: return combineShiftLeft(N, DAG);
51201 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
51202 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
51203 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
51204 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
51205 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
51206 case X86ISD::BEXTR:
51207 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
51208 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
51209 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
51210 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
51211 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
51212 case X86ISD::VEXTRACT_STORE:
51213 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
51214 case ISD::SINT_TO_FP:
51215 case ISD::STRICT_SINT_TO_FP:
51216 return combineSIntToFP(N, DAG, DCI, Subtarget);
51217 case ISD::UINT_TO_FP:
51218 case ISD::STRICT_UINT_TO_FP:
51219 return combineUIntToFP(N, DAG, Subtarget);
51220 case ISD::FADD:
51221 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
51222 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
51223 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
51224 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
51225 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
51226 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
51227 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
51228 case X86ISD::FXOR:
51229 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
51230 case X86ISD::FMIN:
51231 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
51232 case ISD::FMINNUM:
51233 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
51234 case X86ISD::CVTSI2P:
51235 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
51236 case X86ISD::CVTP2SI:
51237 case X86ISD::CVTP2UI:
51238 case X86ISD::STRICT_CVTTP2SI:
51239 case X86ISD::CVTTP2SI:
51240 case X86ISD::STRICT_CVTTP2UI:
51241 case X86ISD::CVTTP2UI:
51242 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
51243 case X86ISD::STRICT_CVTPH2PS:
51244 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
51245 case X86ISD::BT: return combineBT(N, DAG, DCI);
51246 case ISD::ANY_EXTEND:
51247 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
51248 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
51249 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
51250 case ISD::ANY_EXTEND_VECTOR_INREG:
51251 case ISD::SIGN_EXTEND_VECTOR_INREG:
51252 case ISD::ZERO_EXTEND_VECTOR_INREG:
51253 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
51254 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
51255 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
51256 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
51257 case X86ISD::PACKSS:
51258 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
51259 case X86ISD::HADD:
51260 case X86ISD::HSUB:
51261 case X86ISD::FHADD:
51262 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
51263 case X86ISD::VSHL:
51264 case X86ISD::VSRA:
51265 case X86ISD::VSRL:
51266 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
51267 case X86ISD::VSHLI:
51268 case X86ISD::VSRAI:
51269 case X86ISD::VSRLI:
51270 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
51271 case ISD::INSERT_VECTOR_ELT:
51272 case X86ISD::PINSRB:
51273 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
51274 case X86ISD::SHUFP: // Handle all target specific shuffles
51275 case X86ISD::INSERTPS:
51276 case X86ISD::EXTRQI:
51277 case X86ISD::INSERTQI:
51278 case X86ISD::VALIGN:
51279 case X86ISD::PALIGNR:
51280 case X86ISD::VSHLDQ:
51281 case X86ISD::VSRLDQ:
51282 case X86ISD::BLENDI:
51283 case X86ISD::UNPCKH:
51284 case X86ISD::UNPCKL:
51285 case X86ISD::MOVHLPS:
51286 case X86ISD::MOVLHPS:
51287 case X86ISD::PSHUFB:
51288 case X86ISD::PSHUFD:
51289 case X86ISD::PSHUFHW:
51290 case X86ISD::PSHUFLW:
51291 case X86ISD::MOVSHDUP:
51292 case X86ISD::MOVSLDUP:
51293 case X86ISD::MOVDDUP:
51294 case X86ISD::MOVSS:
51295 case X86ISD::MOVSD:
51296 case X86ISD::VBROADCAST:
51297 case X86ISD::VPPERM:
51298 case X86ISD::VPERMI:
51299 case X86ISD::VPERMV:
51300 case X86ISD::VPERMV3:
51301 case X86ISD::VPERMIL2:
51302 case X86ISD::VPERMILPI:
51303 case X86ISD::VPERMILPV:
51304 case X86ISD::VPERM2X128:
51305 case X86ISD::SHUF128:
51306 case X86ISD::VZEXT_MOVL:
51307 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
51308 case X86ISD::FMADD_RND:
51309 case X86ISD::FMSUB:
51310 case X86ISD::STRICT_FMSUB:
51311 case X86ISD::FMSUB_RND:
51312 case X86ISD::FNMADD:
51313 case X86ISD::STRICT_FNMADD:
51314 case X86ISD::FNMADD_RND:
51315 case X86ISD::FNMSUB:
51316 case X86ISD::STRICT_FNMSUB:
51317 case X86ISD::FNMSUB_RND:
51318 case ISD::FMA:
51319 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
51320 case X86ISD::FMADDSUB_RND:
51321 case X86ISD::FMSUBADD_RND:
51322 case X86ISD::FMADDSUB:
51323 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
51324 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
51325 case X86ISD::MGATHER:
51326 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
51327 case ISD::MGATHER:
51328 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
51329 case X86ISD::PCMPEQ:
51330 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
51331 case X86ISD::PMULDQ:
51332 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
51333 case X86ISD::KSHIFTL:
51334 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
51335 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
51336 case ISD::STRICT_FP_EXTEND:
51337 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
51338 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
51339 case X86ISD::VBROADCAST_LOAD:
51340 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
51341 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
51342 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
51343 }
51344
51345 return SDValue();
51346}
51347
51348bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
51349 if (!isTypeLegal(VT))
51350 return false;
51351
51352 // There are no vXi8 shifts.
51353 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
51354 return false;
51355
51356 // TODO: Almost no 8-bit ops are desirable because they have no actual
51357 // size/speed advantages vs. 32-bit ops, but they do have a major
51358 // potential disadvantage by causing partial register stalls.
51359 //
51360 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
51361 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
51362 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
51363 // check for a constant operand to the multiply.
51364 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
51365 return false;
51366
51367 // i16 instruction encodings are longer and some i16 instructions are slow,
51368 // so those are not desirable.
51369 if (VT == MVT::i16) {
51370 switch (Opc) {
51371 default:
51372 break;
51373 case ISD::LOAD:
51374 case ISD::SIGN_EXTEND:
51375 case ISD::ZERO_EXTEND:
51376 case ISD::ANY_EXTEND:
51377 case ISD::SHL:
51378 case ISD::SRA:
51379 case ISD::SRL:
51380 case ISD::SUB:
51381 case ISD::ADD:
51382 case ISD::MUL:
51383 case ISD::AND:
51384 case ISD::OR:
51385 case ISD::XOR:
51386 return false;
51387 }
51388 }
51389
51390 // Any legal type not explicitly accounted for above here is desirable.
51391 return true;
51392}
51393
51394SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
51395 SDValue Value, SDValue Addr,
51396 SelectionDAG &DAG) const {
51397 const Module *M = DAG.getMachineFunction().getMMI().getModule();
51398 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
51399 if (IsCFProtectionSupported) {
51400 // In case control-flow branch protection is enabled, we need to add
51401 // notrack prefix to the indirect branch.
51402 // In order to do that we create NT_BRIND SDNode.
51403 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
51404 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
51405 }
51406
51407 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
51408}
51409
51410bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
51411 EVT VT = Op.getValueType();
51412 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
51413 isa<ConstantSDNode>(Op.getOperand(1));
51414
51415 // i16 is legal, but undesirable since i16 instruction encodings are longer
51416 // and some i16 instructions are slow.
51417 // 8-bit multiply-by-constant can usually be expanded to something cheaper
51418 // using LEA and/or other ALU ops.
51419 if (VT != MVT::i16 && !Is8BitMulByConstant)
51420 return false;
51421
51422 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
51423 if (!Op.hasOneUse())
51424 return false;
51425 SDNode *User = *Op->use_begin();
51426 if (!ISD::isNormalStore(User))
51427 return false;
51428 auto *Ld = cast<LoadSDNode>(Load);
51429 auto *St = cast<StoreSDNode>(User);
51430 return Ld->getBasePtr() == St->getBasePtr();
51431 };
51432
51433 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
51434 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
51435 return false;
51436 if (!Op.hasOneUse())
51437 return false;
51438 SDNode *User = *Op->use_begin();
51439 if (User->getOpcode() != ISD::ATOMIC_STORE)
51440 return false;
51441 auto *Ld = cast<AtomicSDNode>(Load);
51442 auto *St = cast<AtomicSDNode>(User);
51443 return Ld->getBasePtr() == St->getBasePtr();
51444 };
51445
51446 bool Commute = false;
51447 switch (Op.getOpcode()) {
51448 default: return false;
51449 case ISD::SIGN_EXTEND:
51450 case ISD::ZERO_EXTEND:
51451 case ISD::ANY_EXTEND:
51452 break;
51453 case ISD::SHL:
51454 case ISD::SRA:
51455 case ISD::SRL: {
51456 SDValue N0 = Op.getOperand(0);
51457 // Look out for (store (shl (load), x)).
51458 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
51459 return false;
51460 break;
51461 }
51462 case ISD::ADD:
51463 case ISD::MUL:
51464 case ISD::AND:
51465 case ISD::OR:
51466 case ISD::XOR:
51467 Commute = true;
51468 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51469 case ISD::SUB: {
51470 SDValue N0 = Op.getOperand(0);
51471 SDValue N1 = Op.getOperand(1);
51472 // Avoid disabling potential load folding opportunities.
51473 if (MayFoldLoad(N1) &&
51474 (!Commute || !isa<ConstantSDNode>(N0) ||
51475 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
51476 return false;
51477 if (MayFoldLoad(N0) &&
51478 ((Commute && !isa<ConstantSDNode>(N1)) ||
51479 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
51480 return false;
51481 if (IsFoldableAtomicRMW(N0, Op) ||
51482 (Commute && IsFoldableAtomicRMW(N1, Op)))
51483 return false;
51484 }
51485 }
51486
51487 PVT = MVT::i32;
51488 return true;
51489}
51490
51491//===----------------------------------------------------------------------===//
51492// X86 Inline Assembly Support
51493//===----------------------------------------------------------------------===//
51494
51495// Helper to match a string separated by whitespace.
51496static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
51497 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
51498
51499 for (StringRef Piece : Pieces) {
51500 if (!S.startswith(Piece)) // Check if the piece matches.
51501 return false;
51502
51503 S = S.substr(Piece.size());
51504 StringRef::size_type Pos = S.find_first_not_of(" \t");
51505 if (Pos == 0) // We matched a prefix.
51506 return false;
51507
51508 S = S.substr(Pos);
51509 }
51510
51511 return S.empty();
51512}
51513
51514static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
51515
51516 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
51517 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
51518 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
51519 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
51520
51521 if (AsmPieces.size() == 3)
51522 return true;
51523 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
51524 return true;
51525 }
51526 }
51527 return false;
51528}
51529
51530bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
51531 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
51532
51533 const std::string &AsmStr = IA->getAsmString();
51534
51535 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
51536 if (!Ty || Ty->getBitWidth() % 16 != 0)
51537 return false;
51538
51539 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
51540 SmallVector<StringRef, 4> AsmPieces;
51541 SplitString(AsmStr, AsmPieces, ";\n");
51542
51543 switch (AsmPieces.size()) {
51544 default: return false;
51545 case 1:
51546 // FIXME: this should verify that we are targeting a 486 or better. If not,
51547 // we will turn this bswap into something that will be lowered to logical
51548 // ops instead of emitting the bswap asm. For now, we don't support 486 or
51549 // lower so don't worry about this.
51550 // bswap $0
51551 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
51552 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
51553 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
51554 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
51555 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
51556 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
51557 // No need to check constraints, nothing other than the equivalent of
51558 // "=r,0" would be valid here.
51559 return IntrinsicLowering::LowerToByteSwap(CI);
51560 }
51561
51562 // rorw $$8, ${0:w} --> llvm.bswap.i16
51563 if (CI->getType()->isIntegerTy(16) &&
51564 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51565 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
51566 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
51567 AsmPieces.clear();
51568 StringRef ConstraintsStr = IA->getConstraintString();
51569 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51570 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51571 if (clobbersFlagRegisters(AsmPieces))
51572 return IntrinsicLowering::LowerToByteSwap(CI);
51573 }
51574 break;
51575 case 3:
51576 if (CI->getType()->isIntegerTy(32) &&
51577 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
51578 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
51579 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
51580 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
51581 AsmPieces.clear();
51582 StringRef ConstraintsStr = IA->getConstraintString();
51583 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
51584 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
51585 if (clobbersFlagRegisters(AsmPieces))
51586 return IntrinsicLowering::LowerToByteSwap(CI);
51587 }
51588
51589 if (CI->getType()->isIntegerTy(64)) {
51590 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
51591 if (Constraints.size() >= 2 &&
51592 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
51593 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
51594 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
51595 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
51596 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
51597 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
51598 return IntrinsicLowering::LowerToByteSwap(CI);
51599 }
51600 }
51601 break;
51602 }
51603 return false;
51604}
51605
51606static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
51607 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
51608 .Case("{@cca}", X86::COND_A)
51609 .Case("{@ccae}", X86::COND_AE)
51610 .Case("{@ccb}", X86::COND_B)
51611 .Case("{@ccbe}", X86::COND_BE)
51612 .Case("{@ccc}", X86::COND_B)
51613 .Case("{@cce}", X86::COND_E)
51614 .Case("{@ccz}", X86::COND_E)
51615 .Case("{@ccg}", X86::COND_G)
51616 .Case("{@ccge}", X86::COND_GE)
51617 .Case("{@ccl}", X86::COND_L)
51618 .Case("{@ccle}", X86::COND_LE)
51619 .Case("{@ccna}", X86::COND_BE)
51620 .Case("{@ccnae}", X86::COND_B)
51621 .Case("{@ccnb}", X86::COND_AE)
51622 .Case("{@ccnbe}", X86::COND_A)
51623 .Case("{@ccnc}", X86::COND_AE)
51624 .Case("{@ccne}", X86::COND_NE)
51625 .Case("{@ccnz}", X86::COND_NE)
51626 .Case("{@ccng}", X86::COND_LE)
51627 .Case("{@ccnge}", X86::COND_L)
51628 .Case("{@ccnl}", X86::COND_GE)
51629 .Case("{@ccnle}", X86::COND_G)
51630 .Case("{@ccno}", X86::COND_NO)
51631 .Case("{@ccnp}", X86::COND_NP)
51632 .Case("{@ccns}", X86::COND_NS)
51633 .Case("{@cco}", X86::COND_O)
51634 .Case("{@ccp}", X86::COND_P)
51635 .Case("{@ccs}", X86::COND_S)
51636 .Default(X86::COND_INVALID);
51637 return Cond;
51638}
51639
51640/// Given a constraint letter, return the type of constraint for this target.
51641X86TargetLowering::ConstraintType
51642X86TargetLowering::getConstraintType(StringRef Constraint) const {
51643 if (Constraint.size() == 1) {
51644 switch (Constraint[0]) {
51645 case 'R':
51646 case 'q':
51647 case 'Q':
51648 case 'f':
51649 case 't':
51650 case 'u':
51651 case 'y':
51652 case 'x':
51653 case 'v':
51654 case 'l':
51655 case 'k': // AVX512 masking registers.
51656 return C_RegisterClass;
51657 case 'a':
51658 case 'b':
51659 case 'c':
51660 case 'd':
51661 case 'S':
51662 case 'D':
51663 case 'A':
51664 return C_Register;
51665 case 'I':
51666 case 'J':
51667 case 'K':
51668 case 'N':
51669 case 'G':
51670 case 'L':
51671 case 'M':
51672 return C_Immediate;
51673 case 'C':
51674 case 'e':
51675 case 'Z':
51676 return C_Other;
51677 default:
51678 break;
51679 }
51680 }
51681 else if (Constraint.size() == 2) {
51682 switch (Constraint[0]) {
51683 default:
51684 break;
51685 case 'Y':
51686 switch (Constraint[1]) {
51687 default:
51688 break;
51689 case 'z':
51690 return C_Register;
51691 case 'i':
51692 case 'm':
51693 case 'k':
51694 case 't':
51695 case '2':
51696 return C_RegisterClass;
51697 }
51698 }
51699 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
51700 return C_Other;
51701 return TargetLowering::getConstraintType(Constraint);
51702}
51703
51704/// Examine constraint type and operand type and determine a weight value.
51705/// This object must already have been set up with the operand type
51706/// and the current alternative constraint selected.
51707TargetLowering::ConstraintWeight
51708 X86TargetLowering::getSingleConstraintMatchWeight(
51709 AsmOperandInfo &info, const char *constraint) const {
51710 ConstraintWeight weight = CW_Invalid;
51711 Value *CallOperandVal = info.CallOperandVal;
51712 // If we don't have a value, we can't do a match,
51713 // but allow it at the lowest weight.
51714 if (!CallOperandVal)
51715 return CW_Default;
51716 Type *type = CallOperandVal->getType();
51717 // Look at the constraint type.
51718 switch (*constraint) {
51719 default:
51720 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
51721 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51722 case 'R':
51723 case 'q':
51724 case 'Q':
51725 case 'a':
51726 case 'b':
51727 case 'c':
51728 case 'd':
51729 case 'S':
51730 case 'D':
51731 case 'A':
51732 if (CallOperandVal->getType()->isIntegerTy())
51733 weight = CW_SpecificReg;
51734 break;
51735 case 'f':
51736 case 't':
51737 case 'u':
51738 if (type->isFloatingPointTy())
51739 weight = CW_SpecificReg;
51740 break;
51741 case 'y':
51742 if (type->isX86_MMXTy() && Subtarget.hasMMX())
51743 weight = CW_SpecificReg;
51744 break;
51745 case 'Y':
51746 if (StringRef(constraint).size() != 2)
51747 break;
51748 switch (constraint[1]) {
51749 default:
51750 return CW_Invalid;
51751 // XMM0
51752 case 'z':
51753 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51754 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
51755 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
51756 return CW_SpecificReg;
51757 return CW_Invalid;
51758 // Conditional OpMask regs (AVX512)
51759 case 'k':
51760 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51761 return CW_Register;
51762 return CW_Invalid;
51763 // Any MMX reg
51764 case 'm':
51765 if (type->isX86_MMXTy() && Subtarget.hasMMX())
51766 return weight;
51767 return CW_Invalid;
51768 // Any SSE reg when ISA >= SSE2, same as 'x'
51769 case 'i':
51770 case 't':
51771 case '2':
51772 if (!Subtarget.hasSSE2())
51773 return CW_Invalid;
51774 break;
51775 }
51776 break;
51777 case 'v':
51778 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
51779 weight = CW_Register;
51780 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51781 case 'x':
51782 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
51783 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
51784 weight = CW_Register;
51785 break;
51786 case 'k':
51787 // Enable conditional vector operations using %k<#> registers.
51788 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
51789 weight = CW_Register;
51790 break;
51791 case 'I':
51792 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
51793 if (C->getZExtValue() <= 31)
51794 weight = CW_Constant;
51795 }
51796 break;
51797 case 'J':
51798 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51799 if (C->getZExtValue() <= 63)
51800 weight = CW_Constant;
51801 }
51802 break;
51803 case 'K':
51804 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51805 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
51806 weight = CW_Constant;
51807 }
51808 break;
51809 case 'L':
51810 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51811 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
51812 weight = CW_Constant;
51813 }
51814 break;
51815 case 'M':
51816 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51817 if (C->getZExtValue() <= 3)
51818 weight = CW_Constant;
51819 }
51820 break;
51821 case 'N':
51822 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51823 if (C->getZExtValue() <= 0xff)
51824 weight = CW_Constant;
51825 }
51826 break;
51827 case 'G':
51828 case 'C':
51829 if (isa<ConstantFP>(CallOperandVal)) {
51830 weight = CW_Constant;
51831 }
51832 break;
51833 case 'e':
51834 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51835 if ((C->getSExtValue() >= -0x80000000LL) &&
51836 (C->getSExtValue() <= 0x7fffffffLL))
51837 weight = CW_Constant;
51838 }
51839 break;
51840 case 'Z':
51841 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
51842 if (C->getZExtValue() <= 0xffffffff)
51843 weight = CW_Constant;
51844 }
51845 break;
51846 }
51847 return weight;
51848}
51849
51850/// Try to replace an X constraint, which matches anything, with another that
51851/// has more specific requirements based on the type of the corresponding
51852/// operand.
51853const char *X86TargetLowering::
51854LowerXConstraint(EVT ConstraintVT) const {
51855 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
51856 // 'f' like normal targets.
51857 if (ConstraintVT.isFloatingPoint()) {
51858 if (Subtarget.hasSSE1())
51859 return "x";
51860 }
51861
51862 return TargetLowering::LowerXConstraint(ConstraintVT);
51863}
51864
51865// Lower @cc targets via setcc.
51866SDValue X86TargetLowering::LowerAsmOutputForConstraint(
51867 SDValue &Chain, SDValue &Flag, const SDLoc &DL,
51868 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
51869 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
51870 if (Cond == X86::COND_INVALID)
51871 return SDValue();
51872 // Check that return type is valid.
51873 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
51874 OpInfo.ConstraintVT.getSizeInBits() < 8)
51875 report_fatal_error("Flag output operand is of invalid type");
51876
51877 // Get EFLAGS register. Only update chain when copyfrom is glued.
51878 if (Flag.getNode()) {
51879 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
51880 Chain = Flag.getValue(1);
51881 } else
51882 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
51883 // Extract CC code.
51884 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
51885 // Extend to 32-bits
51886 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
51887
51888 return Result;
51889}
51890
51891/// Lower the specified operand into the Ops vector.
51892/// If it is invalid, don't add anything to Ops.
51893void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
51894 std::string &Constraint,
51895 std::vector<SDValue>&Ops,
51896 SelectionDAG &DAG) const {
51897 SDValue Result;
51898
51899 // Only support length 1 constraints for now.
51900 if (Constraint.length() > 1) return;
51901
51902 char ConstraintLetter = Constraint[0];
51903 switch (ConstraintLetter) {
51904 default: break;
51905 case 'I':
51906 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51907 if (C->getZExtValue() <= 31) {
51908 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51909 Op.getValueType());
51910 break;
51911 }
51912 }
51913 return;
51914 case 'J':
51915 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51916 if (C->getZExtValue() <= 63) {
51917 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51918 Op.getValueType());
51919 break;
51920 }
51921 }
51922 return;
51923 case 'K':
51924 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51925 if (isInt<8>(C->getSExtValue())) {
51926 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51927 Op.getValueType());
51928 break;
51929 }
51930 }
51931 return;
51932 case 'L':
51933 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51934 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
51935 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
51936 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
51937 Op.getValueType());
51938 break;
51939 }
51940 }
51941 return;
51942 case 'M':
51943 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51944 if (C->getZExtValue() <= 3) {
51945 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51946 Op.getValueType());
51947 break;
51948 }
51949 }
51950 return;
51951 case 'N':
51952 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51953 if (C->getZExtValue() <= 255) {
51954 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51955 Op.getValueType());
51956 break;
51957 }
51958 }
51959 return;
51960 case 'O':
51961 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51962 if (C->getZExtValue() <= 127) {
51963 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51964 Op.getValueType());
51965 break;
51966 }
51967 }
51968 return;
51969 case 'e': {
51970 // 32-bit signed value
51971 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51972 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51973 C->getSExtValue())) {
51974 // Widen to 64 bits here to get it sign extended.
51975 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
51976 break;
51977 }
51978 // FIXME gcc accepts some relocatable values here too, but only in certain
51979 // memory models; it's complicated.
51980 }
51981 return;
51982 }
51983 case 'Z': {
51984 // 32-bit unsigned value
51985 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
51986 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
51987 C->getZExtValue())) {
51988 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
51989 Op.getValueType());
51990 break;
51991 }
51992 }
51993 // FIXME gcc accepts some relocatable values here too, but only in certain
51994 // memory models; it's complicated.
51995 return;
51996 }
51997 case 'i': {
51998 // Literal immediates are always ok.
51999 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
52000 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
52001 BooleanContent BCont = getBooleanContents(MVT::i64);
52002 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
52003 : ISD::SIGN_EXTEND;
52004 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
52005 : CST->getSExtValue();
52006 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
52007 break;
52008 }
52009
52010 // In any sort of PIC mode addresses need to be computed at runtime by
52011 // adding in a register or some sort of table lookup. These can't
52012 // be used as immediates.
52013 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
52014 return;
52015
52016 // If we are in non-pic codegen mode, we allow the address of a global (with
52017 // an optional displacement) to be used with 'i'.
52018 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
52019 // If we require an extra load to get this address, as in PIC mode, we
52020 // can't accept it.
52021 if (isGlobalStubReference(
52022 Subtarget.classifyGlobalReference(GA->getGlobal())))
52023 return;
52024 break;
52025 }
52026 }
52027
52028 if (Result.getNode()) {
52029 Ops.push_back(Result);
52030 return;
52031 }
52032 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
52033}
52034
52035/// Check if \p RC is a general purpose register class.
52036/// I.e., GR* or one of their variant.
52037static bool isGRClass(const TargetRegisterClass &RC) {
52038 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
52039 RC.hasSuperClassEq(&X86::GR16RegClass) ||
52040 RC.hasSuperClassEq(&X86::GR32RegClass) ||
52041 RC.hasSuperClassEq(&X86::GR64RegClass) ||
52042 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
52043}
52044
52045/// Check if \p RC is a vector register class.
52046/// I.e., FR* / VR* or one of their variant.
52047static bool isFRClass(const TargetRegisterClass &RC) {
52048 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
52049 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
52050 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
52051 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
52052 RC.hasSuperClassEq(&X86::VR512RegClass);
52053}
52054
52055/// Check if \p RC is a mask register class.
52056/// I.e., VK* or one of their variant.
52057static bool isVKClass(const TargetRegisterClass &RC) {
52058 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
52059 RC.hasSuperClassEq(&X86::VK2RegClass) ||
52060 RC.hasSuperClassEq(&X86::VK4RegClass) ||
52061 RC.hasSuperClassEq(&X86::VK8RegClass) ||
52062 RC.hasSuperClassEq(&X86::VK16RegClass) ||
52063 RC.hasSuperClassEq(&X86::VK32RegClass) ||
52064 RC.hasSuperClassEq(&X86::VK64RegClass);
52065}
52066
52067std::pair<unsigned, const TargetRegisterClass *>
52068X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
52069 StringRef Constraint,
52070 MVT VT) const {
52071 // First, see if this is a constraint that directly corresponds to an LLVM
52072 // register class.
52073 if (Constraint.size() == 1) {
52074 // GCC Constraint Letters
52075 switch (Constraint[0]) {
52076 default: break;
52077 // 'A' means [ER]AX + [ER]DX.
52078 case 'A':
52079 if (Subtarget.is64Bit())
52080 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
52081 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&((void)0)
52082 "Expecting 64, 32 or 16 bit subtarget")((void)0);
52083 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52084
52085 // TODO: Slight differences here in allocation order and leaving
52086 // RIP in the class. Do they matter any more here than they do
52087 // in the normal allocation?
52088 case 'k':
52089 if (Subtarget.hasAVX512()) {
52090 if (VT == MVT::i1)
52091 return std::make_pair(0U, &X86::VK1RegClass);
52092 if (VT == MVT::i8)
52093 return std::make_pair(0U, &X86::VK8RegClass);
52094 if (VT == MVT::i16)
52095 return std::make_pair(0U, &X86::VK16RegClass);
52096 }
52097 if (Subtarget.hasBWI()) {
52098 if (VT == MVT::i32)
52099 return std::make_pair(0U, &X86::VK32RegClass);
52100 if (VT == MVT::i64)
52101 return std::make_pair(0U, &X86::VK64RegClass);
52102 }
52103 break;
52104 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
52105 if (Subtarget.is64Bit()) {
52106 if (VT == MVT::i8 || VT == MVT::i1)
52107 return std::make_pair(0U, &X86::GR8RegClass);
52108 if (VT == MVT::i16)
52109 return std::make_pair(0U, &X86::GR16RegClass);
52110 if (VT == MVT::i32 || VT == MVT::f32)
52111 return std::make_pair(0U, &X86::GR32RegClass);
52112 if (VT != MVT::f80 && !VT.isVector())
52113 return std::make_pair(0U, &X86::GR64RegClass);
52114 break;
52115 }
52116 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52117 // 32-bit fallthrough
52118 case 'Q': // Q_REGS
52119 if (VT == MVT::i8 || VT == MVT::i1)
52120 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
52121 if (VT == MVT::i16)
52122 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
52123 if (VT == MVT::i32 || VT == MVT::f32 ||
52124 (!VT.isVector() && !Subtarget.is64Bit()))
52125 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
52126 if (VT != MVT::f80 && !VT.isVector())
52127 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
52128 break;
52129 case 'r': // GENERAL_REGS
52130 case 'l': // INDEX_REGS
52131 if (VT == MVT::i8 || VT == MVT::i1)
52132 return std::make_pair(0U, &X86::GR8RegClass);
52133 if (VT == MVT::i16)
52134 return std::make_pair(0U, &X86::GR16RegClass);
52135 if (VT == MVT::i32 || VT == MVT::f32 ||
52136 (!VT.isVector() && !Subtarget.is64Bit()))
52137 return std::make_pair(0U, &X86::GR32RegClass);
52138 if (VT != MVT::f80 && !VT.isVector())
52139 return std::make_pair(0U, &X86::GR64RegClass);
52140 break;
52141 case 'R': // LEGACY_REGS
52142 if (VT == MVT::i8 || VT == MVT::i1)
52143 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
52144 if (VT == MVT::i16)
52145 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
52146 if (VT == MVT::i32 || VT == MVT::f32 ||
52147 (!VT.isVector() && !Subtarget.is64Bit()))
52148 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
52149 if (VT != MVT::f80 && !VT.isVector())
52150 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
52151 break;
52152 case 'f': // FP Stack registers.
52153 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
52154 // value to the correct fpstack register class.
52155 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
52156 return std::make_pair(0U, &X86::RFP32RegClass);
52157 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
52158 return std::make_pair(0U, &X86::RFP64RegClass);
52159 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
52160 return std::make_pair(0U, &X86::RFP80RegClass);
52161 break;
52162 case 'y': // MMX_REGS if MMX allowed.
52163 if (!Subtarget.hasMMX()) break;
52164 return std::make_pair(0U, &X86::VR64RegClass);
52165 case 'v':
52166 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
52167 if (!Subtarget.hasSSE1()) break;
52168 bool VConstraint = (Constraint[0] == 'v');
52169
52170 switch (VT.SimpleTy) {
52171 default: break;
52172 // Scalar SSE types.
52173 case MVT::f32:
52174 case MVT::i32:
52175 if (VConstraint && Subtarget.hasVLX())
52176 return std::make_pair(0U, &X86::FR32XRegClass);
52177 return std::make_pair(0U, &X86::FR32RegClass);
52178 case MVT::f64:
52179 case MVT::i64:
52180 if (VConstraint && Subtarget.hasVLX())
52181 return std::make_pair(0U, &X86::FR64XRegClass);
52182 return std::make_pair(0U, &X86::FR64RegClass);
52183 case MVT::i128:
52184 if (Subtarget.is64Bit()) {
52185 if (VConstraint && Subtarget.hasVLX())
52186 return std::make_pair(0U, &X86::VR128XRegClass);
52187 return std::make_pair(0U, &X86::VR128RegClass);
52188 }
52189 break;
52190 // Vector types and fp128.
52191 case MVT::f128:
52192 case MVT::v16i8:
52193 case MVT::v8i16:
52194 case MVT::v4i32:
52195 case MVT::v2i64:
52196 case MVT::v4f32:
52197 case MVT::v2f64:
52198 if (VConstraint && Subtarget.hasVLX())
52199 return std::make_pair(0U, &X86::VR128XRegClass);
52200 return std::make_pair(0U, &X86::VR128RegClass);
52201 // AVX types.
52202 case MVT::v32i8:
52203 case MVT::v16i16:
52204 case MVT::v8i32:
52205 case MVT::v4i64:
52206 case MVT::v8f32:
52207 case MVT::v4f64:
52208 if (VConstraint && Subtarget.hasVLX())
52209 return std::make_pair(0U, &X86::VR256XRegClass);
52210 if (Subtarget.hasAVX())
52211 return std::make_pair(0U, &X86::VR256RegClass);
52212 break;
52213 case MVT::v64i8:
52214 case MVT::v32i16:
52215 case MVT::v8f64:
52216 case MVT::v16f32:
52217 case MVT::v16i32:
52218 case MVT::v8i64:
52219 if (!Subtarget.hasAVX512()) break;
52220 if (VConstraint)
52221 return std::make_pair(0U, &X86::VR512RegClass);
52222 return std::make_pair(0U, &X86::VR512_0_15RegClass);
52223 }
52224 break;
52225 }
52226 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
52227 switch (Constraint[1]) {
52228 default:
52229 break;
52230 case 'i':
52231 case 't':
52232 case '2':
52233 return getRegForInlineAsmConstraint(TRI, "x", VT);
52234 case 'm':
52235 if (!Subtarget.hasMMX()) break;
52236 return std::make_pair(0U, &X86::VR64RegClass);
52237 case 'z':
52238 if (!Subtarget.hasSSE1()) break;
52239 switch (VT.SimpleTy) {
52240 default: break;
52241 // Scalar SSE types.
52242 case MVT::f32:
52243 case MVT::i32:
52244 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
52245 case MVT::f64:
52246 case MVT::i64:
52247 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
52248 case MVT::f128:
52249 case MVT::v16i8:
52250 case MVT::v8i16:
52251 case MVT::v4i32:
52252 case MVT::v2i64:
52253 case MVT::v4f32:
52254 case MVT::v2f64:
52255 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
52256 // AVX types.
52257 case MVT::v32i8:
52258 case MVT::v16i16:
52259 case MVT::v8i32:
52260 case MVT::v4i64:
52261 case MVT::v8f32:
52262 case MVT::v4f64:
52263 if (Subtarget.hasAVX())
52264 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
52265 break;
52266 case MVT::v64i8:
52267 case MVT::v32i16:
52268 case MVT::v8f64:
52269 case MVT::v16f32:
52270 case MVT::v16i32:
52271 case MVT::v8i64:
52272 if (Subtarget.hasAVX512())
52273 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
52274 break;
52275 }
52276 break;
52277 case 'k':
52278 // This register class doesn't allocate k0 for masked vector operation.
52279 if (Subtarget.hasAVX512()) {
52280 if (VT == MVT::i1)
52281 return std::make_pair(0U, &X86::VK1WMRegClass);
52282 if (VT == MVT::i8)
52283 return std::make_pair(0U, &X86::VK8WMRegClass);
52284 if (VT == MVT::i16)
52285 return std::make_pair(0U, &X86::VK16WMRegClass);
52286 }
52287 if (Subtarget.hasBWI()) {
52288 if (VT == MVT::i32)
52289 return std::make_pair(0U, &X86::VK32WMRegClass);
52290 if (VT == MVT::i64)
52291 return std::make_pair(0U, &X86::VK64WMRegClass);
52292 }
52293 break;
52294 }
52295 }
52296
52297 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
52298 return std::make_pair(0U, &X86::GR32RegClass);
52299
52300 // Use the default implementation in TargetLowering to convert the register
52301 // constraint into a member of a register class.
52302 std::pair<Register, const TargetRegisterClass*> Res;
52303 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
52304
52305 // Not found as a standard register?
52306 if (!Res.second) {
52307 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
52308 // to/from f80.
52309 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
52310 // Map st(0) -> st(7) -> ST0
52311 if (Constraint.size() == 7 && Constraint[0] == '{' &&
52312 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
52313 Constraint[3] == '(' &&
52314 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
52315 Constraint[5] == ')' && Constraint[6] == '}') {
52316 // st(7) is not allocatable and thus not a member of RFP80. Return
52317 // singleton class in cases where we have a reference to it.
52318 if (Constraint[4] == '7')
52319 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
52320 return std::make_pair(X86::FP0 + Constraint[4] - '0',
52321 &X86::RFP80RegClass);
52322 }
52323
52324 // GCC allows "st(0)" to be called just plain "st".
52325 if (StringRef("{st}").equals_insensitive(Constraint))
52326 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
52327 }
52328
52329 // flags -> EFLAGS
52330 if (StringRef("{flags}").equals_insensitive(Constraint))
52331 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
52332
52333 // dirflag -> DF
52334 // Only allow for clobber.
52335 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
52336 VT == MVT::Other)
52337 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
52338
52339 // fpsr -> FPSW
52340 if (StringRef("{fpsr}").equals_insensitive(Constraint))
52341 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
52342
52343 return Res;
52344 }
52345
52346 // Make sure it isn't a register that requires 64-bit mode.
52347 if (!Subtarget.is64Bit() &&
52348 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
52349 TRI->getEncodingValue(Res.first) >= 8) {
52350 // Register requires REX prefix, but we're in 32-bit mode.
52351 return std::make_pair(0, nullptr);
52352 }
52353
52354 // Make sure it isn't a register that requires AVX512.
52355 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
52356 TRI->getEncodingValue(Res.first) & 0x10) {
52357 // Register requires EVEX prefix.
52358 return std::make_pair(0, nullptr);
52359 }
52360
52361 // Otherwise, check to see if this is a register class of the wrong value
52362 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
52363 // turn into {ax},{dx}.
52364 // MVT::Other is used to specify clobber names.
52365 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
52366 return Res; // Correct type already, nothing to do.
52367
52368 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
52369 // return "eax". This should even work for things like getting 64bit integer
52370 // registers when given an f64 type.
52371 const TargetRegisterClass *Class = Res.second;
52372 // The generic code will match the first register class that contains the
52373 // given register. Thus, based on the ordering of the tablegened file,
52374 // the "plain" GR classes might not come first.
52375 // Therefore, use a helper method.
52376 if (isGRClass(*Class)) {
52377 unsigned Size = VT.getSizeInBits();
52378 if (Size == 1) Size = 8;
52379 Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
52380 if (DestReg > 0) {
52381 bool is64Bit = Subtarget.is64Bit();
52382 const TargetRegisterClass *RC =
52383 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
52384 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
52385 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
52386 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
52387 : nullptr;
52388 if (Size == 64 && !is64Bit) {
52389 // Model GCC's behavior here and select a fixed pair of 32-bit
52390 // registers.
52391 switch (DestReg) {
52392 case X86::RAX:
52393 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52394 case X86::RDX:
52395 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
52396 case X86::RCX:
52397 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
52398 case X86::RBX:
52399 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
52400 case X86::RSI:
52401 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
52402 case X86::RDI:
52403 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
52404 case X86::RBP:
52405 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
52406 default:
52407 return std::make_pair(0, nullptr);
52408 }
52409 }
52410 if (RC && RC->contains(DestReg))
52411 return std::make_pair(DestReg, RC);
52412 return Res;
52413 }
52414 // No register found/type mismatch.
52415 return std::make_pair(0, nullptr);
52416 } else if (isFRClass(*Class)) {
52417 // Handle references to XMM physical registers that got mapped into the
52418 // wrong class. This can happen with constraints like {xmm0} where the
52419 // target independent register mapper will just pick the first match it can
52420 // find, ignoring the required type.
52421
52422 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
52423 if (VT == MVT::f32 || VT == MVT::i32)
52424 Res.second = &X86::FR32XRegClass;
52425 else if (VT == MVT::f64 || VT == MVT::i64)
52426 Res.second = &X86::FR64XRegClass;
52427 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
52428 Res.second = &X86::VR128XRegClass;
52429 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
52430 Res.second = &X86::VR256XRegClass;
52431 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
52432 Res.second = &X86::VR512RegClass;
52433 else {
52434 // Type mismatch and not a clobber: Return an error;
52435 Res.first = 0;
52436 Res.second = nullptr;
52437 }
52438 } else if (isVKClass(*Class)) {
52439 if (VT == MVT::i1)
52440 Res.second = &X86::VK1RegClass;
52441 else if (VT == MVT::i8)
52442 Res.second = &X86::VK8RegClass;
52443 else if (VT == MVT::i16)
52444 Res.second = &X86::VK16RegClass;
52445 else if (VT == MVT::i32)
52446 Res.second = &X86::VK32RegClass;
52447 else if (VT == MVT::i64)
52448 Res.second = &X86::VK64RegClass;
52449 else {
52450 // Type mismatch and not a clobber: Return an error;
52451 Res.first = 0;
52452 Res.second = nullptr;
52453 }
52454 }
52455
52456 return Res;
52457}
52458
52459InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
52460 const AddrMode &AM,
52461 Type *Ty,
52462 unsigned AS) const {
52463 // Scaling factors are not free at all.
52464 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
52465 // will take 2 allocations in the out of order engine instead of 1
52466 // for plain addressing mode, i.e. inst (reg1).
52467 // E.g.,
52468 // vaddps (%rsi,%rdx), %ymm0, %ymm1
52469 // Requires two allocations (one for the load, one for the computation)
52470 // whereas:
52471 // vaddps (%rsi), %ymm0, %ymm1
52472 // Requires just 1 allocation, i.e., freeing allocations for other operations
52473 // and having less micro operations to execute.
52474 //
52475 // For some X86 architectures, this is even worse because for instance for
52476 // stores, the complex addressing mode forces the instruction to use the
52477 // "load" ports instead of the dedicated "store" port.
52478 // E.g., on Haswell:
52479 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
52480 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
52481 if (isLegalAddressingMode(DL, AM, Ty, AS))
52482 // Scale represents reg2 * scale, thus account for 1
52483 // as soon as we use a second register.
52484 return AM.Scale != 0;
52485 return -1;
52486}
52487
52488bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
52489 // Integer division on x86 is expensive. However, when aggressively optimizing
52490 // for code size, we prefer to use a div instruction, as it is usually smaller
52491 // than the alternative sequence.
52492 // The exception to this is vector division. Since x86 doesn't have vector
52493 // integer division, leaving the division as-is is a loss even in terms of
52494 // size, because it will have to be scalarized, while the alternative code
52495 // sequence can be performed in vector form.
52496 bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
52497 return OptSize && !VT.isVector();
52498}
52499
52500void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
52501 if (!Subtarget.is64Bit())
52502 return;
52503
52504 // Update IsSplitCSR in X86MachineFunctionInfo.
52505 X86MachineFunctionInfo *AFI =
52506 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
52507 AFI->setIsSplitCSR(true);
52508}
52509
52510void X86TargetLowering::insertCopiesSplitCSR(
52511 MachineBasicBlock *Entry,
52512 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
52513 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
52514 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
52515 if (!IStart)
52516 return;
52517
52518 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
52519 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
52520 MachineBasicBlock::iterator MBBI = Entry->begin();
52521 for (const MCPhysReg *I = IStart; *I; ++I) {
52522 const TargetRegisterClass *RC = nullptr;
52523 if (X86::GR64RegClass.contains(*I))
52524 RC = &X86::GR64RegClass;
52525 else
52526 llvm_unreachable("Unexpected register class in CSRsViaCopy!")__builtin_unreachable();
52527
52528 Register NewVR = MRI->createVirtualRegister(RC);
52529 // Create copy from CSR to a virtual register.
52530 // FIXME: this currently does not emit CFI pseudo-instructions, it works
52531 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
52532 // nounwind. If we want to generalize this later, we may need to emit
52533 // CFI pseudo-instructions.
52534 assert(((void)0)
52535 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&((void)0)
52536 "Function should be nounwind in insertCopiesSplitCSR!")((void)0);
52537 Entry->addLiveIn(*I);
52538 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
52539 .addReg(*I);
52540
52541 // Insert the copy-back instructions right before the terminator.
52542 for (auto *Exit : Exits)
52543 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
52544 TII->get(TargetOpcode::COPY), *I)
52545 .addReg(NewVR);
52546 }
52547}
52548
52549bool X86TargetLowering::supportSwiftError() const {
52550 return Subtarget.is64Bit();
52551}
52552
52553/// Returns true if stack probing through a function call is requested.
52554bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
52555 return !getStackProbeSymbolName(MF).empty();
52556}
52557
52558/// Returns true if stack probing through inline assembly is requested.
52559bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
52560
52561 // No inline stack probe for Windows, they have their own mechanism.
52562 if (Subtarget.isOSWindows() ||
52563 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52564 return false;
52565
52566 // If the function specifically requests inline stack probes, emit them.
52567 if (MF.getFunction().hasFnAttribute("probe-stack"))
52568 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
52569 "inline-asm";
52570
52571 return false;
52572}
52573
52574/// Returns the name of the symbol used to emit stack probes or the empty
52575/// string if not applicable.
52576StringRef
52577X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
52578 // Inline Stack probes disable stack probe call
52579 if (hasInlineStackProbe(MF))
52580 return "";
52581
52582 // If the function specifically requests stack probes, emit them.
52583 if (MF.getFunction().hasFnAttribute("probe-stack"))
52584 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
52585
52586 // Generally, if we aren't on Windows, the platform ABI does not include
52587 // support for stack probes, so don't emit them.
52588 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
52589 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
52590 return "";
52591
52592 // We need a stack probe to conform to the Windows ABI. Choose the right
52593 // symbol.
52594 if (Subtarget.is64Bit())
52595 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
52596 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
52597}
52598
52599unsigned
52600X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
52601 // The default stack probe size is 4096 if the function has no stackprobesize
52602 // attribute.
52603 unsigned StackProbeSize = 4096;
52604 const Function &Fn = MF.getFunction();
52605 if (Fn.hasFnAttribute("stack-probe-size"))
52606 Fn.getFnAttribute("stack-probe-size")
52607 .getValueAsString()
52608 .getAsInteger(0, StackProbeSize);
52609 return StackProbeSize;
52610}
52611
52612Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
52613 if (ML->isInnermost() &&
52614 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
52615 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
52616 return TargetLowering::getPrefLoopAlignment();
52617}