File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/CodeGen/BasicTTIImpl.h |
Warning: | line 1065, column 11 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | /// \file | ||||||
9 | /// This file implements a TargetTransformInfo analysis pass specific to the | ||||||
10 | /// X86 target machine. It uses the target's detailed information to provide | ||||||
11 | /// more precise answers to certain TTI queries, while letting the target | ||||||
12 | /// independent and default TTI implementations handle the rest. | ||||||
13 | /// | ||||||
14 | //===----------------------------------------------------------------------===// | ||||||
15 | /// About Cost Model numbers used below it's necessary to say the following: | ||||||
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of | ||||||
17 | /// concrete CPU model. Usually the numbers correspond to CPU where the feature | ||||||
18 | /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in | ||||||
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU | ||||||
20 | /// to support that feature level and thus has most likely the worst case cost. | ||||||
21 | /// Some examples of other technologies/CPUs: | ||||||
22 | /// SSE 3 - Pentium4 / Athlon64 | ||||||
23 | /// SSE 4.1 - Penryn | ||||||
24 | /// SSE 4.2 - Nehalem | ||||||
25 | /// AVX - Sandy Bridge | ||||||
26 | /// AVX2 - Haswell | ||||||
27 | /// AVX-512 - Xeon Phi / Skylake | ||||||
28 | /// And some examples of instruction target dependent costs (latency) | ||||||
29 | /// divss sqrtss rsqrtss | ||||||
30 | /// AMD K7 11-16 19 3 | ||||||
31 | /// Piledriver 9-24 13-15 5 | ||||||
32 | /// Jaguar 14 16 2 | ||||||
33 | /// Pentium II,III 18 30 2 | ||||||
34 | /// Nehalem 7-14 7-18 3 | ||||||
35 | /// Haswell 10-13 11 5 | ||||||
36 | /// TODO: Develop and implement the target dependent cost model and | ||||||
37 | /// specialize cost numbers for different Cost Model Targets such as throughput, | ||||||
38 | /// code size, latency and uop count. | ||||||
39 | //===----------------------------------------------------------------------===// | ||||||
40 | |||||||
41 | #include "X86TargetTransformInfo.h" | ||||||
42 | #include "llvm/Analysis/TargetTransformInfo.h" | ||||||
43 | #include "llvm/CodeGen/BasicTTIImpl.h" | ||||||
44 | #include "llvm/CodeGen/CostTable.h" | ||||||
45 | #include "llvm/CodeGen/TargetLowering.h" | ||||||
46 | #include "llvm/IR/IntrinsicInst.h" | ||||||
47 | #include "llvm/Support/Debug.h" | ||||||
48 | |||||||
49 | using namespace llvm; | ||||||
50 | |||||||
51 | #define DEBUG_TYPE"x86tti" "x86tti" | ||||||
52 | |||||||
53 | //===----------------------------------------------------------------------===// | ||||||
54 | // | ||||||
55 | // X86 cost model. | ||||||
56 | // | ||||||
57 | //===----------------------------------------------------------------------===// | ||||||
58 | |||||||
59 | TargetTransformInfo::PopcntSupportKind | ||||||
60 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { | ||||||
61 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")((void)0); | ||||||
62 | // TODO: Currently the __builtin_popcount() implementation using SSE3 | ||||||
63 | // instructions is inefficient. Once the problem is fixed, we should | ||||||
64 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). | ||||||
65 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; | ||||||
66 | } | ||||||
67 | |||||||
68 | llvm::Optional<unsigned> X86TTIImpl::getCacheSize( | ||||||
69 | TargetTransformInfo::CacheLevel Level) const { | ||||||
70 | switch (Level) { | ||||||
71 | case TargetTransformInfo::CacheLevel::L1D: | ||||||
72 | // - Penryn | ||||||
73 | // - Nehalem | ||||||
74 | // - Westmere | ||||||
75 | // - Sandy Bridge | ||||||
76 | // - Ivy Bridge | ||||||
77 | // - Haswell | ||||||
78 | // - Broadwell | ||||||
79 | // - Skylake | ||||||
80 | // - Kabylake | ||||||
81 | return 32 * 1024; // 32 KByte | ||||||
82 | case TargetTransformInfo::CacheLevel::L2D: | ||||||
83 | // - Penryn | ||||||
84 | // - Nehalem | ||||||
85 | // - Westmere | ||||||
86 | // - Sandy Bridge | ||||||
87 | // - Ivy Bridge | ||||||
88 | // - Haswell | ||||||
89 | // - Broadwell | ||||||
90 | // - Skylake | ||||||
91 | // - Kabylake | ||||||
92 | return 256 * 1024; // 256 KByte | ||||||
93 | } | ||||||
94 | |||||||
95 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")__builtin_unreachable(); | ||||||
96 | } | ||||||
97 | |||||||
98 | llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( | ||||||
99 | TargetTransformInfo::CacheLevel Level) const { | ||||||
100 | // - Penryn | ||||||
101 | // - Nehalem | ||||||
102 | // - Westmere | ||||||
103 | // - Sandy Bridge | ||||||
104 | // - Ivy Bridge | ||||||
105 | // - Haswell | ||||||
106 | // - Broadwell | ||||||
107 | // - Skylake | ||||||
108 | // - Kabylake | ||||||
109 | switch (Level) { | ||||||
110 | case TargetTransformInfo::CacheLevel::L1D: | ||||||
111 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | ||||||
112 | case TargetTransformInfo::CacheLevel::L2D: | ||||||
113 | return 8; | ||||||
114 | } | ||||||
115 | |||||||
116 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")__builtin_unreachable(); | ||||||
117 | } | ||||||
118 | |||||||
119 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { | ||||||
120 | bool Vector = (ClassID == 1); | ||||||
121 | if (Vector && !ST->hasSSE1()) | ||||||
122 | return 0; | ||||||
123 | |||||||
124 | if (ST->is64Bit()) { | ||||||
125 | if (Vector && ST->hasAVX512()) | ||||||
126 | return 32; | ||||||
127 | return 16; | ||||||
128 | } | ||||||
129 | return 8; | ||||||
130 | } | ||||||
131 | |||||||
132 | TypeSize | ||||||
133 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | ||||||
134 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); | ||||||
135 | switch (K) { | ||||||
136 | case TargetTransformInfo::RGK_Scalar: | ||||||
137 | return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); | ||||||
138 | case TargetTransformInfo::RGK_FixedWidthVector: | ||||||
139 | if (ST->hasAVX512() && PreferVectorWidth >= 512) | ||||||
140 | return TypeSize::getFixed(512); | ||||||
141 | if (ST->hasAVX() && PreferVectorWidth >= 256) | ||||||
142 | return TypeSize::getFixed(256); | ||||||
143 | if (ST->hasSSE1() && PreferVectorWidth >= 128) | ||||||
144 | return TypeSize::getFixed(128); | ||||||
145 | return TypeSize::getFixed(0); | ||||||
146 | case TargetTransformInfo::RGK_ScalableVector: | ||||||
147 | return TypeSize::getScalable(0); | ||||||
148 | } | ||||||
149 | |||||||
150 | llvm_unreachable("Unsupported register kind")__builtin_unreachable(); | ||||||
151 | } | ||||||
152 | |||||||
153 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { | ||||||
154 | return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) | ||||||
155 | .getFixedSize(); | ||||||
156 | } | ||||||
157 | |||||||
158 | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { | ||||||
159 | // If the loop will not be vectorized, don't interleave the loop. | ||||||
160 | // Let regular unroll to unroll the loop, which saves the overflow | ||||||
161 | // check and memory check cost. | ||||||
162 | if (VF == 1) | ||||||
163 | return 1; | ||||||
164 | |||||||
165 | if (ST->isAtom()) | ||||||
166 | return 1; | ||||||
167 | |||||||
168 | // Sandybridge and Haswell have multiple execution ports and pipelined | ||||||
169 | // vector units. | ||||||
170 | if (ST->hasAVX()) | ||||||
171 | return 4; | ||||||
172 | |||||||
173 | return 2; | ||||||
174 | } | ||||||
175 | |||||||
176 | InstructionCost X86TTIImpl::getArithmeticInstrCost( | ||||||
177 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | ||||||
178 | TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, | ||||||
179 | TTI::OperandValueProperties Opd1PropInfo, | ||||||
180 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, | ||||||
181 | const Instruction *CxtI) { | ||||||
182 | // TODO: Handle more cost kinds. | ||||||
183 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
184 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | ||||||
185 | Op2Info, Opd1PropInfo, | ||||||
186 | Opd2PropInfo, Args, CxtI); | ||||||
187 | |||||||
188 | // vXi8 multiplications are always promoted to vXi16. | ||||||
189 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && | ||||||
190 | Ty->getScalarSizeInBits() == 8) { | ||||||
191 | Type *WideVecTy = | ||||||
192 | VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); | ||||||
193 | return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, | ||||||
194 | TargetTransformInfo::CastContextHint::None, | ||||||
195 | CostKind) + | ||||||
196 | getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, | ||||||
197 | TargetTransformInfo::CastContextHint::None, | ||||||
198 | CostKind) + | ||||||
199 | getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info, | ||||||
200 | Opd1PropInfo, Opd2PropInfo); | ||||||
201 | } | ||||||
202 | |||||||
203 | // Legalize the type. | ||||||
204 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | ||||||
205 | |||||||
206 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
207 | assert(ISD && "Invalid opcode")((void)0); | ||||||
208 | |||||||
209 | static const CostTblEntry GLMCostTable[] = { | ||||||
210 | { ISD::FDIV, MVT::f32, 18 }, // divss | ||||||
211 | { ISD::FDIV, MVT::v4f32, 35 }, // divps | ||||||
212 | { ISD::FDIV, MVT::f64, 33 }, // divsd | ||||||
213 | { ISD::FDIV, MVT::v2f64, 65 }, // divpd | ||||||
214 | }; | ||||||
215 | |||||||
216 | if (ST->useGLMDivSqrtCosts()) | ||||||
217 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, | ||||||
218 | LT.second)) | ||||||
219 | return LT.first * Entry->Cost; | ||||||
220 | |||||||
221 | static const CostTblEntry SLMCostTable[] = { | ||||||
222 | { ISD::MUL, MVT::v4i32, 11 }, // pmulld | ||||||
223 | { ISD::MUL, MVT::v8i16, 2 }, // pmullw | ||||||
224 | { ISD::FMUL, MVT::f64, 2 }, // mulsd | ||||||
225 | { ISD::FMUL, MVT::v2f64, 4 }, // mulpd | ||||||
226 | { ISD::FMUL, MVT::v4f32, 2 }, // mulps | ||||||
227 | { ISD::FDIV, MVT::f32, 17 }, // divss | ||||||
228 | { ISD::FDIV, MVT::v4f32, 39 }, // divps | ||||||
229 | { ISD::FDIV, MVT::f64, 32 }, // divsd | ||||||
230 | { ISD::FDIV, MVT::v2f64, 69 }, // divpd | ||||||
231 | { ISD::FADD, MVT::v2f64, 2 }, // addpd | ||||||
232 | { ISD::FSUB, MVT::v2f64, 2 }, // subpd | ||||||
233 | // v2i64/v4i64 mul is custom lowered as a series of long: | ||||||
234 | // multiplies(3), shifts(3) and adds(2) | ||||||
235 | // slm muldq version throughput is 2 and addq throughput 4 | ||||||
236 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + | ||||||
237 | // 3X4 (addq throughput) = 17 | ||||||
238 | { ISD::MUL, MVT::v2i64, 17 }, | ||||||
239 | // slm addq\subq throughput is 4 | ||||||
240 | { ISD::ADD, MVT::v2i64, 4 }, | ||||||
241 | { ISD::SUB, MVT::v2i64, 4 }, | ||||||
242 | }; | ||||||
243 | |||||||
244 | if (ST->isSLM()) { | ||||||
245 | if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { | ||||||
246 | // Check if the operands can be shrinked into a smaller datatype. | ||||||
247 | bool Op1Signed = false; | ||||||
248 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | ||||||
249 | bool Op2Signed = false; | ||||||
250 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | ||||||
251 | |||||||
252 | bool SignedMode = Op1Signed || Op2Signed; | ||||||
253 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | ||||||
254 | |||||||
255 | if (OpMinSize <= 7) | ||||||
256 | return LT.first * 3; // pmullw/sext | ||||||
257 | if (!SignedMode && OpMinSize <= 8) | ||||||
258 | return LT.first * 3; // pmullw/zext | ||||||
259 | if (OpMinSize <= 15) | ||||||
260 | return LT.first * 5; // pmullw/pmulhw/pshuf | ||||||
261 | if (!SignedMode && OpMinSize <= 16) | ||||||
262 | return LT.first * 5; // pmullw/pmulhw/pshuf | ||||||
263 | } | ||||||
264 | |||||||
265 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, | ||||||
266 | LT.second)) { | ||||||
267 | return LT.first * Entry->Cost; | ||||||
268 | } | ||||||
269 | } | ||||||
270 | |||||||
271 | if ((ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || | ||||||
272 | ISD == ISD::UREM) && | ||||||
273 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
274 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
275 | Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { | ||||||
276 | if (ISD == ISD::SDIV || ISD == ISD::SREM) { | ||||||
277 | // On X86, vector signed division by constants power-of-two are | ||||||
278 | // normally expanded to the sequence SRA + SRL + ADD + SRA. | ||||||
279 | // The OperandValue properties may not be the same as that of the previous | ||||||
280 | // operation; conservatively assume OP_None. | ||||||
281 | InstructionCost Cost = | ||||||
282 | 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, | ||||||
283 | Op2Info, TargetTransformInfo::OP_None, | ||||||
284 | TargetTransformInfo::OP_None); | ||||||
285 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, | ||||||
286 | Op2Info, | ||||||
287 | TargetTransformInfo::OP_None, | ||||||
288 | TargetTransformInfo::OP_None); | ||||||
289 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, | ||||||
290 | Op2Info, | ||||||
291 | TargetTransformInfo::OP_None, | ||||||
292 | TargetTransformInfo::OP_None); | ||||||
293 | |||||||
294 | if (ISD == ISD::SREM) { | ||||||
295 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) | ||||||
296 | Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, | ||||||
297 | Op2Info); | ||||||
298 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, | ||||||
299 | Op2Info); | ||||||
300 | } | ||||||
301 | |||||||
302 | return Cost; | ||||||
303 | } | ||||||
304 | |||||||
305 | // Vector unsigned division/remainder will be simplified to shifts/masks. | ||||||
306 | if (ISD == ISD::UDIV) | ||||||
307 | return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, | ||||||
308 | Op1Info, Op2Info, | ||||||
309 | TargetTransformInfo::OP_None, | ||||||
310 | TargetTransformInfo::OP_None); | ||||||
311 | |||||||
312 | else // UREM | ||||||
313 | return getArithmeticInstrCost(Instruction::And, Ty, CostKind, | ||||||
314 | Op1Info, Op2Info, | ||||||
315 | TargetTransformInfo::OP_None, | ||||||
316 | TargetTransformInfo::OP_None); | ||||||
317 | } | ||||||
318 | |||||||
319 | static const CostTblEntry AVX512BWUniformConstCostTable[] = { | ||||||
320 | { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. | ||||||
321 | { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. | ||||||
322 | { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. | ||||||
323 | }; | ||||||
324 | |||||||
325 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | ||||||
326 | ST->hasBWI()) { | ||||||
327 | if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, | ||||||
328 | LT.second)) | ||||||
329 | return LT.first * Entry->Cost; | ||||||
330 | } | ||||||
331 | |||||||
332 | static const CostTblEntry AVX512UniformConstCostTable[] = { | ||||||
333 | { ISD::SRA, MVT::v2i64, 1 }, | ||||||
334 | { ISD::SRA, MVT::v4i64, 1 }, | ||||||
335 | { ISD::SRA, MVT::v8i64, 1 }, | ||||||
336 | |||||||
337 | { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. | ||||||
338 | { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. | ||||||
339 | { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. | ||||||
340 | |||||||
341 | { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence | ||||||
342 | { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence | ||||||
343 | { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence | ||||||
344 | { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence | ||||||
345 | }; | ||||||
346 | |||||||
347 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | ||||||
348 | ST->hasAVX512()) { | ||||||
349 | if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, | ||||||
350 | LT.second)) | ||||||
351 | return LT.first * Entry->Cost; | ||||||
352 | } | ||||||
353 | |||||||
354 | static const CostTblEntry AVX2UniformConstCostTable[] = { | ||||||
355 | { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. | ||||||
356 | { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. | ||||||
357 | { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. | ||||||
358 | |||||||
359 | { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. | ||||||
360 | |||||||
361 | { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence | ||||||
362 | { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence | ||||||
363 | { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence | ||||||
364 | { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence | ||||||
365 | }; | ||||||
366 | |||||||
367 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | ||||||
368 | ST->hasAVX2()) { | ||||||
369 | if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, | ||||||
370 | LT.second)) | ||||||
371 | return LT.first * Entry->Cost; | ||||||
372 | } | ||||||
373 | |||||||
374 | static const CostTblEntry SSE2UniformConstCostTable[] = { | ||||||
375 | { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. | ||||||
376 | { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. | ||||||
377 | { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. | ||||||
378 | |||||||
379 | { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. | ||||||
380 | { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. | ||||||
381 | { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. | ||||||
382 | |||||||
383 | { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. | ||||||
384 | { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. | ||||||
385 | { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence | ||||||
386 | { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence | ||||||
387 | { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. | ||||||
388 | { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. | ||||||
389 | { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence | ||||||
390 | { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence | ||||||
391 | }; | ||||||
392 | |||||||
393 | // XOP has faster vXi8 shifts. | ||||||
394 | if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && | ||||||
395 | ST->hasSSE2() && !ST->hasXOP()) { | ||||||
396 | if (const auto *Entry = | ||||||
397 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) | ||||||
398 | return LT.first * Entry->Cost; | ||||||
399 | } | ||||||
400 | |||||||
401 | static const CostTblEntry AVX512BWConstCostTable[] = { | ||||||
402 | { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
403 | { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
404 | { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
405 | { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
406 | { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence | ||||||
407 | { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence | ||||||
408 | { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence | ||||||
409 | { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence | ||||||
410 | }; | ||||||
411 | |||||||
412 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
413 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
414 | ST->hasBWI()) { | ||||||
415 | if (const auto *Entry = | ||||||
416 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) | ||||||
417 | return LT.first * Entry->Cost; | ||||||
418 | } | ||||||
419 | |||||||
420 | static const CostTblEntry AVX512ConstCostTable[] = { | ||||||
421 | { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence | ||||||
422 | { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence | ||||||
423 | { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence | ||||||
424 | { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence | ||||||
425 | { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence | ||||||
426 | { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence | ||||||
427 | { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence | ||||||
428 | { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence | ||||||
429 | { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence | ||||||
430 | { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence | ||||||
431 | { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence | ||||||
432 | { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence | ||||||
433 | }; | ||||||
434 | |||||||
435 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
436 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
437 | ST->hasAVX512()) { | ||||||
438 | if (const auto *Entry = | ||||||
439 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) | ||||||
440 | return LT.first * Entry->Cost; | ||||||
441 | } | ||||||
442 | |||||||
443 | static const CostTblEntry AVX2ConstCostTable[] = { | ||||||
444 | { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
445 | { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
446 | { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
447 | { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
448 | { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence | ||||||
449 | { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence | ||||||
450 | { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence | ||||||
451 | { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence | ||||||
452 | { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence | ||||||
453 | { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence | ||||||
454 | { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence | ||||||
455 | { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence | ||||||
456 | }; | ||||||
457 | |||||||
458 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
459 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
460 | ST->hasAVX2()) { | ||||||
461 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) | ||||||
462 | return LT.first * Entry->Cost; | ||||||
463 | } | ||||||
464 | |||||||
465 | static const CostTblEntry SSE2ConstCostTable[] = { | ||||||
466 | { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | ||||||
467 | { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | ||||||
468 | { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
469 | { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
470 | { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. | ||||||
471 | { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. | ||||||
472 | { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence | ||||||
473 | { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence | ||||||
474 | { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. | ||||||
475 | { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. | ||||||
476 | { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence | ||||||
477 | { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence | ||||||
478 | { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. | ||||||
479 | { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. | ||||||
480 | { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence | ||||||
481 | { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence | ||||||
482 | { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. | ||||||
483 | { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. | ||||||
484 | { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence | ||||||
485 | { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence | ||||||
486 | { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. | ||||||
487 | { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. | ||||||
488 | { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence | ||||||
489 | { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence | ||||||
490 | }; | ||||||
491 | |||||||
492 | if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
493 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && | ||||||
494 | ST->hasSSE2()) { | ||||||
495 | // pmuldq sequence. | ||||||
496 | if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) | ||||||
497 | return LT.first * 32; | ||||||
498 | if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) | ||||||
499 | return LT.first * 38; | ||||||
500 | if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) | ||||||
501 | return LT.first * 15; | ||||||
502 | if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) | ||||||
503 | return LT.first * 20; | ||||||
504 | |||||||
505 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) | ||||||
506 | return LT.first * Entry->Cost; | ||||||
507 | } | ||||||
508 | |||||||
509 | static const CostTblEntry AVX512BWShiftCostTable[] = { | ||||||
510 | { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence. | ||||||
511 | { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence. | ||||||
512 | { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence. | ||||||
513 | { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence. | ||||||
514 | { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence. | ||||||
515 | { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence. | ||||||
516 | { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence. | ||||||
517 | { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence. | ||||||
518 | { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence. | ||||||
519 | |||||||
520 | { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw | ||||||
521 | { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw | ||||||
522 | { ISD::SRA, MVT::v8i16, 1 }, // vpsravw | ||||||
523 | { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw | ||||||
524 | { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw | ||||||
525 | { ISD::SRA, MVT::v16i16, 1 }, // vpsravw | ||||||
526 | { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw | ||||||
527 | { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw | ||||||
528 | { ISD::SRA, MVT::v32i16, 1 }, // vpsravw | ||||||
529 | }; | ||||||
530 | |||||||
531 | if (ST->hasBWI()) | ||||||
532 | if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second)) | ||||||
533 | return LT.first * Entry->Cost; | ||||||
534 | |||||||
535 | static const CostTblEntry AVX2UniformCostTable[] = { | ||||||
536 | // Uniform splats are cheaper for the following instructions. | ||||||
537 | { ISD::SHL, MVT::v16i16, 1 }, // psllw. | ||||||
538 | { ISD::SRL, MVT::v16i16, 1 }, // psrlw. | ||||||
539 | { ISD::SRA, MVT::v16i16, 1 }, // psraw. | ||||||
540 | { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. | ||||||
541 | { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. | ||||||
542 | { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. | ||||||
543 | |||||||
544 | { ISD::SHL, MVT::v8i32, 1 }, // pslld | ||||||
545 | { ISD::SRL, MVT::v8i32, 1 }, // psrld | ||||||
546 | { ISD::SRA, MVT::v8i32, 1 }, // psrad | ||||||
547 | { ISD::SHL, MVT::v4i64, 1 }, // psllq | ||||||
548 | { ISD::SRL, MVT::v4i64, 1 }, // psrlq | ||||||
549 | }; | ||||||
550 | |||||||
551 | if (ST->hasAVX2() && | ||||||
552 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | ||||||
553 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | ||||||
554 | if (const auto *Entry = | ||||||
555 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) | ||||||
556 | return LT.first * Entry->Cost; | ||||||
557 | } | ||||||
558 | |||||||
559 | static const CostTblEntry SSE2UniformCostTable[] = { | ||||||
560 | // Uniform splats are cheaper for the following instructions. | ||||||
561 | { ISD::SHL, MVT::v8i16, 1 }, // psllw. | ||||||
562 | { ISD::SHL, MVT::v4i32, 1 }, // pslld | ||||||
563 | { ISD::SHL, MVT::v2i64, 1 }, // psllq. | ||||||
564 | |||||||
565 | { ISD::SRL, MVT::v8i16, 1 }, // psrlw. | ||||||
566 | { ISD::SRL, MVT::v4i32, 1 }, // psrld. | ||||||
567 | { ISD::SRL, MVT::v2i64, 1 }, // psrlq. | ||||||
568 | |||||||
569 | { ISD::SRA, MVT::v8i16, 1 }, // psraw. | ||||||
570 | { ISD::SRA, MVT::v4i32, 1 }, // psrad. | ||||||
571 | }; | ||||||
572 | |||||||
573 | if (ST->hasSSE2() && | ||||||
574 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | ||||||
575 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | ||||||
576 | if (const auto *Entry = | ||||||
577 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) | ||||||
578 | return LT.first * Entry->Cost; | ||||||
579 | } | ||||||
580 | |||||||
581 | static const CostTblEntry AVX512DQCostTable[] = { | ||||||
582 | { ISD::MUL, MVT::v2i64, 2 }, // pmullq | ||||||
583 | { ISD::MUL, MVT::v4i64, 2 }, // pmullq | ||||||
584 | { ISD::MUL, MVT::v8i64, 2 } // pmullq | ||||||
585 | }; | ||||||
586 | |||||||
587 | // Look for AVX512DQ lowering tricks for custom cases. | ||||||
588 | if (ST->hasDQI()) | ||||||
589 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) | ||||||
590 | return LT.first * Entry->Cost; | ||||||
591 | |||||||
592 | static const CostTblEntry AVX512BWCostTable[] = { | ||||||
593 | { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. | ||||||
594 | { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. | ||||||
595 | { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. | ||||||
596 | }; | ||||||
597 | |||||||
598 | // Look for AVX512BW lowering tricks for custom cases. | ||||||
599 | if (ST->hasBWI()) | ||||||
600 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) | ||||||
601 | return LT.first * Entry->Cost; | ||||||
602 | |||||||
603 | static const CostTblEntry AVX512CostTable[] = { | ||||||
604 | { ISD::SHL, MVT::v4i32, 1 }, | ||||||
605 | { ISD::SRL, MVT::v4i32, 1 }, | ||||||
606 | { ISD::SRA, MVT::v4i32, 1 }, | ||||||
607 | { ISD::SHL, MVT::v8i32, 1 }, | ||||||
608 | { ISD::SRL, MVT::v8i32, 1 }, | ||||||
609 | { ISD::SRA, MVT::v8i32, 1 }, | ||||||
610 | { ISD::SHL, MVT::v16i32, 1 }, | ||||||
611 | { ISD::SRL, MVT::v16i32, 1 }, | ||||||
612 | { ISD::SRA, MVT::v16i32, 1 }, | ||||||
613 | |||||||
614 | { ISD::SHL, MVT::v2i64, 1 }, | ||||||
615 | { ISD::SRL, MVT::v2i64, 1 }, | ||||||
616 | { ISD::SHL, MVT::v4i64, 1 }, | ||||||
617 | { ISD::SRL, MVT::v4i64, 1 }, | ||||||
618 | { ISD::SHL, MVT::v8i64, 1 }, | ||||||
619 | { ISD::SRL, MVT::v8i64, 1 }, | ||||||
620 | |||||||
621 | { ISD::SRA, MVT::v2i64, 1 }, | ||||||
622 | { ISD::SRA, MVT::v4i64, 1 }, | ||||||
623 | { ISD::SRA, MVT::v8i64, 1 }, | ||||||
624 | |||||||
625 | { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) | ||||||
626 | { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) | ||||||
627 | { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) | ||||||
628 | { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add | ||||||
629 | |||||||
630 | { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | ||||||
631 | { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | ||||||
632 | { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | ||||||
633 | { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ | ||||||
634 | { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/ | ||||||
635 | { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/ | ||||||
636 | { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/ | ||||||
637 | { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/ | ||||||
638 | |||||||
639 | { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | ||||||
640 | { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | ||||||
641 | { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | ||||||
642 | { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ | ||||||
643 | { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/ | ||||||
644 | { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/ | ||||||
645 | { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/ | ||||||
646 | { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/ | ||||||
647 | }; | ||||||
648 | |||||||
649 | if (ST->hasAVX512()) | ||||||
650 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) | ||||||
651 | return LT.first * Entry->Cost; | ||||||
652 | |||||||
653 | static const CostTblEntry AVX2ShiftCostTable[] = { | ||||||
654 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to | ||||||
655 | // customize them to detect the cases where shift amount is a scalar one. | ||||||
656 | { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org) | ||||||
657 | { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org) | ||||||
658 | { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org) | ||||||
659 | { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org) | ||||||
660 | { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org) | ||||||
661 | { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org) | ||||||
662 | { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org) | ||||||
663 | { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org) | ||||||
664 | { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org) | ||||||
665 | { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org) | ||||||
666 | }; | ||||||
667 | |||||||
668 | if (ST->hasAVX512()) { | ||||||
669 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && | ||||||
670 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
671 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | ||||||
672 | // On AVX512, a packed v32i16 shift left by a constant build_vector | ||||||
673 | // is lowered into a vector multiply (vpmullw). | ||||||
674 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | ||||||
675 | Op1Info, Op2Info, | ||||||
676 | TargetTransformInfo::OP_None, | ||||||
677 | TargetTransformInfo::OP_None); | ||||||
678 | } | ||||||
679 | |||||||
680 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). | ||||||
681 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { | ||||||
682 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && | ||||||
683 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
684 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | ||||||
685 | // On AVX2, a packed v16i16 shift left by a constant build_vector | ||||||
686 | // is lowered into a vector multiply (vpmullw). | ||||||
687 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | ||||||
688 | Op1Info, Op2Info, | ||||||
689 | TargetTransformInfo::OP_None, | ||||||
690 | TargetTransformInfo::OP_None); | ||||||
691 | |||||||
692 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) | ||||||
693 | return LT.first * Entry->Cost; | ||||||
694 | } | ||||||
695 | |||||||
696 | static const CostTblEntry XOPShiftCostTable[] = { | ||||||
697 | // 128bit shifts take 1cy, but right shifts require negation beforehand. | ||||||
698 | { ISD::SHL, MVT::v16i8, 1 }, | ||||||
699 | { ISD::SRL, MVT::v16i8, 2 }, | ||||||
700 | { ISD::SRA, MVT::v16i8, 2 }, | ||||||
701 | { ISD::SHL, MVT::v8i16, 1 }, | ||||||
702 | { ISD::SRL, MVT::v8i16, 2 }, | ||||||
703 | { ISD::SRA, MVT::v8i16, 2 }, | ||||||
704 | { ISD::SHL, MVT::v4i32, 1 }, | ||||||
705 | { ISD::SRL, MVT::v4i32, 2 }, | ||||||
706 | { ISD::SRA, MVT::v4i32, 2 }, | ||||||
707 | { ISD::SHL, MVT::v2i64, 1 }, | ||||||
708 | { ISD::SRL, MVT::v2i64, 2 }, | ||||||
709 | { ISD::SRA, MVT::v2i64, 2 }, | ||||||
710 | // 256bit shifts require splitting if AVX2 didn't catch them above. | ||||||
711 | { ISD::SHL, MVT::v32i8, 2+2 }, | ||||||
712 | { ISD::SRL, MVT::v32i8, 4+2 }, | ||||||
713 | { ISD::SRA, MVT::v32i8, 4+2 }, | ||||||
714 | { ISD::SHL, MVT::v16i16, 2+2 }, | ||||||
715 | { ISD::SRL, MVT::v16i16, 4+2 }, | ||||||
716 | { ISD::SRA, MVT::v16i16, 4+2 }, | ||||||
717 | { ISD::SHL, MVT::v8i32, 2+2 }, | ||||||
718 | { ISD::SRL, MVT::v8i32, 4+2 }, | ||||||
719 | { ISD::SRA, MVT::v8i32, 4+2 }, | ||||||
720 | { ISD::SHL, MVT::v4i64, 2+2 }, | ||||||
721 | { ISD::SRL, MVT::v4i64, 4+2 }, | ||||||
722 | { ISD::SRA, MVT::v4i64, 4+2 }, | ||||||
723 | }; | ||||||
724 | |||||||
725 | // Look for XOP lowering tricks. | ||||||
726 | if (ST->hasXOP()) { | ||||||
727 | // If the right shift is constant then we'll fold the negation so | ||||||
728 | // it's as cheap as a left shift. | ||||||
729 | int ShiftISD = ISD; | ||||||
730 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && | ||||||
731 | (Op2Info == TargetTransformInfo::OK_UniformConstantValue || | ||||||
732 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) | ||||||
733 | ShiftISD = ISD::SHL; | ||||||
734 | if (const auto *Entry = | ||||||
735 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) | ||||||
736 | return LT.first * Entry->Cost; | ||||||
737 | } | ||||||
738 | |||||||
739 | static const CostTblEntry SSE2UniformShiftCostTable[] = { | ||||||
740 | // Uniform splats are cheaper for the following instructions. | ||||||
741 | { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. | ||||||
742 | { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. | ||||||
743 | { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. | ||||||
744 | |||||||
745 | { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. | ||||||
746 | { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. | ||||||
747 | { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. | ||||||
748 | |||||||
749 | { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. | ||||||
750 | { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. | ||||||
751 | { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. | ||||||
752 | { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. | ||||||
753 | }; | ||||||
754 | |||||||
755 | if (ST->hasSSE2() && | ||||||
756 | ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || | ||||||
757 | (Op2Info == TargetTransformInfo::OK_UniformValue))) { | ||||||
758 | |||||||
759 | // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. | ||||||
760 | if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) | ||||||
761 | return LT.first * 4; // 2*psrad + shuffle. | ||||||
762 | |||||||
763 | if (const auto *Entry = | ||||||
764 | CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) | ||||||
765 | return LT.first * Entry->Cost; | ||||||
766 | } | ||||||
767 | |||||||
768 | if (ISD == ISD::SHL && | ||||||
769 | Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { | ||||||
770 | MVT VT = LT.second; | ||||||
771 | // Vector shift left by non uniform constant can be lowered | ||||||
772 | // into vector multiply. | ||||||
773 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || | ||||||
774 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) | ||||||
775 | ISD = ISD::MUL; | ||||||
776 | } | ||||||
777 | |||||||
778 | static const CostTblEntry AVX2CostTable[] = { | ||||||
779 | { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence. | ||||||
780 | { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence. | ||||||
781 | { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. | ||||||
782 | { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. | ||||||
783 | { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. | ||||||
784 | { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. | ||||||
785 | |||||||
786 | { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence. | ||||||
787 | { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence. | ||||||
788 | { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. | ||||||
789 | { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. | ||||||
790 | { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. | ||||||
791 | { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. | ||||||
792 | |||||||
793 | { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence. | ||||||
794 | { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence. | ||||||
795 | { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence. | ||||||
796 | { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence. | ||||||
797 | { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence. | ||||||
798 | { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence. | ||||||
799 | { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence. | ||||||
800 | { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence. | ||||||
801 | |||||||
802 | { ISD::SUB, MVT::v32i8, 1 }, // psubb | ||||||
803 | { ISD::ADD, MVT::v32i8, 1 }, // paddb | ||||||
804 | { ISD::SUB, MVT::v16i16, 1 }, // psubw | ||||||
805 | { ISD::ADD, MVT::v16i16, 1 }, // paddw | ||||||
806 | { ISD::SUB, MVT::v8i32, 1 }, // psubd | ||||||
807 | { ISD::ADD, MVT::v8i32, 1 }, // paddd | ||||||
808 | { ISD::SUB, MVT::v4i64, 1 }, // psubq | ||||||
809 | { ISD::ADD, MVT::v4i64, 1 }, // paddq | ||||||
810 | |||||||
811 | { ISD::MUL, MVT::v16i16, 1 }, // pmullw | ||||||
812 | { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) | ||||||
813 | { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add | ||||||
814 | |||||||
815 | { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
816 | { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | ||||||
817 | { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
818 | { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | ||||||
819 | { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
820 | { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | ||||||
821 | { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
822 | { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
823 | { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ | ||||||
824 | { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ | ||||||
825 | |||||||
826 | { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | ||||||
827 | { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | ||||||
828 | { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | ||||||
829 | { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | ||||||
830 | { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | ||||||
831 | { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | ||||||
832 | }; | ||||||
833 | |||||||
834 | // Look for AVX2 lowering tricks for custom cases. | ||||||
835 | if (ST->hasAVX2()) | ||||||
836 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) | ||||||
837 | return LT.first * Entry->Cost; | ||||||
838 | |||||||
839 | static const CostTblEntry AVX1CostTable[] = { | ||||||
840 | // We don't have to scalarize unsupported ops. We can issue two half-sized | ||||||
841 | // operations and we only need to extract the upper YMM half. | ||||||
842 | // Two ops + 1 extract + 1 insert = 4. | ||||||
843 | { ISD::MUL, MVT::v16i16, 4 }, | ||||||
844 | { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/ | ||||||
845 | { ISD::MUL, MVT::v4i64, 12 }, | ||||||
846 | |||||||
847 | { ISD::SUB, MVT::v32i8, 4 }, | ||||||
848 | { ISD::ADD, MVT::v32i8, 4 }, | ||||||
849 | { ISD::SUB, MVT::v16i16, 4 }, | ||||||
850 | { ISD::ADD, MVT::v16i16, 4 }, | ||||||
851 | { ISD::SUB, MVT::v8i32, 4 }, | ||||||
852 | { ISD::ADD, MVT::v8i32, 4 }, | ||||||
853 | { ISD::SUB, MVT::v4i64, 4 }, | ||||||
854 | { ISD::ADD, MVT::v4i64, 4 }, | ||||||
855 | |||||||
856 | { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split. | ||||||
857 | { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence. | ||||||
858 | { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split. | ||||||
859 | { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld | ||||||
860 | { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split | ||||||
861 | { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend. | ||||||
862 | { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split. | ||||||
863 | |||||||
864 | { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split. | ||||||
865 | { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split. | ||||||
866 | { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend. | ||||||
867 | { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split. | ||||||
868 | { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend. | ||||||
869 | { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split. | ||||||
870 | |||||||
871 | { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split. | ||||||
872 | { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split. | ||||||
873 | { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend. | ||||||
874 | { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split. | ||||||
875 | { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend. | ||||||
876 | { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split. | ||||||
877 | |||||||
878 | { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/ | ||||||
879 | { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/ | ||||||
880 | |||||||
881 | { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/ | ||||||
882 | { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/ | ||||||
883 | { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/ | ||||||
884 | |||||||
885 | { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ | ||||||
886 | { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | ||||||
887 | { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | ||||||
888 | { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ | ||||||
889 | { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ | ||||||
890 | { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ | ||||||
891 | }; | ||||||
892 | |||||||
893 | if (ST->hasAVX()) | ||||||
894 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) | ||||||
895 | return LT.first * Entry->Cost; | ||||||
896 | |||||||
897 | static const CostTblEntry SSE42CostTable[] = { | ||||||
898 | { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
899 | { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
900 | { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
901 | { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
902 | |||||||
903 | { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
904 | { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ | ||||||
905 | { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
906 | { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
907 | |||||||
908 | { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
909 | { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
910 | { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ | ||||||
911 | { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ | ||||||
912 | |||||||
913 | { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ | ||||||
914 | { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ | ||||||
915 | { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ | ||||||
916 | { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ | ||||||
917 | |||||||
918 | { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add | ||||||
919 | }; | ||||||
920 | |||||||
921 | if (ST->hasSSE42()) | ||||||
922 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) | ||||||
923 | return LT.first * Entry->Cost; | ||||||
924 | |||||||
925 | static const CostTblEntry SSE41CostTable[] = { | ||||||
926 | { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence. | ||||||
927 | { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence. | ||||||
928 | { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld | ||||||
929 | |||||||
930 | { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence. | ||||||
931 | { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence. | ||||||
932 | { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. | ||||||
933 | |||||||
934 | { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence. | ||||||
935 | { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence. | ||||||
936 | |||||||
937 | { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) | ||||||
938 | }; | ||||||
939 | |||||||
940 | if (ST->hasSSE41()) | ||||||
941 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) | ||||||
942 | return LT.first * Entry->Cost; | ||||||
943 | |||||||
944 | static const CostTblEntry SSE2CostTable[] = { | ||||||
945 | // We don't correctly identify costs of casts because they are marked as | ||||||
946 | // custom. | ||||||
947 | { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence. | ||||||
948 | { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence. | ||||||
949 | { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq. | ||||||
950 | { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. | ||||||
951 | |||||||
952 | { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence. | ||||||
953 | { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence. | ||||||
954 | { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend. | ||||||
955 | { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. | ||||||
956 | |||||||
957 | { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence. | ||||||
958 | { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence. | ||||||
959 | { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. | ||||||
960 | { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence. | ||||||
961 | |||||||
962 | { ISD::MUL, MVT::v8i16, 1 }, // pmullw | ||||||
963 | { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle | ||||||
964 | { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add | ||||||
965 | |||||||
966 | { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ | ||||||
967 | { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ | ||||||
968 | { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ | ||||||
969 | { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ | ||||||
970 | |||||||
971 | { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/ | ||||||
972 | { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/ | ||||||
973 | { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/ | ||||||
974 | { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/ | ||||||
975 | |||||||
976 | { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | ||||||
977 | { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | ||||||
978 | |||||||
979 | { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ | ||||||
980 | { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ | ||||||
981 | }; | ||||||
982 | |||||||
983 | if (ST->hasSSE2()) | ||||||
984 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) | ||||||
985 | return LT.first * Entry->Cost; | ||||||
986 | |||||||
987 | static const CostTblEntry SSE1CostTable[] = { | ||||||
988 | { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ | ||||||
989 | { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ | ||||||
990 | |||||||
991 | { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/ | ||||||
992 | { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | ||||||
993 | |||||||
994 | { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | ||||||
995 | { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | ||||||
996 | |||||||
997 | { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ | ||||||
998 | { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ | ||||||
999 | }; | ||||||
1000 | |||||||
1001 | if (ST->hasSSE1()) | ||||||
1002 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) | ||||||
1003 | return LT.first * Entry->Cost; | ||||||
1004 | |||||||
1005 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | ||||||
1006 | { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ | ||||||
1007 | { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ | ||||||
1008 | }; | ||||||
1009 | |||||||
1010 | if (ST->is64Bit()) | ||||||
1011 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) | ||||||
1012 | return LT.first * Entry->Cost; | ||||||
1013 | |||||||
1014 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | ||||||
1015 | { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1016 | { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1017 | { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1018 | |||||||
1019 | { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1020 | { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1021 | { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ | ||||||
1022 | }; | ||||||
1023 | |||||||
1024 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) | ||||||
1025 | return LT.first * Entry->Cost; | ||||||
1026 | |||||||
1027 | // It is not a good idea to vectorize division. We have to scalarize it and | ||||||
1028 | // in the process we will often end up having to spilling regular | ||||||
1029 | // registers. The overhead of division is going to dominate most kernels | ||||||
1030 | // anyways so try hard to prevent vectorization of division - it is | ||||||
1031 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able | ||||||
1032 | // to hide "20 cycles" for each lane. | ||||||
1033 | if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || | ||||||
1034 | ISD == ISD::UDIV || ISD == ISD::UREM)) { | ||||||
1035 | InstructionCost ScalarCost = getArithmeticInstrCost( | ||||||
1036 | Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info, | ||||||
1037 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | ||||||
1038 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; | ||||||
1039 | } | ||||||
1040 | |||||||
1041 | // Fallback to the default implementation. | ||||||
1042 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); | ||||||
1043 | } | ||||||
1044 | |||||||
1045 | InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | ||||||
1046 | VectorType *BaseTp, | ||||||
1047 | ArrayRef<int> Mask, int Index, | ||||||
1048 | VectorType *SubTp) { | ||||||
1049 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. | ||||||
1050 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. | ||||||
1051 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); | ||||||
1052 | |||||||
1053 | Kind = improveShuffleKindFromMask(Kind, Mask); | ||||||
1054 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. | ||||||
1055 | if (Kind == TTI::SK_Transpose) | ||||||
1056 | Kind = TTI::SK_PermuteTwoSrc; | ||||||
1057 | |||||||
1058 | // For Broadcasts we are splatting the first element from the first input | ||||||
1059 | // register, so only need to reference that input and all the output | ||||||
1060 | // registers are the same. | ||||||
1061 | if (Kind == TTI::SK_Broadcast) | ||||||
1062 | LT.first = 1; | ||||||
1063 | |||||||
1064 | // Subvector extractions are free if they start at the beginning of a | ||||||
1065 | // vector and cheap if the subvectors are aligned. | ||||||
1066 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { | ||||||
1067 | int NumElts = LT.second.getVectorNumElements(); | ||||||
1068 | if ((Index % NumElts) == 0) | ||||||
1069 | return 0; | ||||||
1070 | std::pair<InstructionCost, MVT> SubLT = | ||||||
1071 | TLI->getTypeLegalizationCost(DL, SubTp); | ||||||
1072 | if (SubLT.second.isVector()) { | ||||||
1073 | int NumSubElts = SubLT.second.getVectorNumElements(); | ||||||
1074 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | ||||||
1075 | return SubLT.first; | ||||||
1076 | // Handle some cases for widening legalization. For now we only handle | ||||||
1077 | // cases where the original subvector was naturally aligned and evenly | ||||||
1078 | // fit in its legalized subvector type. | ||||||
1079 | // FIXME: Remove some of the alignment restrictions. | ||||||
1080 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit | ||||||
1081 | // vectors. | ||||||
1082 | int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); | ||||||
1083 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && | ||||||
1084 | (NumSubElts % OrigSubElts) == 0 && | ||||||
1085 | LT.second.getVectorElementType() == | ||||||
1086 | SubLT.second.getVectorElementType() && | ||||||
1087 | LT.second.getVectorElementType().getSizeInBits() == | ||||||
1088 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { | ||||||
1089 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&((void)0) | ||||||
1090 | "Unexpected number of elements!")((void)0); | ||||||
1091 | auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), | ||||||
1092 | LT.second.getVectorNumElements()); | ||||||
1093 | auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), | ||||||
1094 | SubLT.second.getVectorNumElements()); | ||||||
1095 | int ExtractIndex = alignDown((Index % NumElts), NumSubElts); | ||||||
1096 | InstructionCost ExtractCost = getShuffleCost( | ||||||
1097 | TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy); | ||||||
1098 | |||||||
1099 | // If the original size is 32-bits or more, we can use pshufd. Otherwise | ||||||
1100 | // if we have SSSE3 we can use pshufb. | ||||||
1101 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) | ||||||
1102 | return ExtractCost + 1; // pshufd or pshufb | ||||||
1103 | |||||||
1104 | assert(SubTp->getPrimitiveSizeInBits() == 16 &&((void)0) | ||||||
1105 | "Unexpected vector size")((void)0); | ||||||
1106 | |||||||
1107 | return ExtractCost + 2; // worst case pshufhw + pshufd | ||||||
1108 | } | ||||||
1109 | } | ||||||
1110 | } | ||||||
1111 | |||||||
1112 | // Subvector insertions are cheap if the subvectors are aligned. | ||||||
1113 | // Note that in general, the insertion starting at the beginning of a vector | ||||||
1114 | // isn't free, because we need to preserve the rest of the wide vector. | ||||||
1115 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { | ||||||
1116 | int NumElts = LT.second.getVectorNumElements(); | ||||||
1117 | std::pair<InstructionCost, MVT> SubLT = | ||||||
1118 | TLI->getTypeLegalizationCost(DL, SubTp); | ||||||
1119 | if (SubLT.second.isVector()) { | ||||||
1120 | int NumSubElts = SubLT.second.getVectorNumElements(); | ||||||
1121 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | ||||||
1122 | return SubLT.first; | ||||||
1123 | } | ||||||
1124 | } | ||||||
1125 | |||||||
1126 | // Handle some common (illegal) sub-vector types as they are often very cheap | ||||||
1127 | // to shuffle even on targets without PSHUFB. | ||||||
1128 | EVT VT = TLI->getValueType(DL, BaseTp); | ||||||
1129 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && | ||||||
1130 | !ST->hasSSSE3()) { | ||||||
1131 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { | ||||||
1132 | {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw | ||||||
1133 | {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw | ||||||
1134 | {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw | ||||||
1135 | {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw | ||||||
1136 | {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck | ||||||
1137 | |||||||
1138 | {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw | ||||||
1139 | {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw | ||||||
1140 | {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus | ||||||
1141 | {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck | ||||||
1142 | |||||||
1143 | {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw | ||||||
1144 | {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw | ||||||
1145 | {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw | ||||||
1146 | {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw | ||||||
1147 | {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck | ||||||
1148 | |||||||
1149 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw | ||||||
1150 | {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw | ||||||
1151 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw | ||||||
1152 | {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw | ||||||
1153 | {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck | ||||||
1154 | }; | ||||||
1155 | |||||||
1156 | if (ST->hasSSE2()) | ||||||
1157 | if (const auto *Entry = | ||||||
1158 | CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) | ||||||
1159 | return Entry->Cost; | ||||||
1160 | } | ||||||
1161 | |||||||
1162 | // We are going to permute multiple sources and the result will be in multiple | ||||||
1163 | // destinations. Providing an accurate cost only for splits where the element | ||||||
1164 | // type remains the same. | ||||||
1165 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { | ||||||
1166 | MVT LegalVT = LT.second; | ||||||
1167 | if (LegalVT.isVector() && | ||||||
1168 | LegalVT.getVectorElementType().getSizeInBits() == | ||||||
1169 | BaseTp->getElementType()->getPrimitiveSizeInBits() && | ||||||
1170 | LegalVT.getVectorNumElements() < | ||||||
1171 | cast<FixedVectorType>(BaseTp)->getNumElements()) { | ||||||
1172 | |||||||
1173 | unsigned VecTySize = DL.getTypeStoreSize(BaseTp); | ||||||
1174 | unsigned LegalVTSize = LegalVT.getStoreSize(); | ||||||
1175 | // Number of source vectors after legalization: | ||||||
1176 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; | ||||||
1177 | // Number of destination vectors after legalization: | ||||||
1178 | InstructionCost NumOfDests = LT.first; | ||||||
1179 | |||||||
1180 | auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), | ||||||
1181 | LegalVT.getVectorNumElements()); | ||||||
1182 | |||||||
1183 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; | ||||||
1184 | return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, | ||||||
1185 | None, 0, nullptr); | ||||||
1186 | } | ||||||
1187 | |||||||
1188 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); | ||||||
1189 | } | ||||||
1190 | |||||||
1191 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. | ||||||
1192 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { | ||||||
1193 | // We assume that source and destination have the same vector type. | ||||||
1194 | InstructionCost NumOfDests = LT.first; | ||||||
1195 | InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; | ||||||
1196 | LT.first = NumOfDests * NumOfShufflesPerDest; | ||||||
1197 | } | ||||||
1198 | |||||||
1199 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { | ||||||
1200 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb | ||||||
1201 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb | ||||||
1202 | |||||||
1203 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb | ||||||
1204 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb | ||||||
1205 | |||||||
1206 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b | ||||||
1207 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b | ||||||
1208 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b | ||||||
1209 | }; | ||||||
1210 | |||||||
1211 | if (ST->hasVBMI()) | ||||||
1212 | if (const auto *Entry = | ||||||
1213 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) | ||||||
1214 | return LT.first * Entry->Cost; | ||||||
1215 | |||||||
1216 | static const CostTblEntry AVX512BWShuffleTbl[] = { | ||||||
1217 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | ||||||
1218 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | ||||||
1219 | |||||||
1220 | {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw | ||||||
1221 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw | ||||||
1222 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 | ||||||
1223 | |||||||
1224 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw | ||||||
1225 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw | ||||||
1226 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 | ||||||
1227 | |||||||
1228 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w | ||||||
1229 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w | ||||||
1230 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w | ||||||
1231 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 | ||||||
1232 | |||||||
1233 | {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw | ||||||
1234 | {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb | ||||||
1235 | }; | ||||||
1236 | |||||||
1237 | if (ST->hasBWI()) | ||||||
1238 | if (const auto *Entry = | ||||||
1239 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) | ||||||
1240 | return LT.first * Entry->Cost; | ||||||
1241 | |||||||
1242 | static const CostTblEntry AVX512ShuffleTbl[] = { | ||||||
1243 | {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd | ||||||
1244 | {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps | ||||||
1245 | {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq | ||||||
1246 | {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd | ||||||
1247 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | ||||||
1248 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | ||||||
1249 | |||||||
1250 | {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd | ||||||
1251 | {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps | ||||||
1252 | {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq | ||||||
1253 | {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd | ||||||
1254 | {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca | ||||||
1255 | {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca | ||||||
1256 | |||||||
1257 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd | ||||||
1258 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | ||||||
1259 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd | ||||||
1260 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps | ||||||
1261 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | ||||||
1262 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps | ||||||
1263 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq | ||||||
1264 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | ||||||
1265 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq | ||||||
1266 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd | ||||||
1267 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | ||||||
1268 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd | ||||||
1269 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | ||||||
1270 | |||||||
1271 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd | ||||||
1272 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps | ||||||
1273 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q | ||||||
1274 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d | ||||||
1275 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd | ||||||
1276 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps | ||||||
1277 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q | ||||||
1278 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d | ||||||
1279 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd | ||||||
1280 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps | ||||||
1281 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q | ||||||
1282 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d | ||||||
1283 | |||||||
1284 | // FIXME: This just applies the type legalization cost rules above | ||||||
1285 | // assuming these completely split. | ||||||
1286 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, | ||||||
1287 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, | ||||||
1288 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, | ||||||
1289 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, | ||||||
1290 | |||||||
1291 | {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq | ||||||
1292 | {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq | ||||||
1293 | {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd | ||||||
1294 | {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps | ||||||
1295 | {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq | ||||||
1296 | {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd | ||||||
1297 | }; | ||||||
1298 | |||||||
1299 | if (ST->hasAVX512()) | ||||||
1300 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) | ||||||
1301 | return LT.first * Entry->Cost; | ||||||
1302 | |||||||
1303 | static const CostTblEntry AVX2ShuffleTbl[] = { | ||||||
1304 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd | ||||||
1305 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps | ||||||
1306 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq | ||||||
1307 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd | ||||||
1308 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw | ||||||
1309 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb | ||||||
1310 | |||||||
1311 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd | ||||||
1312 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps | ||||||
1313 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq | ||||||
1314 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd | ||||||
1315 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb | ||||||
1316 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb | ||||||
1317 | |||||||
1318 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb | ||||||
1319 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb | ||||||
1320 | |||||||
1321 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | ||||||
1322 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | ||||||
1323 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | ||||||
1324 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | ||||||
1325 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb | ||||||
1326 | // + vpblendvb | ||||||
1327 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb | ||||||
1328 | // + vpblendvb | ||||||
1329 | |||||||
1330 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd | ||||||
1331 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps | ||||||
1332 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd | ||||||
1333 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd | ||||||
1334 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb | ||||||
1335 | // + vpblendvb | ||||||
1336 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb | ||||||
1337 | // + vpblendvb | ||||||
1338 | }; | ||||||
1339 | |||||||
1340 | if (ST->hasAVX2()) | ||||||
1341 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) | ||||||
1342 | return LT.first * Entry->Cost; | ||||||
1343 | |||||||
1344 | static const CostTblEntry XOPShuffleTbl[] = { | ||||||
1345 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd | ||||||
1346 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps | ||||||
1347 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd | ||||||
1348 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps | ||||||
1349 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm | ||||||
1350 | // + vinsertf128 | ||||||
1351 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm | ||||||
1352 | // + vinsertf128 | ||||||
1353 | |||||||
1354 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm | ||||||
1355 | // + vinsertf128 | ||||||
1356 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm | ||||||
1357 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm | ||||||
1358 | // + vinsertf128 | ||||||
1359 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm | ||||||
1360 | }; | ||||||
1361 | |||||||
1362 | if (ST->hasXOP()) | ||||||
1363 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) | ||||||
1364 | return LT.first * Entry->Cost; | ||||||
1365 | |||||||
1366 | static const CostTblEntry AVX1ShuffleTbl[] = { | ||||||
1367 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | ||||||
1368 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps | ||||||
1369 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | ||||||
1370 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps | ||||||
1371 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 | ||||||
1372 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 | ||||||
1373 | |||||||
1374 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | ||||||
1375 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps | ||||||
1376 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | ||||||
1377 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps | ||||||
1378 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb | ||||||
1379 | // + vinsertf128 | ||||||
1380 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb | ||||||
1381 | // + vinsertf128 | ||||||
1382 | |||||||
1383 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd | ||||||
1384 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd | ||||||
1385 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps | ||||||
1386 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps | ||||||
1387 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor | ||||||
1388 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor | ||||||
1389 | |||||||
1390 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd | ||||||
1391 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd | ||||||
1392 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | ||||||
1393 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | ||||||
1394 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb | ||||||
1395 | // + 2*por + vinsertf128 | ||||||
1396 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb | ||||||
1397 | // + 2*por + vinsertf128 | ||||||
1398 | |||||||
1399 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd | ||||||
1400 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd | ||||||
1401 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | ||||||
1402 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | ||||||
1403 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb | ||||||
1404 | // + 4*por + vinsertf128 | ||||||
1405 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb | ||||||
1406 | // + 4*por + vinsertf128 | ||||||
1407 | }; | ||||||
1408 | |||||||
1409 | if (ST->hasAVX()) | ||||||
1410 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) | ||||||
1411 | return LT.first * Entry->Cost; | ||||||
1412 | |||||||
1413 | static const CostTblEntry SSE41ShuffleTbl[] = { | ||||||
1414 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw | ||||||
1415 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | ||||||
1416 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw | ||||||
1417 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps | ||||||
1418 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw | ||||||
1419 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb | ||||||
1420 | }; | ||||||
1421 | |||||||
1422 | if (ST->hasSSE41()) | ||||||
1423 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) | ||||||
1424 | return LT.first * Entry->Cost; | ||||||
1425 | |||||||
1426 | static const CostTblEntry SSSE3ShuffleTbl[] = { | ||||||
1427 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb | ||||||
1428 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb | ||||||
1429 | |||||||
1430 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb | ||||||
1431 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb | ||||||
1432 | |||||||
1433 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por | ||||||
1434 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por | ||||||
1435 | |||||||
1436 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb | ||||||
1437 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | ||||||
1438 | |||||||
1439 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por | ||||||
1440 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por | ||||||
1441 | }; | ||||||
1442 | |||||||
1443 | if (ST->hasSSSE3()) | ||||||
1444 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) | ||||||
1445 | return LT.first * Entry->Cost; | ||||||
1446 | |||||||
1447 | static const CostTblEntry SSE2ShuffleTbl[] = { | ||||||
1448 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd | ||||||
1449 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd | ||||||
1450 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd | ||||||
1451 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd | ||||||
1452 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd | ||||||
1453 | |||||||
1454 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd | ||||||
1455 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd | ||||||
1456 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd | ||||||
1457 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd | ||||||
1458 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw | ||||||
1459 | // + 2*pshufd + 2*unpck + packus | ||||||
1460 | |||||||
1461 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd | ||||||
1462 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | ||||||
1463 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps | ||||||
1464 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por | ||||||
1465 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por | ||||||
1466 | |||||||
1467 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd | ||||||
1468 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd | ||||||
1469 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd | ||||||
1470 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw | ||||||
1471 | // + pshufd/unpck | ||||||
1472 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw | ||||||
1473 | // + 2*pshufd + 2*unpck + 2*packus | ||||||
1474 | |||||||
1475 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd | ||||||
1476 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd | ||||||
1477 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} | ||||||
1478 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute | ||||||
1479 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute | ||||||
1480 | }; | ||||||
1481 | |||||||
1482 | if (ST->hasSSE2()) | ||||||
1483 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) | ||||||
1484 | return LT.first * Entry->Cost; | ||||||
1485 | |||||||
1486 | static const CostTblEntry SSE1ShuffleTbl[] = { | ||||||
1487 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps | ||||||
1488 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps | ||||||
1489 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps | ||||||
1490 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps | ||||||
1491 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps | ||||||
1492 | }; | ||||||
1493 | |||||||
1494 | if (ST->hasSSE1()) | ||||||
1495 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) | ||||||
1496 | return LT.first * Entry->Cost; | ||||||
1497 | |||||||
1498 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); | ||||||
1499 | } | ||||||
1500 | |||||||
1501 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | ||||||
1502 | Type *Src, | ||||||
1503 | TTI::CastContextHint CCH, | ||||||
1504 | TTI::TargetCostKind CostKind, | ||||||
1505 | const Instruction *I) { | ||||||
1506 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
1507 | assert(ISD && "Invalid opcode")((void)0); | ||||||
1508 | |||||||
1509 | // TODO: Allow non-throughput costs that aren't binary. | ||||||
1510 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | ||||||
1511 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
1512 | return Cost == 0 ? 0 : 1; | ||||||
1513 | return Cost; | ||||||
1514 | }; | ||||||
1515 | |||||||
1516 | // The cost tables include both specific, custom (non-legal) src/dst type | ||||||
1517 | // conversions and generic, legalized types. We test for customs first, before | ||||||
1518 | // falling back to legalization. | ||||||
1519 | // FIXME: Need a better design of the cost table to handle non-simple types of | ||||||
1520 | // potential massive combinations (elem_num x src_type x dst_type). | ||||||
1521 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { | ||||||
1522 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | ||||||
1523 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | ||||||
1524 | |||||||
1525 | // Mask sign extend has an instruction. | ||||||
1526 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | ||||||
1527 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | ||||||
1528 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | ||||||
1529 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | ||||||
1530 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | ||||||
1531 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | ||||||
1532 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | ||||||
1533 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||||
1534 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | ||||||
1535 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, | ||||||
1536 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, | ||||||
1537 | |||||||
1538 | // Mask zero extend is a sext + shift. | ||||||
1539 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | ||||||
1540 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | ||||||
1541 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | ||||||
1542 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | ||||||
1543 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | ||||||
1544 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | ||||||
1545 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | ||||||
1546 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | ||||||
1547 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | ||||||
1548 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, | ||||||
1549 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, | ||||||
1550 | |||||||
1551 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, | ||||||
1552 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm | ||||||
1553 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm | ||||||
1554 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm | ||||||
1555 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb | ||||||
1556 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm | ||||||
1557 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm | ||||||
1558 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb | ||||||
1559 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm | ||||||
1560 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm | ||||||
1561 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb | ||||||
1562 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm | ||||||
1563 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm | ||||||
1564 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm | ||||||
1565 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, | ||||||
1566 | { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, | ||||||
1567 | }; | ||||||
1568 | |||||||
1569 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { | ||||||
1570 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | ||||||
1571 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | ||||||
1572 | |||||||
1573 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | ||||||
1574 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | ||||||
1575 | |||||||
1576 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, | ||||||
1577 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, | ||||||
1578 | |||||||
1579 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, | ||||||
1580 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, | ||||||
1581 | }; | ||||||
1582 | |||||||
1583 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and | ||||||
1584 | // 256-bit wide vectors. | ||||||
1585 | |||||||
1586 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { | ||||||
1587 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, | ||||||
1588 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, | ||||||
1589 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, | ||||||
1590 | |||||||
1591 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | ||||||
1592 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | ||||||
1593 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | ||||||
1594 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd | ||||||
1595 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1596 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1597 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1598 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd | ||||||
1599 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd | ||||||
1600 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd | ||||||
1601 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd | ||||||
1602 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd | ||||||
1603 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq | ||||||
1604 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq | ||||||
1605 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq | ||||||
1606 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb | ||||||
1607 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb | ||||||
1608 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb | ||||||
1609 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb | ||||||
1610 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb | ||||||
1611 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb | ||||||
1612 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb | ||||||
1613 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw | ||||||
1614 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd | ||||||
1615 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd | ||||||
1616 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb | ||||||
1617 | |||||||
1618 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 | ||||||
1619 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, | ||||||
1620 | |||||||
1621 | // Sign extend is zmm vpternlogd+vptruncdb. | ||||||
1622 | // Zero extend is zmm broadcast load+vptruncdw. | ||||||
1623 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, | ||||||
1624 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, | ||||||
1625 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, | ||||||
1626 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, | ||||||
1627 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, | ||||||
1628 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, | ||||||
1629 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, | ||||||
1630 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, | ||||||
1631 | |||||||
1632 | // Sign extend is zmm vpternlogd+vptruncdw. | ||||||
1633 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. | ||||||
1634 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, | ||||||
1635 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | ||||||
1636 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, | ||||||
1637 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | ||||||
1638 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, | ||||||
1639 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | ||||||
1640 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, | ||||||
1641 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||||
1642 | |||||||
1643 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd | ||||||
1644 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld | ||||||
1645 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd | ||||||
1646 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld | ||||||
1647 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd | ||||||
1648 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld | ||||||
1649 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq | ||||||
1650 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq | ||||||
1651 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq | ||||||
1652 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq | ||||||
1653 | |||||||
1654 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd | ||||||
1655 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld | ||||||
1656 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq | ||||||
1657 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq | ||||||
1658 | |||||||
1659 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | ||||||
1660 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | ||||||
1661 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | ||||||
1662 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | ||||||
1663 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | ||||||
1664 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | ||||||
1665 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | ||||||
1666 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | ||||||
1667 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | ||||||
1668 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | ||||||
1669 | |||||||
1670 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | ||||||
1671 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | ||||||
1672 | |||||||
1673 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | ||||||
1674 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | ||||||
1675 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | ||||||
1676 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | ||||||
1677 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | ||||||
1678 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | ||||||
1679 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | ||||||
1680 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | ||||||
1681 | |||||||
1682 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | ||||||
1683 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | ||||||
1684 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | ||||||
1685 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | ||||||
1686 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | ||||||
1687 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | ||||||
1688 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | ||||||
1689 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | ||||||
1690 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, | ||||||
1691 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, | ||||||
1692 | |||||||
1693 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | ||||||
1694 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, | ||||||
1695 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, | ||||||
1696 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, | ||||||
1697 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, | ||||||
1698 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, | ||||||
1699 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, | ||||||
1700 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, | ||||||
1701 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, | ||||||
1702 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||||
1703 | { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, | ||||||
1704 | |||||||
1705 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||||
1706 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, | ||||||
1707 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, | ||||||
1708 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, | ||||||
1709 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, | ||||||
1710 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, | ||||||
1711 | }; | ||||||
1712 | |||||||
1713 | static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { | ||||||
1714 | // Mask sign extend has an instruction. | ||||||
1715 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | ||||||
1716 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | ||||||
1717 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | ||||||
1718 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | ||||||
1719 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | ||||||
1720 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | ||||||
1721 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | ||||||
1722 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||||
1723 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | ||||||
1724 | |||||||
1725 | // Mask zero extend is a sext + shift. | ||||||
1726 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | ||||||
1727 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | ||||||
1728 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | ||||||
1729 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | ||||||
1730 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | ||||||
1731 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | ||||||
1732 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | ||||||
1733 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | ||||||
1734 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | ||||||
1735 | |||||||
1736 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, | ||||||
1737 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb | ||||||
1738 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw | ||||||
1739 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb | ||||||
1740 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw | ||||||
1741 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb | ||||||
1742 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw | ||||||
1743 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb | ||||||
1744 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw | ||||||
1745 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb | ||||||
1746 | }; | ||||||
1747 | |||||||
1748 | static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { | ||||||
1749 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | ||||||
1750 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | ||||||
1751 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | ||||||
1752 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | ||||||
1753 | |||||||
1754 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | ||||||
1755 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | ||||||
1756 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | ||||||
1757 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | ||||||
1758 | |||||||
1759 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, | ||||||
1760 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, | ||||||
1761 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | ||||||
1762 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, | ||||||
1763 | |||||||
1764 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, | ||||||
1765 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, | ||||||
1766 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | ||||||
1767 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, | ||||||
1768 | }; | ||||||
1769 | |||||||
1770 | static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { | ||||||
1771 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | ||||||
1772 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | ||||||
1773 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | ||||||
1774 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 | ||||||
1775 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1776 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1777 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | ||||||
1778 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 | ||||||
1779 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd | ||||||
1780 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd | ||||||
1781 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd | ||||||
1782 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq | ||||||
1783 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq | ||||||
1784 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd | ||||||
1785 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb | ||||||
1786 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw | ||||||
1787 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb | ||||||
1788 | |||||||
1789 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb | ||||||
1790 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb | ||||||
1791 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, | ||||||
1792 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, | ||||||
1793 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, | ||||||
1794 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, | ||||||
1795 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, | ||||||
1796 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, | ||||||
1797 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, | ||||||
1798 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, | ||||||
1799 | |||||||
1800 | // sign extend is vpcmpeq+maskedmove+vpmovdw | ||||||
1801 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw | ||||||
1802 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | ||||||
1803 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, | ||||||
1804 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | ||||||
1805 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, | ||||||
1806 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | ||||||
1807 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, | ||||||
1808 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, | ||||||
1809 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, | ||||||
1810 | |||||||
1811 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd | ||||||
1812 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld | ||||||
1813 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd | ||||||
1814 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld | ||||||
1815 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd | ||||||
1816 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld | ||||||
1817 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq | ||||||
1818 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq | ||||||
1819 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq | ||||||
1820 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq | ||||||
1821 | |||||||
1822 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | ||||||
1823 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | ||||||
1824 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | ||||||
1825 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | ||||||
1826 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | ||||||
1827 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | ||||||
1828 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | ||||||
1829 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | ||||||
1830 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | ||||||
1831 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | ||||||
1832 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | ||||||
1833 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | ||||||
1834 | |||||||
1835 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||||
1836 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | ||||||
1837 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||||
1838 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | ||||||
1839 | |||||||
1840 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, | ||||||
1841 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, | ||||||
1842 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||||
1843 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | ||||||
1844 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||||
1845 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | ||||||
1846 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | ||||||
1847 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | ||||||
1848 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | ||||||
1849 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | ||||||
1850 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, | ||||||
1851 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | ||||||
1852 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, | ||||||
1853 | |||||||
1854 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||||
1855 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | ||||||
1856 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, | ||||||
1857 | |||||||
1858 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, | ||||||
1859 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, | ||||||
1860 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | ||||||
1861 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, | ||||||
1862 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, | ||||||
1863 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, | ||||||
1864 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | ||||||
1865 | }; | ||||||
1866 | |||||||
1867 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { | ||||||
1868 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | ||||||
1869 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | ||||||
1870 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | ||||||
1871 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | ||||||
1872 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||||
1873 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | ||||||
1874 | |||||||
1875 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | ||||||
1876 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | ||||||
1877 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | ||||||
1878 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | ||||||
1879 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | ||||||
1880 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | ||||||
1881 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | ||||||
1882 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | ||||||
1883 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | ||||||
1884 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | ||||||
1885 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | ||||||
1886 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | ||||||
1887 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | ||||||
1888 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | ||||||
1889 | |||||||
1890 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | ||||||
1891 | |||||||
1892 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, | ||||||
1893 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, | ||||||
1894 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, | ||||||
1895 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, | ||||||
1896 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, | ||||||
1897 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, | ||||||
1898 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, | ||||||
1899 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, | ||||||
1900 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, | ||||||
1901 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, | ||||||
1902 | |||||||
1903 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, | ||||||
1904 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, | ||||||
1905 | |||||||
1906 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, | ||||||
1907 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, | ||||||
1908 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, | ||||||
1909 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, | ||||||
1910 | |||||||
1911 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, | ||||||
1912 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, | ||||||
1913 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, | ||||||
1914 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | ||||||
1915 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||||
1916 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, | ||||||
1917 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, | ||||||
1918 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, | ||||||
1919 | |||||||
1920 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | ||||||
1921 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | ||||||
1922 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | ||||||
1923 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | ||||||
1924 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | ||||||
1925 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | ||||||
1926 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, | ||||||
1927 | |||||||
1928 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | ||||||
1929 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | ||||||
1930 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | ||||||
1931 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | ||||||
1932 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, | ||||||
1933 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, | ||||||
1934 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, | ||||||
1935 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||||
1936 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | ||||||
1937 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | ||||||
1938 | }; | ||||||
1939 | |||||||
1940 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { | ||||||
1941 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, | ||||||
1942 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, | ||||||
1943 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, | ||||||
1944 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, | ||||||
1945 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||||
1946 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | ||||||
1947 | |||||||
1948 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | ||||||
1949 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | ||||||
1950 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | ||||||
1951 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | ||||||
1952 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | ||||||
1953 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | ||||||
1954 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | ||||||
1955 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | ||||||
1956 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | ||||||
1957 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | ||||||
1958 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | ||||||
1959 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | ||||||
1960 | |||||||
1961 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, | ||||||
1962 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, | ||||||
1963 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, | ||||||
1964 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, | ||||||
1965 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, | ||||||
1966 | |||||||
1967 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb | ||||||
1968 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, | ||||||
1969 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | ||||||
1970 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, | ||||||
1971 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw | ||||||
1972 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | ||||||
1973 | |||||||
1974 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | ||||||
1975 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, | ||||||
1976 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, | ||||||
1977 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | ||||||
1978 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | ||||||
1979 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | ||||||
1980 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | ||||||
1981 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||||
1982 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | ||||||
1983 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | ||||||
1984 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, | ||||||
1985 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, | ||||||
1986 | |||||||
1987 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, | ||||||
1988 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, | ||||||
1989 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, | ||||||
1990 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | ||||||
1991 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | ||||||
1992 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | ||||||
1993 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | ||||||
1994 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, | ||||||
1995 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, | ||||||
1996 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | ||||||
1997 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, | ||||||
1998 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, | ||||||
1999 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, | ||||||
2000 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, | ||||||
2001 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, | ||||||
2002 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | ||||||
2003 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, | ||||||
2004 | |||||||
2005 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||||
2006 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, | ||||||
2007 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, | ||||||
2008 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, | ||||||
2009 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, | ||||||
2010 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, | ||||||
2011 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, | ||||||
2012 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, | ||||||
2013 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, | ||||||
2014 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, | ||||||
2015 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, | ||||||
2016 | |||||||
2017 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, | ||||||
2018 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, | ||||||
2019 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, | ||||||
2020 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, | ||||||
2021 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, | ||||||
2022 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, | ||||||
2023 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, | ||||||
2024 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, | ||||||
2025 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | ||||||
2026 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||||
2027 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, | ||||||
2028 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, | ||||||
2029 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, | ||||||
2030 | |||||||
2031 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, | ||||||
2032 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, | ||||||
2033 | }; | ||||||
2034 | |||||||
2035 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { | ||||||
2036 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | ||||||
2037 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | ||||||
2038 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | ||||||
2039 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | ||||||
2040 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||||
2041 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||||
2042 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | ||||||
2043 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | ||||||
2044 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||||
2045 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||||
2046 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||||
2047 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||||
2048 | |||||||
2049 | // These truncates end up widening elements. | ||||||
2050 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ | ||||||
2051 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ | ||||||
2052 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD | ||||||
2053 | |||||||
2054 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, | ||||||
2055 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, | ||||||
2056 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, | ||||||
2057 | |||||||
2058 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, | ||||||
2059 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, | ||||||
2060 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, | ||||||
2061 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, | ||||||
2062 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | ||||||
2063 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||||
2064 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | ||||||
2065 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||||
2066 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | ||||||
2067 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, | ||||||
2068 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | ||||||
2069 | |||||||
2070 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, | ||||||
2071 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, | ||||||
2072 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, | ||||||
2073 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, | ||||||
2074 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | ||||||
2075 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | ||||||
2076 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | ||||||
2077 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | ||||||
2078 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, | ||||||
2079 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | ||||||
2080 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, | ||||||
2081 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, | ||||||
2082 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, | ||||||
2083 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, | ||||||
2084 | |||||||
2085 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, | ||||||
2086 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, | ||||||
2087 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, | ||||||
2088 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, | ||||||
2089 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, | ||||||
2090 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, | ||||||
2091 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, | ||||||
2092 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, | ||||||
2093 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | ||||||
2094 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, | ||||||
2095 | |||||||
2096 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, | ||||||
2097 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | ||||||
2098 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, | ||||||
2099 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, | ||||||
2100 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, | ||||||
2101 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, | ||||||
2102 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, | ||||||
2103 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, | ||||||
2104 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, | ||||||
2105 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||||
2106 | }; | ||||||
2107 | |||||||
2108 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { | ||||||
2109 | // These are somewhat magic numbers justified by comparing the | ||||||
2110 | // output of llvm-mca for our various supported scheduler models | ||||||
2111 | // and basing it off the worst case scenario. | ||||||
2112 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, | ||||||
2113 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, | ||||||
2114 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, | ||||||
2115 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, | ||||||
2116 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, | ||||||
2117 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | ||||||
2118 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, | ||||||
2119 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | ||||||
2120 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | ||||||
2121 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, | ||||||
2122 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, | ||||||
2123 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, | ||||||
2124 | |||||||
2125 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, | ||||||
2126 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, | ||||||
2127 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, | ||||||
2128 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, | ||||||
2129 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | ||||||
2130 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, | ||||||
2131 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, | ||||||
2132 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | ||||||
2133 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, | ||||||
2134 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, | ||||||
2135 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | ||||||
2136 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, | ||||||
2137 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, | ||||||
2138 | |||||||
2139 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, | ||||||
2140 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, | ||||||
2141 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, | ||||||
2142 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, | ||||||
2143 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, | ||||||
2144 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, | ||||||
2145 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, | ||||||
2146 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, | ||||||
2147 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, | ||||||
2148 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, | ||||||
2149 | |||||||
2150 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, | ||||||
2151 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | ||||||
2152 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, | ||||||
2153 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, | ||||||
2154 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, | ||||||
2155 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, | ||||||
2156 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, | ||||||
2157 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, | ||||||
2158 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, | ||||||
2159 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, | ||||||
2160 | |||||||
2161 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | ||||||
2162 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | ||||||
2163 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, | ||||||
2164 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, | ||||||
2165 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | ||||||
2166 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, | ||||||
2167 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, | ||||||
2168 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, | ||||||
2169 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | ||||||
2170 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, | ||||||
2171 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | ||||||
2172 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, | ||||||
2173 | |||||||
2174 | // These truncates are really widening elements. | ||||||
2175 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD | ||||||
2176 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ | ||||||
2177 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD | ||||||
2178 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD | ||||||
2179 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD | ||||||
2180 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW | ||||||
2181 | |||||||
2182 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB | ||||||
2183 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, | ||||||
2184 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB | ||||||
2185 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, | ||||||
2186 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, | ||||||
2187 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, | ||||||
2188 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | ||||||
2189 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, | ||||||
2190 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB | ||||||
2191 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW | ||||||
2192 | { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD | ||||||
2193 | }; | ||||||
2194 | |||||||
2195 | // Attempt to map directly to (simple) MVT types to let us match custom entries. | ||||||
2196 | EVT SrcTy = TLI->getValueType(DL, Src); | ||||||
2197 | EVT DstTy = TLI->getValueType(DL, Dst); | ||||||
2198 | |||||||
2199 | // The function getSimpleVT only handles simple value types. | ||||||
2200 | if (SrcTy.isSimple() && DstTy.isSimple()) { | ||||||
2201 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); | ||||||
2202 | MVT SimpleDstTy = DstTy.getSimpleVT(); | ||||||
2203 | |||||||
2204 | if (ST->useAVX512Regs()) { | ||||||
2205 | if (ST->hasBWI()) | ||||||
2206 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2207 | AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2208 | return AdjustCost(Entry->Cost); | ||||||
2209 | |||||||
2210 | if (ST->hasDQI()) | ||||||
2211 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2212 | AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2213 | return AdjustCost(Entry->Cost); | ||||||
2214 | |||||||
2215 | if (ST->hasAVX512()) | ||||||
2216 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2217 | AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2218 | return AdjustCost(Entry->Cost); | ||||||
2219 | } | ||||||
2220 | |||||||
2221 | if (ST->hasBWI()) | ||||||
2222 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2223 | AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2224 | return AdjustCost(Entry->Cost); | ||||||
2225 | |||||||
2226 | if (ST->hasDQI()) | ||||||
2227 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2228 | AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | ||||||
2229 | return AdjustCost(Entry->Cost); | ||||||
2230 | |||||||
2231 | if (ST->hasAVX512()) | ||||||
2232 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | ||||||
2233 | SimpleDstTy, SimpleSrcTy)) | ||||||
2234 | return AdjustCost(Entry->Cost); | ||||||
2235 | |||||||
2236 | if (ST->hasAVX2()) { | ||||||
2237 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | ||||||
2238 | SimpleDstTy, SimpleSrcTy)) | ||||||
2239 | return AdjustCost(Entry->Cost); | ||||||
2240 | } | ||||||
2241 | |||||||
2242 | if (ST->hasAVX()) { | ||||||
2243 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | ||||||
2244 | SimpleDstTy, SimpleSrcTy)) | ||||||
2245 | return AdjustCost(Entry->Cost); | ||||||
2246 | } | ||||||
2247 | |||||||
2248 | if (ST->hasSSE41()) { | ||||||
2249 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | ||||||
2250 | SimpleDstTy, SimpleSrcTy)) | ||||||
2251 | return AdjustCost(Entry->Cost); | ||||||
2252 | } | ||||||
2253 | |||||||
2254 | if (ST->hasSSE2()) { | ||||||
2255 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | ||||||
2256 | SimpleDstTy, SimpleSrcTy)) | ||||||
2257 | return AdjustCost(Entry->Cost); | ||||||
2258 | } | ||||||
2259 | } | ||||||
2260 | |||||||
2261 | // Fall back to legalized types. | ||||||
2262 | std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); | ||||||
2263 | std::pair<InstructionCost, MVT> LTDest = | ||||||
2264 | TLI->getTypeLegalizationCost(DL, Dst); | ||||||
2265 | |||||||
2266 | if (ST->useAVX512Regs()) { | ||||||
2267 | if (ST->hasBWI()) | ||||||
2268 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2269 | AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||||
2270 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2271 | |||||||
2272 | if (ST->hasDQI()) | ||||||
2273 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2274 | AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||||
2275 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2276 | |||||||
2277 | if (ST->hasAVX512()) | ||||||
2278 | if (const auto *Entry = ConvertCostTableLookup( | ||||||
2279 | AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) | ||||||
2280 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2281 | } | ||||||
2282 | |||||||
2283 | if (ST->hasBWI()) | ||||||
2284 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, | ||||||
2285 | LTDest.second, LTSrc.second)) | ||||||
2286 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2287 | |||||||
2288 | if (ST->hasDQI()) | ||||||
2289 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, | ||||||
2290 | LTDest.second, LTSrc.second)) | ||||||
2291 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2292 | |||||||
2293 | if (ST->hasAVX512()) | ||||||
2294 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | ||||||
2295 | LTDest.second, LTSrc.second)) | ||||||
2296 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2297 | |||||||
2298 | if (ST->hasAVX2()) | ||||||
2299 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | ||||||
2300 | LTDest.second, LTSrc.second)) | ||||||
2301 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2302 | |||||||
2303 | if (ST->hasAVX()) | ||||||
2304 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | ||||||
2305 | LTDest.second, LTSrc.second)) | ||||||
2306 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2307 | |||||||
2308 | if (ST->hasSSE41()) | ||||||
2309 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | ||||||
2310 | LTDest.second, LTSrc.second)) | ||||||
2311 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2312 | |||||||
2313 | if (ST->hasSSE2()) | ||||||
2314 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | ||||||
2315 | LTDest.second, LTSrc.second)) | ||||||
2316 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | ||||||
2317 | |||||||
2318 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for | ||||||
2319 | // sitofp. | ||||||
2320 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && | ||||||
2321 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { | ||||||
2322 | Type *ExtSrc = Src->getWithNewBitWidth(32); | ||||||
2323 | unsigned ExtOpc = | ||||||
2324 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; | ||||||
2325 | |||||||
2326 | // For scalar loads the extend would be free. | ||||||
2327 | InstructionCost ExtCost = 0; | ||||||
2328 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) | ||||||
2329 | ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); | ||||||
2330 | |||||||
2331 | return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, | ||||||
2332 | TTI::CastContextHint::None, CostKind); | ||||||
2333 | } | ||||||
2334 | |||||||
2335 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi | ||||||
2336 | // i32. | ||||||
2337 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && | ||||||
2338 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { | ||||||
2339 | Type *TruncDst = Dst->getWithNewBitWidth(32); | ||||||
2340 | return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + | ||||||
2341 | getCastInstrCost(Instruction::Trunc, Dst, TruncDst, | ||||||
2342 | TTI::CastContextHint::None, CostKind); | ||||||
2343 | } | ||||||
2344 | |||||||
2345 | return AdjustCost( | ||||||
2346 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | ||||||
2347 | } | ||||||
2348 | |||||||
2349 | InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | ||||||
2350 | Type *CondTy, | ||||||
2351 | CmpInst::Predicate VecPred, | ||||||
2352 | TTI::TargetCostKind CostKind, | ||||||
2353 | const Instruction *I) { | ||||||
2354 | // TODO: Handle other cost kinds. | ||||||
2355 | if (CostKind
| ||||||
2356 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | ||||||
2357 | I); | ||||||
2358 | |||||||
2359 | // Legalize the type. | ||||||
2360 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
2361 | |||||||
2362 | MVT MTy = LT.second; | ||||||
2363 | |||||||
2364 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
2365 | assert(ISD && "Invalid opcode")((void)0); | ||||||
2366 | |||||||
2367 | unsigned ExtraCost = 0; | ||||||
2368 | if (I
| ||||||
2369 | // Some vector comparison predicates cost extra instructions. | ||||||
2370 | if (MTy.isVector() && | ||||||
2371 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || | ||||||
2372 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || | ||||||
2373 | ST->hasBWI())) { | ||||||
2374 | switch (cast<CmpInst>(I)->getPredicate()) { | ||||||
2375 | case CmpInst::Predicate::ICMP_NE: | ||||||
2376 | // xor(cmpeq(x,y),-1) | ||||||
2377 | ExtraCost = 1; | ||||||
2378 | break; | ||||||
2379 | case CmpInst::Predicate::ICMP_SGE: | ||||||
2380 | case CmpInst::Predicate::ICMP_SLE: | ||||||
2381 | // xor(cmpgt(x,y),-1) | ||||||
2382 | ExtraCost = 1; | ||||||
2383 | break; | ||||||
2384 | case CmpInst::Predicate::ICMP_ULT: | ||||||
2385 | case CmpInst::Predicate::ICMP_UGT: | ||||||
2386 | // cmpgt(xor(x,signbit),xor(y,signbit)) | ||||||
2387 | // xor(cmpeq(pmaxu(x,y),x),-1) | ||||||
2388 | ExtraCost = 2; | ||||||
2389 | break; | ||||||
2390 | case CmpInst::Predicate::ICMP_ULE: | ||||||
2391 | case CmpInst::Predicate::ICMP_UGE: | ||||||
2392 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || | ||||||
2393 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { | ||||||
2394 | // cmpeq(psubus(x,y),0) | ||||||
2395 | // cmpeq(pminu(x,y),x) | ||||||
2396 | ExtraCost = 1; | ||||||
2397 | } else { | ||||||
2398 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) | ||||||
2399 | ExtraCost = 3; | ||||||
2400 | } | ||||||
2401 | break; | ||||||
2402 | default: | ||||||
2403 | break; | ||||||
2404 | } | ||||||
2405 | } | ||||||
2406 | } | ||||||
2407 | |||||||
2408 | static const CostTblEntry SLMCostTbl[] = { | ||||||
2409 | // slm pcmpeq/pcmpgt throughput is 2 | ||||||
2410 | { ISD::SETCC, MVT::v2i64, 2 }, | ||||||
2411 | }; | ||||||
2412 | |||||||
2413 | static const CostTblEntry AVX512BWCostTbl[] = { | ||||||
2414 | { ISD::SETCC, MVT::v32i16, 1 }, | ||||||
2415 | { ISD::SETCC, MVT::v64i8, 1 }, | ||||||
2416 | |||||||
2417 | { ISD::SELECT, MVT::v32i16, 1 }, | ||||||
2418 | { ISD::SELECT, MVT::v64i8, 1 }, | ||||||
2419 | }; | ||||||
2420 | |||||||
2421 | static const CostTblEntry AVX512CostTbl[] = { | ||||||
2422 | { ISD::SETCC, MVT::v8i64, 1 }, | ||||||
2423 | { ISD::SETCC, MVT::v16i32, 1 }, | ||||||
2424 | { ISD::SETCC, MVT::v8f64, 1 }, | ||||||
2425 | { ISD::SETCC, MVT::v16f32, 1 }, | ||||||
2426 | |||||||
2427 | { ISD::SELECT, MVT::v8i64, 1 }, | ||||||
2428 | { ISD::SELECT, MVT::v16i32, 1 }, | ||||||
2429 | { ISD::SELECT, MVT::v8f64, 1 }, | ||||||
2430 | { ISD::SELECT, MVT::v16f32, 1 }, | ||||||
2431 | |||||||
2432 | { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 | ||||||
2433 | { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 | ||||||
2434 | |||||||
2435 | { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3 | ||||||
2436 | { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3 | ||||||
2437 | }; | ||||||
2438 | |||||||
2439 | static const CostTblEntry AVX2CostTbl[] = { | ||||||
2440 | { ISD::SETCC, MVT::v4i64, 1 }, | ||||||
2441 | { ISD::SETCC, MVT::v8i32, 1 }, | ||||||
2442 | { ISD::SETCC, MVT::v16i16, 1 }, | ||||||
2443 | { ISD::SETCC, MVT::v32i8, 1 }, | ||||||
2444 | |||||||
2445 | { ISD::SELECT, MVT::v4i64, 1 }, // pblendvb | ||||||
2446 | { ISD::SELECT, MVT::v8i32, 1 }, // pblendvb | ||||||
2447 | { ISD::SELECT, MVT::v16i16, 1 }, // pblendvb | ||||||
2448 | { ISD::SELECT, MVT::v32i8, 1 }, // pblendvb | ||||||
2449 | }; | ||||||
2450 | |||||||
2451 | static const CostTblEntry AVX1CostTbl[] = { | ||||||
2452 | { ISD::SETCC, MVT::v4f64, 1 }, | ||||||
2453 | { ISD::SETCC, MVT::v8f32, 1 }, | ||||||
2454 | // AVX1 does not support 8-wide integer compare. | ||||||
2455 | { ISD::SETCC, MVT::v4i64, 4 }, | ||||||
2456 | { ISD::SETCC, MVT::v8i32, 4 }, | ||||||
2457 | { ISD::SETCC, MVT::v16i16, 4 }, | ||||||
2458 | { ISD::SETCC, MVT::v32i8, 4 }, | ||||||
2459 | |||||||
2460 | { ISD::SELECT, MVT::v4f64, 1 }, // vblendvpd | ||||||
2461 | { ISD::SELECT, MVT::v8f32, 1 }, // vblendvps | ||||||
2462 | { ISD::SELECT, MVT::v4i64, 1 }, // vblendvpd | ||||||
2463 | { ISD::SELECT, MVT::v8i32, 1 }, // vblendvps | ||||||
2464 | { ISD::SELECT, MVT::v16i16, 3 }, // vandps + vandnps + vorps | ||||||
2465 | { ISD::SELECT, MVT::v32i8, 3 }, // vandps + vandnps + vorps | ||||||
2466 | }; | ||||||
2467 | |||||||
2468 | static const CostTblEntry SSE42CostTbl[] = { | ||||||
2469 | { ISD::SETCC, MVT::v2f64, 1 }, | ||||||
2470 | { ISD::SETCC, MVT::v4f32, 1 }, | ||||||
2471 | { ISD::SETCC, MVT::v2i64, 1 }, | ||||||
2472 | }; | ||||||
2473 | |||||||
2474 | static const CostTblEntry SSE41CostTbl[] = { | ||||||
2475 | { ISD::SELECT, MVT::v2f64, 1 }, // blendvpd | ||||||
2476 | { ISD::SELECT, MVT::v4f32, 1 }, // blendvps | ||||||
2477 | { ISD::SELECT, MVT::v2i64, 1 }, // pblendvb | ||||||
2478 | { ISD::SELECT, MVT::v4i32, 1 }, // pblendvb | ||||||
2479 | { ISD::SELECT, MVT::v8i16, 1 }, // pblendvb | ||||||
2480 | { ISD::SELECT, MVT::v16i8, 1 }, // pblendvb | ||||||
2481 | }; | ||||||
2482 | |||||||
2483 | static const CostTblEntry SSE2CostTbl[] = { | ||||||
2484 | { ISD::SETCC, MVT::v2f64, 2 }, | ||||||
2485 | { ISD::SETCC, MVT::f64, 1 }, | ||||||
2486 | { ISD::SETCC, MVT::v2i64, 8 }, | ||||||
2487 | { ISD::SETCC, MVT::v4i32, 1 }, | ||||||
2488 | { ISD::SETCC, MVT::v8i16, 1 }, | ||||||
2489 | { ISD::SETCC, MVT::v16i8, 1 }, | ||||||
2490 | |||||||
2491 | { ISD::SELECT, MVT::v2f64, 3 }, // andpd + andnpd + orpd | ||||||
2492 | { ISD::SELECT, MVT::v2i64, 3 }, // pand + pandn + por | ||||||
2493 | { ISD::SELECT, MVT::v4i32, 3 }, // pand + pandn + por | ||||||
2494 | { ISD::SELECT, MVT::v8i16, 3 }, // pand + pandn + por | ||||||
2495 | { ISD::SELECT, MVT::v16i8, 3 }, // pand + pandn + por | ||||||
2496 | }; | ||||||
2497 | |||||||
2498 | static const CostTblEntry SSE1CostTbl[] = { | ||||||
2499 | { ISD::SETCC, MVT::v4f32, 2 }, | ||||||
2500 | { ISD::SETCC, MVT::f32, 1 }, | ||||||
2501 | |||||||
2502 | { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps | ||||||
2503 | }; | ||||||
2504 | |||||||
2505 | if (ST->isSLM()) | ||||||
2506 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | ||||||
2507 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2508 | |||||||
2509 | if (ST->hasBWI()) | ||||||
2510 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||||
2511 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2512 | |||||||
2513 | if (ST->hasAVX512()) | ||||||
2514 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||||
2515 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2516 | |||||||
2517 | if (ST->hasAVX2()) | ||||||
2518 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||||
2519 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2520 | |||||||
2521 | if (ST->hasAVX()) | ||||||
2522 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||||
2523 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2524 | |||||||
2525 | if (ST->hasSSE42()) | ||||||
2526 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||||
2527 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2528 | |||||||
2529 | if (ST->hasSSE41()) | ||||||
2530 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||||
2531 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2532 | |||||||
2533 | if (ST->hasSSE2()) | ||||||
2534 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||||
2535 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2536 | |||||||
2537 | if (ST->hasSSE1()) | ||||||
2538 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||||
2539 | return LT.first * (ExtraCost + Entry->Cost); | ||||||
2540 | |||||||
2541 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | ||||||
2542 | } | ||||||
2543 | |||||||
2544 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } | ||||||
2545 | |||||||
2546 | InstructionCost | ||||||
2547 | X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||
2548 | TTI::TargetCostKind CostKind) { | ||||||
2549 | |||||||
2550 | // Costs should match the codegen from: | ||||||
2551 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll | ||||||
2552 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll | ||||||
2553 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll | ||||||
2554 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll | ||||||
2555 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll | ||||||
2556 | |||||||
2557 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not | ||||||
2558 | // specialized in these tables yet. | ||||||
2559 | static const CostTblEntry AVX512CDCostTbl[] = { | ||||||
2560 | { ISD::CTLZ, MVT::v8i64, 1 }, | ||||||
2561 | { ISD::CTLZ, MVT::v16i32, 1 }, | ||||||
2562 | { ISD::CTLZ, MVT::v32i16, 8 }, | ||||||
2563 | { ISD::CTLZ, MVT::v64i8, 20 }, | ||||||
2564 | { ISD::CTLZ, MVT::v4i64, 1 }, | ||||||
2565 | { ISD::CTLZ, MVT::v8i32, 1 }, | ||||||
2566 | { ISD::CTLZ, MVT::v16i16, 4 }, | ||||||
2567 | { ISD::CTLZ, MVT::v32i8, 10 }, | ||||||
2568 | { ISD::CTLZ, MVT::v2i64, 1 }, | ||||||
2569 | { ISD::CTLZ, MVT::v4i32, 1 }, | ||||||
2570 | { ISD::CTLZ, MVT::v8i16, 4 }, | ||||||
2571 | { ISD::CTLZ, MVT::v16i8, 4 }, | ||||||
2572 | }; | ||||||
2573 | static const CostTblEntry AVX512BWCostTbl[] = { | ||||||
2574 | { ISD::ABS, MVT::v32i16, 1 }, | ||||||
2575 | { ISD::ABS, MVT::v64i8, 1 }, | ||||||
2576 | { ISD::BITREVERSE, MVT::v8i64, 5 }, | ||||||
2577 | { ISD::BITREVERSE, MVT::v16i32, 5 }, | ||||||
2578 | { ISD::BITREVERSE, MVT::v32i16, 5 }, | ||||||
2579 | { ISD::BITREVERSE, MVT::v64i8, 5 }, | ||||||
2580 | { ISD::BSWAP, MVT::v8i64, 1 }, | ||||||
2581 | { ISD::BSWAP, MVT::v16i32, 1 }, | ||||||
2582 | { ISD::BSWAP, MVT::v32i16, 1 }, | ||||||
2583 | { ISD::CTLZ, MVT::v8i64, 23 }, | ||||||
2584 | { ISD::CTLZ, MVT::v16i32, 22 }, | ||||||
2585 | { ISD::CTLZ, MVT::v32i16, 18 }, | ||||||
2586 | { ISD::CTLZ, MVT::v64i8, 17 }, | ||||||
2587 | { ISD::CTPOP, MVT::v8i64, 7 }, | ||||||
2588 | { ISD::CTPOP, MVT::v16i32, 11 }, | ||||||
2589 | { ISD::CTPOP, MVT::v32i16, 9 }, | ||||||
2590 | { ISD::CTPOP, MVT::v64i8, 6 }, | ||||||
2591 | { ISD::CTTZ, MVT::v8i64, 10 }, | ||||||
2592 | { ISD::CTTZ, MVT::v16i32, 14 }, | ||||||
2593 | { ISD::CTTZ, MVT::v32i16, 12 }, | ||||||
2594 | { ISD::CTTZ, MVT::v64i8, 9 }, | ||||||
2595 | { ISD::SADDSAT, MVT::v32i16, 1 }, | ||||||
2596 | { ISD::SADDSAT, MVT::v64i8, 1 }, | ||||||
2597 | { ISD::SMAX, MVT::v32i16, 1 }, | ||||||
2598 | { ISD::SMAX, MVT::v64i8, 1 }, | ||||||
2599 | { ISD::SMIN, MVT::v32i16, 1 }, | ||||||
2600 | { ISD::SMIN, MVT::v64i8, 1 }, | ||||||
2601 | { ISD::SSUBSAT, MVT::v32i16, 1 }, | ||||||
2602 | { ISD::SSUBSAT, MVT::v64i8, 1 }, | ||||||
2603 | { ISD::UADDSAT, MVT::v32i16, 1 }, | ||||||
2604 | { ISD::UADDSAT, MVT::v64i8, 1 }, | ||||||
2605 | { ISD::UMAX, MVT::v32i16, 1 }, | ||||||
2606 | { ISD::UMAX, MVT::v64i8, 1 }, | ||||||
2607 | { ISD::UMIN, MVT::v32i16, 1 }, | ||||||
2608 | { ISD::UMIN, MVT::v64i8, 1 }, | ||||||
2609 | { ISD::USUBSAT, MVT::v32i16, 1 }, | ||||||
2610 | { ISD::USUBSAT, MVT::v64i8, 1 }, | ||||||
2611 | }; | ||||||
2612 | static const CostTblEntry AVX512CostTbl[] = { | ||||||
2613 | { ISD::ABS, MVT::v8i64, 1 }, | ||||||
2614 | { ISD::ABS, MVT::v16i32, 1 }, | ||||||
2615 | { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2616 | { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2617 | { ISD::ABS, MVT::v4i64, 1 }, | ||||||
2618 | { ISD::ABS, MVT::v2i64, 1 }, | ||||||
2619 | { ISD::BITREVERSE, MVT::v8i64, 36 }, | ||||||
2620 | { ISD::BITREVERSE, MVT::v16i32, 24 }, | ||||||
2621 | { ISD::BITREVERSE, MVT::v32i16, 10 }, | ||||||
2622 | { ISD::BITREVERSE, MVT::v64i8, 10 }, | ||||||
2623 | { ISD::BSWAP, MVT::v8i64, 4 }, | ||||||
2624 | { ISD::BSWAP, MVT::v16i32, 4 }, | ||||||
2625 | { ISD::BSWAP, MVT::v32i16, 4 }, | ||||||
2626 | { ISD::CTLZ, MVT::v8i64, 29 }, | ||||||
2627 | { ISD::CTLZ, MVT::v16i32, 35 }, | ||||||
2628 | { ISD::CTLZ, MVT::v32i16, 28 }, | ||||||
2629 | { ISD::CTLZ, MVT::v64i8, 18 }, | ||||||
2630 | { ISD::CTPOP, MVT::v8i64, 16 }, | ||||||
2631 | { ISD::CTPOP, MVT::v16i32, 24 }, | ||||||
2632 | { ISD::CTPOP, MVT::v32i16, 18 }, | ||||||
2633 | { ISD::CTPOP, MVT::v64i8, 12 }, | ||||||
2634 | { ISD::CTTZ, MVT::v8i64, 20 }, | ||||||
2635 | { ISD::CTTZ, MVT::v16i32, 28 }, | ||||||
2636 | { ISD::CTTZ, MVT::v32i16, 24 }, | ||||||
2637 | { ISD::CTTZ, MVT::v64i8, 18 }, | ||||||
2638 | { ISD::SMAX, MVT::v8i64, 1 }, | ||||||
2639 | { ISD::SMAX, MVT::v16i32, 1 }, | ||||||
2640 | { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2641 | { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2642 | { ISD::SMAX, MVT::v4i64, 1 }, | ||||||
2643 | { ISD::SMAX, MVT::v2i64, 1 }, | ||||||
2644 | { ISD::SMIN, MVT::v8i64, 1 }, | ||||||
2645 | { ISD::SMIN, MVT::v16i32, 1 }, | ||||||
2646 | { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2647 | { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2648 | { ISD::SMIN, MVT::v4i64, 1 }, | ||||||
2649 | { ISD::SMIN, MVT::v2i64, 1 }, | ||||||
2650 | { ISD::UMAX, MVT::v8i64, 1 }, | ||||||
2651 | { ISD::UMAX, MVT::v16i32, 1 }, | ||||||
2652 | { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2653 | { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2654 | { ISD::UMAX, MVT::v4i64, 1 }, | ||||||
2655 | { ISD::UMAX, MVT::v2i64, 1 }, | ||||||
2656 | { ISD::UMIN, MVT::v8i64, 1 }, | ||||||
2657 | { ISD::UMIN, MVT::v16i32, 1 }, | ||||||
2658 | { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2659 | { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2660 | { ISD::UMIN, MVT::v4i64, 1 }, | ||||||
2661 | { ISD::UMIN, MVT::v2i64, 1 }, | ||||||
2662 | { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd | ||||||
2663 | { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq | ||||||
2664 | { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq | ||||||
2665 | { ISD::USUBSAT, MVT::v8i64, 2 }, // pmaxuq + psubq | ||||||
2666 | { ISD::UADDSAT, MVT::v16i32, 3 }, // not + pminud + paddd | ||||||
2667 | { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq | ||||||
2668 | { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq | ||||||
2669 | { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq | ||||||
2670 | { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2671 | { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2672 | { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2673 | { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2674 | { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2675 | { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2676 | { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split | ||||||
2677 | { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split | ||||||
2678 | { ISD::FMAXNUM, MVT::f32, 2 }, | ||||||
2679 | { ISD::FMAXNUM, MVT::v4f32, 2 }, | ||||||
2680 | { ISD::FMAXNUM, MVT::v8f32, 2 }, | ||||||
2681 | { ISD::FMAXNUM, MVT::v16f32, 2 }, | ||||||
2682 | { ISD::FMAXNUM, MVT::f64, 2 }, | ||||||
2683 | { ISD::FMAXNUM, MVT::v2f64, 2 }, | ||||||
2684 | { ISD::FMAXNUM, MVT::v4f64, 2 }, | ||||||
2685 | { ISD::FMAXNUM, MVT::v8f64, 2 }, | ||||||
2686 | }; | ||||||
2687 | static const CostTblEntry XOPCostTbl[] = { | ||||||
2688 | { ISD::BITREVERSE, MVT::v4i64, 4 }, | ||||||
2689 | { ISD::BITREVERSE, MVT::v8i32, 4 }, | ||||||
2690 | { ISD::BITREVERSE, MVT::v16i16, 4 }, | ||||||
2691 | { ISD::BITREVERSE, MVT::v32i8, 4 }, | ||||||
2692 | { ISD::BITREVERSE, MVT::v2i64, 1 }, | ||||||
2693 | { ISD::BITREVERSE, MVT::v4i32, 1 }, | ||||||
2694 | { ISD::BITREVERSE, MVT::v8i16, 1 }, | ||||||
2695 | { ISD::BITREVERSE, MVT::v16i8, 1 }, | ||||||
2696 | { ISD::BITREVERSE, MVT::i64, 3 }, | ||||||
2697 | { ISD::BITREVERSE, MVT::i32, 3 }, | ||||||
2698 | { ISD::BITREVERSE, MVT::i16, 3 }, | ||||||
2699 | { ISD::BITREVERSE, MVT::i8, 3 } | ||||||
2700 | }; | ||||||
2701 | static const CostTblEntry AVX2CostTbl[] = { | ||||||
2702 | { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X) | ||||||
2703 | { ISD::ABS, MVT::v8i32, 1 }, | ||||||
2704 | { ISD::ABS, MVT::v16i16, 1 }, | ||||||
2705 | { ISD::ABS, MVT::v32i8, 1 }, | ||||||
2706 | { ISD::BITREVERSE, MVT::v4i64, 5 }, | ||||||
2707 | { ISD::BITREVERSE, MVT::v8i32, 5 }, | ||||||
2708 | { ISD::BITREVERSE, MVT::v16i16, 5 }, | ||||||
2709 | { ISD::BITREVERSE, MVT::v32i8, 5 }, | ||||||
2710 | { ISD::BSWAP, MVT::v4i64, 1 }, | ||||||
2711 | { ISD::BSWAP, MVT::v8i32, 1 }, | ||||||
2712 | { ISD::BSWAP, MVT::v16i16, 1 }, | ||||||
2713 | { ISD::CTLZ, MVT::v4i64, 23 }, | ||||||
2714 | { ISD::CTLZ, MVT::v8i32, 18 }, | ||||||
2715 | { ISD::CTLZ, MVT::v16i16, 14 }, | ||||||
2716 | { ISD::CTLZ, MVT::v32i8, 9 }, | ||||||
2717 | { ISD::CTPOP, MVT::v4i64, 7 }, | ||||||
2718 | { ISD::CTPOP, MVT::v8i32, 11 }, | ||||||
2719 | { ISD::CTPOP, MVT::v16i16, 9 }, | ||||||
2720 | { ISD::CTPOP, MVT::v32i8, 6 }, | ||||||
2721 | { ISD::CTTZ, MVT::v4i64, 10 }, | ||||||
2722 | { ISD::CTTZ, MVT::v8i32, 14 }, | ||||||
2723 | { ISD::CTTZ, MVT::v16i16, 12 }, | ||||||
2724 | { ISD::CTTZ, MVT::v32i8, 9 }, | ||||||
2725 | { ISD::SADDSAT, MVT::v16i16, 1 }, | ||||||
2726 | { ISD::SADDSAT, MVT::v32i8, 1 }, | ||||||
2727 | { ISD::SMAX, MVT::v8i32, 1 }, | ||||||
2728 | { ISD::SMAX, MVT::v16i16, 1 }, | ||||||
2729 | { ISD::SMAX, MVT::v32i8, 1 }, | ||||||
2730 | { ISD::SMIN, MVT::v8i32, 1 }, | ||||||
2731 | { ISD::SMIN, MVT::v16i16, 1 }, | ||||||
2732 | { ISD::SMIN, MVT::v32i8, 1 }, | ||||||
2733 | { ISD::SSUBSAT, MVT::v16i16, 1 }, | ||||||
2734 | { ISD::SSUBSAT, MVT::v32i8, 1 }, | ||||||
2735 | { ISD::UADDSAT, MVT::v16i16, 1 }, | ||||||
2736 | { ISD::UADDSAT, MVT::v32i8, 1 }, | ||||||
2737 | { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd | ||||||
2738 | { ISD::UMAX, MVT::v8i32, 1 }, | ||||||
2739 | { ISD::UMAX, MVT::v16i16, 1 }, | ||||||
2740 | { ISD::UMAX, MVT::v32i8, 1 }, | ||||||
2741 | { ISD::UMIN, MVT::v8i32, 1 }, | ||||||
2742 | { ISD::UMIN, MVT::v16i16, 1 }, | ||||||
2743 | { ISD::UMIN, MVT::v32i8, 1 }, | ||||||
2744 | { ISD::USUBSAT, MVT::v16i16, 1 }, | ||||||
2745 | { ISD::USUBSAT, MVT::v32i8, 1 }, | ||||||
2746 | { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd | ||||||
2747 | { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS | ||||||
2748 | { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD | ||||||
2749 | { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ | ||||||
2750 | { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ | ||||||
2751 | { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ | ||||||
2752 | { ISD::FSQRT, MVT::f64, 14 }, // Haswell from http://www.agner.org/ | ||||||
2753 | { ISD::FSQRT, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ | ||||||
2754 | { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ | ||||||
2755 | }; | ||||||
2756 | static const CostTblEntry AVX1CostTbl[] = { | ||||||
2757 | { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X) | ||||||
2758 | { ISD::ABS, MVT::v8i32, 3 }, | ||||||
2759 | { ISD::ABS, MVT::v16i16, 3 }, | ||||||
2760 | { ISD::ABS, MVT::v32i8, 3 }, | ||||||
2761 | { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert | ||||||
2762 | { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert | ||||||
2763 | { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert | ||||||
2764 | { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert | ||||||
2765 | { ISD::BSWAP, MVT::v4i64, 4 }, | ||||||
2766 | { ISD::BSWAP, MVT::v8i32, 4 }, | ||||||
2767 | { ISD::BSWAP, MVT::v16i16, 4 }, | ||||||
2768 | { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert | ||||||
2769 | { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert | ||||||
2770 | { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert | ||||||
2771 | { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | ||||||
2772 | { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert | ||||||
2773 | { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert | ||||||
2774 | { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert | ||||||
2775 | { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert | ||||||
2776 | { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert | ||||||
2777 | { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert | ||||||
2778 | { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert | ||||||
2779 | { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert | ||||||
2780 | { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2781 | { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2782 | { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2783 | { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2784 | { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2785 | { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2786 | { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2787 | { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2788 | { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2789 | { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2790 | { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2791 | { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2792 | { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert | ||||||
2793 | { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2794 | { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2795 | { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2796 | { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2797 | { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2798 | { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2799 | { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2800 | { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert | ||||||
2801 | { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert | ||||||
2802 | { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS | ||||||
2803 | { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS | ||||||
2804 | { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ? | ||||||
2805 | { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD | ||||||
2806 | { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD | ||||||
2807 | { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ? | ||||||
2808 | { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ | ||||||
2809 | { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ | ||||||
2810 | { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ | ||||||
2811 | { ISD::FSQRT, MVT::f64, 21 }, // SNB from http://www.agner.org/ | ||||||
2812 | { ISD::FSQRT, MVT::v2f64, 21 }, // SNB from http://www.agner.org/ | ||||||
2813 | { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ | ||||||
2814 | }; | ||||||
2815 | static const CostTblEntry GLMCostTbl[] = { | ||||||
2816 | { ISD::FSQRT, MVT::f32, 19 }, // sqrtss | ||||||
2817 | { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps | ||||||
2818 | { ISD::FSQRT, MVT::f64, 34 }, // sqrtsd | ||||||
2819 | { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd | ||||||
2820 | }; | ||||||
2821 | static const CostTblEntry SLMCostTbl[] = { | ||||||
2822 | { ISD::FSQRT, MVT::f32, 20 }, // sqrtss | ||||||
2823 | { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps | ||||||
2824 | { ISD::FSQRT, MVT::f64, 35 }, // sqrtsd | ||||||
2825 | { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd | ||||||
2826 | }; | ||||||
2827 | static const CostTblEntry SSE42CostTbl[] = { | ||||||
2828 | { ISD::USUBSAT, MVT::v4i32, 2 }, // pmaxud + psubd | ||||||
2829 | { ISD::UADDSAT, MVT::v4i32, 3 }, // not + pminud + paddd | ||||||
2830 | { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ | ||||||
2831 | { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ | ||||||
2832 | }; | ||||||
2833 | static const CostTblEntry SSE41CostTbl[] = { | ||||||
2834 | { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X) | ||||||
2835 | { ISD::SMAX, MVT::v4i32, 1 }, | ||||||
2836 | { ISD::SMAX, MVT::v16i8, 1 }, | ||||||
2837 | { ISD::SMIN, MVT::v4i32, 1 }, | ||||||
2838 | { ISD::SMIN, MVT::v16i8, 1 }, | ||||||
2839 | { ISD::UMAX, MVT::v4i32, 1 }, | ||||||
2840 | { ISD::UMAX, MVT::v8i16, 1 }, | ||||||
2841 | { ISD::UMIN, MVT::v4i32, 1 }, | ||||||
2842 | { ISD::UMIN, MVT::v8i16, 1 }, | ||||||
2843 | }; | ||||||
2844 | static const CostTblEntry SSSE3CostTbl[] = { | ||||||
2845 | { ISD::ABS, MVT::v4i32, 1 }, | ||||||
2846 | { ISD::ABS, MVT::v8i16, 1 }, | ||||||
2847 | { ISD::ABS, MVT::v16i8, 1 }, | ||||||
2848 | { ISD::BITREVERSE, MVT::v2i64, 5 }, | ||||||
2849 | { ISD::BITREVERSE, MVT::v4i32, 5 }, | ||||||
2850 | { ISD::BITREVERSE, MVT::v8i16, 5 }, | ||||||
2851 | { ISD::BITREVERSE, MVT::v16i8, 5 }, | ||||||
2852 | { ISD::BSWAP, MVT::v2i64, 1 }, | ||||||
2853 | { ISD::BSWAP, MVT::v4i32, 1 }, | ||||||
2854 | { ISD::BSWAP, MVT::v8i16, 1 }, | ||||||
2855 | { ISD::CTLZ, MVT::v2i64, 23 }, | ||||||
2856 | { ISD::CTLZ, MVT::v4i32, 18 }, | ||||||
2857 | { ISD::CTLZ, MVT::v8i16, 14 }, | ||||||
2858 | { ISD::CTLZ, MVT::v16i8, 9 }, | ||||||
2859 | { ISD::CTPOP, MVT::v2i64, 7 }, | ||||||
2860 | { ISD::CTPOP, MVT::v4i32, 11 }, | ||||||
2861 | { ISD::CTPOP, MVT::v8i16, 9 }, | ||||||
2862 | { ISD::CTPOP, MVT::v16i8, 6 }, | ||||||
2863 | { ISD::CTTZ, MVT::v2i64, 10 }, | ||||||
2864 | { ISD::CTTZ, MVT::v4i32, 14 }, | ||||||
2865 | { ISD::CTTZ, MVT::v8i16, 12 }, | ||||||
2866 | { ISD::CTTZ, MVT::v16i8, 9 } | ||||||
2867 | }; | ||||||
2868 | static const CostTblEntry SSE2CostTbl[] = { | ||||||
2869 | { ISD::ABS, MVT::v2i64, 4 }, | ||||||
2870 | { ISD::ABS, MVT::v4i32, 3 }, | ||||||
2871 | { ISD::ABS, MVT::v8i16, 2 }, | ||||||
2872 | { ISD::ABS, MVT::v16i8, 2 }, | ||||||
2873 | { ISD::BITREVERSE, MVT::v2i64, 29 }, | ||||||
2874 | { ISD::BITREVERSE, MVT::v4i32, 27 }, | ||||||
2875 | { ISD::BITREVERSE, MVT::v8i16, 27 }, | ||||||
2876 | { ISD::BITREVERSE, MVT::v16i8, 20 }, | ||||||
2877 | { ISD::BSWAP, MVT::v2i64, 7 }, | ||||||
2878 | { ISD::BSWAP, MVT::v4i32, 7 }, | ||||||
2879 | { ISD::BSWAP, MVT::v8i16, 7 }, | ||||||
2880 | { ISD::CTLZ, MVT::v2i64, 25 }, | ||||||
2881 | { ISD::CTLZ, MVT::v4i32, 26 }, | ||||||
2882 | { ISD::CTLZ, MVT::v8i16, 20 }, | ||||||
2883 | { ISD::CTLZ, MVT::v16i8, 17 }, | ||||||
2884 | { ISD::CTPOP, MVT::v2i64, 12 }, | ||||||
2885 | { ISD::CTPOP, MVT::v4i32, 15 }, | ||||||
2886 | { ISD::CTPOP, MVT::v8i16, 13 }, | ||||||
2887 | { ISD::CTPOP, MVT::v16i8, 10 }, | ||||||
2888 | { ISD::CTTZ, MVT::v2i64, 14 }, | ||||||
2889 | { ISD::CTTZ, MVT::v4i32, 18 }, | ||||||
2890 | { ISD::CTTZ, MVT::v8i16, 16 }, | ||||||
2891 | { ISD::CTTZ, MVT::v16i8, 13 }, | ||||||
2892 | { ISD::SADDSAT, MVT::v8i16, 1 }, | ||||||
2893 | { ISD::SADDSAT, MVT::v16i8, 1 }, | ||||||
2894 | { ISD::SMAX, MVT::v8i16, 1 }, | ||||||
2895 | { ISD::SMIN, MVT::v8i16, 1 }, | ||||||
2896 | { ISD::SSUBSAT, MVT::v8i16, 1 }, | ||||||
2897 | { ISD::SSUBSAT, MVT::v16i8, 1 }, | ||||||
2898 | { ISD::UADDSAT, MVT::v8i16, 1 }, | ||||||
2899 | { ISD::UADDSAT, MVT::v16i8, 1 }, | ||||||
2900 | { ISD::UMAX, MVT::v8i16, 2 }, | ||||||
2901 | { ISD::UMAX, MVT::v16i8, 1 }, | ||||||
2902 | { ISD::UMIN, MVT::v8i16, 2 }, | ||||||
2903 | { ISD::UMIN, MVT::v16i8, 1 }, | ||||||
2904 | { ISD::USUBSAT, MVT::v8i16, 1 }, | ||||||
2905 | { ISD::USUBSAT, MVT::v16i8, 1 }, | ||||||
2906 | { ISD::FMAXNUM, MVT::f64, 4 }, | ||||||
2907 | { ISD::FMAXNUM, MVT::v2f64, 4 }, | ||||||
2908 | { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ | ||||||
2909 | { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ | ||||||
2910 | }; | ||||||
2911 | static const CostTblEntry SSE1CostTbl[] = { | ||||||
2912 | { ISD::FMAXNUM, MVT::f32, 4 }, | ||||||
2913 | { ISD::FMAXNUM, MVT::v4f32, 4 }, | ||||||
2914 | { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ | ||||||
2915 | { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ | ||||||
2916 | }; | ||||||
2917 | static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets | ||||||
2918 | { ISD::CTTZ, MVT::i64, 1 }, | ||||||
2919 | }; | ||||||
2920 | static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets | ||||||
2921 | { ISD::CTTZ, MVT::i32, 1 }, | ||||||
2922 | { ISD::CTTZ, MVT::i16, 1 }, | ||||||
2923 | { ISD::CTTZ, MVT::i8, 1 }, | ||||||
2924 | }; | ||||||
2925 | static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets | ||||||
2926 | { ISD::CTLZ, MVT::i64, 1 }, | ||||||
2927 | }; | ||||||
2928 | static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets | ||||||
2929 | { ISD::CTLZ, MVT::i32, 1 }, | ||||||
2930 | { ISD::CTLZ, MVT::i16, 1 }, | ||||||
2931 | { ISD::CTLZ, MVT::i8, 1 }, | ||||||
2932 | }; | ||||||
2933 | static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets | ||||||
2934 | { ISD::CTPOP, MVT::i64, 1 }, | ||||||
2935 | }; | ||||||
2936 | static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets | ||||||
2937 | { ISD::CTPOP, MVT::i32, 1 }, | ||||||
2938 | { ISD::CTPOP, MVT::i16, 1 }, | ||||||
2939 | { ISD::CTPOP, MVT::i8, 1 }, | ||||||
2940 | }; | ||||||
2941 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | ||||||
2942 | { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV | ||||||
2943 | { ISD::BITREVERSE, MVT::i64, 14 }, | ||||||
2944 | { ISD::BSWAP, MVT::i64, 1 }, | ||||||
2945 | { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV | ||||||
2946 | { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH | ||||||
2947 | { ISD::CTPOP, MVT::i64, 10 }, | ||||||
2948 | { ISD::SADDO, MVT::i64, 1 }, | ||||||
2949 | { ISD::UADDO, MVT::i64, 1 }, | ||||||
2950 | { ISD::UMULO, MVT::i64, 2 }, // mulq + seto | ||||||
2951 | }; | ||||||
2952 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | ||||||
2953 | { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV | ||||||
2954 | { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV | ||||||
2955 | { ISD::BITREVERSE, MVT::i32, 14 }, | ||||||
2956 | { ISD::BITREVERSE, MVT::i16, 14 }, | ||||||
2957 | { ISD::BITREVERSE, MVT::i8, 11 }, | ||||||
2958 | { ISD::BSWAP, MVT::i32, 1 }, | ||||||
2959 | { ISD::BSWAP, MVT::i16, 1 }, // ROL | ||||||
2960 | { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV | ||||||
2961 | { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV | ||||||
2962 | { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV | ||||||
2963 | { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH | ||||||
2964 | { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH | ||||||
2965 | { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH | ||||||
2966 | { ISD::CTPOP, MVT::i32, 8 }, | ||||||
2967 | { ISD::CTPOP, MVT::i16, 9 }, | ||||||
2968 | { ISD::CTPOP, MVT::i8, 7 }, | ||||||
2969 | { ISD::SADDO, MVT::i32, 1 }, | ||||||
2970 | { ISD::SADDO, MVT::i16, 1 }, | ||||||
2971 | { ISD::SADDO, MVT::i8, 1 }, | ||||||
2972 | { ISD::UADDO, MVT::i32, 1 }, | ||||||
2973 | { ISD::UADDO, MVT::i16, 1 }, | ||||||
2974 | { ISD::UADDO, MVT::i8, 1 }, | ||||||
2975 | { ISD::UMULO, MVT::i32, 2 }, // mul + seto | ||||||
2976 | { ISD::UMULO, MVT::i16, 2 }, | ||||||
2977 | { ISD::UMULO, MVT::i8, 2 }, | ||||||
2978 | }; | ||||||
2979 | |||||||
2980 | Type *RetTy = ICA.getReturnType(); | ||||||
2981 | Type *OpTy = RetTy; | ||||||
2982 | Intrinsic::ID IID = ICA.getID(); | ||||||
2983 | unsigned ISD = ISD::DELETED_NODE; | ||||||
2984 | switch (IID) { | ||||||
2985 | default: | ||||||
2986 | break; | ||||||
2987 | case Intrinsic::abs: | ||||||
2988 | ISD = ISD::ABS; | ||||||
2989 | break; | ||||||
2990 | case Intrinsic::bitreverse: | ||||||
2991 | ISD = ISD::BITREVERSE; | ||||||
2992 | break; | ||||||
2993 | case Intrinsic::bswap: | ||||||
2994 | ISD = ISD::BSWAP; | ||||||
2995 | break; | ||||||
2996 | case Intrinsic::ctlz: | ||||||
2997 | ISD = ISD::CTLZ; | ||||||
2998 | break; | ||||||
2999 | case Intrinsic::ctpop: | ||||||
3000 | ISD = ISD::CTPOP; | ||||||
3001 | break; | ||||||
3002 | case Intrinsic::cttz: | ||||||
3003 | ISD = ISD::CTTZ; | ||||||
3004 | break; | ||||||
3005 | case Intrinsic::maxnum: | ||||||
3006 | case Intrinsic::minnum: | ||||||
3007 | // FMINNUM has same costs so don't duplicate. | ||||||
3008 | ISD = ISD::FMAXNUM; | ||||||
3009 | break; | ||||||
3010 | case Intrinsic::sadd_sat: | ||||||
3011 | ISD = ISD::SADDSAT; | ||||||
3012 | break; | ||||||
3013 | case Intrinsic::smax: | ||||||
3014 | ISD = ISD::SMAX; | ||||||
3015 | break; | ||||||
3016 | case Intrinsic::smin: | ||||||
3017 | ISD = ISD::SMIN; | ||||||
3018 | break; | ||||||
3019 | case Intrinsic::ssub_sat: | ||||||
3020 | ISD = ISD::SSUBSAT; | ||||||
3021 | break; | ||||||
3022 | case Intrinsic::uadd_sat: | ||||||
3023 | ISD = ISD::UADDSAT; | ||||||
3024 | break; | ||||||
3025 | case Intrinsic::umax: | ||||||
3026 | ISD = ISD::UMAX; | ||||||
3027 | break; | ||||||
3028 | case Intrinsic::umin: | ||||||
3029 | ISD = ISD::UMIN; | ||||||
3030 | break; | ||||||
3031 | case Intrinsic::usub_sat: | ||||||
3032 | ISD = ISD::USUBSAT; | ||||||
3033 | break; | ||||||
3034 | case Intrinsic::sqrt: | ||||||
3035 | ISD = ISD::FSQRT; | ||||||
3036 | break; | ||||||
3037 | case Intrinsic::sadd_with_overflow: | ||||||
3038 | case Intrinsic::ssub_with_overflow: | ||||||
3039 | // SSUBO has same costs so don't duplicate. | ||||||
3040 | ISD = ISD::SADDO; | ||||||
3041 | OpTy = RetTy->getContainedType(0); | ||||||
3042 | break; | ||||||
3043 | case Intrinsic::uadd_with_overflow: | ||||||
3044 | case Intrinsic::usub_with_overflow: | ||||||
3045 | // USUBO has same costs so don't duplicate. | ||||||
3046 | ISD = ISD::UADDO; | ||||||
3047 | OpTy = RetTy->getContainedType(0); | ||||||
3048 | break; | ||||||
3049 | case Intrinsic::umul_with_overflow: | ||||||
3050 | case Intrinsic::smul_with_overflow: | ||||||
3051 | // SMULO has same costs so don't duplicate. | ||||||
3052 | ISD = ISD::UMULO; | ||||||
3053 | OpTy = RetTy->getContainedType(0); | ||||||
3054 | break; | ||||||
3055 | } | ||||||
3056 | |||||||
3057 | if (ISD != ISD::DELETED_NODE) { | ||||||
3058 | // Legalize the type. | ||||||
3059 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy); | ||||||
3060 | MVT MTy = LT.second; | ||||||
3061 | |||||||
3062 | // Attempt to lookup cost. | ||||||
3063 | if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && | ||||||
3064 | MTy.isVector()) { | ||||||
3065 | // With PSHUFB the code is very similar for all types. If we have integer | ||||||
3066 | // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types | ||||||
3067 | // we also need a PSHUFB. | ||||||
3068 | unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; | ||||||
3069 | |||||||
3070 | // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB | ||||||
3071 | // instructions. We also need an extract and an insert. | ||||||
3072 | if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || | ||||||
3073 | (ST->hasBWI() && MTy.is512BitVector()))) | ||||||
3074 | Cost = Cost * 2 + 2; | ||||||
3075 | |||||||
3076 | return LT.first * Cost; | ||||||
3077 | } | ||||||
3078 | |||||||
3079 | auto adjustTableCost = [](const CostTblEntry &Entry, | ||||||
3080 | InstructionCost LegalizationCost, | ||||||
3081 | FastMathFlags FMF) { | ||||||
3082 | // If there are no NANs to deal with, then these are reduced to a | ||||||
3083 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we | ||||||
3084 | // assume is used in the non-fast case. | ||||||
3085 | if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) { | ||||||
3086 | if (FMF.noNaNs()) | ||||||
3087 | return LegalizationCost * 1; | ||||||
3088 | } | ||||||
3089 | return LegalizationCost * (int)Entry.Cost; | ||||||
3090 | }; | ||||||
3091 | |||||||
3092 | if (ST->useGLMDivSqrtCosts()) | ||||||
3093 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) | ||||||
3094 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3095 | |||||||
3096 | if (ST->isSLM()) | ||||||
3097 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | ||||||
3098 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3099 | |||||||
3100 | if (ST->hasCDI()) | ||||||
3101 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) | ||||||
3102 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3103 | |||||||
3104 | if (ST->hasBWI()) | ||||||
3105 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||||
3106 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3107 | |||||||
3108 | if (ST->hasAVX512()) | ||||||
3109 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||||
3110 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3111 | |||||||
3112 | if (ST->hasXOP()) | ||||||
3113 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | ||||||
3114 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3115 | |||||||
3116 | if (ST->hasAVX2()) | ||||||
3117 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||||
3118 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3119 | |||||||
3120 | if (ST->hasAVX()) | ||||||
3121 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||||
3122 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3123 | |||||||
3124 | if (ST->hasSSE42()) | ||||||
3125 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||||
3126 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3127 | |||||||
3128 | if (ST->hasSSE41()) | ||||||
3129 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||||
3130 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3131 | |||||||
3132 | if (ST->hasSSSE3()) | ||||||
3133 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) | ||||||
3134 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3135 | |||||||
3136 | if (ST->hasSSE2()) | ||||||
3137 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||||
3138 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3139 | |||||||
3140 | if (ST->hasSSE1()) | ||||||
3141 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||||
3142 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3143 | |||||||
3144 | if (ST->hasBMI()) { | ||||||
3145 | if (ST->is64Bit()) | ||||||
3146 | if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) | ||||||
3147 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3148 | |||||||
3149 | if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) | ||||||
3150 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3151 | } | ||||||
3152 | |||||||
3153 | if (ST->hasLZCNT()) { | ||||||
3154 | if (ST->is64Bit()) | ||||||
3155 | if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) | ||||||
3156 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3157 | |||||||
3158 | if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) | ||||||
3159 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3160 | } | ||||||
3161 | |||||||
3162 | if (ST->hasPOPCNT()) { | ||||||
3163 | if (ST->is64Bit()) | ||||||
3164 | if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) | ||||||
3165 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3166 | |||||||
3167 | if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) | ||||||
3168 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3169 | } | ||||||
3170 | |||||||
3171 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { | ||||||
3172 | if (const Instruction *II = ICA.getInst()) { | ||||||
3173 | if (II->hasOneUse() && isa<StoreInst>(II->user_back())) | ||||||
3174 | return TTI::TCC_Free; | ||||||
3175 | if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { | ||||||
3176 | if (LI->hasOneUse()) | ||||||
3177 | return TTI::TCC_Free; | ||||||
3178 | } | ||||||
3179 | } | ||||||
3180 | } | ||||||
3181 | |||||||
3182 | // TODO - add BMI (TZCNT) scalar handling | ||||||
3183 | |||||||
3184 | if (ST->is64Bit()) | ||||||
3185 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | ||||||
3186 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3187 | |||||||
3188 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | ||||||
3189 | return adjustTableCost(*Entry, LT.first, ICA.getFlags()); | ||||||
3190 | } | ||||||
3191 | |||||||
3192 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
3193 | } | ||||||
3194 | |||||||
3195 | InstructionCost | ||||||
3196 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||
3197 | TTI::TargetCostKind CostKind) { | ||||||
3198 | if (ICA.isTypeBasedOnly()) | ||||||
3199 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); | ||||||
3200 | |||||||
3201 | static const CostTblEntry AVX512CostTbl[] = { | ||||||
3202 | { ISD::ROTL, MVT::v8i64, 1 }, | ||||||
3203 | { ISD::ROTL, MVT::v4i64, 1 }, | ||||||
3204 | { ISD::ROTL, MVT::v2i64, 1 }, | ||||||
3205 | { ISD::ROTL, MVT::v16i32, 1 }, | ||||||
3206 | { ISD::ROTL, MVT::v8i32, 1 }, | ||||||
3207 | { ISD::ROTL, MVT::v4i32, 1 }, | ||||||
3208 | { ISD::ROTR, MVT::v8i64, 1 }, | ||||||
3209 | { ISD::ROTR, MVT::v4i64, 1 }, | ||||||
3210 | { ISD::ROTR, MVT::v2i64, 1 }, | ||||||
3211 | { ISD::ROTR, MVT::v16i32, 1 }, | ||||||
3212 | { ISD::ROTR, MVT::v8i32, 1 }, | ||||||
3213 | { ISD::ROTR, MVT::v4i32, 1 } | ||||||
3214 | }; | ||||||
3215 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) | ||||||
3216 | static const CostTblEntry XOPCostTbl[] = { | ||||||
3217 | { ISD::ROTL, MVT::v4i64, 4 }, | ||||||
3218 | { ISD::ROTL, MVT::v8i32, 4 }, | ||||||
3219 | { ISD::ROTL, MVT::v16i16, 4 }, | ||||||
3220 | { ISD::ROTL, MVT::v32i8, 4 }, | ||||||
3221 | { ISD::ROTL, MVT::v2i64, 1 }, | ||||||
3222 | { ISD::ROTL, MVT::v4i32, 1 }, | ||||||
3223 | { ISD::ROTL, MVT::v8i16, 1 }, | ||||||
3224 | { ISD::ROTL, MVT::v16i8, 1 }, | ||||||
3225 | { ISD::ROTR, MVT::v4i64, 6 }, | ||||||
3226 | { ISD::ROTR, MVT::v8i32, 6 }, | ||||||
3227 | { ISD::ROTR, MVT::v16i16, 6 }, | ||||||
3228 | { ISD::ROTR, MVT::v32i8, 6 }, | ||||||
3229 | { ISD::ROTR, MVT::v2i64, 2 }, | ||||||
3230 | { ISD::ROTR, MVT::v4i32, 2 }, | ||||||
3231 | { ISD::ROTR, MVT::v8i16, 2 }, | ||||||
3232 | { ISD::ROTR, MVT::v16i8, 2 } | ||||||
3233 | }; | ||||||
3234 | static const CostTblEntry X64CostTbl[] = { // 64-bit targets | ||||||
3235 | { ISD::ROTL, MVT::i64, 1 }, | ||||||
3236 | { ISD::ROTR, MVT::i64, 1 }, | ||||||
3237 | { ISD::FSHL, MVT::i64, 4 } | ||||||
3238 | }; | ||||||
3239 | static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets | ||||||
3240 | { ISD::ROTL, MVT::i32, 1 }, | ||||||
3241 | { ISD::ROTL, MVT::i16, 1 }, | ||||||
3242 | { ISD::ROTL, MVT::i8, 1 }, | ||||||
3243 | { ISD::ROTR, MVT::i32, 1 }, | ||||||
3244 | { ISD::ROTR, MVT::i16, 1 }, | ||||||
3245 | { ISD::ROTR, MVT::i8, 1 }, | ||||||
3246 | { ISD::FSHL, MVT::i32, 4 }, | ||||||
3247 | { ISD::FSHL, MVT::i16, 4 }, | ||||||
3248 | { ISD::FSHL, MVT::i8, 4 } | ||||||
3249 | }; | ||||||
3250 | |||||||
3251 | Intrinsic::ID IID = ICA.getID(); | ||||||
3252 | Type *RetTy = ICA.getReturnType(); | ||||||
3253 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | ||||||
3254 | unsigned ISD = ISD::DELETED_NODE; | ||||||
3255 | switch (IID) { | ||||||
3256 | default: | ||||||
3257 | break; | ||||||
3258 | case Intrinsic::fshl: | ||||||
3259 | ISD = ISD::FSHL; | ||||||
3260 | if (Args[0] == Args[1]) | ||||||
3261 | ISD = ISD::ROTL; | ||||||
3262 | break; | ||||||
3263 | case Intrinsic::fshr: | ||||||
3264 | // FSHR has same costs so don't duplicate. | ||||||
3265 | ISD = ISD::FSHL; | ||||||
3266 | if (Args[0] == Args[1]) | ||||||
3267 | ISD = ISD::ROTR; | ||||||
3268 | break; | ||||||
3269 | } | ||||||
3270 | |||||||
3271 | if (ISD != ISD::DELETED_NODE) { | ||||||
3272 | // Legalize the type. | ||||||
3273 | std::pair<InstructionCost, MVT> LT = | ||||||
3274 | TLI->getTypeLegalizationCost(DL, RetTy); | ||||||
3275 | MVT MTy = LT.second; | ||||||
3276 | |||||||
3277 | // Attempt to lookup cost. | ||||||
3278 | if (ST->hasAVX512()) | ||||||
3279 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||||
3280 | return LT.first * Entry->Cost; | ||||||
3281 | |||||||
3282 | if (ST->hasXOP()) | ||||||
3283 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | ||||||
3284 | return LT.first * Entry->Cost; | ||||||
3285 | |||||||
3286 | if (ST->is64Bit()) | ||||||
3287 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | ||||||
3288 | return LT.first * Entry->Cost; | ||||||
3289 | |||||||
3290 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | ||||||
3291 | return LT.first * Entry->Cost; | ||||||
3292 | } | ||||||
3293 | |||||||
3294 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
3295 | } | ||||||
3296 | |||||||
3297 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | ||||||
3298 | unsigned Index) { | ||||||
3299 | static const CostTblEntry SLMCostTbl[] = { | ||||||
3300 | { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, | ||||||
3301 | { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, | ||||||
3302 | { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, | ||||||
3303 | { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } | ||||||
3304 | }; | ||||||
3305 | |||||||
3306 | assert(Val->isVectorTy() && "This must be a vector type")((void)0); | ||||||
3307 | Type *ScalarType = Val->getScalarType(); | ||||||
3308 | int RegisterFileMoveCost = 0; | ||||||
3309 | |||||||
3310 | // Non-immediate extraction/insertion can be handled as a sequence of | ||||||
3311 | // aliased loads+stores via the stack. | ||||||
3312 | if (Index == -1U && (Opcode == Instruction::ExtractElement || | ||||||
3313 | Opcode == Instruction::InsertElement)) { | ||||||
3314 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: | ||||||
3315 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. | ||||||
3316 | |||||||
3317 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. | ||||||
3318 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected")((void)0); | ||||||
3319 | Align VecAlign = DL.getPrefTypeAlign(Val); | ||||||
3320 | Align SclAlign = DL.getPrefTypeAlign(ScalarType); | ||||||
3321 | |||||||
3322 | // Extract - store vector to stack, load scalar. | ||||||
3323 | if (Opcode == Instruction::ExtractElement) { | ||||||
3324 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, | ||||||
3325 | TTI::TargetCostKind::TCK_RecipThroughput) + | ||||||
3326 | getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, | ||||||
3327 | TTI::TargetCostKind::TCK_RecipThroughput); | ||||||
3328 | } | ||||||
3329 | // Insert - store vector to stack, store scalar, load vector. | ||||||
3330 | if (Opcode == Instruction::InsertElement) { | ||||||
3331 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, | ||||||
3332 | TTI::TargetCostKind::TCK_RecipThroughput) + | ||||||
3333 | getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, | ||||||
3334 | TTI::TargetCostKind::TCK_RecipThroughput) + | ||||||
3335 | getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, | ||||||
3336 | TTI::TargetCostKind::TCK_RecipThroughput); | ||||||
3337 | } | ||||||
3338 | } | ||||||
3339 | |||||||
3340 | if (Index != -1U && (Opcode == Instruction::ExtractElement || | ||||||
3341 | Opcode == Instruction::InsertElement)) { | ||||||
3342 | // Legalize the type. | ||||||
3343 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); | ||||||
3344 | |||||||
3345 | // This type is legalized to a scalar type. | ||||||
3346 | if (!LT.second.isVector()) | ||||||
3347 | return 0; | ||||||
3348 | |||||||
3349 | // The type may be split. Normalize the index to the new type. | ||||||
3350 | unsigned NumElts = LT.second.getVectorNumElements(); | ||||||
3351 | unsigned SubNumElts = NumElts; | ||||||
3352 | Index = Index % NumElts; | ||||||
3353 | |||||||
3354 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. | ||||||
3355 | // For inserts, we also need to insert the subvector back. | ||||||
3356 | if (LT.second.getSizeInBits() > 128) { | ||||||
3357 | assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector")((void)0); | ||||||
3358 | unsigned NumSubVecs = LT.second.getSizeInBits() / 128; | ||||||
3359 | SubNumElts = NumElts / NumSubVecs; | ||||||
3360 | if (SubNumElts <= Index) { | ||||||
3361 | RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); | ||||||
3362 | Index %= SubNumElts; | ||||||
3363 | } | ||||||
3364 | } | ||||||
3365 | |||||||
3366 | if (Index == 0) { | ||||||
3367 | // Floating point scalars are already located in index #0. | ||||||
3368 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume | ||||||
3369 | // true for all. | ||||||
3370 | if (ScalarType->isFloatingPointTy()) | ||||||
3371 | return RegisterFileMoveCost; | ||||||
3372 | |||||||
3373 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. | ||||||
3374 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) | ||||||
3375 | return 1 + RegisterFileMoveCost; | ||||||
3376 | } | ||||||
3377 | |||||||
3378 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
3379 | assert(ISD && "Unexpected vector opcode")((void)0); | ||||||
3380 | MVT MScalarTy = LT.second.getScalarType(); | ||||||
3381 | if (ST->isSLM()) | ||||||
3382 | if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) | ||||||
3383 | return Entry->Cost + RegisterFileMoveCost; | ||||||
3384 | |||||||
3385 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. | ||||||
3386 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | ||||||
3387 | (MScalarTy.isInteger() && ST->hasSSE41())) | ||||||
3388 | return 1 + RegisterFileMoveCost; | ||||||
3389 | |||||||
3390 | // Assume insertps is relatively cheap on all targets. | ||||||
3391 | if (MScalarTy == MVT::f32 && ST->hasSSE41() && | ||||||
3392 | Opcode == Instruction::InsertElement) | ||||||
3393 | return 1 + RegisterFileMoveCost; | ||||||
3394 | |||||||
3395 | // For extractions we just need to shuffle the element to index 0, which | ||||||
3396 | // should be very cheap (assume cost = 1). For insertions we need to shuffle | ||||||
3397 | // the elements to its destination. In both cases we must handle the | ||||||
3398 | // subvector move(s). | ||||||
3399 | // If the vector type is already less than 128-bits then don't reduce it. | ||||||
3400 | // TODO: Under what circumstances should we shuffle using the full width? | ||||||
3401 | InstructionCost ShuffleCost = 1; | ||||||
3402 | if (Opcode == Instruction::InsertElement) { | ||||||
3403 | auto *SubTy = cast<VectorType>(Val); | ||||||
3404 | EVT VT = TLI->getValueType(DL, Val); | ||||||
3405 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) | ||||||
3406 | SubTy = FixedVectorType::get(ScalarType, SubNumElts); | ||||||
3407 | ShuffleCost = | ||||||
3408 | getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy); | ||||||
3409 | } | ||||||
3410 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; | ||||||
3411 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; | ||||||
3412 | } | ||||||
3413 | |||||||
3414 | // Add to the base cost if we know that the extracted element of a vector is | ||||||
3415 | // destined to be moved to and used in the integer register file. | ||||||
3416 | if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) | ||||||
3417 | RegisterFileMoveCost += 1; | ||||||
3418 | |||||||
3419 | return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; | ||||||
3420 | } | ||||||
3421 | |||||||
3422 | InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, | ||||||
3423 | const APInt &DemandedElts, | ||||||
3424 | bool Insert, | ||||||
3425 | bool Extract) { | ||||||
3426 | InstructionCost Cost = 0; | ||||||
3427 | |||||||
3428 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much | ||||||
3429 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. | ||||||
3430 | if (Insert) { | ||||||
3431 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | ||||||
3432 | MVT MScalarTy = LT.second.getScalarType(); | ||||||
3433 | |||||||
3434 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | ||||||
3435 | (MScalarTy.isInteger() && ST->hasSSE41()) || | ||||||
3436 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { | ||||||
3437 | // For types we can insert directly, insertion into 128-bit sub vectors is | ||||||
3438 | // cheap, followed by a cheap chain of concatenations. | ||||||
3439 | if (LT.second.getSizeInBits() <= 128) { | ||||||
3440 | Cost += | ||||||
3441 | BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); | ||||||
3442 | } else { | ||||||
3443 | // In each 128-lane, if at least one index is demanded but not all | ||||||
3444 | // indices are demanded and this 128-lane is not the first 128-lane of | ||||||
3445 | // the legalized-vector, then this 128-lane needs a extracti128; If in | ||||||
3446 | // each 128-lane, there is at least one demanded index, this 128-lane | ||||||
3447 | // needs a inserti128. | ||||||
3448 | |||||||
3449 | // The following cases will help you build a better understanding: | ||||||
3450 | // Assume we insert several elements into a v8i32 vector in avx2, | ||||||
3451 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. | ||||||
3452 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + | ||||||
3453 | // inserti128. | ||||||
3454 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. | ||||||
3455 | const int CostValue = *LT.first.getValue(); | ||||||
3456 | assert(CostValue >= 0 && "Negative cost!")((void)0); | ||||||
3457 | unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue; | ||||||
3458 | unsigned NumElts = LT.second.getVectorNumElements() * CostValue; | ||||||
3459 | APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); | ||||||
3460 | unsigned Scale = NumElts / Num128Lanes; | ||||||
3461 | // We iterate each 128-lane, and check if we need a | ||||||
3462 | // extracti128/inserti128 for this 128-lane. | ||||||
3463 | for (unsigned I = 0; I < NumElts; I += Scale) { | ||||||
3464 | APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale); | ||||||
3465 | APInt MaskedDE = Mask & WidenedDemandedElts; | ||||||
3466 | unsigned Population = MaskedDE.countPopulation(); | ||||||
3467 | Cost += (Population > 0 && Population != Scale && | ||||||
3468 | I % LT.second.getVectorNumElements() != 0); | ||||||
3469 | Cost += Population > 0; | ||||||
3470 | } | ||||||
3471 | Cost += DemandedElts.countPopulation(); | ||||||
3472 | |||||||
3473 | // For vXf32 cases, insertion into the 0'th index in each v4f32 | ||||||
3474 | // 128-bit vector is free. | ||||||
3475 | // NOTE: This assumes legalization widens vXf32 vectors. | ||||||
3476 | if (MScalarTy == MVT::f32) | ||||||
3477 | for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements(); | ||||||
3478 | i < e; i += 4) | ||||||
3479 | if (DemandedElts[i]) | ||||||
3480 | Cost--; | ||||||
3481 | } | ||||||
3482 | } else if (LT.second.isVector()) { | ||||||
3483 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded | ||||||
3484 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a | ||||||
3485 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be | ||||||
3486 | // considered cheap. | ||||||
3487 | if (Ty->isIntOrIntVectorTy()) | ||||||
3488 | Cost += DemandedElts.countPopulation(); | ||||||
3489 | |||||||
3490 | // Get the smaller of the legalized or original pow2-extended number of | ||||||
3491 | // vector elements, which represents the number of unpacks we'll end up | ||||||
3492 | // performing. | ||||||
3493 | unsigned NumElts = LT.second.getVectorNumElements(); | ||||||
3494 | unsigned Pow2Elts = | ||||||
3495 | PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); | ||||||
3496 | Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; | ||||||
3497 | } | ||||||
3498 | } | ||||||
3499 | |||||||
3500 | // TODO: Use default extraction for now, but we should investigate extending this | ||||||
3501 | // to handle repeated subvector extraction. | ||||||
3502 | if (Extract) | ||||||
3503 | Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); | ||||||
3504 | |||||||
3505 | return Cost; | ||||||
3506 | } | ||||||
3507 | |||||||
3508 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | ||||||
3509 | MaybeAlign Alignment, | ||||||
3510 | unsigned AddressSpace, | ||||||
3511 | TTI::TargetCostKind CostKind, | ||||||
3512 | const Instruction *I) { | ||||||
3513 | // TODO: Handle other cost kinds. | ||||||
3514 | if (CostKind != TTI::TCK_RecipThroughput) { | ||||||
3515 | if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { | ||||||
3516 | // Store instruction with index and scale costs 2 Uops. | ||||||
3517 | // Check the preceding GEP to identify non-const indices. | ||||||
3518 | if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { | ||||||
3519 | if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) | ||||||
3520 | return TTI::TCC_Basic * 2; | ||||||
3521 | } | ||||||
3522 | } | ||||||
3523 | return TTI::TCC_Basic; | ||||||
3524 | } | ||||||
3525 | |||||||
3526 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&((void)0) | ||||||
3527 | "Invalid Opcode")((void)0); | ||||||
3528 | // Type legalization can't handle structs | ||||||
3529 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | ||||||
3530 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
3531 | CostKind); | ||||||
3532 | |||||||
3533 | // Legalize the type. | ||||||
3534 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); | ||||||
3535 | |||||||
3536 | auto *VTy = dyn_cast<FixedVectorType>(Src); | ||||||
3537 | |||||||
3538 | // Handle the simple case of non-vectors. | ||||||
3539 | // NOTE: this assumes that legalization never creates vector from scalars! | ||||||
3540 | if (!VTy || !LT.second.isVector()) | ||||||
3541 | // Each load/store unit costs 1. | ||||||
3542 | return LT.first * 1; | ||||||
3543 | |||||||
3544 | bool IsLoad = Opcode == Instruction::Load; | ||||||
3545 | |||||||
3546 | Type *EltTy = VTy->getElementType(); | ||||||
3547 | |||||||
3548 | const int EltTyBits = DL.getTypeSizeInBits(EltTy); | ||||||
3549 | |||||||
3550 | InstructionCost Cost = 0; | ||||||
3551 | |||||||
3552 | // Source of truth: how many elements were there in the original IR vector? | ||||||
3553 | const unsigned SrcNumElt = VTy->getNumElements(); | ||||||
3554 | |||||||
3555 | // How far have we gotten? | ||||||
3556 | int NumEltRemaining = SrcNumElt; | ||||||
3557 | // Note that we intentionally capture by-reference, NumEltRemaining changes. | ||||||
3558 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; | ||||||
3559 | |||||||
3560 | const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); | ||||||
3561 | |||||||
3562 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. | ||||||
3563 | const unsigned XMMBits = 128; | ||||||
3564 | if (XMMBits % EltTyBits != 0) | ||||||
3565 | // Vector size must be a multiple of the element size. I.e. no padding. | ||||||
3566 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
3567 | CostKind); | ||||||
3568 | const int NumEltPerXMM = XMMBits / EltTyBits; | ||||||
3569 | |||||||
3570 | auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); | ||||||
3571 | |||||||
3572 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; | ||||||
3573 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { | ||||||
3574 | // How many elements would a single op deal with at once? | ||||||
3575 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) | ||||||
3576 | // Vector size must be a multiple of the element size. I.e. no padding. | ||||||
3577 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | ||||||
3578 | CostKind); | ||||||
3579 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; | ||||||
3580 | |||||||
3581 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?")((void)0); | ||||||
3582 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||((void)0) | ||||||
3583 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&((void)0) | ||||||
3584 | "Unless we haven't halved the op size yet, "((void)0) | ||||||
3585 | "we have less than two op's sized units of work left.")((void)0); | ||||||
3586 | |||||||
3587 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM | ||||||
3588 | ? FixedVectorType::get(EltTy, CurrNumEltPerOp) | ||||||
3589 | : XMMVecTy; | ||||||
3590 | |||||||
3591 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&((void)0) | ||||||
3592 | "After halving sizes, the vector elt count is no longer a multiple "((void)0) | ||||||
3593 | "of number of elements per operation?")((void)0); | ||||||
3594 | auto *CoalescedVecTy = | ||||||
3595 | CurrNumEltPerOp == 1 | ||||||
3596 | ? CurrVecTy | ||||||
3597 | : FixedVectorType::get( | ||||||
3598 | IntegerType::get(Src->getContext(), | ||||||
3599 | EltTyBits * CurrNumEltPerOp), | ||||||
3600 | CurrVecTy->getNumElements() / CurrNumEltPerOp); | ||||||
3601 | assert(DL.getTypeSizeInBits(CoalescedVecTy) ==((void)0) | ||||||
3602 | DL.getTypeSizeInBits(CurrVecTy) &&((void)0) | ||||||
3603 | "coalesciing elements doesn't change vector width.")((void)0); | ||||||
3604 | |||||||
3605 | while (NumEltRemaining > 0) { | ||||||
3606 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?")((void)0); | ||||||
3607 | |||||||
3608 | // Can we use this vector size, as per the remaining element count? | ||||||
3609 | // Iff the vector is naturally aligned, we can do a wide load regardless. | ||||||
3610 | if (NumEltRemaining < CurrNumEltPerOp && | ||||||
3611 | (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && | ||||||
3612 | CurrOpSizeBytes != 1) | ||||||
3613 | break; // Try smalled vector size. | ||||||
3614 | |||||||
3615 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; | ||||||
3616 | |||||||
3617 | // If we have fully processed the previous reg, we need to replenish it. | ||||||
3618 | if (SubVecEltsLeft == 0) { | ||||||
3619 | SubVecEltsLeft += CurrVecTy->getNumElements(); | ||||||
3620 | // And that's free only for the 0'th subvector of a legalized vector. | ||||||
3621 | if (!Is0thSubVec) | ||||||
3622 | Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector | ||||||
3623 | : TTI::ShuffleKind::SK_ExtractSubvector, | ||||||
3624 | VTy, None, NumEltDone(), CurrVecTy); | ||||||
3625 | } | ||||||
3626 | |||||||
3627 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, | ||||||
3628 | // for smaller widths (32/16/8) we have to insert/extract them separately. | ||||||
3629 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, | ||||||
3630 | // but let's pretend that it is also true for 16/8 bit wide ops...) | ||||||
3631 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { | ||||||
3632 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; | ||||||
3633 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "")((void)0); | ||||||
3634 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; | ||||||
3635 | APInt DemandedElts = | ||||||
3636 | APInt::getBitsSet(CoalescedVecTy->getNumElements(), | ||||||
3637 | CoalescedVecEltIdx, CoalescedVecEltIdx + 1); | ||||||
3638 | assert(DemandedElts.countPopulation() == 1 && "Inserting single value")((void)0); | ||||||
3639 | Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, | ||||||
3640 | !IsLoad); | ||||||
3641 | } | ||||||
3642 | |||||||
3643 | // This isn't exactly right. We're using slow unaligned 32-byte accesses | ||||||
3644 | // as a proxy for a double-pumped AVX memory interface such as on | ||||||
3645 | // Sandybridge. | ||||||
3646 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) | ||||||
3647 | Cost += 2; | ||||||
3648 | else | ||||||
3649 | Cost += 1; | ||||||
3650 | |||||||
3651 | SubVecEltsLeft -= CurrNumEltPerOp; | ||||||
3652 | NumEltRemaining -= CurrNumEltPerOp; | ||||||
3653 | Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); | ||||||
3654 | } | ||||||
3655 | } | ||||||
3656 | |||||||
3657 | assert(NumEltRemaining <= 0 && "Should have processed all the elements.")((void)0); | ||||||
3658 | |||||||
3659 | return Cost; | ||||||
3660 | } | ||||||
3661 | |||||||
3662 | InstructionCost | ||||||
3663 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, | ||||||
3664 | unsigned AddressSpace, | ||||||
3665 | TTI::TargetCostKind CostKind) { | ||||||
3666 | bool IsLoad = (Instruction::Load == Opcode); | ||||||
3667 | bool IsStore = (Instruction::Store == Opcode); | ||||||
3668 | |||||||
3669 | auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); | ||||||
3670 | if (!SrcVTy) | ||||||
3671 | // To calculate scalar take the regular cost, without mask | ||||||
3672 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); | ||||||
3673 | |||||||
3674 | unsigned NumElem = SrcVTy->getNumElements(); | ||||||
3675 | auto *MaskTy = | ||||||
3676 | FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); | ||||||
3677 | if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || | ||||||
3678 | (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { | ||||||
3679 | // Scalarization | ||||||
3680 | APInt DemandedElts = APInt::getAllOnesValue(NumElem); | ||||||
3681 | InstructionCost MaskSplitCost = | ||||||
3682 | getScalarizationOverhead(MaskTy, DemandedElts, false, true); | ||||||
3683 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | ||||||
3684 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, | ||||||
3685 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
3686 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | ||||||
3687 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); | ||||||
3688 | InstructionCost ValueSplitCost = | ||||||
3689 | getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); | ||||||
3690 | InstructionCost MemopCost = | ||||||
3691 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | ||||||
3692 | Alignment, AddressSpace, CostKind); | ||||||
3693 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; | ||||||
3694 | } | ||||||
3695 | |||||||
3696 | // Legalize the type. | ||||||
3697 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); | ||||||
3698 | auto VT = TLI->getValueType(DL, SrcVTy); | ||||||
3699 | InstructionCost Cost = 0; | ||||||
3700 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && | ||||||
3701 | LT.second.getVectorNumElements() == NumElem) | ||||||
3702 | // Promotion requires extend/truncate for data and a shuffle for mask. | ||||||
3703 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) + | ||||||
3704 | getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr); | ||||||
3705 | |||||||
3706 | else if (LT.first * LT.second.getVectorNumElements() > NumElem) { | ||||||
3707 | auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), | ||||||
3708 | LT.second.getVectorNumElements()); | ||||||
3709 | // Expanding requires fill mask with zeroes | ||||||
3710 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy); | ||||||
3711 | } | ||||||
3712 | |||||||
3713 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. | ||||||
3714 | if (!ST->hasAVX512()) | ||||||
3715 | return Cost + LT.first * (IsLoad ? 2 : 8); | ||||||
3716 | |||||||
3717 | // AVX-512 masked load/store is cheapper | ||||||
3718 | return Cost + LT.first; | ||||||
3719 | } | ||||||
3720 | |||||||
3721 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, | ||||||
3722 | ScalarEvolution *SE, | ||||||
3723 | const SCEV *Ptr) { | ||||||
3724 | // Address computations in vectorized code with non-consecutive addresses will | ||||||
3725 | // likely result in more instructions compared to scalar code where the | ||||||
3726 | // computation can more often be merged into the index mode. The resulting | ||||||
3727 | // extra micro-ops can significantly decrease throughput. | ||||||
3728 | const unsigned NumVectorInstToHideOverhead = 10; | ||||||
3729 | |||||||
3730 | // Cost modeling of Strided Access Computation is hidden by the indexing | ||||||
3731 | // modes of X86 regardless of the stride value. We dont believe that there | ||||||
3732 | // is a difference between constant strided access in gerenal and constant | ||||||
3733 | // strided value which is less than or equal to 64. | ||||||
3734 | // Even in the case of (loop invariant) stride whose value is not known at | ||||||
3735 | // compile time, the address computation will not incur more than one extra | ||||||
3736 | // ADD instruction. | ||||||
3737 | if (Ty->isVectorTy() && SE) { | ||||||
3738 | if (!BaseT::isStridedAccess(Ptr)) | ||||||
3739 | return NumVectorInstToHideOverhead; | ||||||
3740 | if (!BaseT::getConstantStrideStep(SE, Ptr)) | ||||||
3741 | return 1; | ||||||
3742 | } | ||||||
3743 | |||||||
3744 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | ||||||
3745 | } | ||||||
3746 | |||||||
3747 | InstructionCost | ||||||
3748 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | ||||||
3749 | Optional<FastMathFlags> FMF, | ||||||
3750 | TTI::TargetCostKind CostKind) { | ||||||
3751 | if (TTI::requiresOrderedReduction(FMF)) | ||||||
3752 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | ||||||
3753 | |||||||
3754 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | ||||||
3755 | // and make it as the cost. | ||||||
3756 | |||||||
3757 | static const CostTblEntry SLMCostTblNoPairWise[] = { | ||||||
3758 | { ISD::FADD, MVT::v2f64, 3 }, | ||||||
3759 | { ISD::ADD, MVT::v2i64, 5 }, | ||||||
3760 | }; | ||||||
3761 | |||||||
3762 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | ||||||
3763 | { ISD::FADD, MVT::v2f64, 2 }, | ||||||
3764 | { ISD::FADD, MVT::v2f32, 2 }, | ||||||
3765 | { ISD::FADD, MVT::v4f32, 4 }, | ||||||
3766 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | ||||||
3767 | { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 | ||||||
3768 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". | ||||||
3769 | { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". | ||||||
3770 | { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". | ||||||
3771 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". | ||||||
3772 | { ISD::ADD, MVT::v2i8, 2 }, | ||||||
3773 | { ISD::ADD, MVT::v4i8, 2 }, | ||||||
3774 | { ISD::ADD, MVT::v8i8, 2 }, | ||||||
3775 | { ISD::ADD, MVT::v16i8, 3 }, | ||||||
3776 | }; | ||||||
3777 | |||||||
3778 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | ||||||
3779 | { ISD::FADD, MVT::v4f64, 3 }, | ||||||
3780 | { ISD::FADD, MVT::v4f32, 3 }, | ||||||
3781 | { ISD::FADD, MVT::v8f32, 4 }, | ||||||
3782 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | ||||||
3783 | { ISD::ADD, MVT::v4i64, 3 }, | ||||||
3784 | { ISD::ADD, MVT::v8i32, 5 }, | ||||||
3785 | { ISD::ADD, MVT::v16i16, 5 }, | ||||||
3786 | { ISD::ADD, MVT::v32i8, 4 }, | ||||||
3787 | }; | ||||||
3788 | |||||||
3789 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
3790 | assert(ISD && "Invalid opcode")((void)0); | ||||||
3791 | |||||||
3792 | // Before legalizing the type, give a chance to look up illegal narrow types | ||||||
3793 | // in the table. | ||||||
3794 | // FIXME: Is there a better way to do this? | ||||||
3795 | EVT VT = TLI->getValueType(DL, ValTy); | ||||||
3796 | if (VT.isSimple()) { | ||||||
3797 | MVT MTy = VT.getSimpleVT(); | ||||||
3798 | if (ST->isSLM()) | ||||||
3799 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | ||||||
3800 | return Entry->Cost; | ||||||
3801 | |||||||
3802 | if (ST->hasAVX()) | ||||||
3803 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||||
3804 | return Entry->Cost; | ||||||
3805 | |||||||
3806 | if (ST->hasSSE2()) | ||||||
3807 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||||
3808 | return Entry->Cost; | ||||||
3809 | } | ||||||
3810 | |||||||
3811 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
3812 | |||||||
3813 | MVT MTy = LT.second; | ||||||
3814 | |||||||
3815 | auto *ValVTy = cast<FixedVectorType>(ValTy); | ||||||
3816 | |||||||
3817 | // Special case: vXi8 mul reductions are performed as vXi16. | ||||||
3818 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { | ||||||
3819 | auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); | ||||||
3820 | auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); | ||||||
3821 | return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, | ||||||
3822 | TargetTransformInfo::CastContextHint::None, | ||||||
3823 | CostKind) + | ||||||
3824 | getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); | ||||||
3825 | } | ||||||
3826 | |||||||
3827 | InstructionCost ArithmeticCost = 0; | ||||||
3828 | if (LT.first != 1 && MTy.isVector() && | ||||||
3829 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||||
3830 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||||
3831 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | ||||||
3832 | MTy.getVectorNumElements()); | ||||||
3833 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | ||||||
3834 | ArithmeticCost *= LT.first - 1; | ||||||
3835 | } | ||||||
3836 | |||||||
3837 | if (ST->isSLM()) | ||||||
3838 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | ||||||
3839 | return ArithmeticCost + Entry->Cost; | ||||||
3840 | |||||||
3841 | if (ST->hasAVX()) | ||||||
3842 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||||
3843 | return ArithmeticCost + Entry->Cost; | ||||||
3844 | |||||||
3845 | if (ST->hasSSE2()) | ||||||
3846 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||||
3847 | return ArithmeticCost + Entry->Cost; | ||||||
3848 | |||||||
3849 | // FIXME: These assume a naive kshift+binop lowering, which is probably | ||||||
3850 | // conservative in most cases. | ||||||
3851 | static const CostTblEntry AVX512BoolReduction[] = { | ||||||
3852 | { ISD::AND, MVT::v2i1, 3 }, | ||||||
3853 | { ISD::AND, MVT::v4i1, 5 }, | ||||||
3854 | { ISD::AND, MVT::v8i1, 7 }, | ||||||
3855 | { ISD::AND, MVT::v16i1, 9 }, | ||||||
3856 | { ISD::AND, MVT::v32i1, 11 }, | ||||||
3857 | { ISD::AND, MVT::v64i1, 13 }, | ||||||
3858 | { ISD::OR, MVT::v2i1, 3 }, | ||||||
3859 | { ISD::OR, MVT::v4i1, 5 }, | ||||||
3860 | { ISD::OR, MVT::v8i1, 7 }, | ||||||
3861 | { ISD::OR, MVT::v16i1, 9 }, | ||||||
3862 | { ISD::OR, MVT::v32i1, 11 }, | ||||||
3863 | { ISD::OR, MVT::v64i1, 13 }, | ||||||
3864 | }; | ||||||
3865 | |||||||
3866 | static const CostTblEntry AVX2BoolReduction[] = { | ||||||
3867 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp | ||||||
3868 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp | ||||||
3869 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp | ||||||
3870 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp | ||||||
3871 | }; | ||||||
3872 | |||||||
3873 | static const CostTblEntry AVX1BoolReduction[] = { | ||||||
3874 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp | ||||||
3875 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp | ||||||
3876 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | ||||||
3877 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | ||||||
3878 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp | ||||||
3879 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp | ||||||
3880 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | ||||||
3881 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | ||||||
3882 | }; | ||||||
3883 | |||||||
3884 | static const CostTblEntry SSE2BoolReduction[] = { | ||||||
3885 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp | ||||||
3886 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp | ||||||
3887 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp | ||||||
3888 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp | ||||||
3889 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp | ||||||
3890 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp | ||||||
3891 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp | ||||||
3892 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp | ||||||
3893 | }; | ||||||
3894 | |||||||
3895 | // Handle bool allof/anyof patterns. | ||||||
3896 | if (ValVTy->getElementType()->isIntegerTy(1)) { | ||||||
3897 | InstructionCost ArithmeticCost = 0; | ||||||
3898 | if (LT.first != 1 && MTy.isVector() && | ||||||
3899 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||||
3900 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||||
3901 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | ||||||
3902 | MTy.getVectorNumElements()); | ||||||
3903 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | ||||||
3904 | ArithmeticCost *= LT.first - 1; | ||||||
3905 | } | ||||||
3906 | |||||||
3907 | if (ST->hasAVX512()) | ||||||
3908 | if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) | ||||||
3909 | return ArithmeticCost + Entry->Cost; | ||||||
3910 | if (ST->hasAVX2()) | ||||||
3911 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) | ||||||
3912 | return ArithmeticCost + Entry->Cost; | ||||||
3913 | if (ST->hasAVX()) | ||||||
3914 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) | ||||||
3915 | return ArithmeticCost + Entry->Cost; | ||||||
3916 | if (ST->hasSSE2()) | ||||||
3917 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) | ||||||
3918 | return ArithmeticCost + Entry->Cost; | ||||||
3919 | |||||||
3920 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | ||||||
3921 | } | ||||||
3922 | |||||||
3923 | unsigned NumVecElts = ValVTy->getNumElements(); | ||||||
3924 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); | ||||||
3925 | |||||||
3926 | // Special case power of 2 reductions where the scalar type isn't changed | ||||||
3927 | // by type legalization. | ||||||
3928 | if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) | ||||||
3929 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | ||||||
3930 | |||||||
3931 | InstructionCost ReductionCost = 0; | ||||||
3932 | |||||||
3933 | auto *Ty = ValVTy; | ||||||
3934 | if (LT.first != 1 && MTy.isVector() && | ||||||
3935 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||||
3936 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | ||||||
3937 | Ty = FixedVectorType::get(ValVTy->getElementType(), | ||||||
3938 | MTy.getVectorNumElements()); | ||||||
3939 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | ||||||
3940 | ReductionCost *= LT.first - 1; | ||||||
3941 | NumVecElts = MTy.getVectorNumElements(); | ||||||
3942 | } | ||||||
3943 | |||||||
3944 | // Now handle reduction with the legal type, taking into account size changes | ||||||
3945 | // at each level. | ||||||
3946 | while (NumVecElts > 1) { | ||||||
3947 | // Determine the size of the remaining vector we need to reduce. | ||||||
3948 | unsigned Size = NumVecElts * ScalarSize; | ||||||
3949 | NumVecElts /= 2; | ||||||
3950 | // If we're reducing from 256/512 bits, use an extract_subvector. | ||||||
3951 | if (Size > 128) { | ||||||
3952 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | ||||||
3953 | ReductionCost += | ||||||
3954 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); | ||||||
3955 | Ty = SubTy; | ||||||
3956 | } else if (Size == 128) { | ||||||
3957 | // Reducing from 128 bits is a permute of v2f64/v2i64. | ||||||
3958 | FixedVectorType *ShufTy; | ||||||
3959 | if (ValVTy->isFloatingPointTy()) | ||||||
3960 | ShufTy = | ||||||
3961 | FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); | ||||||
3962 | else | ||||||
3963 | ShufTy = | ||||||
3964 | FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); | ||||||
3965 | ReductionCost += | ||||||
3966 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | ||||||
3967 | } else if (Size == 64) { | ||||||
3968 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | ||||||
3969 | FixedVectorType *ShufTy; | ||||||
3970 | if (ValVTy->isFloatingPointTy()) | ||||||
3971 | ShufTy = | ||||||
3972 | FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); | ||||||
3973 | else | ||||||
3974 | ShufTy = | ||||||
3975 | FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); | ||||||
3976 | ReductionCost += | ||||||
3977 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | ||||||
3978 | } else { | ||||||
3979 | // Reducing from smaller size is a shift by immediate. | ||||||
3980 | auto *ShiftTy = FixedVectorType::get( | ||||||
3981 | Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); | ||||||
3982 | ReductionCost += getArithmeticInstrCost( | ||||||
3983 | Instruction::LShr, ShiftTy, CostKind, | ||||||
3984 | TargetTransformInfo::OK_AnyValue, | ||||||
3985 | TargetTransformInfo::OK_UniformConstantValue, | ||||||
3986 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | ||||||
3987 | } | ||||||
3988 | |||||||
3989 | // Add the arithmetic op for this level. | ||||||
3990 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); | ||||||
3991 | } | ||||||
3992 | |||||||
3993 | // Add the final extract element to the cost. | ||||||
3994 | return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | ||||||
3995 | } | ||||||
3996 | |||||||
3997 | InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, | ||||||
3998 | bool IsUnsigned) { | ||||||
3999 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | ||||||
4000 | |||||||
4001 | MVT MTy = LT.second; | ||||||
4002 | |||||||
4003 | int ISD; | ||||||
4004 | if (Ty->isIntOrIntVectorTy()) { | ||||||
4005 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | ||||||
4006 | } else { | ||||||
4007 | assert(Ty->isFPOrFPVectorTy() &&((void)0) | ||||||
4008 | "Expected float point or integer vector type.")((void)0); | ||||||
4009 | ISD = ISD::FMINNUM; | ||||||
4010 | } | ||||||
4011 | |||||||
4012 | static const CostTblEntry SSE1CostTbl[] = { | ||||||
4013 | {ISD::FMINNUM, MVT::v4f32, 1}, | ||||||
4014 | }; | ||||||
4015 | |||||||
4016 | static const CostTblEntry SSE2CostTbl[] = { | ||||||
4017 | {ISD::FMINNUM, MVT::v2f64, 1}, | ||||||
4018 | {ISD::SMIN, MVT::v8i16, 1}, | ||||||
4019 | {ISD::UMIN, MVT::v16i8, 1}, | ||||||
4020 | }; | ||||||
4021 | |||||||
4022 | static const CostTblEntry SSE41CostTbl[] = { | ||||||
4023 | {ISD::SMIN, MVT::v4i32, 1}, | ||||||
4024 | {ISD::UMIN, MVT::v4i32, 1}, | ||||||
4025 | {ISD::UMIN, MVT::v8i16, 1}, | ||||||
4026 | {ISD::SMIN, MVT::v16i8, 1}, | ||||||
4027 | }; | ||||||
4028 | |||||||
4029 | static const CostTblEntry SSE42CostTbl[] = { | ||||||
4030 | {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd | ||||||
4031 | }; | ||||||
4032 | |||||||
4033 | static const CostTblEntry AVX1CostTbl[] = { | ||||||
4034 | {ISD::FMINNUM, MVT::v8f32, 1}, | ||||||
4035 | {ISD::FMINNUM, MVT::v4f64, 1}, | ||||||
4036 | {ISD::SMIN, MVT::v8i32, 3}, | ||||||
4037 | {ISD::UMIN, MVT::v8i32, 3}, | ||||||
4038 | {ISD::SMIN, MVT::v16i16, 3}, | ||||||
4039 | {ISD::UMIN, MVT::v16i16, 3}, | ||||||
4040 | {ISD::SMIN, MVT::v32i8, 3}, | ||||||
4041 | {ISD::UMIN, MVT::v32i8, 3}, | ||||||
4042 | }; | ||||||
4043 | |||||||
4044 | static const CostTblEntry AVX2CostTbl[] = { | ||||||
4045 | {ISD::SMIN, MVT::v8i32, 1}, | ||||||
4046 | {ISD::UMIN, MVT::v8i32, 1}, | ||||||
4047 | {ISD::SMIN, MVT::v16i16, 1}, | ||||||
4048 | {ISD::UMIN, MVT::v16i16, 1}, | ||||||
4049 | {ISD::SMIN, MVT::v32i8, 1}, | ||||||
4050 | {ISD::UMIN, MVT::v32i8, 1}, | ||||||
4051 | }; | ||||||
4052 | |||||||
4053 | static const CostTblEntry AVX512CostTbl[] = { | ||||||
4054 | {ISD::FMINNUM, MVT::v16f32, 1}, | ||||||
4055 | {ISD::FMINNUM, MVT::v8f64, 1}, | ||||||
4056 | {ISD::SMIN, MVT::v2i64, 1}, | ||||||
4057 | {ISD::UMIN, MVT::v2i64, 1}, | ||||||
4058 | {ISD::SMIN, MVT::v4i64, 1}, | ||||||
4059 | {ISD::UMIN, MVT::v4i64, 1}, | ||||||
4060 | {ISD::SMIN, MVT::v8i64, 1}, | ||||||
4061 | {ISD::UMIN, MVT::v8i64, 1}, | ||||||
4062 | {ISD::SMIN, MVT::v16i32, 1}, | ||||||
4063 | {ISD::UMIN, MVT::v16i32, 1}, | ||||||
4064 | }; | ||||||
4065 | |||||||
4066 | static const CostTblEntry AVX512BWCostTbl[] = { | ||||||
4067 | {ISD::SMIN, MVT::v32i16, 1}, | ||||||
4068 | {ISD::UMIN, MVT::v32i16, 1}, | ||||||
4069 | {ISD::SMIN, MVT::v64i8, 1}, | ||||||
4070 | {ISD::UMIN, MVT::v64i8, 1}, | ||||||
4071 | }; | ||||||
4072 | |||||||
4073 | // If we have a native MIN/MAX instruction for this type, use it. | ||||||
4074 | if (ST->hasBWI()) | ||||||
4075 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | ||||||
4076 | return LT.first * Entry->Cost; | ||||||
4077 | |||||||
4078 | if (ST->hasAVX512()) | ||||||
4079 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | ||||||
4080 | return LT.first * Entry->Cost; | ||||||
4081 | |||||||
4082 | if (ST->hasAVX2()) | ||||||
4083 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | ||||||
4084 | return LT.first * Entry->Cost; | ||||||
4085 | |||||||
4086 | if (ST->hasAVX()) | ||||||
4087 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | ||||||
4088 | return LT.first * Entry->Cost; | ||||||
4089 | |||||||
4090 | if (ST->hasSSE42()) | ||||||
4091 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | ||||||
4092 | return LT.first * Entry->Cost; | ||||||
4093 | |||||||
4094 | if (ST->hasSSE41()) | ||||||
4095 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | ||||||
4096 | return LT.first * Entry->Cost; | ||||||
4097 | |||||||
4098 | if (ST->hasSSE2()) | ||||||
4099 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | ||||||
4100 | return LT.first * Entry->Cost; | ||||||
4101 | |||||||
4102 | if (ST->hasSSE1()) | ||||||
4103 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | ||||||
4104 | return LT.first * Entry->Cost; | ||||||
4105 | |||||||
4106 | unsigned CmpOpcode; | ||||||
4107 | if (Ty->isFPOrFPVectorTy()) { | ||||||
4108 | CmpOpcode = Instruction::FCmp; | ||||||
4109 | } else { | ||||||
4110 | assert(Ty->isIntOrIntVectorTy() &&((void)0) | ||||||
4111 | "expecting floating point or integer type for min/max reduction")((void)0); | ||||||
4112 | CmpOpcode = Instruction::ICmp; | ||||||
4113 | } | ||||||
4114 | |||||||
4115 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||||
4116 | // Otherwise fall back to cmp+select. | ||||||
4117 | InstructionCost Result = | ||||||
4118 | getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, | ||||||
4119 | CostKind) + | ||||||
4120 | getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | ||||||
4121 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
4122 | return Result; | ||||||
4123 | } | ||||||
4124 | |||||||
4125 | InstructionCost | ||||||
4126 | X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, | ||||||
4127 | bool IsUnsigned, | ||||||
4128 | TTI::TargetCostKind CostKind) { | ||||||
4129 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
4130 | |||||||
4131 | MVT MTy = LT.second; | ||||||
4132 | |||||||
4133 | int ISD; | ||||||
4134 | if (ValTy->isIntOrIntVectorTy()) { | ||||||
4135 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | ||||||
4136 | } else { | ||||||
4137 | assert(ValTy->isFPOrFPVectorTy() &&((void)0) | ||||||
4138 | "Expected float point or integer vector type.")((void)0); | ||||||
4139 | ISD = ISD::FMINNUM; | ||||||
4140 | } | ||||||
4141 | |||||||
4142 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | ||||||
4143 | // and make it as the cost. | ||||||
4144 | |||||||
4145 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | ||||||
4146 | {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw | ||||||
4147 | {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw | ||||||
4148 | {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw | ||||||
4149 | }; | ||||||
4150 | |||||||
4151 | static const CostTblEntry SSE41CostTblNoPairWise[] = { | ||||||
4152 | {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 | ||||||
4153 | {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 | ||||||
4154 | {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 | ||||||
4155 | {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 | ||||||
4156 | {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor | ||||||
4157 | {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax | ||||||
4158 | {ISD::SMIN, MVT::v2i8, 3}, // pminsb | ||||||
4159 | {ISD::SMIN, MVT::v4i8, 5}, // pminsb | ||||||
4160 | {ISD::SMIN, MVT::v8i8, 7}, // pminsb | ||||||
4161 | {ISD::SMIN, MVT::v16i8, 6}, | ||||||
4162 | {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 | ||||||
4163 | {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 | ||||||
4164 | {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 | ||||||
4165 | {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax | ||||||
4166 | }; | ||||||
4167 | |||||||
4168 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | ||||||
4169 | {ISD::SMIN, MVT::v16i16, 6}, | ||||||
4170 | {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax | ||||||
4171 | {ISD::SMIN, MVT::v32i8, 8}, | ||||||
4172 | {ISD::UMIN, MVT::v32i8, 8}, | ||||||
4173 | }; | ||||||
4174 | |||||||
4175 | static const CostTblEntry AVX512BWCostTblNoPairWise[] = { | ||||||
4176 | {ISD::SMIN, MVT::v32i16, 8}, | ||||||
4177 | {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax | ||||||
4178 | {ISD::SMIN, MVT::v64i8, 10}, | ||||||
4179 | {ISD::UMIN, MVT::v64i8, 10}, | ||||||
4180 | }; | ||||||
4181 | |||||||
4182 | // Before legalizing the type, give a chance to look up illegal narrow types | ||||||
4183 | // in the table. | ||||||
4184 | // FIXME: Is there a better way to do this? | ||||||
4185 | EVT VT = TLI->getValueType(DL, ValTy); | ||||||
4186 | if (VT.isSimple()) { | ||||||
4187 | MVT MTy = VT.getSimpleVT(); | ||||||
4188 | if (ST->hasBWI()) | ||||||
4189 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | ||||||
4190 | return Entry->Cost; | ||||||
4191 | |||||||
4192 | if (ST->hasAVX()) | ||||||
4193 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||||
4194 | return Entry->Cost; | ||||||
4195 | |||||||
4196 | if (ST->hasSSE41()) | ||||||
4197 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | ||||||
4198 | return Entry->Cost; | ||||||
4199 | |||||||
4200 | if (ST->hasSSE2()) | ||||||
4201 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||||
4202 | return Entry->Cost; | ||||||
4203 | } | ||||||
4204 | |||||||
4205 | auto *ValVTy = cast<FixedVectorType>(ValTy); | ||||||
4206 | unsigned NumVecElts = ValVTy->getNumElements(); | ||||||
4207 | |||||||
4208 | auto *Ty = ValVTy; | ||||||
4209 | InstructionCost MinMaxCost = 0; | ||||||
4210 | if (LT.first != 1 && MTy.isVector() && | ||||||
4211 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | ||||||
4212 | // Type needs to be split. We need LT.first - 1 operations ops. | ||||||
4213 | Ty = FixedVectorType::get(ValVTy->getElementType(), | ||||||
4214 | MTy.getVectorNumElements()); | ||||||
4215 | auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), | ||||||
4216 | MTy.getVectorNumElements()); | ||||||
4217 | MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); | ||||||
4218 | MinMaxCost *= LT.first - 1; | ||||||
4219 | NumVecElts = MTy.getVectorNumElements(); | ||||||
4220 | } | ||||||
4221 | |||||||
4222 | if (ST->hasBWI()) | ||||||
4223 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | ||||||
4224 | return MinMaxCost + Entry->Cost; | ||||||
4225 | |||||||
4226 | if (ST->hasAVX()) | ||||||
4227 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | ||||||
4228 | return MinMaxCost + Entry->Cost; | ||||||
4229 | |||||||
4230 | if (ST->hasSSE41()) | ||||||
4231 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | ||||||
4232 | return MinMaxCost + Entry->Cost; | ||||||
4233 | |||||||
4234 | if (ST->hasSSE2()) | ||||||
4235 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | ||||||
4236 | return MinMaxCost + Entry->Cost; | ||||||
4237 | |||||||
4238 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); | ||||||
4239 | |||||||
4240 | // Special case power of 2 reductions where the scalar type isn't changed | ||||||
4241 | // by type legalization. | ||||||
4242 | if (!isPowerOf2_32(ValVTy->getNumElements()) || | ||||||
4243 | ScalarSize != MTy.getScalarSizeInBits()) | ||||||
4244 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); | ||||||
4245 | |||||||
4246 | // Now handle reduction with the legal type, taking into account size changes | ||||||
4247 | // at each level. | ||||||
4248 | while (NumVecElts > 1) { | ||||||
4249 | // Determine the size of the remaining vector we need to reduce. | ||||||
4250 | unsigned Size = NumVecElts * ScalarSize; | ||||||
4251 | NumVecElts /= 2; | ||||||
4252 | // If we're reducing from 256/512 bits, use an extract_subvector. | ||||||
4253 | if (Size > 128) { | ||||||
4254 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | ||||||
4255 | MinMaxCost += | ||||||
4256 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); | ||||||
4257 | Ty = SubTy; | ||||||
4258 | } else if (Size == 128) { | ||||||
4259 | // Reducing from 128 bits is a permute of v2f64/v2i64. | ||||||
4260 | VectorType *ShufTy; | ||||||
4261 | if (ValTy->isFloatingPointTy()) | ||||||
4262 | ShufTy = | ||||||
4263 | FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); | ||||||
4264 | else | ||||||
4265 | ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); | ||||||
4266 | MinMaxCost += | ||||||
4267 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | ||||||
4268 | } else if (Size == 64) { | ||||||
4269 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | ||||||
4270 | FixedVectorType *ShufTy; | ||||||
4271 | if (ValTy->isFloatingPointTy()) | ||||||
4272 | ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); | ||||||
4273 | else | ||||||
4274 | ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); | ||||||
4275 | MinMaxCost += | ||||||
4276 | getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); | ||||||
4277 | } else { | ||||||
4278 | // Reducing from smaller size is a shift by immediate. | ||||||
4279 | auto *ShiftTy = FixedVectorType::get( | ||||||
4280 | Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); | ||||||
4281 | MinMaxCost += getArithmeticInstrCost( | ||||||
4282 | Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, | ||||||
4283 | TargetTransformInfo::OK_AnyValue, | ||||||
4284 | TargetTransformInfo::OK_UniformConstantValue, | ||||||
4285 | TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); | ||||||
4286 | } | ||||||
4287 | |||||||
4288 | // Add the arithmetic op for this level. | ||||||
4289 | auto *SubCondTy = | ||||||
4290 | FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); | ||||||
4291 | MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); | ||||||
4292 | } | ||||||
4293 | |||||||
4294 | // Add the final extract element to the cost. | ||||||
4295 | return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | ||||||
4296 | } | ||||||
4297 | |||||||
4298 | /// Calculate the cost of materializing a 64-bit value. This helper | ||||||
4299 | /// method might only calculate a fraction of a larger immediate. Therefore it | ||||||
4300 | /// is valid to return a cost of ZERO. | ||||||
4301 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { | ||||||
4302 | if (Val == 0) | ||||||
4303 | return TTI::TCC_Free; | ||||||
4304 | |||||||
4305 | if (isInt<32>(Val)) | ||||||
4306 | return TTI::TCC_Basic; | ||||||
4307 | |||||||
4308 | return 2 * TTI::TCC_Basic; | ||||||
4309 | } | ||||||
4310 | |||||||
4311 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | ||||||
4312 | TTI::TargetCostKind CostKind) { | ||||||
4313 | assert(Ty->isIntegerTy())((void)0); | ||||||
4314 | |||||||
4315 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||||
4316 | if (BitSize == 0) | ||||||
4317 | return ~0U; | ||||||
4318 | |||||||
4319 | // Never hoist constants larger than 128bit, because this might lead to | ||||||
4320 | // incorrect code generation or assertions in codegen. | ||||||
4321 | // Fixme: Create a cost model for types larger than i128 once the codegen | ||||||
4322 | // issues have been fixed. | ||||||
4323 | if (BitSize > 128) | ||||||
4324 | return TTI::TCC_Free; | ||||||
4325 | |||||||
4326 | if (Imm == 0) | ||||||
4327 | return TTI::TCC_Free; | ||||||
4328 | |||||||
4329 | // Sign-extend all constants to a multiple of 64-bit. | ||||||
4330 | APInt ImmVal = Imm; | ||||||
4331 | if (BitSize % 64 != 0) | ||||||
4332 | ImmVal = Imm.sext(alignTo(BitSize, 64)); | ||||||
4333 | |||||||
4334 | // Split the constant into 64-bit chunks and calculate the cost for each | ||||||
4335 | // chunk. | ||||||
4336 | InstructionCost Cost = 0; | ||||||
4337 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | ||||||
4338 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | ||||||
4339 | int64_t Val = Tmp.getSExtValue(); | ||||||
4340 | Cost += getIntImmCost(Val); | ||||||
4341 | } | ||||||
4342 | // We need at least one instruction to materialize the constant. | ||||||
4343 | return std::max<InstructionCost>(1, Cost); | ||||||
4344 | } | ||||||
4345 | |||||||
4346 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | ||||||
4347 | const APInt &Imm, Type *Ty, | ||||||
4348 | TTI::TargetCostKind CostKind, | ||||||
4349 | Instruction *Inst) { | ||||||
4350 | assert(Ty->isIntegerTy())((void)0); | ||||||
4351 | |||||||
4352 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||||
4353 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | ||||||
4354 | // here, so that constant hoisting will ignore this constant. | ||||||
4355 | if (BitSize == 0) | ||||||
4356 | return TTI::TCC_Free; | ||||||
4357 | |||||||
4358 | unsigned ImmIdx = ~0U; | ||||||
4359 | switch (Opcode) { | ||||||
4360 | default: | ||||||
4361 | return TTI::TCC_Free; | ||||||
4362 | case Instruction::GetElementPtr: | ||||||
4363 | // Always hoist the base address of a GetElementPtr. This prevents the | ||||||
4364 | // creation of new constants for every base constant that gets constant | ||||||
4365 | // folded with the offset. | ||||||
4366 | if (Idx == 0) | ||||||
4367 | return 2 * TTI::TCC_Basic; | ||||||
4368 | return TTI::TCC_Free; | ||||||
4369 | case Instruction::Store: | ||||||
4370 | ImmIdx = 0; | ||||||
4371 | break; | ||||||
4372 | case Instruction::ICmp: | ||||||
4373 | // This is an imperfect hack to prevent constant hoisting of | ||||||
4374 | // compares that might be trying to check if a 64-bit value fits in | ||||||
4375 | // 32-bits. The backend can optimize these cases using a right shift by 32. | ||||||
4376 | // Ideally we would check the compare predicate here. There also other | ||||||
4377 | // similar immediates the backend can use shifts for. | ||||||
4378 | if (Idx == 1 && Imm.getBitWidth() == 64) { | ||||||
4379 | uint64_t ImmVal = Imm.getZExtValue(); | ||||||
4380 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) | ||||||
4381 | return TTI::TCC_Free; | ||||||
4382 | } | ||||||
4383 | ImmIdx = 1; | ||||||
4384 | break; | ||||||
4385 | case Instruction::And: | ||||||
4386 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes | ||||||
4387 | // by using a 32-bit operation with implicit zero extension. Detect such | ||||||
4388 | // immediates here as the normal path expects bit 31 to be sign extended. | ||||||
4389 | if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) | ||||||
4390 | return TTI::TCC_Free; | ||||||
4391 | ImmIdx = 1; | ||||||
4392 | break; | ||||||
4393 | case Instruction::Add: | ||||||
4394 | case Instruction::Sub: | ||||||
4395 | // For add/sub, we can use the opposite instruction for INT32_MIN. | ||||||
4396 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) | ||||||
4397 | return TTI::TCC_Free; | ||||||
4398 | ImmIdx = 1; | ||||||
4399 | break; | ||||||
4400 | case Instruction::UDiv: | ||||||
4401 | case Instruction::SDiv: | ||||||
4402 | case Instruction::URem: | ||||||
4403 | case Instruction::SRem: | ||||||
4404 | // Division by constant is typically expanded later into a different | ||||||
4405 | // instruction sequence. This completely changes the constants. | ||||||
4406 | // Report them as "free" to stop ConstantHoist from marking them as opaque. | ||||||
4407 | return TTI::TCC_Free; | ||||||
4408 | case Instruction::Mul: | ||||||
4409 | case Instruction::Or: | ||||||
4410 | case Instruction::Xor: | ||||||
4411 | ImmIdx = 1; | ||||||
4412 | break; | ||||||
4413 | // Always return TCC_Free for the shift value of a shift instruction. | ||||||
4414 | case Instruction::Shl: | ||||||
4415 | case Instruction::LShr: | ||||||
4416 | case Instruction::AShr: | ||||||
4417 | if (Idx == 1) | ||||||
4418 | return TTI::TCC_Free; | ||||||
4419 | break; | ||||||
4420 | case Instruction::Trunc: | ||||||
4421 | case Instruction::ZExt: | ||||||
4422 | case Instruction::SExt: | ||||||
4423 | case Instruction::IntToPtr: | ||||||
4424 | case Instruction::PtrToInt: | ||||||
4425 | case Instruction::BitCast: | ||||||
4426 | case Instruction::PHI: | ||||||
4427 | case Instruction::Call: | ||||||
4428 | case Instruction::Select: | ||||||
4429 | case Instruction::Ret: | ||||||
4430 | case Instruction::Load: | ||||||
4431 | break; | ||||||
4432 | } | ||||||
4433 | |||||||
4434 | if (Idx == ImmIdx) { | ||||||
4435 | int NumConstants = divideCeil(BitSize, 64); | ||||||
4436 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||||
4437 | return (Cost <= NumConstants * TTI::TCC_Basic) | ||||||
4438 | ? static_cast<int>(TTI::TCC_Free) | ||||||
4439 | : Cost; | ||||||
4440 | } | ||||||
4441 | |||||||
4442 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||||
4443 | } | ||||||
4444 | |||||||
4445 | InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | ||||||
4446 | const APInt &Imm, Type *Ty, | ||||||
4447 | TTI::TargetCostKind CostKind) { | ||||||
4448 | assert(Ty->isIntegerTy())((void)0); | ||||||
4449 | |||||||
4450 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | ||||||
4451 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | ||||||
4452 | // here, so that constant hoisting will ignore this constant. | ||||||
4453 | if (BitSize == 0) | ||||||
4454 | return TTI::TCC_Free; | ||||||
4455 | |||||||
4456 | switch (IID) { | ||||||
4457 | default: | ||||||
4458 | return TTI::TCC_Free; | ||||||
4459 | case Intrinsic::sadd_with_overflow: | ||||||
4460 | case Intrinsic::uadd_with_overflow: | ||||||
4461 | case Intrinsic::ssub_with_overflow: | ||||||
4462 | case Intrinsic::usub_with_overflow: | ||||||
4463 | case Intrinsic::smul_with_overflow: | ||||||
4464 | case Intrinsic::umul_with_overflow: | ||||||
4465 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) | ||||||
4466 | return TTI::TCC_Free; | ||||||
4467 | break; | ||||||
4468 | case Intrinsic::experimental_stackmap: | ||||||
4469 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | ||||||
4470 | return TTI::TCC_Free; | ||||||
4471 | break; | ||||||
4472 | case Intrinsic::experimental_patchpoint_void: | ||||||
4473 | case Intrinsic::experimental_patchpoint_i64: | ||||||
4474 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | ||||||
4475 | return TTI::TCC_Free; | ||||||
4476 | break; | ||||||
4477 | } | ||||||
4478 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | ||||||
4479 | } | ||||||
4480 | |||||||
4481 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, | ||||||
4482 | TTI::TargetCostKind CostKind, | ||||||
4483 | const Instruction *I) { | ||||||
4484 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
4485 | return Opcode == Instruction::PHI ? 0 : 1; | ||||||
4486 | // Branches are assumed to be predicted. | ||||||
4487 | return 0; | ||||||
4488 | } | ||||||
4489 | |||||||
4490 | int X86TTIImpl::getGatherOverhead() const { | ||||||
4491 | // Some CPUs have more overhead for gather. The specified overhead is relative | ||||||
4492 | // to the Load operation. "2" is the number provided by Intel architects. This | ||||||
4493 | // parameter is used for cost estimation of Gather Op and comparison with | ||||||
4494 | // other alternatives. | ||||||
4495 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only | ||||||
4496 | // enable gather with a -march. | ||||||
4497 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) | ||||||
4498 | return 2; | ||||||
4499 | |||||||
4500 | return 1024; | ||||||
4501 | } | ||||||
4502 | |||||||
4503 | int X86TTIImpl::getScatterOverhead() const { | ||||||
4504 | if (ST->hasAVX512()) | ||||||
4505 | return 2; | ||||||
4506 | |||||||
4507 | return 1024; | ||||||
4508 | } | ||||||
4509 | |||||||
4510 | // Return an average cost of Gather / Scatter instruction, maybe improved later. | ||||||
4511 | // FIXME: Add TargetCostKind support. | ||||||
4512 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, | ||||||
4513 | const Value *Ptr, Align Alignment, | ||||||
4514 | unsigned AddressSpace) { | ||||||
4515 | |||||||
4516 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")((void)0); | ||||||
4517 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | ||||||
4518 | |||||||
4519 | // Try to reduce index size from 64 bit (default for GEP) | ||||||
4520 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the | ||||||
4521 | // operation will use 16 x 64 indices which do not fit in a zmm and needs | ||||||
4522 | // to split. Also check that the base pointer is the same for all lanes, | ||||||
4523 | // and that there's at most one variable index. | ||||||
4524 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { | ||||||
4525 | unsigned IndexSize = DL.getPointerSizeInBits(); | ||||||
4526 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); | ||||||
4527 | if (IndexSize < 64 || !GEP) | ||||||
4528 | return IndexSize; | ||||||
4529 | |||||||
4530 | unsigned NumOfVarIndices = 0; | ||||||
4531 | const Value *Ptrs = GEP->getPointerOperand(); | ||||||
4532 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) | ||||||
4533 | return IndexSize; | ||||||
4534 | for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { | ||||||
4535 | if (isa<Constant>(GEP->getOperand(i))) | ||||||
4536 | continue; | ||||||
4537 | Type *IndxTy = GEP->getOperand(i)->getType(); | ||||||
4538 | if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) | ||||||
4539 | IndxTy = IndexVTy->getElementType(); | ||||||
4540 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && | ||||||
4541 | !isa<SExtInst>(GEP->getOperand(i))) || | ||||||
4542 | ++NumOfVarIndices > 1) | ||||||
4543 | return IndexSize; // 64 | ||||||
4544 | } | ||||||
4545 | return (unsigned)32; | ||||||
4546 | }; | ||||||
4547 | |||||||
4548 | // Trying to reduce IndexSize to 32 bits for vector 16. | ||||||
4549 | // By default the IndexSize is equal to pointer size. | ||||||
4550 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) | ||||||
4551 | ? getIndexSizeInBits(Ptr, DL) | ||||||
4552 | : DL.getPointerSizeInBits(); | ||||||
4553 | |||||||
4554 | auto *IndexVTy = FixedVectorType::get( | ||||||
4555 | IntegerType::get(SrcVTy->getContext(), IndexSize), VF); | ||||||
4556 | std::pair<InstructionCost, MVT> IdxsLT = | ||||||
4557 | TLI->getTypeLegalizationCost(DL, IndexVTy); | ||||||
4558 | std::pair<InstructionCost, MVT> SrcLT = | ||||||
4559 | TLI->getTypeLegalizationCost(DL, SrcVTy); | ||||||
4560 | InstructionCost::CostType SplitFactor = | ||||||
4561 | *std::max(IdxsLT.first, SrcLT.first).getValue(); | ||||||
4562 | if (SplitFactor > 1) { | ||||||
4563 | // Handle splitting of vector of pointers | ||||||
4564 | auto *SplitSrcTy = | ||||||
4565 | FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); | ||||||
4566 | return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, | ||||||
4567 | AddressSpace); | ||||||
4568 | } | ||||||
4569 | |||||||
4570 | // The gather / scatter cost is given by Intel architects. It is a rough | ||||||
4571 | // number since we are looking at one instruction in a time. | ||||||
4572 | const int GSOverhead = (Opcode == Instruction::Load) | ||||||
4573 | ? getGatherOverhead() | ||||||
4574 | : getScatterOverhead(); | ||||||
4575 | return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | ||||||
4576 | MaybeAlign(Alignment), AddressSpace, | ||||||
4577 | TTI::TCK_RecipThroughput); | ||||||
4578 | } | ||||||
4579 | |||||||
4580 | /// Return the cost of full scalarization of gather / scatter operation. | ||||||
4581 | /// | ||||||
4582 | /// Opcode - Load or Store instruction. | ||||||
4583 | /// SrcVTy - The type of the data vector that should be gathered or scattered. | ||||||
4584 | /// VariableMask - The mask is non-constant at compile time. | ||||||
4585 | /// Alignment - Alignment for one element. | ||||||
4586 | /// AddressSpace - pointer[s] address space. | ||||||
4587 | /// | ||||||
4588 | /// FIXME: Add TargetCostKind support. | ||||||
4589 | InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, | ||||||
4590 | bool VariableMask, Align Alignment, | ||||||
4591 | unsigned AddressSpace) { | ||||||
4592 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | ||||||
4593 | APInt DemandedElts = APInt::getAllOnesValue(VF); | ||||||
4594 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | ||||||
4595 | |||||||
4596 | InstructionCost MaskUnpackCost = 0; | ||||||
4597 | if (VariableMask) { | ||||||
4598 | auto *MaskTy = | ||||||
4599 | FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); | ||||||
4600 | MaskUnpackCost = | ||||||
4601 | getScalarizationOverhead(MaskTy, DemandedElts, false, true); | ||||||
4602 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | ||||||
4603 | Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, | ||||||
4604 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
4605 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | ||||||
4606 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); | ||||||
4607 | } | ||||||
4608 | |||||||
4609 | // The cost of the scalar loads/stores. | ||||||
4610 | InstructionCost MemoryOpCost = | ||||||
4611 | VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | ||||||
4612 | MaybeAlign(Alignment), AddressSpace, CostKind); | ||||||
4613 | |||||||
4614 | InstructionCost InsertExtractCost = 0; | ||||||
4615 | if (Opcode == Instruction::Load) | ||||||
4616 | for (unsigned i = 0; i < VF; ++i) | ||||||
4617 | // Add the cost of inserting each scalar load into the vector | ||||||
4618 | InsertExtractCost += | ||||||
4619 | getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); | ||||||
4620 | else | ||||||
4621 | for (unsigned i = 0; i < VF; ++i) | ||||||
4622 | // Add the cost of extracting each element out of the data vector | ||||||
4623 | InsertExtractCost += | ||||||
4624 | getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); | ||||||
4625 | |||||||
4626 | return MemoryOpCost + MaskUnpackCost + InsertExtractCost; | ||||||
4627 | } | ||||||
4628 | |||||||
4629 | /// Calculate the cost of Gather / Scatter operation | ||||||
4630 | InstructionCost X86TTIImpl::getGatherScatterOpCost( | ||||||
4631 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, | ||||||
4632 | Align Alignment, TTI::TargetCostKind CostKind, | ||||||
4633 | const Instruction *I = nullptr) { | ||||||
4634 | if (CostKind != TTI::TCK_RecipThroughput) { | ||||||
| |||||||
4635 | if ((Opcode == Instruction::Load && | ||||||
4636 | isLegalMaskedGather(SrcVTy, Align(Alignment))) || | ||||||
4637 | (Opcode == Instruction::Store && | ||||||
4638 | isLegalMaskedScatter(SrcVTy, Align(Alignment)))) | ||||||
4639 | return 1; | ||||||
4640 | return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, | ||||||
4641 | Alignment, CostKind, I); | ||||||
4642 | } | ||||||
4643 | |||||||
4644 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")((void)0); | ||||||
4645 | PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); | ||||||
4646 | if (!PtrTy
| ||||||
4647 | PtrTy = dyn_cast<PointerType>( | ||||||
4648 | cast<VectorType>(Ptr->getType())->getElementType()); | ||||||
4649 | assert(PtrTy && "Unexpected type for Ptr argument")((void)0); | ||||||
4650 | unsigned AddressSpace = PtrTy->getAddressSpace(); | ||||||
4651 | |||||||
4652 | if ((Opcode == Instruction::Load && | ||||||
4653 | !isLegalMaskedGather(SrcVTy, Align(Alignment))) || | ||||||
4654 | (Opcode == Instruction::Store && | ||||||
4655 | !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) | ||||||
4656 | return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, | ||||||
4657 | AddressSpace); | ||||||
4658 | |||||||
4659 | return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); | ||||||
4660 | } | ||||||
4661 | |||||||
4662 | bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, | ||||||
4663 | TargetTransformInfo::LSRCost &C2) { | ||||||
4664 | // X86 specific here are "instruction number 1st priority". | ||||||
4665 | return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, | ||||||
4666 | C1.NumIVMuls, C1.NumBaseAdds, | ||||||
4667 | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < | ||||||
4668 | std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, | ||||||
4669 | C2.NumIVMuls, C2.NumBaseAdds, | ||||||
4670 | C2.ScaleCost, C2.ImmCost, C2.SetupCost); | ||||||
4671 | } | ||||||
4672 | |||||||
4673 | bool X86TTIImpl::canMacroFuseCmp() { | ||||||
4674 | return ST->hasMacroFusion() || ST->hasBranchFusion(); | ||||||
4675 | } | ||||||
4676 | |||||||
4677 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { | ||||||
4678 | if (!ST->hasAVX()) | ||||||
4679 | return false; | ||||||
4680 | |||||||
4681 | // The backend can't handle a single element vector. | ||||||
4682 | if (isa<VectorType>(DataTy) && | ||||||
4683 | cast<FixedVectorType>(DataTy)->getNumElements() == 1) | ||||||
4684 | return false; | ||||||
4685 | Type *ScalarTy = DataTy->getScalarType(); | ||||||
4686 | |||||||
4687 | if (ScalarTy->isPointerTy()) | ||||||
4688 | return true; | ||||||
4689 | |||||||
4690 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||||
4691 | return true; | ||||||
4692 | |||||||
4693 | if (!ScalarTy->isIntegerTy()) | ||||||
4694 | return false; | ||||||
4695 | |||||||
4696 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||||
4697 | return IntWidth == 32 || IntWidth == 64 || | ||||||
4698 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); | ||||||
4699 | } | ||||||
4700 | |||||||
4701 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { | ||||||
4702 | return isLegalMaskedLoad(DataType, Alignment); | ||||||
4703 | } | ||||||
4704 | |||||||
4705 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { | ||||||
4706 | unsigned DataSize = DL.getTypeStoreSize(DataType); | ||||||
4707 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 | ||||||
4708 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 | ||||||
4709 | // (the equivalent stores only require AVX). | ||||||
4710 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) | ||||||
4711 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); | ||||||
4712 | |||||||
4713 | return false; | ||||||
4714 | } | ||||||
4715 | |||||||
4716 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { | ||||||
4717 | unsigned DataSize = DL.getTypeStoreSize(DataType); | ||||||
4718 | |||||||
4719 | // SSE4A supports nontemporal stores of float and double at arbitrary | ||||||
4720 | // alignment. | ||||||
4721 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) | ||||||
4722 | return true; | ||||||
4723 | |||||||
4724 | // Besides the SSE4A subtarget exception above, only aligned stores are | ||||||
4725 | // available nontemporaly on any other subtarget. And only stores with a size | ||||||
4726 | // of 4..32 bytes (powers of 2, only) are permitted. | ||||||
4727 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || | ||||||
4728 | !isPowerOf2_32(DataSize)) | ||||||
4729 | return false; | ||||||
4730 | |||||||
4731 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent | ||||||
4732 | // loads require AVX2). | ||||||
4733 | if (DataSize == 32) | ||||||
4734 | return ST->hasAVX(); | ||||||
4735 | else if (DataSize == 16) | ||||||
4736 | return ST->hasSSE1(); | ||||||
4737 | return true; | ||||||
4738 | } | ||||||
4739 | |||||||
4740 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { | ||||||
4741 | if (!isa<VectorType>(DataTy)) | ||||||
4742 | return false; | ||||||
4743 | |||||||
4744 | if (!ST->hasAVX512()) | ||||||
4745 | return false; | ||||||
4746 | |||||||
4747 | // The backend can't handle a single element vector. | ||||||
4748 | if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) | ||||||
4749 | return false; | ||||||
4750 | |||||||
4751 | Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); | ||||||
4752 | |||||||
4753 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||||
4754 | return true; | ||||||
4755 | |||||||
4756 | if (!ScalarTy->isIntegerTy()) | ||||||
4757 | return false; | ||||||
4758 | |||||||
4759 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||||
4760 | return IntWidth == 32 || IntWidth == 64 || | ||||||
4761 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); | ||||||
4762 | } | ||||||
4763 | |||||||
4764 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { | ||||||
4765 | return isLegalMaskedExpandLoad(DataTy); | ||||||
4766 | } | ||||||
4767 | |||||||
4768 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { | ||||||
4769 | // Some CPUs have better gather performance than others. | ||||||
4770 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only | ||||||
4771 | // enable gather with a -march. | ||||||
4772 | if (!(ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()))) | ||||||
4773 | return false; | ||||||
4774 | |||||||
4775 | // This function is called now in two cases: from the Loop Vectorizer | ||||||
4776 | // and from the Scalarizer. | ||||||
4777 | // When the Loop Vectorizer asks about legality of the feature, | ||||||
4778 | // the vectorization factor is not calculated yet. The Loop Vectorizer | ||||||
4779 | // sends a scalar type and the decision is based on the width of the | ||||||
4780 | // scalar element. | ||||||
4781 | // Later on, the cost model will estimate usage this intrinsic based on | ||||||
4782 | // the vector type. | ||||||
4783 | // The Scalarizer asks again about legality. It sends a vector type. | ||||||
4784 | // In this case we can reject non-power-of-2 vectors. | ||||||
4785 | // We also reject single element vectors as the type legalizer can't | ||||||
4786 | // scalarize it. | ||||||
4787 | if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) { | ||||||
4788 | unsigned NumElts = DataVTy->getNumElements(); | ||||||
4789 | if (NumElts == 1) | ||||||
4790 | return false; | ||||||
4791 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX | ||||||
4792 | // Vector-4 of gather/scatter instruction does not exist on KNL. | ||||||
4793 | // We can extend it to 8 elements, but zeroing upper bits of | ||||||
4794 | // the mask vector will add more instructions. Right now we give the scalar | ||||||
4795 | // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter | ||||||
4796 | // instruction is better in the VariableMask case. | ||||||
4797 | if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))) | ||||||
4798 | return false; | ||||||
4799 | } | ||||||
4800 | Type *ScalarTy = DataTy->getScalarType(); | ||||||
4801 | if (ScalarTy->isPointerTy()) | ||||||
4802 | return true; | ||||||
4803 | |||||||
4804 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | ||||||
4805 | return true; | ||||||
4806 | |||||||
4807 | if (!ScalarTy->isIntegerTy()) | ||||||
4808 | return false; | ||||||
4809 | |||||||
4810 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | ||||||
4811 | return IntWidth == 32 || IntWidth == 64; | ||||||
4812 | } | ||||||
4813 | |||||||
4814 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { | ||||||
4815 | // AVX2 doesn't support scatter | ||||||
4816 | if (!ST->hasAVX512()) | ||||||
4817 | return false; | ||||||
4818 | return isLegalMaskedGather(DataType, Alignment); | ||||||
4819 | } | ||||||
4820 | |||||||
4821 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { | ||||||
4822 | EVT VT = TLI->getValueType(DL, DataType); | ||||||
4823 | return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); | ||||||
4824 | } | ||||||
4825 | |||||||
4826 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | ||||||
4827 | return false; | ||||||
4828 | } | ||||||
4829 | |||||||
4830 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, | ||||||
4831 | const Function *Callee) const { | ||||||
4832 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||||
4833 | |||||||
4834 | // Work this as a subsetting of subtarget features. | ||||||
4835 | const FeatureBitset &CallerBits = | ||||||
4836 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | ||||||
4837 | const FeatureBitset &CalleeBits = | ||||||
4838 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | ||||||
4839 | |||||||
4840 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; | ||||||
4841 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; | ||||||
4842 | return (RealCallerBits & RealCalleeBits) == RealCalleeBits; | ||||||
4843 | } | ||||||
4844 | |||||||
4845 | bool X86TTIImpl::areFunctionArgsABICompatible( | ||||||
4846 | const Function *Caller, const Function *Callee, | ||||||
4847 | SmallPtrSetImpl<Argument *> &Args) const { | ||||||
4848 | if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) | ||||||
4849 | return false; | ||||||
4850 | |||||||
4851 | // If we get here, we know the target features match. If one function | ||||||
4852 | // considers 512-bit vectors legal and the other does not, consider them | ||||||
4853 | // incompatible. | ||||||
4854 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||||
4855 | |||||||
4856 | if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == | ||||||
4857 | TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) | ||||||
4858 | return true; | ||||||
4859 | |||||||
4860 | // Consider the arguments compatible if they aren't vectors or aggregates. | ||||||
4861 | // FIXME: Look at the size of vectors. | ||||||
4862 | // FIXME: Look at the element types of aggregates to see if there are vectors. | ||||||
4863 | // FIXME: The API of this function seems intended to allow arguments | ||||||
4864 | // to be removed from the set, but the caller doesn't check if the set | ||||||
4865 | // becomes empty so that may not work in practice. | ||||||
4866 | return llvm::none_of(Args, [](Argument *A) { | ||||||
4867 | auto *EltTy = cast<PointerType>(A->getType())->getElementType(); | ||||||
4868 | return EltTy->isVectorTy() || EltTy->isAggregateType(); | ||||||
4869 | }); | ||||||
4870 | } | ||||||
4871 | |||||||
4872 | X86TTIImpl::TTI::MemCmpExpansionOptions | ||||||
4873 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | ||||||
4874 | TTI::MemCmpExpansionOptions Options; | ||||||
4875 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | ||||||
4876 | Options.NumLoadsPerBlock = 2; | ||||||
4877 | // All GPR and vector loads can be unaligned. | ||||||
4878 | Options.AllowOverlappingLoads = true; | ||||||
4879 | if (IsZeroCmp) { | ||||||
4880 | // Only enable vector loads for equality comparison. Right now the vector | ||||||
4881 | // version is not as fast for three way compare (see #33329). | ||||||
4882 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); | ||||||
4883 | if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); | ||||||
4884 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); | ||||||
4885 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); | ||||||
4886 | } | ||||||
4887 | if (ST->is64Bit()) { | ||||||
4888 | Options.LoadSizes.push_back(8); | ||||||
4889 | } | ||||||
4890 | Options.LoadSizes.push_back(4); | ||||||
4891 | Options.LoadSizes.push_back(2); | ||||||
4892 | Options.LoadSizes.push_back(1); | ||||||
4893 | return Options; | ||||||
4894 | } | ||||||
4895 | |||||||
4896 | bool X86TTIImpl::enableInterleavedAccessVectorization() { | ||||||
4897 | // TODO: We expect this to be beneficial regardless of arch, | ||||||
4898 | // but there are currently some unexplained performance artifacts on Atom. | ||||||
4899 | // As a temporary solution, disable on Atom. | ||||||
4900 | return !(ST->isAtom()); | ||||||
4901 | } | ||||||
4902 | |||||||
4903 | // Get estimation for interleaved load/store operations for AVX2. | ||||||
4904 | // \p Factor is the interleaved-access factor (stride) - number of | ||||||
4905 | // (interleaved) elements in the group. | ||||||
4906 | // \p Indices contains the indices for a strided load: when the | ||||||
4907 | // interleaved load has gaps they indicate which elements are used. | ||||||
4908 | // If Indices is empty (or if the number of indices is equal to the size | ||||||
4909 | // of the interleaved-access as given in \p Factor) the access has no gaps. | ||||||
4910 | // | ||||||
4911 | // As opposed to AVX-512, AVX2 does not have generic shuffles that allow | ||||||
4912 | // computing the cost using a generic formula as a function of generic | ||||||
4913 | // shuffles. We therefore use a lookup table instead, filled according to | ||||||
4914 | // the instruction sequences that codegen currently generates. | ||||||
4915 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2( | ||||||
4916 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, | ||||||
4917 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, | ||||||
4918 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { | ||||||
4919 | |||||||
4920 | if (UseMaskForCond || UseMaskForGaps) | ||||||
4921 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
4922 | Alignment, AddressSpace, CostKind, | ||||||
4923 | UseMaskForCond, UseMaskForGaps); | ||||||
4924 | |||||||
4925 | // We currently Support only fully-interleaved groups, with no gaps. | ||||||
4926 | // TODO: Support also strided loads (interleaved-groups with gaps). | ||||||
4927 | if (Indices.size() && Indices.size() != Factor) | ||||||
4928 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
4929 | Alignment, AddressSpace, CostKind); | ||||||
4930 | |||||||
4931 | // VecTy for interleave memop is <VF*Factor x Elt>. | ||||||
4932 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | ||||||
4933 | // VecTy = <12 x i32>. | ||||||
4934 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | ||||||
4935 | |||||||
4936 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case | ||||||
4937 | // the VF=2, while v2i128 is an unsupported MVT vector type | ||||||
4938 | // (see MachineValueType.h::getVectorVT()). | ||||||
4939 | if (!LegalVT.isVector()) | ||||||
4940 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
4941 | Alignment, AddressSpace, CostKind); | ||||||
4942 | |||||||
4943 | unsigned VF = VecTy->getNumElements() / Factor; | ||||||
4944 | Type *ScalarTy = VecTy->getElementType(); | ||||||
4945 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. | ||||||
4946 | if (!ScalarTy->isIntegerTy()) | ||||||
4947 | ScalarTy = | ||||||
4948 | Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); | ||||||
4949 | |||||||
4950 | // Get the cost of all the memory operations. | ||||||
4951 | InstructionCost MemOpCosts = getMemoryOpCost( | ||||||
4952 | Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); | ||||||
4953 | |||||||
4954 | auto *VT = FixedVectorType::get(ScalarTy, VF); | ||||||
4955 | EVT ETy = TLI->getValueType(DL, VT); | ||||||
4956 | if (!ETy.isSimple()) | ||||||
4957 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
4958 | Alignment, AddressSpace, CostKind); | ||||||
4959 | |||||||
4960 | // TODO: Complete for other data-types and strides. | ||||||
4961 | // Each combination of Stride, element bit width and VF results in a different | ||||||
4962 | // sequence; The cost tables are therefore accessed with: | ||||||
4963 | // Factor (stride) and VectorType=VFxiN. | ||||||
4964 | // The Cost accounts only for the shuffle sequence; | ||||||
4965 | // The cost of the loads/stores is accounted for separately. | ||||||
4966 | // | ||||||
4967 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { | ||||||
4968 | {2, MVT::v4i64, 6}, // (load 8i64 and) deinterleave into 2 x 4i64 | ||||||
4969 | |||||||
4970 | {3, MVT::v2i8, 10}, // (load 6i8 and) deinterleave into 3 x 2i8 | ||||||
4971 | {3, MVT::v4i8, 4}, // (load 12i8 and) deinterleave into 3 x 4i8 | ||||||
4972 | {3, MVT::v8i8, 9}, // (load 24i8 and) deinterleave into 3 x 8i8 | ||||||
4973 | {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 | ||||||
4974 | {3, MVT::v32i8, 13}, // (load 96i8 and) deinterleave into 3 x 32i8 | ||||||
4975 | |||||||
4976 | {3, MVT::v8i32, 17}, // (load 24i32 and) deinterleave into 3 x 8i32 | ||||||
4977 | |||||||
4978 | {4, MVT::v2i8, 12}, // (load 8i8 and) deinterleave into 4 x 2i8 | ||||||
4979 | {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 | ||||||
4980 | {4, MVT::v8i8, 20}, // (load 32i8 and) deinterleave into 4 x 8i8 | ||||||
4981 | {4, MVT::v16i8, 39}, // (load 64i8 and) deinterleave into 4 x 16i8 | ||||||
4982 | {4, MVT::v32i8, 80}, // (load 128i8 and) deinterleave into 4 x 32i8 | ||||||
4983 | |||||||
4984 | {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 | ||||||
4985 | }; | ||||||
4986 | |||||||
4987 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { | ||||||
4988 | {2, MVT::v4i64, 6}, // interleave 2 x 4i64 into 8i64 (and store) | ||||||
4989 | |||||||
4990 | {3, MVT::v2i8, 7}, // interleave 3 x 2i8 into 6i8 (and store) | ||||||
4991 | {3, MVT::v4i8, 8}, // interleave 3 x 4i8 into 12i8 (and store) | ||||||
4992 | {3, MVT::v8i8, 11}, // interleave 3 x 8i8 into 24i8 (and store) | ||||||
4993 | {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) | ||||||
4994 | {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) | ||||||
4995 | |||||||
4996 | {4, MVT::v2i8, 12}, // interleave 4 x 2i8 into 8i8 (and store) | ||||||
4997 | {4, MVT::v4i8, 9}, // interleave 4 x 4i8 into 16i8 (and store) | ||||||
4998 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) | ||||||
4999 | {4, MVT::v16i8, 10}, // interleave 4 x 16i8 into 64i8 (and store) | ||||||
5000 | {4, MVT::v32i8, 12} // interleave 4 x 32i8 into 128i8 (and store) | ||||||
5001 | }; | ||||||
5002 | |||||||
5003 | if (Opcode == Instruction::Load) { | ||||||
5004 | if (const auto *Entry = | ||||||
5005 | CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) | ||||||
5006 | return MemOpCosts + Entry->Cost; | ||||||
5007 | } else { | ||||||
5008 | assert(Opcode == Instruction::Store &&((void)0) | ||||||
5009 | "Expected Store Instruction at this point")((void)0); | ||||||
5010 | if (const auto *Entry = | ||||||
5011 | CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) | ||||||
5012 | return MemOpCosts + Entry->Cost; | ||||||
5013 | } | ||||||
5014 | |||||||
5015 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
5016 | Alignment, AddressSpace, CostKind); | ||||||
5017 | } | ||||||
5018 | |||||||
5019 | // Get estimation for interleaved load/store operations and strided load. | ||||||
5020 | // \p Indices contains indices for strided load. | ||||||
5021 | // \p Factor - the factor of interleaving. | ||||||
5022 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. | ||||||
5023 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( | ||||||
5024 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, | ||||||
5025 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, | ||||||
5026 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { | ||||||
5027 | |||||||
5028 | if (UseMaskForCond || UseMaskForGaps) | ||||||
5029 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
5030 | Alignment, AddressSpace, CostKind, | ||||||
5031 | UseMaskForCond, UseMaskForGaps); | ||||||
5032 | |||||||
5033 | // VecTy for interleave memop is <VF*Factor x Elt>. | ||||||
5034 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | ||||||
5035 | // VecTy = <12 x i32>. | ||||||
5036 | |||||||
5037 | // Calculate the number of memory operations (NumOfMemOps), required | ||||||
5038 | // for load/store the VecTy. | ||||||
5039 | MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | ||||||
5040 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | ||||||
5041 | unsigned LegalVTSize = LegalVT.getStoreSize(); | ||||||
5042 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | ||||||
5043 | |||||||
5044 | // Get the cost of one memory operation. | ||||||
5045 | auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), | ||||||
5046 | LegalVT.getVectorNumElements()); | ||||||
5047 | InstructionCost MemOpCost = getMemoryOpCost( | ||||||
5048 | Opcode, SingleMemOpTy, MaybeAlign(Alignment), AddressSpace, CostKind); | ||||||
5049 | |||||||
5050 | unsigned VF = VecTy->getNumElements() / Factor; | ||||||
5051 | MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); | ||||||
5052 | |||||||
5053 | if (Opcode == Instruction::Load) { | ||||||
5054 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) | ||||||
5055 | // contain the cost of the optimized shuffle sequence that the | ||||||
5056 | // X86InterleavedAccess pass will generate. | ||||||
5057 | // The cost of loads and stores are computed separately from the table. | ||||||
5058 | |||||||
5059 | // X86InterleavedAccess support only the following interleaved-access group. | ||||||
5060 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { | ||||||
5061 | {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 | ||||||
5062 | {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 | ||||||
5063 | {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 | ||||||
5064 | }; | ||||||
5065 | |||||||
5066 | if (const auto *Entry = | ||||||
5067 | CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) | ||||||
5068 | return NumOfMemOps * MemOpCost + Entry->Cost; | ||||||
5069 | //If an entry does not exist, fallback to the default implementation. | ||||||
5070 | |||||||
5071 | // Kind of shuffle depends on number of loaded values. | ||||||
5072 | // If we load the entire data in one register, we can use a 1-src shuffle. | ||||||
5073 | // Otherwise, we'll merge 2 sources in each operation. | ||||||
5074 | TTI::ShuffleKind ShuffleKind = | ||||||
5075 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; | ||||||
5076 | |||||||
5077 | InstructionCost ShuffleCost = | ||||||
5078 | getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr); | ||||||
5079 | |||||||
5080 | unsigned NumOfLoadsInInterleaveGrp = | ||||||
5081 | Indices.size() ? Indices.size() : Factor; | ||||||
5082 | auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), | ||||||
5083 | VecTy->getNumElements() / Factor); | ||||||
5084 | InstructionCost NumOfResults = | ||||||
5085 | getTLI()->getTypeLegalizationCost(DL, ResultTy).first * | ||||||
5086 | NumOfLoadsInInterleaveGrp; | ||||||
5087 | |||||||
5088 | // About a half of the loads may be folded in shuffles when we have only | ||||||
5089 | // one result. If we have more than one result, we do not fold loads at all. | ||||||
5090 | unsigned NumOfUnfoldedLoads = | ||||||
5091 | NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; | ||||||
5092 | |||||||
5093 | // Get a number of shuffle operations per result. | ||||||
5094 | unsigned NumOfShufflesPerResult = | ||||||
5095 | std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); | ||||||
5096 | |||||||
5097 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | ||||||
5098 | // When we have more than one destination, we need additional instructions | ||||||
5099 | // to keep sources. | ||||||
5100 | InstructionCost NumOfMoves = 0; | ||||||
5101 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) | ||||||
5102 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; | ||||||
5103 | |||||||
5104 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + | ||||||
5105 | NumOfUnfoldedLoads * MemOpCost + NumOfMoves; | ||||||
5106 | |||||||
5107 | return Cost; | ||||||
5108 | } | ||||||
5109 | |||||||
5110 | // Store. | ||||||
5111 | assert(Opcode == Instruction::Store &&((void)0) | ||||||
5112 | "Expected Store Instruction at this point")((void)0); | ||||||
5113 | // X86InterleavedAccess support only the following interleaved-access group. | ||||||
5114 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { | ||||||
5115 | {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) | ||||||
5116 | {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) | ||||||
5117 | {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) | ||||||
5118 | |||||||
5119 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) | ||||||
5120 | {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) | ||||||
5121 | {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) | ||||||
5122 | {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) | ||||||
5123 | }; | ||||||
5124 | |||||||
5125 | if (const auto *Entry = | ||||||
5126 | CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) | ||||||
5127 | return NumOfMemOps * MemOpCost + Entry->Cost; | ||||||
5128 | //If an entry does not exist, fallback to the default implementation. | ||||||
5129 | |||||||
5130 | // There is no strided stores meanwhile. And store can't be folded in | ||||||
5131 | // shuffle. | ||||||
5132 | unsigned NumOfSources = Factor; // The number of values to be merged. | ||||||
5133 | InstructionCost ShuffleCost = | ||||||
5134 | getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr); | ||||||
5135 | unsigned NumOfShufflesPerStore = NumOfSources - 1; | ||||||
5136 | |||||||
5137 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | ||||||
5138 | // We need additional instructions to keep sources. | ||||||
5139 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; | ||||||
5140 | InstructionCost Cost = | ||||||
5141 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + | ||||||
5142 | NumOfMoves; | ||||||
5143 | return Cost; | ||||||
5144 | } | ||||||
5145 | |||||||
5146 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( | ||||||
5147 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | ||||||
5148 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | ||||||
5149 | bool UseMaskForCond, bool UseMaskForGaps) { | ||||||
5150 | auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { | ||||||
5151 | Type *EltTy = cast<VectorType>(VecTy)->getElementType(); | ||||||
5152 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || | ||||||
5153 | EltTy->isIntegerTy(32) || EltTy->isPointerTy()) | ||||||
5154 | return true; | ||||||
5155 | if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) | ||||||
5156 | return HasBW; | ||||||
5157 | return false; | ||||||
5158 | }; | ||||||
5159 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) | ||||||
5160 | return getInterleavedMemoryOpCostAVX512( | ||||||
5161 | Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment, | ||||||
5162 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); | ||||||
5163 | if (ST->hasAVX2()) | ||||||
5164 | return getInterleavedMemoryOpCostAVX2( | ||||||
5165 | Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment, | ||||||
5166 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); | ||||||
5167 | |||||||
5168 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | ||||||
5169 | Alignment, AddressSpace, CostKind, | ||||||
5170 | UseMaskForCond, UseMaskForGaps); | ||||||
5171 | } |
1 | //===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | // This file declares the X86 specific subclass of TargetSubtargetInfo. | ||||||
10 | // | ||||||
11 | //===----------------------------------------------------------------------===// | ||||||
12 | |||||||
13 | #ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H | ||||||
14 | #define LLVM_LIB_TARGET_X86_X86SUBTARGET_H | ||||||
15 | |||||||
16 | #include "X86FrameLowering.h" | ||||||
17 | #include "X86ISelLowering.h" | ||||||
18 | #include "X86InstrInfo.h" | ||||||
19 | #include "X86SelectionDAGInfo.h" | ||||||
20 | #include "llvm/ADT/Triple.h" | ||||||
21 | #include "llvm/CodeGen/TargetSubtargetInfo.h" | ||||||
22 | #include "llvm/IR/CallingConv.h" | ||||||
23 | #include <climits> | ||||||
24 | #include <memory> | ||||||
25 | |||||||
26 | #define GET_SUBTARGETINFO_HEADER | ||||||
27 | #include "X86GenSubtargetInfo.inc" | ||||||
28 | |||||||
29 | namespace llvm { | ||||||
30 | |||||||
31 | class CallLowering; | ||||||
32 | class GlobalValue; | ||||||
33 | class InstructionSelector; | ||||||
34 | class LegalizerInfo; | ||||||
35 | class RegisterBankInfo; | ||||||
36 | class StringRef; | ||||||
37 | class TargetMachine; | ||||||
38 | |||||||
39 | /// The X86 backend supports a number of different styles of PIC. | ||||||
40 | /// | ||||||
41 | namespace PICStyles { | ||||||
42 | |||||||
43 | enum class Style { | ||||||
44 | StubPIC, // Used on i386-darwin in pic mode. | ||||||
45 | GOT, // Used on 32 bit elf on when in pic mode. | ||||||
46 | RIPRel, // Used on X86-64 when in pic mode. | ||||||
47 | None // Set when not in pic mode. | ||||||
48 | }; | ||||||
49 | |||||||
50 | } // end namespace PICStyles | ||||||
51 | |||||||
52 | class X86Subtarget final : public X86GenSubtargetInfo { | ||||||
53 | // NOTE: Do not add anything new to this list. Coarse, CPU name based flags | ||||||
54 | // are not a good idea. We should be migrating away from these. | ||||||
55 | enum X86ProcFamilyEnum { | ||||||
56 | Others, | ||||||
57 | IntelAtom, | ||||||
58 | IntelSLM | ||||||
59 | }; | ||||||
60 | |||||||
61 | enum X86SSEEnum { | ||||||
62 | NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F | ||||||
63 | }; | ||||||
64 | |||||||
65 | enum X863DNowEnum { | ||||||
66 | NoThreeDNow, MMX, ThreeDNow, ThreeDNowA | ||||||
67 | }; | ||||||
68 | |||||||
69 | /// X86 processor family: Intel Atom, and others | ||||||
70 | X86ProcFamilyEnum X86ProcFamily = Others; | ||||||
71 | |||||||
72 | /// Which PIC style to use | ||||||
73 | PICStyles::Style PICStyle; | ||||||
74 | |||||||
75 | const TargetMachine &TM; | ||||||
76 | |||||||
77 | /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. | ||||||
78 | X86SSEEnum X86SSELevel = NoSSE; | ||||||
79 | |||||||
80 | /// MMX, 3DNow, 3DNow Athlon, or none supported. | ||||||
81 | X863DNowEnum X863DNowLevel = NoThreeDNow; | ||||||
82 | |||||||
83 | /// True if the processor supports X87 instructions. | ||||||
84 | bool HasX87 = false; | ||||||
85 | |||||||
86 | /// True if the processor supports CMPXCHG8B. | ||||||
87 | bool HasCmpxchg8b = false; | ||||||
88 | |||||||
89 | /// True if this processor has NOPL instruction | ||||||
90 | /// (generally pentium pro+). | ||||||
91 | bool HasNOPL = false; | ||||||
92 | |||||||
93 | /// True if this processor has conditional move instructions | ||||||
94 | /// (generally pentium pro+). | ||||||
95 | bool HasCMov = false; | ||||||
96 | |||||||
97 | /// True if the processor supports X86-64 instructions. | ||||||
98 | bool HasX86_64 = false; | ||||||
99 | |||||||
100 | /// True if the processor supports POPCNT. | ||||||
101 | bool HasPOPCNT = false; | ||||||
102 | |||||||
103 | /// True if the processor supports SSE4A instructions. | ||||||
104 | bool HasSSE4A = false; | ||||||
105 | |||||||
106 | /// Target has AES instructions | ||||||
107 | bool HasAES = false; | ||||||
108 | bool HasVAES = false; | ||||||
109 | |||||||
110 | /// Target has FXSAVE/FXRESTOR instructions | ||||||
111 | bool HasFXSR = false; | ||||||
112 | |||||||
113 | /// Target has XSAVE instructions | ||||||
114 | bool HasXSAVE = false; | ||||||
115 | |||||||
116 | /// Target has XSAVEOPT instructions | ||||||
117 | bool HasXSAVEOPT = false; | ||||||
118 | |||||||
119 | /// Target has XSAVEC instructions | ||||||
120 | bool HasXSAVEC = false; | ||||||
121 | |||||||
122 | /// Target has XSAVES instructions | ||||||
123 | bool HasXSAVES = false; | ||||||
124 | |||||||
125 | /// Target has carry-less multiplication | ||||||
126 | bool HasPCLMUL = false; | ||||||
127 | bool HasVPCLMULQDQ = false; | ||||||
128 | |||||||
129 | /// Target has Galois Field Arithmetic instructions | ||||||
130 | bool HasGFNI = false; | ||||||
131 | |||||||
132 | /// Target has 3-operand fused multiply-add | ||||||
133 | bool HasFMA = false; | ||||||
134 | |||||||
135 | /// Target has 4-operand fused multiply-add | ||||||
136 | bool HasFMA4 = false; | ||||||
137 | |||||||
138 | /// Target has XOP instructions | ||||||
139 | bool HasXOP = false; | ||||||
140 | |||||||
141 | /// Target has TBM instructions. | ||||||
142 | bool HasTBM = false; | ||||||
143 | |||||||
144 | /// Target has LWP instructions | ||||||
145 | bool HasLWP = false; | ||||||
146 | |||||||
147 | /// True if the processor has the MOVBE instruction. | ||||||
148 | bool HasMOVBE = false; | ||||||
149 | |||||||
150 | /// True if the processor has the RDRAND instruction. | ||||||
151 | bool HasRDRAND = false; | ||||||
152 | |||||||
153 | /// Processor has 16-bit floating point conversion instructions. | ||||||
154 | bool HasF16C = false; | ||||||
155 | |||||||
156 | /// Processor has FS/GS base insturctions. | ||||||
157 | bool HasFSGSBase = false; | ||||||
158 | |||||||
159 | /// Processor has LZCNT instruction. | ||||||
160 | bool HasLZCNT = false; | ||||||
161 | |||||||
162 | /// Processor has BMI1 instructions. | ||||||
163 | bool HasBMI = false; | ||||||
164 | |||||||
165 | /// Processor has BMI2 instructions. | ||||||
166 | bool HasBMI2 = false; | ||||||
167 | |||||||
168 | /// Processor has VBMI instructions. | ||||||
169 | bool HasVBMI = false; | ||||||
170 | |||||||
171 | /// Processor has VBMI2 instructions. | ||||||
172 | bool HasVBMI2 = false; | ||||||
173 | |||||||
174 | /// Processor has Integer Fused Multiply Add | ||||||
175 | bool HasIFMA = false; | ||||||
176 | |||||||
177 | /// Processor has RTM instructions. | ||||||
178 | bool HasRTM = false; | ||||||
179 | |||||||
180 | /// Processor has ADX instructions. | ||||||
181 | bool HasADX = false; | ||||||
182 | |||||||
183 | /// Processor has SHA instructions. | ||||||
184 | bool HasSHA = false; | ||||||
185 | |||||||
186 | /// Processor has PRFCHW instructions. | ||||||
187 | bool HasPRFCHW = false; | ||||||
188 | |||||||
189 | /// Processor has RDSEED instructions. | ||||||
190 | bool HasRDSEED = false; | ||||||
191 | |||||||
192 | /// Processor has LAHF/SAHF instructions in 64-bit mode. | ||||||
193 | bool HasLAHFSAHF64 = false; | ||||||
194 | |||||||
195 | /// Processor has MONITORX/MWAITX instructions. | ||||||
196 | bool HasMWAITX = false; | ||||||
197 | |||||||
198 | /// Processor has Cache Line Zero instruction | ||||||
199 | bool HasCLZERO = false; | ||||||
200 | |||||||
201 | /// Processor has Cache Line Demote instruction | ||||||
202 | bool HasCLDEMOTE = false; | ||||||
203 | |||||||
204 | /// Processor has MOVDIRI instruction (direct store integer). | ||||||
205 | bool HasMOVDIRI = false; | ||||||
206 | |||||||
207 | /// Processor has MOVDIR64B instruction (direct store 64 bytes). | ||||||
208 | bool HasMOVDIR64B = false; | ||||||
209 | |||||||
210 | /// Processor has ptwrite instruction. | ||||||
211 | bool HasPTWRITE = false; | ||||||
212 | |||||||
213 | /// Processor has Prefetch with intent to Write instruction | ||||||
214 | bool HasPREFETCHWT1 = false; | ||||||
215 | |||||||
216 | /// True if SHLD instructions are slow. | ||||||
217 | bool IsSHLDSlow = false; | ||||||
218 | |||||||
219 | /// True if the PMULLD instruction is slow compared to PMULLW/PMULHW and | ||||||
220 | // PMULUDQ. | ||||||
221 | bool IsPMULLDSlow = false; | ||||||
222 | |||||||
223 | /// True if the PMADDWD instruction is slow compared to PMULLD. | ||||||
224 | bool IsPMADDWDSlow = false; | ||||||
225 | |||||||
226 | /// True if unaligned memory accesses of 16-bytes are slow. | ||||||
227 | bool IsUAMem16Slow = false; | ||||||
228 | |||||||
229 | /// True if unaligned memory accesses of 32-bytes are slow. | ||||||
230 | bool IsUAMem32Slow = false; | ||||||
231 | |||||||
232 | /// True if SSE operations can have unaligned memory operands. | ||||||
233 | /// This may require setting a configuration bit in the processor. | ||||||
234 | bool HasSSEUnalignedMem = false; | ||||||
235 | |||||||
236 | /// True if this processor has the CMPXCHG16B instruction; | ||||||
237 | /// this is true for most x86-64 chips, but not the first AMD chips. | ||||||
238 | bool HasCmpxchg16b = false; | ||||||
239 | |||||||
240 | /// True if the LEA instruction should be used for adjusting | ||||||
241 | /// the stack pointer. This is an optimization for Intel Atom processors. | ||||||
242 | bool UseLeaForSP = false; | ||||||
243 | |||||||
244 | /// True if POPCNT instruction has a false dependency on the destination register. | ||||||
245 | bool HasPOPCNTFalseDeps = false; | ||||||
246 | |||||||
247 | /// True if LZCNT/TZCNT instructions have a false dependency on the destination register. | ||||||
248 | bool HasLZCNTFalseDeps = false; | ||||||
249 | |||||||
250 | /// True if its preferable to combine to a single cross-lane shuffle | ||||||
251 | /// using a variable mask over multiple fixed shuffles. | ||||||
252 | bool HasFastVariableCrossLaneShuffle = false; | ||||||
253 | |||||||
254 | /// True if its preferable to combine to a single per-lane shuffle | ||||||
255 | /// using a variable mask over multiple fixed shuffles. | ||||||
256 | bool HasFastVariablePerLaneShuffle = false; | ||||||
257 | |||||||
258 | /// True if vzeroupper instructions should be inserted after code that uses | ||||||
259 | /// ymm or zmm registers. | ||||||
260 | bool InsertVZEROUPPER = false; | ||||||
261 | |||||||
262 | /// True if there is no performance penalty for writing NOPs with up to | ||||||
263 | /// 7 bytes. | ||||||
264 | bool HasFast7ByteNOP = false; | ||||||
265 | |||||||
266 | /// True if there is no performance penalty for writing NOPs with up to | ||||||
267 | /// 11 bytes. | ||||||
268 | bool HasFast11ByteNOP = false; | ||||||
269 | |||||||
270 | /// True if there is no performance penalty for writing NOPs with up to | ||||||
271 | /// 15 bytes. | ||||||
272 | bool HasFast15ByteNOP = false; | ||||||
273 | |||||||
274 | /// True if gather is reasonably fast. This is true for Skylake client and | ||||||
275 | /// all AVX-512 CPUs. | ||||||
276 | bool HasFastGather = false; | ||||||
277 | |||||||
278 | /// True if hardware SQRTSS instruction is at least as fast (latency) as | ||||||
279 | /// RSQRTSS followed by a Newton-Raphson iteration. | ||||||
280 | bool HasFastScalarFSQRT = false; | ||||||
281 | |||||||
282 | /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast | ||||||
283 | /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration. | ||||||
284 | bool HasFastVectorFSQRT = false; | ||||||
285 | |||||||
286 | /// True if 8-bit divisions are significantly faster than | ||||||
287 | /// 32-bit divisions and should be used when possible. | ||||||
288 | bool HasSlowDivide32 = false; | ||||||
289 | |||||||
290 | /// True if 32-bit divides are significantly faster than | ||||||
291 | /// 64-bit divisions and should be used when possible. | ||||||
292 | bool HasSlowDivide64 = false; | ||||||
293 | |||||||
294 | /// True if LZCNT instruction is fast. | ||||||
295 | bool HasFastLZCNT = false; | ||||||
296 | |||||||
297 | /// True if SHLD based rotate is fast. | ||||||
298 | bool HasFastSHLDRotate = false; | ||||||
299 | |||||||
300 | /// True if the processor supports macrofusion. | ||||||
301 | bool HasMacroFusion = false; | ||||||
302 | |||||||
303 | /// True if the processor supports branch fusion. | ||||||
304 | bool HasBranchFusion = false; | ||||||
305 | |||||||
306 | /// True if the processor has enhanced REP MOVSB/STOSB. | ||||||
307 | bool HasERMSB = false; | ||||||
308 | |||||||
309 | /// True if the processor has fast short REP MOV. | ||||||
310 | bool HasFSRM = false; | ||||||
311 | |||||||
312 | /// True if the short functions should be padded to prevent | ||||||
313 | /// a stall when returning too early. | ||||||
314 | bool PadShortFunctions = false; | ||||||
315 | |||||||
316 | /// True if two memory operand instructions should use a temporary register | ||||||
317 | /// instead. | ||||||
318 | bool SlowTwoMemOps = false; | ||||||
319 | |||||||
320 | /// True if the LEA instruction inputs have to be ready at address generation | ||||||
321 | /// (AG) time. | ||||||
322 | bool LEAUsesAG = false; | ||||||
323 | |||||||
324 | /// True if the LEA instruction with certain arguments is slow | ||||||
325 | bool SlowLEA = false; | ||||||
326 | |||||||
327 | /// True if the LEA instruction has all three source operands: base, index, | ||||||
328 | /// and offset or if the LEA instruction uses base and index registers where | ||||||
329 | /// the base is EBP, RBP,or R13 | ||||||
330 | bool Slow3OpsLEA = false; | ||||||
331 | |||||||
332 | /// True if INC and DEC instructions are slow when writing to flags | ||||||
333 | bool SlowIncDec = false; | ||||||
334 | |||||||
335 | /// Processor has AVX-512 PreFetch Instructions | ||||||
336 | bool HasPFI = false; | ||||||
337 | |||||||
338 | /// Processor has AVX-512 Exponential and Reciprocal Instructions | ||||||
339 | bool HasERI = false; | ||||||
340 | |||||||
341 | /// Processor has AVX-512 Conflict Detection Instructions | ||||||
342 | bool HasCDI = false; | ||||||
343 | |||||||
344 | /// Processor has AVX-512 population count Instructions | ||||||
345 | bool HasVPOPCNTDQ = false; | ||||||
346 | |||||||
347 | /// Processor has AVX-512 Doubleword and Quadword instructions | ||||||
348 | bool HasDQI = false; | ||||||
349 | |||||||
350 | /// Processor has AVX-512 Byte and Word instructions | ||||||
351 | bool HasBWI = false; | ||||||
352 | |||||||
353 | /// Processor has AVX-512 Vector Length eXtenstions | ||||||
354 | bool HasVLX = false; | ||||||
355 | |||||||
356 | /// Processor has PKU extenstions | ||||||
357 | bool HasPKU = false; | ||||||
358 | |||||||
359 | /// Processor has AVX-512 Vector Neural Network Instructions | ||||||
360 | bool HasVNNI = false; | ||||||
361 | |||||||
362 | /// Processor has AVX Vector Neural Network Instructions | ||||||
363 | bool HasAVXVNNI = false; | ||||||
364 | |||||||
365 | /// Processor has AVX-512 bfloat16 floating-point extensions | ||||||
366 | bool HasBF16 = false; | ||||||
367 | |||||||
368 | /// Processor supports ENQCMD instructions | ||||||
369 | bool HasENQCMD = false; | ||||||
370 | |||||||
371 | /// Processor has AVX-512 Bit Algorithms instructions | ||||||
372 | bool HasBITALG = false; | ||||||
373 | |||||||
374 | /// Processor has AVX-512 vp2intersect instructions | ||||||
375 | bool HasVP2INTERSECT = false; | ||||||
376 | |||||||
377 | /// Processor supports CET SHSTK - Control-Flow Enforcement Technology | ||||||
378 | /// using Shadow Stack | ||||||
379 | bool HasSHSTK = false; | ||||||
380 | |||||||
381 | /// Processor supports Invalidate Process-Context Identifier | ||||||
382 | bool HasINVPCID = false; | ||||||
383 | |||||||
384 | /// Processor has Software Guard Extensions | ||||||
385 | bool HasSGX = false; | ||||||
386 | |||||||
387 | /// Processor supports Flush Cache Line instruction | ||||||
388 | bool HasCLFLUSHOPT = false; | ||||||
389 | |||||||
390 | /// Processor supports Cache Line Write Back instruction | ||||||
391 | bool HasCLWB = false; | ||||||
392 | |||||||
393 | /// Processor supports Write Back No Invalidate instruction | ||||||
394 | bool HasWBNOINVD = false; | ||||||
395 | |||||||
396 | /// Processor support RDPID instruction | ||||||
397 | bool HasRDPID = false; | ||||||
398 | |||||||
399 | /// Processor supports WaitPKG instructions | ||||||
400 | bool HasWAITPKG = false; | ||||||
401 | |||||||
402 | /// Processor supports PCONFIG instruction | ||||||
403 | bool HasPCONFIG = false; | ||||||
404 | |||||||
405 | /// Processor support key locker instructions | ||||||
406 | bool HasKL = false; | ||||||
407 | |||||||
408 | /// Processor support key locker wide instructions | ||||||
409 | bool HasWIDEKL = false; | ||||||
410 | |||||||
411 | /// Processor supports HRESET instruction | ||||||
412 | bool HasHRESET = false; | ||||||
413 | |||||||
414 | /// Processor supports SERIALIZE instruction | ||||||
415 | bool HasSERIALIZE = false; | ||||||
416 | |||||||
417 | /// Processor supports TSXLDTRK instruction | ||||||
418 | bool HasTSXLDTRK = false; | ||||||
419 | |||||||
420 | /// Processor has AMX support | ||||||
421 | bool HasAMXTILE = false; | ||||||
422 | bool HasAMXBF16 = false; | ||||||
423 | bool HasAMXINT8 = false; | ||||||
424 | |||||||
425 | /// Processor supports User Level Interrupt instructions | ||||||
426 | bool HasUINTR = false; | ||||||
427 | |||||||
428 | /// Processor has a single uop BEXTR implementation. | ||||||
429 | bool HasFastBEXTR = false; | ||||||
430 | |||||||
431 | /// Try harder to combine to horizontal vector ops if they are fast. | ||||||
432 | bool HasFastHorizontalOps = false; | ||||||
433 | |||||||
434 | /// Prefer a left/right scalar logical shifts pair over a shift+and pair. | ||||||
435 | bool HasFastScalarShiftMasks = false; | ||||||
436 | |||||||
437 | /// Prefer a left/right vector logical shifts pair over a shift+and pair. | ||||||
438 | bool HasFastVectorShiftMasks = false; | ||||||
439 | |||||||
440 | /// Prefer a movbe over a single-use load + bswap / single-use bswap + store. | ||||||
441 | bool HasFastMOVBE = false; | ||||||
442 | |||||||
443 | /// Use a retpoline thunk rather than indirect calls to block speculative | ||||||
444 | /// execution. | ||||||
445 | bool UseRetpolineIndirectCalls = false; | ||||||
446 | |||||||
447 | /// Use a retpoline thunk or remove any indirect branch to block speculative | ||||||
448 | /// execution. | ||||||
449 | bool UseRetpolineIndirectBranches = false; | ||||||
450 | |||||||
451 | /// Deprecated flag, query `UseRetpolineIndirectCalls` and | ||||||
452 | /// `UseRetpolineIndirectBranches` instead. | ||||||
453 | bool DeprecatedUseRetpoline = false; | ||||||
454 | |||||||
455 | /// When using a retpoline thunk, call an externally provided thunk rather | ||||||
456 | /// than emitting one inside the compiler. | ||||||
457 | bool UseRetpolineExternalThunk = false; | ||||||
458 | |||||||
459 | /// Prevent generation of indirect call/branch instructions from memory, | ||||||
460 | /// and force all indirect call/branch instructions from a register to be | ||||||
461 | /// preceded by an LFENCE. Also decompose RET instructions into a | ||||||
462 | /// POP+LFENCE+JMP sequence. | ||||||
463 | bool UseLVIControlFlowIntegrity = false; | ||||||
464 | |||||||
465 | /// Enable Speculative Execution Side Effect Suppression | ||||||
466 | bool UseSpeculativeExecutionSideEffectSuppression = false; | ||||||
467 | |||||||
468 | /// Insert LFENCE instructions to prevent data speculatively injected into | ||||||
469 | /// loads from being used maliciously. | ||||||
470 | bool UseLVILoadHardening = false; | ||||||
471 | |||||||
472 | /// Use software floating point for code generation. | ||||||
473 | bool UseSoftFloat = false; | ||||||
474 | |||||||
475 | /// Use alias analysis during code generation. | ||||||
476 | bool UseAA = false; | ||||||
477 | |||||||
478 | /// The minimum alignment known to hold of the stack frame on | ||||||
479 | /// entry to the function and which must be maintained by every function. | ||||||
480 | Align stackAlignment = Align(4); | ||||||
481 | |||||||
482 | Align TileConfigAlignment = Align(4); | ||||||
483 | |||||||
484 | /// Whether function prologues should save register arguments on the stack. | ||||||
485 | bool SaveArgs = false; | ||||||
486 | |||||||
487 | /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. | ||||||
488 | /// | ||||||
489 | // FIXME: this is a known good value for Yonah. How about others? | ||||||
490 | unsigned MaxInlineSizeThreshold = 128; | ||||||
491 | |||||||
492 | /// Indicates target prefers 128 bit instructions. | ||||||
493 | bool Prefer128Bit = false; | ||||||
494 | |||||||
495 | /// Indicates target prefers 256 bit instructions. | ||||||
496 | bool Prefer256Bit = false; | ||||||
497 | |||||||
498 | /// Indicates target prefers AVX512 mask registers. | ||||||
499 | bool PreferMaskRegisters = false; | ||||||
500 | |||||||
501 | /// Use Goldmont specific floating point div/sqrt costs. | ||||||
502 | bool UseGLMDivSqrtCosts = false; | ||||||
503 | |||||||
504 | /// What processor and OS we're targeting. | ||||||
505 | Triple TargetTriple; | ||||||
506 | |||||||
507 | /// GlobalISel related APIs. | ||||||
508 | std::unique_ptr<CallLowering> CallLoweringInfo; | ||||||
509 | std::unique_ptr<LegalizerInfo> Legalizer; | ||||||
510 | std::unique_ptr<RegisterBankInfo> RegBankInfo; | ||||||
511 | std::unique_ptr<InstructionSelector> InstSelector; | ||||||
512 | |||||||
513 | private: | ||||||
514 | /// Override the stack alignment. | ||||||
515 | MaybeAlign StackAlignOverride; | ||||||
516 | |||||||
517 | /// Preferred vector width from function attribute. | ||||||
518 | unsigned PreferVectorWidthOverride; | ||||||
519 | |||||||
520 | /// Resolved preferred vector width from function attribute and subtarget | ||||||
521 | /// features. | ||||||
522 | unsigned PreferVectorWidth = UINT32_MAX0xffffffffU; | ||||||
523 | |||||||
524 | /// Required vector width from function attribute. | ||||||
525 | unsigned RequiredVectorWidth; | ||||||
526 | |||||||
527 | /// True if compiling for 64-bit, false for 16-bit or 32-bit. | ||||||
528 | bool In64BitMode = false; | ||||||
529 | |||||||
530 | /// True if compiling for 32-bit, false for 16-bit or 64-bit. | ||||||
531 | bool In32BitMode = false; | ||||||
532 | |||||||
533 | /// True if compiling for 16-bit, false for 32-bit or 64-bit. | ||||||
534 | bool In16BitMode = false; | ||||||
535 | |||||||
536 | X86SelectionDAGInfo TSInfo; | ||||||
537 | // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which | ||||||
538 | // X86TargetLowering needs. | ||||||
539 | X86InstrInfo InstrInfo; | ||||||
540 | X86TargetLowering TLInfo; | ||||||
541 | X86FrameLowering FrameLowering; | ||||||
542 | |||||||
543 | public: | ||||||
544 | /// This constructor initializes the data members to match that | ||||||
545 | /// of the specified triple. | ||||||
546 | /// | ||||||
547 | X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, | ||||||
548 | const X86TargetMachine &TM, MaybeAlign StackAlignOverride, | ||||||
549 | unsigned PreferVectorWidthOverride, | ||||||
550 | unsigned RequiredVectorWidth); | ||||||
551 | |||||||
552 | const X86TargetLowering *getTargetLowering() const override { | ||||||
553 | return &TLInfo; | ||||||
554 | } | ||||||
555 | |||||||
556 | const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } | ||||||
557 | |||||||
558 | const X86FrameLowering *getFrameLowering() const override { | ||||||
559 | return &FrameLowering; | ||||||
560 | } | ||||||
561 | |||||||
562 | const X86SelectionDAGInfo *getSelectionDAGInfo() const override { | ||||||
563 | return &TSInfo; | ||||||
564 | } | ||||||
565 | |||||||
566 | const X86RegisterInfo *getRegisterInfo() const override { | ||||||
567 | return &getInstrInfo()->getRegisterInfo(); | ||||||
568 | } | ||||||
569 | |||||||
570 | bool getSaveArgs() const { return SaveArgs; } | ||||||
571 | |||||||
572 | unsigned getTileConfigSize() const { return 64; } | ||||||
573 | Align getTileConfigAlignment() const { return TileConfigAlignment; } | ||||||
574 | |||||||
575 | /// Returns the minimum alignment known to hold of the | ||||||
576 | /// stack frame on entry to the function and which must be maintained by every | ||||||
577 | /// function for this subtarget. | ||||||
578 | Align getStackAlignment() const { return stackAlignment; } | ||||||
579 | |||||||
580 | /// Returns the maximum memset / memcpy size | ||||||
581 | /// that still makes it profitable to inline the call. | ||||||
582 | unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; } | ||||||
583 | |||||||
584 | /// ParseSubtargetFeatures - Parses features string setting specified | ||||||
585 | /// subtarget options. Definition of function is auto generated by tblgen. | ||||||
586 | void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); | ||||||
587 | |||||||
588 | /// Methods used by Global ISel | ||||||
589 | const CallLowering *getCallLowering() const override; | ||||||
590 | InstructionSelector *getInstructionSelector() const override; | ||||||
591 | const LegalizerInfo *getLegalizerInfo() const override; | ||||||
592 | const RegisterBankInfo *getRegBankInfo() const override; | ||||||
593 | |||||||
594 | private: | ||||||
595 | /// Initialize the full set of dependencies so we can use an initializer | ||||||
596 | /// list for X86Subtarget. | ||||||
597 | X86Subtarget &initializeSubtargetDependencies(StringRef CPU, | ||||||
598 | StringRef TuneCPU, | ||||||
599 | StringRef FS); | ||||||
600 | void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); | ||||||
601 | |||||||
602 | public: | ||||||
603 | /// Is this x86_64? (disregarding specific ABI / programming model) | ||||||
604 | bool is64Bit() const { | ||||||
605 | return In64BitMode; | ||||||
606 | } | ||||||
607 | |||||||
608 | bool is32Bit() const { | ||||||
609 | return In32BitMode; | ||||||
610 | } | ||||||
611 | |||||||
612 | bool is16Bit() const { | ||||||
613 | return In16BitMode; | ||||||
614 | } | ||||||
615 | |||||||
616 | /// Is this x86_64 with the ILP32 programming model (x32 ABI)? | ||||||
617 | bool isTarget64BitILP32() const { | ||||||
618 | return In64BitMode && (TargetTriple.isX32() || TargetTriple.isOSNaCl()); | ||||||
619 | } | ||||||
620 | |||||||
621 | /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? | ||||||
622 | bool isTarget64BitLP64() const { | ||||||
623 | return In64BitMode && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl()); | ||||||
624 | } | ||||||
625 | |||||||
626 | PICStyles::Style getPICStyle() const { return PICStyle; } | ||||||
627 | void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } | ||||||
628 | |||||||
629 | bool hasX87() const { return HasX87; } | ||||||
630 | bool hasCmpxchg8b() const { return HasCmpxchg8b; } | ||||||
631 | bool hasNOPL() const { return HasNOPL; } | ||||||
632 | // SSE codegen depends on cmovs, and all SSE1+ processors support them. | ||||||
633 | // All 64-bit processors support cmov. | ||||||
634 | bool hasCMov() const { return HasCMov || X86SSELevel >= SSE1 || is64Bit(); } | ||||||
635 | bool hasSSE1() const { return X86SSELevel >= SSE1; } | ||||||
636 | bool hasSSE2() const { return X86SSELevel >= SSE2; } | ||||||
637 | bool hasSSE3() const { return X86SSELevel >= SSE3; } | ||||||
638 | bool hasSSSE3() const { return X86SSELevel >= SSSE3; } | ||||||
639 | bool hasSSE41() const { return X86SSELevel >= SSE41; } | ||||||
640 | bool hasSSE42() const { return X86SSELevel >= SSE42; } | ||||||
641 | bool hasAVX() const { return X86SSELevel >= AVX; } | ||||||
642 | bool hasAVX2() const { return X86SSELevel >= AVX2; } | ||||||
643 | bool hasAVX512() const { return X86SSELevel
| ||||||
644 | bool hasInt256() const { return hasAVX2(); } | ||||||
645 | bool hasSSE4A() const { return HasSSE4A; } | ||||||
646 | bool hasMMX() const { return X863DNowLevel >= MMX; } | ||||||
647 | bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } | ||||||
648 | bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } | ||||||
649 | bool hasPOPCNT() const { return HasPOPCNT; } | ||||||
650 | bool hasAES() const { return HasAES; } | ||||||
651 | bool hasVAES() const { return HasVAES; } | ||||||
652 | bool hasFXSR() const { return HasFXSR; } | ||||||
653 | bool hasXSAVE() const { return HasXSAVE; } | ||||||
654 | bool hasXSAVEOPT() const { return HasXSAVEOPT; } | ||||||
655 | bool hasXSAVEC() const { return HasXSAVEC; } | ||||||
656 | bool hasXSAVES() const { return HasXSAVES; } | ||||||
657 | bool hasPCLMUL() const { return HasPCLMUL; } | ||||||
658 | bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; } | ||||||
659 | bool hasGFNI() const { return HasGFNI; } | ||||||
660 | // Prefer FMA4 to FMA - its better for commutation/memory folding and | ||||||
661 | // has equal or better performance on all supported targets. | ||||||
662 | bool hasFMA() const { return HasFMA; } | ||||||
663 | bool hasFMA4() const { return HasFMA4; } | ||||||
664 | bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } | ||||||
665 | bool hasXOP() const { return HasXOP; } | ||||||
666 | bool hasTBM() const { return HasTBM; } | ||||||
667 | bool hasLWP() const { return HasLWP; } | ||||||
668 | bool hasMOVBE() const { return HasMOVBE; } | ||||||
669 | bool hasRDRAND() const { return HasRDRAND; } | ||||||
670 | bool hasF16C() const { return HasF16C; } | ||||||
671 | bool hasFSGSBase() const { return HasFSGSBase; } | ||||||
672 | bool hasLZCNT() const { return HasLZCNT; } | ||||||
673 | bool hasBMI() const { return HasBMI; } | ||||||
674 | bool hasBMI2() const { return HasBMI2; } | ||||||
675 | bool hasVBMI() const { return HasVBMI; } | ||||||
676 | bool hasVBMI2() const { return HasVBMI2; } | ||||||
677 | bool hasIFMA() const { return HasIFMA; } | ||||||
678 | bool hasRTM() const { return HasRTM; } | ||||||
679 | bool hasADX() const { return HasADX; } | ||||||
680 | bool hasSHA() const { return HasSHA; } | ||||||
681 | bool hasPRFCHW() const { return HasPRFCHW; } | ||||||
682 | bool hasPREFETCHWT1() const { return HasPREFETCHWT1; } | ||||||
683 | bool hasPrefetchW() const { | ||||||
684 | // The PREFETCHW instruction was added with 3DNow but later CPUs gave it | ||||||
685 | // its own CPUID bit as part of deprecating 3DNow. Intel eventually added | ||||||
686 | // it and KNL has another that prefetches to L2 cache. We assume the | ||||||
687 | // L1 version exists if the L2 version does. | ||||||
688 | return has3DNow() || hasPRFCHW() || hasPREFETCHWT1(); | ||||||
689 | } | ||||||
690 | bool hasSSEPrefetch() const { | ||||||
691 | // We implicitly enable these when we have a write prefix supporting cache | ||||||
692 | // level OR if we have prfchw, but don't already have a read prefetch from | ||||||
693 | // 3dnow. | ||||||
694 | return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1(); | ||||||
695 | } | ||||||
696 | bool hasRDSEED() const { return HasRDSEED; } | ||||||
697 | bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); } | ||||||
698 | bool hasMWAITX() const { return HasMWAITX; } | ||||||
699 | bool hasCLZERO() const { return HasCLZERO; } | ||||||
700 | bool hasCLDEMOTE() const { return HasCLDEMOTE; } | ||||||
701 | bool hasMOVDIRI() const { return HasMOVDIRI; } | ||||||
702 | bool hasMOVDIR64B() const { return HasMOVDIR64B; } | ||||||
703 | bool hasPTWRITE() const { return HasPTWRITE; } | ||||||
704 | bool isSHLDSlow() const { return IsSHLDSlow; } | ||||||
705 | bool isPMULLDSlow() const { return IsPMULLDSlow; } | ||||||
706 | bool isPMADDWDSlow() const { return IsPMADDWDSlow; } | ||||||
707 | bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } | ||||||
708 | bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } | ||||||
709 | bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } | ||||||
710 | bool hasCmpxchg16b() const { return HasCmpxchg16b && is64Bit(); } | ||||||
711 | bool useLeaForSP() const { return UseLeaForSP; } | ||||||
712 | bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } | ||||||
713 | bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } | ||||||
714 | bool hasFastVariableCrossLaneShuffle() const { | ||||||
715 | return HasFastVariableCrossLaneShuffle; | ||||||
716 | } | ||||||
717 | bool hasFastVariablePerLaneShuffle() const { | ||||||
718 | return HasFastVariablePerLaneShuffle; | ||||||
719 | } | ||||||
720 | bool insertVZEROUPPER() const { return InsertVZEROUPPER; } | ||||||
721 | bool hasFastGather() const { return HasFastGather; } | ||||||
722 | bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } | ||||||
723 | bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } | ||||||
724 | bool hasFastLZCNT() const { return HasFastLZCNT; } | ||||||
725 | bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } | ||||||
726 | bool hasFastBEXTR() const { return HasFastBEXTR; } | ||||||
727 | bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } | ||||||
728 | bool hasFastScalarShiftMasks() const { return HasFastScalarShiftMasks; } | ||||||
729 | bool hasFastVectorShiftMasks() const { return HasFastVectorShiftMasks; } | ||||||
730 | bool hasFastMOVBE() const { return HasFastMOVBE; } | ||||||
731 | bool hasMacroFusion() const { return HasMacroFusion; } | ||||||
732 | bool hasBranchFusion() const { return HasBranchFusion; } | ||||||
733 | bool hasERMSB() const { return HasERMSB; } | ||||||
734 | bool hasFSRM() const { return HasFSRM; } | ||||||
735 | bool hasSlowDivide32() const { return HasSlowDivide32; } | ||||||
736 | bool hasSlowDivide64() const { return HasSlowDivide64; } | ||||||
737 | bool padShortFunctions() const { return PadShortFunctions; } | ||||||
738 | bool slowTwoMemOps() const { return SlowTwoMemOps; } | ||||||
739 | bool LEAusesAG() const { return LEAUsesAG; } | ||||||
740 | bool slowLEA() const { return SlowLEA; } | ||||||
741 | bool slow3OpsLEA() const { return Slow3OpsLEA; } | ||||||
742 | bool slowIncDec() const { return SlowIncDec; } | ||||||
743 | bool hasCDI() const { return HasCDI; } | ||||||
744 | bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; } | ||||||
745 | bool hasPFI() const { return HasPFI; } | ||||||
746 | bool hasERI() const { return HasERI; } | ||||||
747 | bool hasDQI() const { return HasDQI; } | ||||||
748 | bool hasBWI() const { return HasBWI; } | ||||||
749 | bool hasVLX() const { return HasVLX; } | ||||||
750 | bool hasPKU() const { return HasPKU; } | ||||||
751 | bool hasVNNI() const { return HasVNNI; } | ||||||
752 | bool hasBF16() const { return HasBF16; } | ||||||
753 | bool hasVP2INTERSECT() const { return HasVP2INTERSECT; } | ||||||
754 | bool hasBITALG() const { return HasBITALG; } | ||||||
755 | bool hasSHSTK() const { return HasSHSTK; } | ||||||
756 | bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } | ||||||
757 | bool hasCLWB() const { return HasCLWB; } | ||||||
758 | bool hasWBNOINVD() const { return HasWBNOINVD; } | ||||||
759 | bool hasRDPID() const { return HasRDPID; } | ||||||
760 | bool hasWAITPKG() const { return HasWAITPKG; } | ||||||
761 | bool hasPCONFIG() const { return HasPCONFIG; } | ||||||
762 | bool hasSGX() const { return HasSGX; } | ||||||
763 | bool hasINVPCID() const { return HasINVPCID; } | ||||||
764 | bool hasENQCMD() const { return HasENQCMD; } | ||||||
765 | bool hasKL() const { return HasKL; } | ||||||
766 | bool hasWIDEKL() const { return HasWIDEKL; } | ||||||
767 | bool hasHRESET() const { return HasHRESET; } | ||||||
768 | bool hasSERIALIZE() const { return HasSERIALIZE; } | ||||||
769 | bool hasTSXLDTRK() const { return HasTSXLDTRK; } | ||||||
770 | bool hasUINTR() const { return HasUINTR; } | ||||||
771 | bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } | ||||||
772 | bool useRetpolineIndirectBranches() const { | ||||||
773 | return UseRetpolineIndirectBranches; | ||||||
774 | } | ||||||
775 | bool hasAVXVNNI() const { return HasAVXVNNI; } | ||||||
776 | bool hasAMXTILE() const { return HasAMXTILE; } | ||||||
777 | bool hasAMXBF16() const { return HasAMXBF16; } | ||||||
778 | bool hasAMXINT8() const { return HasAMXINT8; } | ||||||
779 | bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } | ||||||
780 | |||||||
781 | // These are generic getters that OR together all of the thunk types | ||||||
782 | // supported by the subtarget. Therefore useIndirectThunk*() will return true | ||||||
783 | // if any respective thunk feature is enabled. | ||||||
784 | bool useIndirectThunkCalls() const { | ||||||
785 | return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity(); | ||||||
786 | } | ||||||
787 | bool useIndirectThunkBranches() const { | ||||||
788 | return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity(); | ||||||
789 | } | ||||||
790 | |||||||
791 | bool preferMaskRegisters() const { return PreferMaskRegisters; } | ||||||
792 | bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; } | ||||||
793 | bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; } | ||||||
794 | bool useLVILoadHardening() const { return UseLVILoadHardening; } | ||||||
795 | bool useSpeculativeExecutionSideEffectSuppression() const { | ||||||
796 | return UseSpeculativeExecutionSideEffectSuppression; | ||||||
797 | } | ||||||
798 | |||||||
799 | unsigned getPreferVectorWidth() const { return PreferVectorWidth; } | ||||||
800 | unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } | ||||||
801 | |||||||
802 | // Helper functions to determine when we should allow widening to 512-bit | ||||||
803 | // during codegen. | ||||||
804 | // TODO: Currently we're always allowing widening on CPUs without VLX, | ||||||
805 | // because for many cases we don't have a better option. | ||||||
806 | bool canExtendTo512DQ() const { | ||||||
807 | return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512); | ||||||
808 | } | ||||||
809 | bool canExtendTo512BW() const { | ||||||
810 | return hasBWI() && canExtendTo512DQ(); | ||||||
811 | } | ||||||
812 | |||||||
813 | // If there are no 512-bit vectors and we prefer not to use 512-bit registers, | ||||||
814 | // disable them in the legalizer. | ||||||
815 | bool useAVX512Regs() const { | ||||||
816 | return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); | ||||||
817 | } | ||||||
818 | |||||||
819 | bool useBWIRegs() const { | ||||||
820 | return hasBWI() && useAVX512Regs(); | ||||||
821 | } | ||||||
822 | |||||||
823 | bool isXRaySupported() const override { return is64Bit(); } | ||||||
824 | |||||||
825 | /// TODO: to be removed later and replaced with suitable properties | ||||||
826 | bool isAtom() const { return X86ProcFamily == IntelAtom; } | ||||||
827 | bool isSLM() const { return X86ProcFamily == IntelSLM; } | ||||||
828 | bool useSoftFloat() const { return UseSoftFloat; } | ||||||
829 | bool useAA() const override { return UseAA; } | ||||||
830 | |||||||
831 | /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for | ||||||
832 | /// no-sse2). There isn't any reason to disable it if the target processor | ||||||
833 | /// supports it. | ||||||
834 | bool hasMFence() const { return hasSSE2() || is64Bit(); } | ||||||
835 | |||||||
836 | const Triple &getTargetTriple() const { return TargetTriple; } | ||||||
837 | |||||||
838 | bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } | ||||||
839 | bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } | ||||||
840 | bool isTargetOpenBSD() const { return TargetTriple.isOSOpenBSD(); } | ||||||
841 | bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } | ||||||
842 | bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } | ||||||
843 | bool isTargetPS4() const { return TargetTriple.isPS4CPU(); } | ||||||
844 | |||||||
845 | bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } | ||||||
846 | bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } | ||||||
847 | bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } | ||||||
848 | |||||||
849 | bool isTargetLinux() const { return TargetTriple.isOSLinux(); } | ||||||
850 | bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); } | ||||||
851 | bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); } | ||||||
852 | bool isTargetAndroid() const { return TargetTriple.isAndroid(); } | ||||||
853 | bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } | ||||||
854 | bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } | ||||||
855 | bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } | ||||||
856 | bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } | ||||||
857 | bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } | ||||||
858 | |||||||
859 | bool isTargetWindowsMSVC() const { | ||||||
860 | return TargetTriple.isWindowsMSVCEnvironment(); | ||||||
861 | } | ||||||
862 | |||||||
863 | bool isTargetWindowsCoreCLR() const { | ||||||
864 | return TargetTriple.isWindowsCoreCLREnvironment(); | ||||||
865 | } | ||||||
866 | |||||||
867 | bool isTargetWindowsCygwin() const { | ||||||
868 | return TargetTriple.isWindowsCygwinEnvironment(); | ||||||
869 | } | ||||||
870 | |||||||
871 | bool isTargetWindowsGNU() const { | ||||||
872 | return TargetTriple.isWindowsGNUEnvironment(); | ||||||
873 | } | ||||||
874 | |||||||
875 | bool isTargetWindowsItanium() const { | ||||||
876 | return TargetTriple.isWindowsItaniumEnvironment(); | ||||||
877 | } | ||||||
878 | |||||||
879 | bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } | ||||||
880 | |||||||
881 | bool isOSWindows() const { return TargetTriple.isOSWindows(); } | ||||||
882 | |||||||
883 | bool isTargetWin64() const { return In64BitMode && isOSWindows(); } | ||||||
884 | |||||||
885 | bool isTargetWin32() const { return !In64BitMode && isOSWindows(); } | ||||||
886 | |||||||
887 | bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; } | ||||||
888 | bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; } | ||||||
889 | |||||||
890 | bool isPICStyleStubPIC() const { | ||||||
891 | return PICStyle == PICStyles::Style::StubPIC; | ||||||
892 | } | ||||||
893 | |||||||
894 | bool isPositionIndependent() const; | ||||||
895 | |||||||
896 | bool isCallingConvWin64(CallingConv::ID CC) const { | ||||||
897 | switch (CC) { | ||||||
898 | // On Win64, all these conventions just use the default convention. | ||||||
899 | case CallingConv::C: | ||||||
900 | case CallingConv::Fast: | ||||||
901 | case CallingConv::Tail: | ||||||
902 | case CallingConv::Swift: | ||||||
903 | case CallingConv::SwiftTail: | ||||||
904 | case CallingConv::X86_FastCall: | ||||||
905 | case CallingConv::X86_StdCall: | ||||||
906 | case CallingConv::X86_ThisCall: | ||||||
907 | case CallingConv::X86_VectorCall: | ||||||
908 | case CallingConv::Intel_OCL_BI: | ||||||
909 | return isTargetWin64(); | ||||||
910 | // This convention allows using the Win64 convention on other targets. | ||||||
911 | case CallingConv::Win64: | ||||||
912 | return true; | ||||||
913 | // This convention allows using the SysV convention on Windows targets. | ||||||
914 | case CallingConv::X86_64_SysV: | ||||||
915 | return false; | ||||||
916 | // Otherwise, who knows what this is. | ||||||
917 | default: | ||||||
918 | return false; | ||||||
919 | } | ||||||
920 | } | ||||||
921 | |||||||
922 | /// Classify a global variable reference for the current subtarget according | ||||||
923 | /// to how we should reference it in a non-pcrel context. | ||||||
924 | unsigned char classifyLocalReference(const GlobalValue *GV) const; | ||||||
925 | |||||||
926 | unsigned char classifyGlobalReference(const GlobalValue *GV, | ||||||
927 | const Module &M) const; | ||||||
928 | unsigned char classifyGlobalReference(const GlobalValue *GV) const; | ||||||
929 | |||||||
930 | /// Classify a global function reference for the current subtarget. | ||||||
931 | unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, | ||||||
932 | const Module &M) const; | ||||||
933 | unsigned char classifyGlobalFunctionReference(const GlobalValue *GV) const; | ||||||
934 | |||||||
935 | /// Classify a blockaddress reference for the current subtarget according to | ||||||
936 | /// how we should reference it in a non-pcrel context. | ||||||
937 | unsigned char classifyBlockAddressReference() const; | ||||||
938 | |||||||
939 | /// Return true if the subtarget allows calls to immediate address. | ||||||
940 | bool isLegalToCallImmediateAddr() const; | ||||||
941 | |||||||
942 | /// If we are using indirect thunks, we need to expand indirectbr to avoid it | ||||||
943 | /// lowering to an actual indirect jump. | ||||||
944 | bool enableIndirectBrExpand() const override { | ||||||
945 | return useIndirectThunkBranches(); | ||||||
946 | } | ||||||
947 | |||||||
948 | /// Enable the MachineScheduler pass for all X86 subtargets. | ||||||
949 | bool enableMachineScheduler() const override { return true; } | ||||||
950 | |||||||
951 | bool enableEarlyIfConversion() const override; | ||||||
952 | |||||||
953 | void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> | ||||||
954 | &Mutations) const override; | ||||||
955 | |||||||
956 | AntiDepBreakMode getAntiDepBreakMode() const override { | ||||||
957 | return TargetSubtargetInfo::ANTIDEP_CRITICAL; | ||||||
958 | } | ||||||
959 | |||||||
960 | bool enableAdvancedRASplitCost() const override { return false; } | ||||||
961 | }; | ||||||
962 | |||||||
963 | } // end namespace llvm | ||||||
964 | |||||||
965 | #endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H |
1 | //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// | ||||||
2 | // | ||||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
6 | // | ||||||
7 | //===----------------------------------------------------------------------===// | ||||||
8 | // | ||||||
9 | /// \file | ||||||
10 | /// This file provides a helper that implements much of the TTI interface in | ||||||
11 | /// terms of the target-independent code generator and TargetLowering | ||||||
12 | /// interfaces. | ||||||
13 | // | ||||||
14 | //===----------------------------------------------------------------------===// | ||||||
15 | |||||||
16 | #ifndef LLVM_CODEGEN_BASICTTIIMPL_H | ||||||
17 | #define LLVM_CODEGEN_BASICTTIIMPL_H | ||||||
18 | |||||||
19 | #include "llvm/ADT/APInt.h" | ||||||
20 | #include "llvm/ADT/ArrayRef.h" | ||||||
21 | #include "llvm/ADT/BitVector.h" | ||||||
22 | #include "llvm/ADT/SmallPtrSet.h" | ||||||
23 | #include "llvm/ADT/SmallVector.h" | ||||||
24 | #include "llvm/Analysis/LoopInfo.h" | ||||||
25 | #include "llvm/Analysis/TargetTransformInfo.h" | ||||||
26 | #include "llvm/Analysis/TargetTransformInfoImpl.h" | ||||||
27 | #include "llvm/CodeGen/ISDOpcodes.h" | ||||||
28 | #include "llvm/CodeGen/TargetLowering.h" | ||||||
29 | #include "llvm/CodeGen/TargetSubtargetInfo.h" | ||||||
30 | #include "llvm/CodeGen/ValueTypes.h" | ||||||
31 | #include "llvm/IR/BasicBlock.h" | ||||||
32 | #include "llvm/IR/Constant.h" | ||||||
33 | #include "llvm/IR/Constants.h" | ||||||
34 | #include "llvm/IR/DataLayout.h" | ||||||
35 | #include "llvm/IR/DerivedTypes.h" | ||||||
36 | #include "llvm/IR/InstrTypes.h" | ||||||
37 | #include "llvm/IR/Instruction.h" | ||||||
38 | #include "llvm/IR/Instructions.h" | ||||||
39 | #include "llvm/IR/Intrinsics.h" | ||||||
40 | #include "llvm/IR/Operator.h" | ||||||
41 | #include "llvm/IR/Type.h" | ||||||
42 | #include "llvm/IR/Value.h" | ||||||
43 | #include "llvm/Support/Casting.h" | ||||||
44 | #include "llvm/Support/CommandLine.h" | ||||||
45 | #include "llvm/Support/ErrorHandling.h" | ||||||
46 | #include "llvm/Support/MachineValueType.h" | ||||||
47 | #include "llvm/Support/MathExtras.h" | ||||||
48 | #include "llvm/Target/TargetMachine.h" | ||||||
49 | #include <algorithm> | ||||||
50 | #include <cassert> | ||||||
51 | #include <cstdint> | ||||||
52 | #include <limits> | ||||||
53 | #include <utility> | ||||||
54 | |||||||
55 | namespace llvm { | ||||||
56 | |||||||
57 | class Function; | ||||||
58 | class GlobalValue; | ||||||
59 | class LLVMContext; | ||||||
60 | class ScalarEvolution; | ||||||
61 | class SCEV; | ||||||
62 | class TargetMachine; | ||||||
63 | |||||||
64 | extern cl::opt<unsigned> PartialUnrollingThreshold; | ||||||
65 | |||||||
66 | /// Base class which can be used to help build a TTI implementation. | ||||||
67 | /// | ||||||
68 | /// This class provides as much implementation of the TTI interface as is | ||||||
69 | /// possible using the target independent parts of the code generator. | ||||||
70 | /// | ||||||
71 | /// In order to subclass it, your class must implement a getST() method to | ||||||
72 | /// return the subtarget, and a getTLI() method to return the target lowering. | ||||||
73 | /// We need these methods implemented in the derived class so that this class | ||||||
74 | /// doesn't have to duplicate storage for them. | ||||||
75 | template <typename T> | ||||||
76 | class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { | ||||||
77 | private: | ||||||
78 | using BaseT = TargetTransformInfoImplCRTPBase<T>; | ||||||
79 | using TTI = TargetTransformInfo; | ||||||
80 | |||||||
81 | /// Helper function to access this as a T. | ||||||
82 | T *thisT() { return static_cast<T *>(this); } | ||||||
83 | |||||||
84 | /// Estimate a cost of Broadcast as an extract and sequence of insert | ||||||
85 | /// operations. | ||||||
86 | InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { | ||||||
87 | InstructionCost Cost = 0; | ||||||
88 | // Broadcast cost is equal to the cost of extracting the zero'th element | ||||||
89 | // plus the cost of inserting it into every element of the result vector. | ||||||
90 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); | ||||||
91 | |||||||
92 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { | ||||||
93 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); | ||||||
94 | } | ||||||
95 | return Cost; | ||||||
96 | } | ||||||
97 | |||||||
98 | /// Estimate a cost of shuffle as a sequence of extract and insert | ||||||
99 | /// operations. | ||||||
100 | InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { | ||||||
101 | InstructionCost Cost = 0; | ||||||
102 | // Shuffle cost is equal to the cost of extracting element from its argument | ||||||
103 | // plus the cost of inserting them onto the result vector. | ||||||
104 | |||||||
105 | // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from | ||||||
106 | // index 0 of first vector, index 1 of second vector,index 2 of first | ||||||
107 | // vector and finally index 3 of second vector and insert them at index | ||||||
108 | // <0,1,2,3> of result vector. | ||||||
109 | for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { | ||||||
110 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); | ||||||
111 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); | ||||||
112 | } | ||||||
113 | return Cost; | ||||||
114 | } | ||||||
115 | |||||||
116 | /// Estimate a cost of subvector extraction as a sequence of extract and | ||||||
117 | /// insert operations. | ||||||
118 | InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, | ||||||
119 | FixedVectorType *SubVTy) { | ||||||
120 | assert(VTy && SubVTy &&((void)0) | ||||||
121 | "Can only extract subvectors from vectors")((void)0); | ||||||
122 | int NumSubElts = SubVTy->getNumElements(); | ||||||
123 | assert((!isa<FixedVectorType>(VTy) ||((void)0) | ||||||
124 | (Index + NumSubElts) <=((void)0) | ||||||
125 | (int)cast<FixedVectorType>(VTy)->getNumElements()) &&((void)0) | ||||||
126 | "SK_ExtractSubvector index out of range")((void)0); | ||||||
127 | |||||||
128 | InstructionCost Cost = 0; | ||||||
129 | // Subvector extraction cost is equal to the cost of extracting element from | ||||||
130 | // the source type plus the cost of inserting them into the result vector | ||||||
131 | // type. | ||||||
132 | for (int i = 0; i != NumSubElts; ++i) { | ||||||
133 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, | ||||||
134 | i + Index); | ||||||
135 | Cost += | ||||||
136 | thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); | ||||||
137 | } | ||||||
138 | return Cost; | ||||||
139 | } | ||||||
140 | |||||||
141 | /// Estimate a cost of subvector insertion as a sequence of extract and | ||||||
142 | /// insert operations. | ||||||
143 | InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, | ||||||
144 | FixedVectorType *SubVTy) { | ||||||
145 | assert(VTy && SubVTy &&((void)0) | ||||||
146 | "Can only insert subvectors into vectors")((void)0); | ||||||
147 | int NumSubElts = SubVTy->getNumElements(); | ||||||
148 | assert((!isa<FixedVectorType>(VTy) ||((void)0) | ||||||
149 | (Index + NumSubElts) <=((void)0) | ||||||
150 | (int)cast<FixedVectorType>(VTy)->getNumElements()) &&((void)0) | ||||||
151 | "SK_InsertSubvector index out of range")((void)0); | ||||||
152 | |||||||
153 | InstructionCost Cost = 0; | ||||||
154 | // Subvector insertion cost is equal to the cost of extracting element from | ||||||
155 | // the source type plus the cost of inserting them into the result vector | ||||||
156 | // type. | ||||||
157 | for (int i = 0; i != NumSubElts; ++i) { | ||||||
158 | Cost += | ||||||
159 | thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); | ||||||
160 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, | ||||||
161 | i + Index); | ||||||
162 | } | ||||||
163 | return Cost; | ||||||
164 | } | ||||||
165 | |||||||
166 | /// Local query method delegates up to T which *must* implement this! | ||||||
167 | const TargetSubtargetInfo *getST() const { | ||||||
168 | return static_cast<const T *>(this)->getST(); | ||||||
169 | } | ||||||
170 | |||||||
171 | /// Local query method delegates up to T which *must* implement this! | ||||||
172 | const TargetLoweringBase *getTLI() const { | ||||||
173 | return static_cast<const T *>(this)->getTLI(); | ||||||
174 | } | ||||||
175 | |||||||
176 | static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { | ||||||
177 | switch (M) { | ||||||
178 | case TTI::MIM_Unindexed: | ||||||
179 | return ISD::UNINDEXED; | ||||||
180 | case TTI::MIM_PreInc: | ||||||
181 | return ISD::PRE_INC; | ||||||
182 | case TTI::MIM_PreDec: | ||||||
183 | return ISD::PRE_DEC; | ||||||
184 | case TTI::MIM_PostInc: | ||||||
185 | return ISD::POST_INC; | ||||||
186 | case TTI::MIM_PostDec: | ||||||
187 | return ISD::POST_DEC; | ||||||
188 | } | ||||||
189 | llvm_unreachable("Unexpected MemIndexedMode")__builtin_unreachable(); | ||||||
190 | } | ||||||
191 | |||||||
192 | InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, | ||||||
193 | Align Alignment, | ||||||
194 | bool VariableMask, | ||||||
195 | bool IsGatherScatter, | ||||||
196 | TTI::TargetCostKind CostKind) { | ||||||
197 | auto *VT = cast<FixedVectorType>(DataTy); | ||||||
198 | // Assume the target does not have support for gather/scatter operations | ||||||
199 | // and provide a rough estimate. | ||||||
200 | // | ||||||
201 | // First, compute the cost of the individual memory operations. | ||||||
202 | InstructionCost AddrExtractCost = | ||||||
203 | IsGatherScatter | ||||||
204 | ? getVectorInstrCost(Instruction::ExtractElement, | ||||||
205 | FixedVectorType::get( | ||||||
206 | PointerType::get(VT->getElementType(), 0), | ||||||
207 | VT->getNumElements()), | ||||||
208 | -1) | ||||||
209 | : 0; | ||||||
210 | InstructionCost LoadCost = | ||||||
211 | VT->getNumElements() * | ||||||
212 | (AddrExtractCost + | ||||||
213 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); | ||||||
214 | |||||||
215 | // Next, compute the cost of packing the result in a vector. | ||||||
216 | InstructionCost PackingCost = getScalarizationOverhead( | ||||||
217 | VT, Opcode != Instruction::Store, Opcode == Instruction::Store); | ||||||
218 | |||||||
219 | InstructionCost ConditionalCost = 0; | ||||||
220 | if (VariableMask) { | ||||||
221 | // Compute the cost of conditionally executing the memory operations with | ||||||
222 | // variable masks. This includes extracting the individual conditions, a | ||||||
223 | // branches and PHIs to combine the results. | ||||||
224 | // NOTE: Estimating the cost of conditionally executing the memory | ||||||
225 | // operations accurately is quite difficult and the current solution | ||||||
226 | // provides a very rough estimate only. | ||||||
227 | ConditionalCost = | ||||||
228 | VT->getNumElements() * | ||||||
229 | (getVectorInstrCost( | ||||||
230 | Instruction::ExtractElement, | ||||||
231 | FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), | ||||||
232 | VT->getNumElements()), | ||||||
233 | -1) + | ||||||
234 | getCFInstrCost(Instruction::Br, CostKind) + | ||||||
235 | getCFInstrCost(Instruction::PHI, CostKind)); | ||||||
236 | } | ||||||
237 | |||||||
238 | return LoadCost + PackingCost + ConditionalCost; | ||||||
239 | } | ||||||
240 | |||||||
241 | protected: | ||||||
242 | explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) | ||||||
243 | : BaseT(DL) {} | ||||||
244 | virtual ~BasicTTIImplBase() = default; | ||||||
245 | |||||||
246 | using TargetTransformInfoImplBase::DL; | ||||||
247 | |||||||
248 | public: | ||||||
249 | /// \name Scalar TTI Implementations | ||||||
250 | /// @{ | ||||||
251 | bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, | ||||||
252 | unsigned AddressSpace, Align Alignment, | ||||||
253 | bool *Fast) const { | ||||||
254 | EVT E = EVT::getIntegerVT(Context, BitWidth); | ||||||
255 | return getTLI()->allowsMisalignedMemoryAccesses( | ||||||
256 | E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); | ||||||
257 | } | ||||||
258 | |||||||
259 | bool hasBranchDivergence() { return false; } | ||||||
260 | |||||||
261 | bool useGPUDivergenceAnalysis() { return false; } | ||||||
262 | |||||||
263 | bool isSourceOfDivergence(const Value *V) { return false; } | ||||||
264 | |||||||
265 | bool isAlwaysUniform(const Value *V) { return false; } | ||||||
266 | |||||||
267 | unsigned getFlatAddressSpace() { | ||||||
268 | // Return an invalid address space. | ||||||
269 | return -1; | ||||||
270 | } | ||||||
271 | |||||||
272 | bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, | ||||||
273 | Intrinsic::ID IID) const { | ||||||
274 | return false; | ||||||
275 | } | ||||||
276 | |||||||
277 | bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { | ||||||
278 | return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); | ||||||
279 | } | ||||||
280 | |||||||
281 | unsigned getAssumedAddrSpace(const Value *V) const { | ||||||
282 | return getTLI()->getTargetMachine().getAssumedAddrSpace(V); | ||||||
283 | } | ||||||
284 | |||||||
285 | Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, | ||||||
286 | Value *NewV) const { | ||||||
287 | return nullptr; | ||||||
288 | } | ||||||
289 | |||||||
290 | bool isLegalAddImmediate(int64_t imm) { | ||||||
291 | return getTLI()->isLegalAddImmediate(imm); | ||||||
292 | } | ||||||
293 | |||||||
294 | bool isLegalICmpImmediate(int64_t imm) { | ||||||
295 | return getTLI()->isLegalICmpImmediate(imm); | ||||||
296 | } | ||||||
297 | |||||||
298 | bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, | ||||||
299 | bool HasBaseReg, int64_t Scale, | ||||||
300 | unsigned AddrSpace, Instruction *I = nullptr) { | ||||||
301 | TargetLoweringBase::AddrMode AM; | ||||||
302 | AM.BaseGV = BaseGV; | ||||||
303 | AM.BaseOffs = BaseOffset; | ||||||
304 | AM.HasBaseReg = HasBaseReg; | ||||||
305 | AM.Scale = Scale; | ||||||
306 | return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); | ||||||
307 | } | ||||||
308 | |||||||
309 | bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, | ||||||
310 | const DataLayout &DL) const { | ||||||
311 | EVT VT = getTLI()->getValueType(DL, Ty); | ||||||
312 | return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); | ||||||
313 | } | ||||||
314 | |||||||
315 | bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, | ||||||
316 | const DataLayout &DL) const { | ||||||
317 | EVT VT = getTLI()->getValueType(DL, Ty); | ||||||
318 | return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); | ||||||
319 | } | ||||||
320 | |||||||
321 | bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { | ||||||
322 | return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); | ||||||
323 | } | ||||||
324 | |||||||
325 | bool isNumRegsMajorCostOfLSR() { | ||||||
326 | return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); | ||||||
327 | } | ||||||
328 | |||||||
329 | bool isProfitableLSRChainElement(Instruction *I) { | ||||||
330 | return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); | ||||||
331 | } | ||||||
332 | |||||||
333 | InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, | ||||||
334 | int64_t BaseOffset, bool HasBaseReg, | ||||||
335 | int64_t Scale, unsigned AddrSpace) { | ||||||
336 | TargetLoweringBase::AddrMode AM; | ||||||
337 | AM.BaseGV = BaseGV; | ||||||
338 | AM.BaseOffs = BaseOffset; | ||||||
339 | AM.HasBaseReg = HasBaseReg; | ||||||
340 | AM.Scale = Scale; | ||||||
341 | return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); | ||||||
342 | } | ||||||
343 | |||||||
344 | bool isTruncateFree(Type *Ty1, Type *Ty2) { | ||||||
345 | return getTLI()->isTruncateFree(Ty1, Ty2); | ||||||
346 | } | ||||||
347 | |||||||
348 | bool isProfitableToHoist(Instruction *I) { | ||||||
349 | return getTLI()->isProfitableToHoist(I); | ||||||
350 | } | ||||||
351 | |||||||
352 | bool useAA() const { return getST()->useAA(); } | ||||||
353 | |||||||
354 | bool isTypeLegal(Type *Ty) { | ||||||
355 | EVT VT = getTLI()->getValueType(DL, Ty); | ||||||
356 | return getTLI()->isTypeLegal(VT); | ||||||
357 | } | ||||||
358 | |||||||
359 | InstructionCost getRegUsageForType(Type *Ty) { | ||||||
360 | InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first; | ||||||
361 | assert(Val >= 0 && "Negative cost!")((void)0); | ||||||
362 | return Val; | ||||||
363 | } | ||||||
364 | |||||||
365 | InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, | ||||||
366 | ArrayRef<const Value *> Operands) { | ||||||
367 | return BaseT::getGEPCost(PointeeType, Ptr, Operands); | ||||||
368 | } | ||||||
369 | |||||||
370 | unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, | ||||||
371 | unsigned &JumpTableSize, | ||||||
372 | ProfileSummaryInfo *PSI, | ||||||
373 | BlockFrequencyInfo *BFI) { | ||||||
374 | /// Try to find the estimated number of clusters. Note that the number of | ||||||
375 | /// clusters identified in this function could be different from the actual | ||||||
376 | /// numbers found in lowering. This function ignore switches that are | ||||||
377 | /// lowered with a mix of jump table / bit test / BTree. This function was | ||||||
378 | /// initially intended to be used when estimating the cost of switch in | ||||||
379 | /// inline cost heuristic, but it's a generic cost model to be used in other | ||||||
380 | /// places (e.g., in loop unrolling). | ||||||
381 | unsigned N = SI.getNumCases(); | ||||||
382 | const TargetLoweringBase *TLI = getTLI(); | ||||||
383 | const DataLayout &DL = this->getDataLayout(); | ||||||
384 | |||||||
385 | JumpTableSize = 0; | ||||||
386 | bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); | ||||||
387 | |||||||
388 | // Early exit if both a jump table and bit test are not allowed. | ||||||
389 | if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) | ||||||
390 | return N; | ||||||
391 | |||||||
392 | APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); | ||||||
393 | APInt MinCaseVal = MaxCaseVal; | ||||||
394 | for (auto CI : SI.cases()) { | ||||||
395 | const APInt &CaseVal = CI.getCaseValue()->getValue(); | ||||||
396 | if (CaseVal.sgt(MaxCaseVal)) | ||||||
397 | MaxCaseVal = CaseVal; | ||||||
398 | if (CaseVal.slt(MinCaseVal)) | ||||||
399 | MinCaseVal = CaseVal; | ||||||
400 | } | ||||||
401 | |||||||
402 | // Check if suitable for a bit test | ||||||
403 | if (N <= DL.getIndexSizeInBits(0u)) { | ||||||
404 | SmallPtrSet<const BasicBlock *, 4> Dests; | ||||||
405 | for (auto I : SI.cases()) | ||||||
406 | Dests.insert(I.getCaseSuccessor()); | ||||||
407 | |||||||
408 | if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, | ||||||
409 | DL)) | ||||||
410 | return 1; | ||||||
411 | } | ||||||
412 | |||||||
413 | // Check if suitable for a jump table. | ||||||
414 | if (IsJTAllowed) { | ||||||
415 | if (N < 2 || N < TLI->getMinimumJumpTableEntries()) | ||||||
416 | return N; | ||||||
417 | uint64_t Range = | ||||||
418 | (MaxCaseVal - MinCaseVal) | ||||||
419 | .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; | ||||||
420 | // Check whether a range of clusters is dense enough for a jump table | ||||||
421 | if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { | ||||||
422 | JumpTableSize = Range; | ||||||
423 | return 1; | ||||||
424 | } | ||||||
425 | } | ||||||
426 | return N; | ||||||
427 | } | ||||||
428 | |||||||
429 | bool shouldBuildLookupTables() { | ||||||
430 | const TargetLoweringBase *TLI = getTLI(); | ||||||
431 | return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || | ||||||
432 | TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); | ||||||
433 | } | ||||||
434 | |||||||
435 | bool shouldBuildRelLookupTables() const { | ||||||
436 | const TargetMachine &TM = getTLI()->getTargetMachine(); | ||||||
437 | // If non-PIC mode, do not generate a relative lookup table. | ||||||
438 | if (!TM.isPositionIndependent()) | ||||||
439 | return false; | ||||||
440 | |||||||
441 | /// Relative lookup table entries consist of 32-bit offsets. | ||||||
442 | /// Do not generate relative lookup tables for large code models | ||||||
443 | /// in 64-bit achitectures where 32-bit offsets might not be enough. | ||||||
444 | if (TM.getCodeModel() == CodeModel::Medium || | ||||||
445 | TM.getCodeModel() == CodeModel::Large) | ||||||
446 | return false; | ||||||
447 | |||||||
448 | Triple TargetTriple = TM.getTargetTriple(); | ||||||
449 | if (!TargetTriple.isArch64Bit()) | ||||||
450 | return false; | ||||||
451 | |||||||
452 | // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it | ||||||
453 | // there. | ||||||
454 | if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) | ||||||
455 | return false; | ||||||
456 | |||||||
457 | return true; | ||||||
458 | } | ||||||
459 | |||||||
460 | bool haveFastSqrt(Type *Ty) { | ||||||
461 | const TargetLoweringBase *TLI = getTLI(); | ||||||
462 | EVT VT = TLI->getValueType(DL, Ty); | ||||||
463 | return TLI->isTypeLegal(VT) && | ||||||
464 | TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); | ||||||
465 | } | ||||||
466 | |||||||
467 | bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | ||||||
468 | return true; | ||||||
469 | } | ||||||
470 | |||||||
471 | InstructionCost getFPOpCost(Type *Ty) { | ||||||
472 | // Check whether FADD is available, as a proxy for floating-point in | ||||||
473 | // general. | ||||||
474 | const TargetLoweringBase *TLI = getTLI(); | ||||||
475 | EVT VT = TLI->getValueType(DL, Ty); | ||||||
476 | if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) | ||||||
477 | return TargetTransformInfo::TCC_Basic; | ||||||
478 | return TargetTransformInfo::TCC_Expensive; | ||||||
479 | } | ||||||
480 | |||||||
481 | unsigned getInliningThresholdMultiplier() { return 1; } | ||||||
482 | unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } | ||||||
483 | |||||||
484 | int getInlinerVectorBonusPercent() { return 150; } | ||||||
485 | |||||||
486 | void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | ||||||
487 | TTI::UnrollingPreferences &UP) { | ||||||
488 | // This unrolling functionality is target independent, but to provide some | ||||||
489 | // motivation for its intended use, for x86: | ||||||
490 | |||||||
491 | // According to the Intel 64 and IA-32 Architectures Optimization Reference | ||||||
492 | // Manual, Intel Core models and later have a loop stream detector (and | ||||||
493 | // associated uop queue) that can benefit from partial unrolling. | ||||||
494 | // The relevant requirements are: | ||||||
495 | // - The loop must have no more than 4 (8 for Nehalem and later) branches | ||||||
496 | // taken, and none of them may be calls. | ||||||
497 | // - The loop can have no more than 18 (28 for Nehalem and later) uops. | ||||||
498 | |||||||
499 | // According to the Software Optimization Guide for AMD Family 15h | ||||||
500 | // Processors, models 30h-4fh (Steamroller and later) have a loop predictor | ||||||
501 | // and loop buffer which can benefit from partial unrolling. | ||||||
502 | // The relevant requirements are: | ||||||
503 | // - The loop must have fewer than 16 branches | ||||||
504 | // - The loop must have less than 40 uops in all executed loop branches | ||||||
505 | |||||||
506 | // The number of taken branches in a loop is hard to estimate here, and | ||||||
507 | // benchmarking has revealed that it is better not to be conservative when | ||||||
508 | // estimating the branch count. As a result, we'll ignore the branch limits | ||||||
509 | // until someone finds a case where it matters in practice. | ||||||
510 | |||||||
511 | unsigned MaxOps; | ||||||
512 | const TargetSubtargetInfo *ST = getST(); | ||||||
513 | if (PartialUnrollingThreshold.getNumOccurrences() > 0) | ||||||
514 | MaxOps = PartialUnrollingThreshold; | ||||||
515 | else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) | ||||||
516 | MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; | ||||||
517 | else | ||||||
518 | return; | ||||||
519 | |||||||
520 | // Scan the loop: don't unroll loops with calls. | ||||||
521 | for (BasicBlock *BB : L->blocks()) { | ||||||
522 | for (Instruction &I : *BB) { | ||||||
523 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { | ||||||
524 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { | ||||||
525 | if (!thisT()->isLoweredToCall(F)) | ||||||
526 | continue; | ||||||
527 | } | ||||||
528 | |||||||
529 | return; | ||||||
530 | } | ||||||
531 | } | ||||||
532 | } | ||||||
533 | |||||||
534 | // Enable runtime and partial unrolling up to the specified size. | ||||||
535 | // Enable using trip count upper bound to unroll loops. | ||||||
536 | UP.Partial = UP.Runtime = UP.UpperBound = true; | ||||||
537 | UP.PartialThreshold = MaxOps; | ||||||
538 | |||||||
539 | // Avoid unrolling when optimizing for size. | ||||||
540 | UP.OptSizeThreshold = 0; | ||||||
541 | UP.PartialOptSizeThreshold = 0; | ||||||
542 | |||||||
543 | // Set number of instructions optimized when "back edge" | ||||||
544 | // becomes "fall through" to default value of 2. | ||||||
545 | UP.BEInsns = 2; | ||||||
546 | } | ||||||
547 | |||||||
548 | void getPeelingPreferences(Loop *L, ScalarEvolution &SE, | ||||||
549 | TTI::PeelingPreferences &PP) { | ||||||
550 | PP.PeelCount = 0; | ||||||
551 | PP.AllowPeeling = true; | ||||||
552 | PP.AllowLoopNestsPeeling = false; | ||||||
553 | PP.PeelProfiledIterations = true; | ||||||
554 | } | ||||||
555 | |||||||
556 | bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, | ||||||
557 | AssumptionCache &AC, | ||||||
558 | TargetLibraryInfo *LibInfo, | ||||||
559 | HardwareLoopInfo &HWLoopInfo) { | ||||||
560 | return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); | ||||||
561 | } | ||||||
562 | |||||||
563 | bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, | ||||||
564 | AssumptionCache &AC, TargetLibraryInfo *TLI, | ||||||
565 | DominatorTree *DT, | ||||||
566 | const LoopAccessInfo *LAI) { | ||||||
567 | return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); | ||||||
568 | } | ||||||
569 | |||||||
570 | bool emitGetActiveLaneMask() { | ||||||
571 | return BaseT::emitGetActiveLaneMask(); | ||||||
572 | } | ||||||
573 | |||||||
574 | Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, | ||||||
575 | IntrinsicInst &II) { | ||||||
576 | return BaseT::instCombineIntrinsic(IC, II); | ||||||
577 | } | ||||||
578 | |||||||
579 | Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, | ||||||
580 | IntrinsicInst &II, | ||||||
581 | APInt DemandedMask, | ||||||
582 | KnownBits &Known, | ||||||
583 | bool &KnownBitsComputed) { | ||||||
584 | return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, | ||||||
585 | KnownBitsComputed); | ||||||
586 | } | ||||||
587 | |||||||
588 | Optional<Value *> simplifyDemandedVectorEltsIntrinsic( | ||||||
589 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, | ||||||
590 | APInt &UndefElts2, APInt &UndefElts3, | ||||||
591 | std::function<void(Instruction *, unsigned, APInt, APInt &)> | ||||||
592 | SimplifyAndSetOp) { | ||||||
593 | return BaseT::simplifyDemandedVectorEltsIntrinsic( | ||||||
594 | IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, | ||||||
595 | SimplifyAndSetOp); | ||||||
596 | } | ||||||
597 | |||||||
598 | InstructionCost getInstructionLatency(const Instruction *I) { | ||||||
599 | if (isa<LoadInst>(I)) | ||||||
600 | return getST()->getSchedModel().DefaultLoadLatency; | ||||||
601 | |||||||
602 | return BaseT::getInstructionLatency(I); | ||||||
603 | } | ||||||
604 | |||||||
605 | virtual Optional<unsigned> | ||||||
606 | getCacheSize(TargetTransformInfo::CacheLevel Level) const { | ||||||
607 | return Optional<unsigned>( | ||||||
608 | getST()->getCacheSize(static_cast<unsigned>(Level))); | ||||||
609 | } | ||||||
610 | |||||||
611 | virtual Optional<unsigned> | ||||||
612 | getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { | ||||||
613 | Optional<unsigned> TargetResult = | ||||||
614 | getST()->getCacheAssociativity(static_cast<unsigned>(Level)); | ||||||
615 | |||||||
616 | if (TargetResult) | ||||||
617 | return TargetResult; | ||||||
618 | |||||||
619 | return BaseT::getCacheAssociativity(Level); | ||||||
620 | } | ||||||
621 | |||||||
622 | virtual unsigned getCacheLineSize() const { | ||||||
623 | return getST()->getCacheLineSize(); | ||||||
624 | } | ||||||
625 | |||||||
626 | virtual unsigned getPrefetchDistance() const { | ||||||
627 | return getST()->getPrefetchDistance(); | ||||||
628 | } | ||||||
629 | |||||||
630 | virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, | ||||||
631 | unsigned NumStridedMemAccesses, | ||||||
632 | unsigned NumPrefetches, | ||||||
633 | bool HasCall) const { | ||||||
634 | return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, | ||||||
635 | NumPrefetches, HasCall); | ||||||
636 | } | ||||||
637 | |||||||
638 | virtual unsigned getMaxPrefetchIterationsAhead() const { | ||||||
639 | return getST()->getMaxPrefetchIterationsAhead(); | ||||||
640 | } | ||||||
641 | |||||||
642 | virtual bool enableWritePrefetching() const { | ||||||
643 | return getST()->enableWritePrefetching(); | ||||||
644 | } | ||||||
645 | |||||||
646 | /// @} | ||||||
647 | |||||||
648 | /// \name Vector TTI Implementations | ||||||
649 | /// @{ | ||||||
650 | |||||||
651 | TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | ||||||
652 | return TypeSize::getFixed(32); | ||||||
653 | } | ||||||
654 | |||||||
655 | Optional<unsigned> getMaxVScale() const { return None; } | ||||||
656 | |||||||
657 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract | ||||||
658 | /// are set if the demanded result elements need to be inserted and/or | ||||||
659 | /// extracted from vectors. | ||||||
660 | InstructionCost getScalarizationOverhead(VectorType *InTy, | ||||||
661 | const APInt &DemandedElts, | ||||||
662 | bool Insert, bool Extract) { | ||||||
663 | /// FIXME: a bitfield is not a reasonable abstraction for talking about | ||||||
664 | /// which elements are needed from a scalable vector | ||||||
665 | auto *Ty = cast<FixedVectorType>(InTy); | ||||||
666 | |||||||
667 | assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&((void)0) | ||||||
668 | "Vector size mismatch")((void)0); | ||||||
669 | |||||||
670 | InstructionCost Cost = 0; | ||||||
671 | |||||||
672 | for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { | ||||||
673 | if (!DemandedElts[i]) | ||||||
674 | continue; | ||||||
675 | if (Insert) | ||||||
676 | Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); | ||||||
677 | if (Extract) | ||||||
678 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); | ||||||
679 | } | ||||||
680 | |||||||
681 | return Cost; | ||||||
682 | } | ||||||
683 | |||||||
684 | /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. | ||||||
685 | InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, | ||||||
686 | bool Extract) { | ||||||
687 | auto *Ty = cast<FixedVectorType>(InTy); | ||||||
688 | |||||||
689 | APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements()); | ||||||
690 | return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); | ||||||
691 | } | ||||||
692 | |||||||
693 | /// Estimate the overhead of scalarizing an instructions unique | ||||||
694 | /// non-constant operands. The (potentially vector) types to use for each of | ||||||
695 | /// argument are passes via Tys. | ||||||
696 | InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, | ||||||
697 | ArrayRef<Type *> Tys) { | ||||||
698 | assert(Args.size() == Tys.size() && "Expected matching Args and Tys")((void)0); | ||||||
699 | |||||||
700 | InstructionCost Cost = 0; | ||||||
701 | SmallPtrSet<const Value*, 4> UniqueOperands; | ||||||
702 | for (int I = 0, E = Args.size(); I != E; I++) { | ||||||
703 | // Disregard things like metadata arguments. | ||||||
704 | const Value *A = Args[I]; | ||||||
705 | Type *Ty = Tys[I]; | ||||||
706 | if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && | ||||||
707 | !Ty->isPtrOrPtrVectorTy()) | ||||||
708 | continue; | ||||||
709 | |||||||
710 | if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { | ||||||
711 | if (auto *VecTy = dyn_cast<VectorType>(Ty)) | ||||||
712 | Cost += getScalarizationOverhead(VecTy, false, true); | ||||||
713 | } | ||||||
714 | } | ||||||
715 | |||||||
716 | return Cost; | ||||||
717 | } | ||||||
718 | |||||||
719 | /// Estimate the overhead of scalarizing the inputs and outputs of an | ||||||
720 | /// instruction, with return type RetTy and arguments Args of type Tys. If | ||||||
721 | /// Args are unknown (empty), then the cost associated with one argument is | ||||||
722 | /// added as a heuristic. | ||||||
723 | InstructionCost getScalarizationOverhead(VectorType *RetTy, | ||||||
724 | ArrayRef<const Value *> Args, | ||||||
725 | ArrayRef<Type *> Tys) { | ||||||
726 | InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); | ||||||
727 | if (!Args.empty()) | ||||||
728 | Cost += getOperandsScalarizationOverhead(Args, Tys); | ||||||
729 | else | ||||||
730 | // When no information on arguments is provided, we add the cost | ||||||
731 | // associated with one argument as a heuristic. | ||||||
732 | Cost += getScalarizationOverhead(RetTy, false, true); | ||||||
733 | |||||||
734 | return Cost; | ||||||
735 | } | ||||||
736 | |||||||
737 | unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } | ||||||
738 | |||||||
739 | InstructionCost getArithmeticInstrCost( | ||||||
740 | unsigned Opcode, Type *Ty, | ||||||
741 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, | ||||||
742 | TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, | ||||||
743 | TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, | ||||||
744 | TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, | ||||||
745 | TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, | ||||||
746 | ArrayRef<const Value *> Args = ArrayRef<const Value *>(), | ||||||
747 | const Instruction *CxtI = nullptr) { | ||||||
748 | // Check if any of the operands are vector operands. | ||||||
749 | const TargetLoweringBase *TLI = getTLI(); | ||||||
750 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
751 | assert(ISD && "Invalid opcode")((void)0); | ||||||
752 | |||||||
753 | // TODO: Handle more cost kinds. | ||||||
754 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
755 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, | ||||||
756 | Opd1Info, Opd2Info, | ||||||
757 | Opd1PropInfo, Opd2PropInfo, | ||||||
758 | Args, CxtI); | ||||||
759 | |||||||
760 | std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | ||||||
761 | |||||||
762 | bool IsFloat = Ty->isFPOrFPVectorTy(); | ||||||
763 | // Assume that floating point arithmetic operations cost twice as much as | ||||||
764 | // integer operations. | ||||||
765 | InstructionCost OpCost = (IsFloat ? 2 : 1); | ||||||
766 | |||||||
767 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { | ||||||
768 | // The operation is legal. Assume it costs 1. | ||||||
769 | // TODO: Once we have extract/insert subvector cost we need to use them. | ||||||
770 | return LT.first * OpCost; | ||||||
771 | } | ||||||
772 | |||||||
773 | if (!TLI->isOperationExpand(ISD, LT.second)) { | ||||||
774 | // If the operation is custom lowered, then assume that the code is twice | ||||||
775 | // as expensive. | ||||||
776 | return LT.first * 2 * OpCost; | ||||||
777 | } | ||||||
778 | |||||||
779 | // An 'Expand' of URem and SRem is special because it may default | ||||||
780 | // to expanding the operation into a sequence of sub-operations | ||||||
781 | // i.e. X % Y -> X-(X/Y)*Y. | ||||||
782 | if (ISD == ISD::UREM || ISD == ISD::SREM) { | ||||||
783 | bool IsSigned = ISD == ISD::SREM; | ||||||
784 | if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, | ||||||
785 | LT.second) || | ||||||
786 | TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, | ||||||
787 | LT.second)) { | ||||||
788 | unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; | ||||||
789 | InstructionCost DivCost = thisT()->getArithmeticInstrCost( | ||||||
790 | DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, | ||||||
791 | Opd2PropInfo); | ||||||
792 | InstructionCost MulCost = | ||||||
793 | thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); | ||||||
794 | InstructionCost SubCost = | ||||||
795 | thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); | ||||||
796 | return DivCost + MulCost + SubCost; | ||||||
797 | } | ||||||
798 | } | ||||||
799 | |||||||
800 | // We cannot scalarize scalable vectors, so return Invalid. | ||||||
801 | if (isa<ScalableVectorType>(Ty)) | ||||||
802 | return InstructionCost::getInvalid(); | ||||||
803 | |||||||
804 | // Else, assume that we need to scalarize this op. | ||||||
805 | // TODO: If one of the types get legalized by splitting, handle this | ||||||
806 | // similarly to what getCastInstrCost() does. | ||||||
807 | if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { | ||||||
808 | InstructionCost Cost = thisT()->getArithmeticInstrCost( | ||||||
809 | Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, | ||||||
810 | Opd1PropInfo, Opd2PropInfo, Args, CxtI); | ||||||
811 | // Return the cost of multiple scalar invocation plus the cost of | ||||||
812 | // inserting and extracting the values. | ||||||
813 | SmallVector<Type *> Tys(Args.size(), Ty); | ||||||
814 | return getScalarizationOverhead(VTy, Args, Tys) + | ||||||
815 | VTy->getNumElements() * Cost; | ||||||
816 | } | ||||||
817 | |||||||
818 | // We don't know anything about this scalar instruction. | ||||||
819 | return OpCost; | ||||||
820 | } | ||||||
821 | |||||||
822 | TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, | ||||||
823 | ArrayRef<int> Mask) const { | ||||||
824 | int Limit = Mask.size() * 2; | ||||||
825 | if (Mask.empty() || | ||||||
826 | // Extra check required by isSingleSourceMaskImpl function (called by | ||||||
827 | // ShuffleVectorInst::isSingleSourceMask). | ||||||
828 | any_of(Mask, [Limit](int I) { return I >= Limit; })) | ||||||
829 | return Kind; | ||||||
830 | switch (Kind) { | ||||||
831 | case TTI::SK_PermuteSingleSrc: | ||||||
832 | if (ShuffleVectorInst::isReverseMask(Mask)) | ||||||
833 | return TTI::SK_Reverse; | ||||||
834 | if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) | ||||||
835 | return TTI::SK_Broadcast; | ||||||
836 | break; | ||||||
837 | case TTI::SK_PermuteTwoSrc: | ||||||
838 | if (ShuffleVectorInst::isSelectMask(Mask)) | ||||||
839 | return TTI::SK_Select; | ||||||
840 | if (ShuffleVectorInst::isTransposeMask(Mask)) | ||||||
841 | return TTI::SK_Transpose; | ||||||
842 | break; | ||||||
843 | case TTI::SK_Select: | ||||||
844 | case TTI::SK_Reverse: | ||||||
845 | case TTI::SK_Broadcast: | ||||||
846 | case TTI::SK_Transpose: | ||||||
847 | case TTI::SK_InsertSubvector: | ||||||
848 | case TTI::SK_ExtractSubvector: | ||||||
849 | case TTI::SK_Splice: | ||||||
850 | break; | ||||||
851 | } | ||||||
852 | return Kind; | ||||||
853 | } | ||||||
854 | |||||||
855 | InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, | ||||||
856 | ArrayRef<int> Mask, int Index, | ||||||
857 | VectorType *SubTp) { | ||||||
858 | |||||||
859 | switch (improveShuffleKindFromMask(Kind, Mask)) { | ||||||
860 | case TTI::SK_Broadcast: | ||||||
861 | return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp)); | ||||||
862 | case TTI::SK_Select: | ||||||
863 | case TTI::SK_Splice: | ||||||
864 | case TTI::SK_Reverse: | ||||||
865 | case TTI::SK_Transpose: | ||||||
866 | case TTI::SK_PermuteSingleSrc: | ||||||
867 | case TTI::SK_PermuteTwoSrc: | ||||||
868 | return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp)); | ||||||
869 | case TTI::SK_ExtractSubvector: | ||||||
870 | return getExtractSubvectorOverhead(Tp, Index, | ||||||
871 | cast<FixedVectorType>(SubTp)); | ||||||
872 | case TTI::SK_InsertSubvector: | ||||||
873 | return getInsertSubvectorOverhead(Tp, Index, | ||||||
874 | cast<FixedVectorType>(SubTp)); | ||||||
875 | } | ||||||
876 | llvm_unreachable("Unknown TTI::ShuffleKind")__builtin_unreachable(); | ||||||
877 | } | ||||||
878 | |||||||
879 | InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, | ||||||
880 | TTI::CastContextHint CCH, | ||||||
881 | TTI::TargetCostKind CostKind, | ||||||
882 | const Instruction *I = nullptr) { | ||||||
883 | if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) | ||||||
884 | return 0; | ||||||
885 | |||||||
886 | const TargetLoweringBase *TLI = getTLI(); | ||||||
887 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
888 | assert(ISD && "Invalid opcode")((void)0); | ||||||
889 | std::pair<InstructionCost, MVT> SrcLT = | ||||||
890 | TLI->getTypeLegalizationCost(DL, Src); | ||||||
891 | std::pair<InstructionCost, MVT> DstLT = | ||||||
892 | TLI->getTypeLegalizationCost(DL, Dst); | ||||||
893 | |||||||
894 | TypeSize SrcSize = SrcLT.second.getSizeInBits(); | ||||||
895 | TypeSize DstSize = DstLT.second.getSizeInBits(); | ||||||
896 | bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); | ||||||
897 | bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); | ||||||
898 | |||||||
899 | switch (Opcode) { | ||||||
900 | default: | ||||||
901 | break; | ||||||
902 | case Instruction::Trunc: | ||||||
903 | // Check for NOOP conversions. | ||||||
904 | if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) | ||||||
905 | return 0; | ||||||
906 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | ||||||
907 | case Instruction::BitCast: | ||||||
908 | // Bitcast between types that are legalized to the same type are free and | ||||||
909 | // assume int to/from ptr of the same size is also free. | ||||||
910 | if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && | ||||||
911 | SrcSize == DstSize) | ||||||
912 | return 0; | ||||||
913 | break; | ||||||
914 | case Instruction::FPExt: | ||||||
915 | if (I && getTLI()->isExtFree(I)) | ||||||
916 | return 0; | ||||||
917 | break; | ||||||
918 | case Instruction::ZExt: | ||||||
919 | if (TLI->isZExtFree(SrcLT.second, DstLT.second)) | ||||||
920 | return 0; | ||||||
921 | LLVM_FALLTHROUGH[[gnu::fallthrough]]; | ||||||
922 | case Instruction::SExt: | ||||||
923 | if (I && getTLI()->isExtFree(I)) | ||||||
924 | return 0; | ||||||
925 | |||||||
926 | // If this is a zext/sext of a load, return 0 if the corresponding | ||||||
927 | // extending load exists on target and the result type is legal. | ||||||
928 | if (CCH == TTI::CastContextHint::Normal) { | ||||||
929 | EVT ExtVT = EVT::getEVT(Dst); | ||||||
930 | EVT LoadVT = EVT::getEVT(Src); | ||||||
931 | unsigned LType = | ||||||
932 | ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); | ||||||
933 | if (DstLT.first == SrcLT.first && | ||||||
934 | TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) | ||||||
935 | return 0; | ||||||
936 | } | ||||||
937 | break; | ||||||
938 | case Instruction::AddrSpaceCast: | ||||||
939 | if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), | ||||||
940 | Dst->getPointerAddressSpace())) | ||||||
941 | return 0; | ||||||
942 | break; | ||||||
943 | } | ||||||
944 | |||||||
945 | auto *SrcVTy = dyn_cast<VectorType>(Src); | ||||||
946 | auto *DstVTy = dyn_cast<VectorType>(Dst); | ||||||
947 | |||||||
948 | // If the cast is marked as legal (or promote) then assume low cost. | ||||||
949 | if (SrcLT.first == DstLT.first && | ||||||
950 | TLI->isOperationLegalOrPromote(ISD, DstLT.second)) | ||||||
951 | return SrcLT.first; | ||||||
952 | |||||||
953 | // Handle scalar conversions. | ||||||
954 | if (!SrcVTy && !DstVTy) { | ||||||
955 | // Just check the op cost. If the operation is legal then assume it costs | ||||||
956 | // 1. | ||||||
957 | if (!TLI->isOperationExpand(ISD, DstLT.second)) | ||||||
958 | return 1; | ||||||
959 | |||||||
960 | // Assume that illegal scalar instruction are expensive. | ||||||
961 | return 4; | ||||||
962 | } | ||||||
963 | |||||||
964 | // Check vector-to-vector casts. | ||||||
965 | if (DstVTy && SrcVTy) { | ||||||
966 | // If the cast is between same-sized registers, then the check is simple. | ||||||
967 | if (SrcLT.first == DstLT.first && SrcSize == DstSize) { | ||||||
968 | |||||||
969 | // Assume that Zext is done using AND. | ||||||
970 | if (Opcode == Instruction::ZExt) | ||||||
971 | return SrcLT.first; | ||||||
972 | |||||||
973 | // Assume that sext is done using SHL and SRA. | ||||||
974 | if (Opcode == Instruction::SExt) | ||||||
975 | return SrcLT.first * 2; | ||||||
976 | |||||||
977 | // Just check the op cost. If the operation is legal then assume it | ||||||
978 | // costs | ||||||
979 | // 1 and multiply by the type-legalization overhead. | ||||||
980 | if (!TLI->isOperationExpand(ISD, DstLT.second)) | ||||||
981 | return SrcLT.first * 1; | ||||||
982 | } | ||||||
983 | |||||||
984 | // If we are legalizing by splitting, query the concrete TTI for the cost | ||||||
985 | // of casting the original vector twice. We also need to factor in the | ||||||
986 | // cost of the split itself. Count that as 1, to be consistent with | ||||||
987 | // TLI->getTypeLegalizationCost(). | ||||||
988 | bool SplitSrc = | ||||||
989 | TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == | ||||||
990 | TargetLowering::TypeSplitVector; | ||||||
991 | bool SplitDst = | ||||||
992 | TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == | ||||||
993 | TargetLowering::TypeSplitVector; | ||||||
994 | if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && | ||||||
995 | DstVTy->getElementCount().isVector()) { | ||||||
996 | Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); | ||||||
997 | Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); | ||||||
998 | T *TTI = static_cast<T *>(this); | ||||||
999 | // If both types need to be split then the split is free. | ||||||
1000 | InstructionCost SplitCost = | ||||||
1001 | (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; | ||||||
1002 | return SplitCost + | ||||||
1003 | (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, | ||||||
1004 | CostKind, I)); | ||||||
1005 | } | ||||||
1006 | |||||||
1007 | // Scalarization cost is Invalid, can't assume any num elements. | ||||||
1008 | if (isa<ScalableVectorType>(DstVTy)) | ||||||
1009 | return InstructionCost::getInvalid(); | ||||||
1010 | |||||||
1011 | // In other cases where the source or destination are illegal, assume | ||||||
1012 | // the operation will get scalarized. | ||||||
1013 | unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); | ||||||
1014 | InstructionCost Cost = thisT()->getCastInstrCost( | ||||||
1015 | Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); | ||||||
1016 | |||||||
1017 | // Return the cost of multiple scalar invocation plus the cost of | ||||||
1018 | // inserting and extracting the values. | ||||||
1019 | return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; | ||||||
1020 | } | ||||||
1021 | |||||||
1022 | // We already handled vector-to-vector and scalar-to-scalar conversions. | ||||||
1023 | // This | ||||||
1024 | // is where we handle bitcast between vectors and scalars. We need to assume | ||||||
1025 | // that the conversion is scalarized in one way or another. | ||||||
1026 | if (Opcode == Instruction::BitCast) { | ||||||
1027 | // Illegal bitcasts are done by storing and loading from a stack slot. | ||||||
1028 | return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + | ||||||
1029 | (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); | ||||||
1030 | } | ||||||
1031 | |||||||
1032 | llvm_unreachable("Unhandled cast")__builtin_unreachable(); | ||||||
1033 | } | ||||||
1034 | |||||||
1035 | InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, | ||||||
1036 | VectorType *VecTy, unsigned Index) { | ||||||
1037 | return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, | ||||||
1038 | Index) + | ||||||
1039 | thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), | ||||||
1040 | TTI::CastContextHint::None, | ||||||
1041 | TTI::TCK_RecipThroughput); | ||||||
1042 | } | ||||||
1043 | |||||||
1044 | InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, | ||||||
1045 | const Instruction *I = nullptr) { | ||||||
1046 | return BaseT::getCFInstrCost(Opcode, CostKind, I); | ||||||
1047 | } | ||||||
1048 | |||||||
1049 | InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, | ||||||
1050 | CmpInst::Predicate VecPred, | ||||||
1051 | TTI::TargetCostKind CostKind, | ||||||
1052 | const Instruction *I = nullptr) { | ||||||
1053 | const TargetLoweringBase *TLI = getTLI(); | ||||||
1054 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | ||||||
1055 | assert(ISD && "Invalid opcode")((void)0); | ||||||
1056 | |||||||
1057 | // TODO: Handle other cost kinds. | ||||||
1058 | if (CostKind
| ||||||
1059 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | ||||||
1060 | I); | ||||||
1061 | |||||||
1062 | // Selects on vectors are actually vector selects. | ||||||
1063 | if (ISD == ISD::SELECT) { | ||||||
1064 | assert(CondTy && "CondTy must exist")((void)0); | ||||||
1065 | if (CondTy->isVectorTy()) | ||||||
| |||||||
1066 | ISD = ISD::VSELECT; | ||||||
1067 | } | ||||||
1068 | std::pair<InstructionCost, MVT> LT = | ||||||
1069 | TLI->getTypeLegalizationCost(DL, ValTy); | ||||||
1070 | |||||||
1071 | if (!(ValTy->isVectorTy() && !LT.second.isVector()) && | ||||||
1072 | !TLI->isOperationExpand(ISD, LT.second)) { | ||||||
1073 | // The operation is legal. Assume it costs 1. Multiply | ||||||
1074 | // by the type-legalization overhead. | ||||||
1075 | return LT.first * 1; | ||||||
1076 | } | ||||||
1077 | |||||||
1078 | // Otherwise, assume that the cast is scalarized. | ||||||
1079 | // TODO: If one of the types get legalized by splitting, handle this | ||||||
1080 | // similarly to what getCastInstrCost() does. | ||||||
1081 | if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { | ||||||
1082 | unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); | ||||||
1083 | if (CondTy) | ||||||
1084 | CondTy = CondTy->getScalarType(); | ||||||
1085 | InstructionCost Cost = thisT()->getCmpSelInstrCost( | ||||||
1086 | Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); | ||||||
1087 | |||||||
1088 | // Return the cost of multiple scalar invocation plus the cost of | ||||||
1089 | // inserting and extracting the values. | ||||||
1090 | return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; | ||||||
1091 | } | ||||||
1092 | |||||||
1093 | // Unknown scalar opcode. | ||||||
1094 | return 1; | ||||||
1095 | } | ||||||
1096 | |||||||
1097 | InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, | ||||||
1098 | unsigned Index) { | ||||||
1099 | std::pair<InstructionCost, MVT> LT = | ||||||
1100 | getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); | ||||||
1101 | |||||||
1102 | return LT.first; | ||||||
1103 | } | ||||||
1104 | |||||||
1105 | InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, | ||||||
1106 | MaybeAlign Alignment, unsigned AddressSpace, | ||||||
1107 | TTI::TargetCostKind CostKind, | ||||||
1108 | const Instruction *I = nullptr) { | ||||||
1109 | assert(!Src->isVoidTy() && "Invalid type")((void)0); | ||||||
1110 | // Assume types, such as structs, are expensive. | ||||||
1111 | if (getTLI()->getValueType(DL, Src, true) == MVT::Other) | ||||||
1112 | return 4; | ||||||
1113 | std::pair<InstructionCost, MVT> LT = | ||||||
1114 | getTLI()->getTypeLegalizationCost(DL, Src); | ||||||
1115 | |||||||
1116 | // Assuming that all loads of legal types cost 1. | ||||||
1117 | InstructionCost Cost = LT.first; | ||||||
1118 | if (CostKind != TTI::TCK_RecipThroughput) | ||||||
1119 | return Cost; | ||||||
1120 | |||||||
1121 | if (Src->isVectorTy() && | ||||||
1122 | // In practice it's not currently possible to have a change in lane | ||||||
1123 | // length for extending loads or truncating stores so both types should | ||||||
1124 | // have the same scalable property. | ||||||
1125 | TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(), | ||||||
1126 | LT.second.getSizeInBits())) { | ||||||
1127 | // This is a vector load that legalizes to a larger type than the vector | ||||||
1128 | // itself. Unless the corresponding extending load or truncating store is | ||||||
1129 | // legal, then this will scalarize. | ||||||
1130 | TargetLowering::LegalizeAction LA = TargetLowering::Expand; | ||||||
1131 | EVT MemVT = getTLI()->getValueType(DL, Src); | ||||||
1132 | if (Opcode == Instruction::Store) | ||||||
1133 | LA = getTLI()->getTruncStoreAction(LT.second, MemVT); | ||||||
1134 | else | ||||||
1135 | LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); | ||||||
1136 | |||||||
1137 | if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { | ||||||
1138 | // This is a vector load/store for some illegal type that is scalarized. | ||||||
1139 | // We must account for the cost of building or decomposing the vector. | ||||||
1140 | Cost += getScalarizationOverhead(cast<VectorType>(Src), | ||||||
1141 | Opcode != Instruction::Store, | ||||||
1142 | Opcode == Instruction::Store); | ||||||
1143 | } | ||||||
1144 | } | ||||||
1145 | |||||||
1146 | return Cost; | ||||||
1147 | } | ||||||
1148 | |||||||
1149 | InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, | ||||||
1150 | Align Alignment, unsigned AddressSpace, | ||||||
1151 | TTI::TargetCostKind CostKind) { | ||||||
1152 | return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, | ||||||
1153 | CostKind); | ||||||
1154 | } | ||||||
1155 | |||||||
1156 | InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, | ||||||
1157 | const Value *Ptr, bool VariableMask, | ||||||
1158 | Align Alignment, | ||||||
1159 | TTI::TargetCostKind CostKind, | ||||||
1160 | const Instruction *I = nullptr) { | ||||||
1161 | return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, | ||||||
1162 | true, CostKind); | ||||||
1163 | } | ||||||
1164 | |||||||
1165 | InstructionCost getInterleavedMemoryOpCost( | ||||||
1166 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | ||||||
1167 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | ||||||
1168 | bool UseMaskForCond = false, bool UseMaskForGaps = false) { | ||||||
1169 | auto *VT = cast<FixedVectorType>(VecTy); | ||||||
1170 | |||||||
1171 | unsigned NumElts = VT->getNumElements(); | ||||||
1172 | assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor")((void)0); | ||||||
1173 | |||||||
1174 | unsigned NumSubElts = NumElts / Factor; | ||||||
1175 | auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); | ||||||
1176 | |||||||
1177 | // Firstly, the cost of load/store operation. | ||||||
1178 | InstructionCost Cost; | ||||||
1179 | if (UseMaskForCond || UseMaskForGaps) | ||||||
1180 | Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, | ||||||
1181 | AddressSpace, CostKind); | ||||||
1182 | else | ||||||
1183 | Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, | ||||||
1184 | CostKind); | ||||||
1185 | |||||||
1186 | // Legalize the vector type, and get the legalized and unlegalized type | ||||||
1187 | // sizes. | ||||||
1188 | MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; | ||||||
1189 | unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); | ||||||
1190 | unsigned VecTyLTSize = VecTyLT.getStoreSize(); | ||||||
1191 | |||||||
1192 | // Scale the cost of the memory operation by the fraction of legalized | ||||||
1193 | // instructions that will actually be used. We shouldn't account for the | ||||||
1194 | // cost of dead instructions since they will be removed. | ||||||
1195 | // | ||||||
1196 | // E.g., An interleaved load of factor 8: | ||||||
1197 | // %vec = load <16 x i64>, <16 x i64>* %ptr | ||||||
1198 | // %v0 = shufflevector %vec, undef, <0, 8> | ||||||
1199 | // | ||||||
1200 | // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be | ||||||
1201 | // used (those corresponding to elements [0:1] and [8:9] of the unlegalized | ||||||
1202 | // type). The other loads are unused. | ||||||
1203 | // | ||||||
1204 | // We only scale the cost of loads since interleaved store groups aren't | ||||||
1205 | // allowed to have gaps. | ||||||
1206 | if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { | ||||||
1207 | // The number of loads of a legal type it will take to represent a load | ||||||
1208 | // of the unlegalized vector type. | ||||||
1209 | unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); | ||||||
1210 | |||||||
1211 | // The number of elements of the unlegalized type that correspond to a | ||||||
1212 | // single legal instruction. | ||||||
1213 | unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); | ||||||
1214 | |||||||
1215 | // Determine which legal instructions will be used. | ||||||
1216 | BitVector UsedInsts(NumLegalInsts, false); | ||||||
1217 | for (unsigned Index : Indices) | ||||||
1218 | for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) | ||||||
1219 | UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); | ||||||
1220 | |||||||
1221 | // Scale the cost of the load by the fraction of legal instructions that | ||||||
1222 | // will be used. | ||||||
1223 | Cost *= UsedInsts.count() / NumLegalInsts; | ||||||
1224 | } | ||||||
1225 | |||||||
1226 | // Then plus the cost of interleave operation. | ||||||
1227 | if (Opcode == Instruction::Load) { | ||||||
1228 | // The interleave cost is similar to extract sub vectors' elements | ||||||
1229 | // from the wide vector, and insert them into sub vectors. | ||||||
1230 | // | ||||||
1231 | // E.g. An interleaved load of factor 2 (with one member of index 0): | ||||||
1232 | // %vec = load <8 x i32>, <8 x i32>* %ptr | ||||||
1233 | // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 | ||||||
1234 | // The cost is estimated as extract elements at 0, 2, 4, 6 from the | ||||||
1235 | // <8 x i32> vector and insert them into a <4 x i32> vector. | ||||||
1236 | |||||||
1237 | assert(Indices.size() <= Factor &&((void)0) | ||||||
1238 | "Interleaved memory op has too many members")((void)0); | ||||||
1239 | |||||||
1240 | for (unsigned Index : Indices) { | ||||||
1241 | assert(Index < Factor && "Invalid index for interleaved memory op")((void)0); | ||||||
1242 | |||||||
1243 | // Extract elements from loaded vector for each sub vector. | ||||||
1244 | for (unsigned i = 0; i < NumSubElts; i++) | ||||||
1245 | Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT, | ||||||
1246 | Index + i * Factor); | ||||||
1247 | } | ||||||
1248 | |||||||
1249 | InstructionCost InsSubCost = 0; | ||||||
1250 | for (unsigned i = 0; i < NumSubElts; i++) | ||||||
1251 | InsSubCost += | ||||||
1252 | thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i); | ||||||
1253 | |||||||
1254 | Cost += Indices.size() * InsSubCost; | ||||||
1255 | } else { | ||||||
1256 | // The interleave cost is extract all elements from sub vectors, and | ||||||
1257 | // insert them into the wide vector. | ||||||
1258 | // | ||||||
1259 | // E.g. An interleaved store of factor 2: | ||||||
1260 | // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> | ||||||
1261 | // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr | ||||||
1262 | // The cost is estimated as extract all elements from both <4 x i32> | ||||||
1263 | // vectors and insert into the <8 x i32> vector. | ||||||
1264 | |||||||
1265 | InstructionCost ExtSubCost = 0; | ||||||
1266 | for (unsigned i = 0; i < NumSubElts; i++) | ||||||
1267 | ExtSubCost += | ||||||
1268 | thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); | ||||||
1269 | Cost += ExtSubCost * Factor; | ||||||
1270 | |||||||
1271 | for (unsigned i = 0; i < NumElts; i++) | ||||||
1272 | Cost += static_cast<T *>(this) | ||||||
1273 | ->getVectorInstrCost(Instruction::InsertElement, VT, i); | ||||||
1274 | } | ||||||
1275 | |||||||
1276 | if (!UseMaskForCond) | ||||||
1277 | return Cost; | ||||||
1278 | |||||||
1279 | Type *I8Type = Type::getInt8Ty(VT->getContext()); | ||||||
1280 | auto *MaskVT = FixedVectorType::get(I8Type, NumElts); | ||||||
1281 | SubVT = FixedVectorType::get(I8Type, NumSubElts); | ||||||
1282 | |||||||
1283 | // The Mask shuffling cost is extract all the elements of the Mask | ||||||
1284 | // and insert each of them Factor times into the wide vector: | ||||||
1285 | // | ||||||
1286 | // E.g. an interleaved group with factor 3: | ||||||
1287 | // %mask = icmp ult <8 x i32> %vec1, %vec2 | ||||||
1288 | // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, | ||||||
1289 | // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> | ||||||
1290 | // The cost is estimated as extract all mask elements from the <8xi1> mask | ||||||
1291 | // vector and insert them factor times into the <24xi1> shuffled mask | ||||||
1292 | // vector. | ||||||
1293 | for (unsigned i = 0; i < NumSubElts; i++) | ||||||
1294 | Cost += | ||||||
1295 | thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); | ||||||
1296 | |||||||
1297 | for (unsigned i = 0; i < NumElts; i++) | ||||||
1298 | Cost += | ||||||
1299 | thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i); | ||||||
1300 | |||||||
1301 | // The Gaps mask is invariant and created outside the loop, therefore the | ||||||
1302 | // cost of creating it is not accounted for here. However if we have both | ||||||
1303 | // a MaskForGaps and some other mask that guards the execution of the | ||||||
1304 | // memory access, we need to account for the cost of And-ing the two masks | ||||||
1305 | // inside the loop. | ||||||
1306 | if (UseMaskForGaps) | ||||||
1307 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, | ||||||
1308 | CostKind); | ||||||
1309 | |||||||
1310 | return Cost; | ||||||
1311 | } | ||||||
1312 | |||||||
1313 | /// Get intrinsic cost based on arguments. | ||||||
1314 | InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||
1315 | TTI::TargetCostKind CostKind) { | ||||||
1316 | // Check for generically free intrinsics. | ||||||
1317 | if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) | ||||||
1318 | return 0; | ||||||
1319 | |||||||
1320 | // Assume that target intrinsics are cheap. | ||||||
1321 | Intrinsic::ID IID = ICA.getID(); | ||||||
1322 | if (Function::isTargetIntrinsic(IID)) | ||||||
1323 | return TargetTransformInfo::TCC_Basic; | ||||||
1324 | |||||||
1325 | if (ICA.isTypeBasedOnly()) | ||||||
1326 | return getTypeBasedIntrinsicInstrCost(ICA, CostKind); | ||||||
1327 | |||||||
1328 | Type *RetTy = ICA.getReturnType(); | ||||||
1329 | |||||||
1330 | ElementCount RetVF = | ||||||
1331 | (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() | ||||||
1332 | : ElementCount::getFixed(1)); | ||||||
1333 | const IntrinsicInst *I = ICA.getInst(); | ||||||
1334 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | ||||||
1335 | FastMathFlags FMF = ICA.getFlags(); | ||||||
1336 | switch (IID) { | ||||||
1337 | default: | ||||||
1338 | break; | ||||||
1339 | |||||||
1340 | case Intrinsic::cttz: | ||||||
1341 | // FIXME: If necessary, this should go in target-specific overrides. | ||||||
1342 | if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) | ||||||
1343 | return TargetTransformInfo::TCC_Basic; | ||||||
1344 | break; | ||||||
1345 | |||||||
1346 | case Intrinsic::ctlz: | ||||||
1347 | // FIXME: If necessary, this should go in target-specific overrides. | ||||||
1348 | if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) | ||||||
1349 | return TargetTransformInfo::TCC_Basic; | ||||||
1350 | break; | ||||||
1351 | |||||||
1352 | case Intrinsic::memcpy: | ||||||
1353 | return thisT()->getMemcpyCost(ICA.getInst()); | ||||||
1354 | |||||||
1355 | case Intrinsic::masked_scatter: { | ||||||
1356 | const Value *Mask = Args[3]; | ||||||
1357 | bool VarMask = !isa<Constant>(Mask); | ||||||
1358 | Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); | ||||||
1359 | return thisT()->getGatherScatterOpCost(Instruction::Store, | ||||||
1360 | ICA.getArgTypes()[0], Args[1], | ||||||
1361 | VarMask, Alignment, CostKind, I); | ||||||
1362 | } | ||||||
1363 | case Intrinsic::masked_gather: { | ||||||
1364 | const Value *Mask = Args[2]; | ||||||
1365 | bool VarMask = !isa<Constant>(Mask); | ||||||
1366 | Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); | ||||||
1367 | return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], | ||||||
1368 | VarMask, Alignment, CostKind, I); | ||||||
1369 | } | ||||||
1370 | case Intrinsic::experimental_stepvector: { | ||||||
1371 | if (isa<ScalableVectorType>(RetTy)) | ||||||
1372 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
1373 | // The cost of materialising a constant integer vector. | ||||||
1374 | return TargetTransformInfo::TCC_Basic; | ||||||
1375 | } | ||||||
1376 | case Intrinsic::experimental_vector_extract: { | ||||||
1377 | // FIXME: Handle case where a scalable vector is extracted from a scalable | ||||||
1378 | // vector | ||||||
1379 | if (isa<ScalableVectorType>(RetTy)) | ||||||
1380 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
1381 | unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); | ||||||
1382 | return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, | ||||||
1383 | cast<VectorType>(Args[0]->getType()), None, | ||||||
1384 | Index, cast<VectorType>(RetTy)); | ||||||
1385 | } | ||||||
1386 | case Intrinsic::experimental_vector_insert: { | ||||||
1387 | // FIXME: Handle case where a scalable vector is inserted into a scalable | ||||||
1388 | // vector | ||||||
1389 | if (isa<ScalableVectorType>(Args[1]->getType())) | ||||||
1390 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
1391 | unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); | ||||||
1392 | return thisT()->getShuffleCost( | ||||||
1393 | TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None, | ||||||
1394 | Index, cast<VectorType>(Args[1]->getType())); | ||||||
1395 | } | ||||||
1396 | case Intrinsic::experimental_vector_reverse: { | ||||||
1397 | return thisT()->getShuffleCost(TTI::SK_Reverse, | ||||||
1398 | cast<VectorType>(Args[0]->getType()), None, | ||||||
1399 | 0, cast<VectorType>(RetTy)); | ||||||
1400 | } | ||||||
1401 | case Intrinsic::experimental_vector_splice: { | ||||||
1402 | unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); | ||||||
1403 | return thisT()->getShuffleCost(TTI::SK_Splice, | ||||||
1404 | cast<VectorType>(Args[0]->getType()), None, | ||||||
1405 | Index, cast<VectorType>(RetTy)); | ||||||
1406 | } | ||||||
1407 | case Intrinsic::vector_reduce_add: | ||||||
1408 | case Intrinsic::vector_reduce_mul: | ||||||
1409 | case Intrinsic::vector_reduce_and: | ||||||
1410 | case Intrinsic::vector_reduce_or: | ||||||
1411 | case Intrinsic::vector_reduce_xor: | ||||||
1412 | case Intrinsic::vector_reduce_smax: | ||||||
1413 | case Intrinsic::vector_reduce_smin: | ||||||
1414 | case Intrinsic::vector_reduce_fmax: | ||||||
1415 | case Intrinsic::vector_reduce_fmin: | ||||||
1416 | case Intrinsic::vector_reduce_umax: | ||||||
1417 | case Intrinsic::vector_reduce_umin: { | ||||||
1418 | IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); | ||||||
1419 | return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); | ||||||
1420 | } | ||||||
1421 | case Intrinsic::vector_reduce_fadd: | ||||||
1422 | case Intrinsic::vector_reduce_fmul: { | ||||||
1423 | IntrinsicCostAttributes Attrs( | ||||||
1424 | IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); | ||||||
1425 | return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); | ||||||
1426 | } | ||||||
1427 | case Intrinsic::fshl: | ||||||
1428 | case Intrinsic::fshr: { | ||||||
1429 | if (isa<ScalableVectorType>(RetTy)) | ||||||
1430 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | ||||||
1431 | const Value *X = Args[0]; | ||||||
1432 | const Value *Y = Args[1]; | ||||||
1433 | const Value *Z = Args[2]; | ||||||
1434 | TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; | ||||||
1435 | TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); | ||||||
1436 | TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); | ||||||
1437 | TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); | ||||||
1438 | TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; | ||||||
1439 | OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 | ||||||
1440 | : TTI::OP_None; | ||||||
1441 | // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) | ||||||
1442 | // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) | ||||||
1443 | InstructionCost Cost = 0; | ||||||
1444 | Cost += | ||||||
1445 | thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); | ||||||
1446 | Cost += | ||||||
1447 | thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); | ||||||
1448 | Cost += thisT()->getArithmeticInstrCost( | ||||||
1449 | BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); | ||||||
1450 | Cost += thisT()->getArithmeticInstrCost( | ||||||
1451 | BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); | ||||||
1452 | // Non-constant shift amounts requires a modulo. | ||||||
1453 | if (OpKindZ != TTI::OK_UniformConstantValue && | ||||||
1454 | OpKindZ != TTI::OK_NonUniformConstantValue) | ||||||
1455 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, | ||||||
1456 | CostKind, OpKindZ, OpKindBW, | ||||||
1457 | OpPropsZ, OpPropsBW); | ||||||
1458 | // For non-rotates (X != Y) we must add shift-by-zero handling costs. | ||||||
1459 | if (X != Y) { | ||||||
1460 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||
1461 | Cost += | ||||||
1462 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, | ||||||
1463 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1464 | Cost += | ||||||
1465 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, | ||||||
1466 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1467 | } | ||||||
1468 | return Cost; | ||||||
1469 | } | ||||||
1470 | } | ||||||
1471 | |||||||
1472 | // Assume that we need to scalarize this intrinsic. | ||||||
1473 | // Compute the scalarization overhead based on Args for a vector | ||||||
1474 | // intrinsic. | ||||||
1475 | InstructionCost ScalarizationCost = InstructionCost::getInvalid(); | ||||||
1476 | if (RetVF.isVector() && !RetVF.isScalable()) { | ||||||
1477 | ScalarizationCost = 0; | ||||||
1478 | if (!RetTy->isVoidTy()) | ||||||
1479 | ScalarizationCost += | ||||||
1480 | getScalarizationOverhead(cast<VectorType>(RetTy), true, false); | ||||||
1481 | ScalarizationCost += | ||||||
1482 | getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); | ||||||
1483 | } | ||||||
1484 | |||||||
1485 | IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, | ||||||
1486 | ScalarizationCost); | ||||||
1487 | return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); | ||||||
1488 | } | ||||||
1489 | |||||||
1490 | /// Get intrinsic cost based on argument types. | ||||||
1491 | /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the | ||||||
1492 | /// cost of scalarizing the arguments and the return value will be computed | ||||||
1493 | /// based on types. | ||||||
1494 | InstructionCost | ||||||
1495 | getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||||||
1496 | TTI::TargetCostKind CostKind) { | ||||||
1497 | Intrinsic::ID IID = ICA.getID(); | ||||||
1498 | Type *RetTy = ICA.getReturnType(); | ||||||
1499 | const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); | ||||||
1500 | FastMathFlags FMF = ICA.getFlags(); | ||||||
1501 | InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); | ||||||
1502 | bool SkipScalarizationCost = ICA.skipScalarizationCost(); | ||||||
1503 | |||||||
1504 | VectorType *VecOpTy = nullptr; | ||||||
1505 | if (!Tys.empty()) { | ||||||
1506 | // The vector reduction operand is operand 0 except for fadd/fmul. | ||||||
1507 | // Their operand 0 is a scalar start value, so the vector op is operand 1. | ||||||
1508 | unsigned VecTyIndex = 0; | ||||||
1509 | if (IID == Intrinsic::vector_reduce_fadd || | ||||||
1510 | IID == Intrinsic::vector_reduce_fmul) | ||||||
1511 | VecTyIndex = 1; | ||||||
1512 | assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes")((void)0); | ||||||
1513 | VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); | ||||||
1514 | } | ||||||
1515 | |||||||
1516 | // Library call cost - other than size, make it expensive. | ||||||
1517 | unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; | ||||||
1518 | SmallVector<unsigned, 2> ISDs; | ||||||
1519 | switch (IID) { | ||||||
1520 | default: { | ||||||
1521 | // Scalable vectors cannot be scalarized, so return Invalid. | ||||||
1522 | if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { | ||||||
1523 | return isa<ScalableVectorType>(Ty); | ||||||
1524 | })) | ||||||
1525 | return InstructionCost::getInvalid(); | ||||||
1526 | |||||||
1527 | // Assume that we need to scalarize this intrinsic. | ||||||
1528 | InstructionCost ScalarizationCost = | ||||||
1529 | SkipScalarizationCost ? ScalarizationCostPassed : 0; | ||||||
1530 | unsigned ScalarCalls = 1; | ||||||
1531 | Type *ScalarRetTy = RetTy; | ||||||
1532 | if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { | ||||||
1533 | if (!SkipScalarizationCost) | ||||||
1534 | ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); | ||||||
1535 | ScalarCalls = std::max(ScalarCalls, | ||||||
1536 | cast<FixedVectorType>(RetVTy)->getNumElements()); | ||||||
1537 | ScalarRetTy = RetTy->getScalarType(); | ||||||
1538 | } | ||||||
1539 | SmallVector<Type *, 4> ScalarTys; | ||||||
1540 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { | ||||||
1541 | Type *Ty = Tys[i]; | ||||||
1542 | if (auto *VTy = dyn_cast<VectorType>(Ty)) { | ||||||
1543 | if (!SkipScalarizationCost) | ||||||
1544 | ScalarizationCost += getScalarizationOverhead(VTy, false, true); | ||||||
1545 | ScalarCalls = std::max(ScalarCalls, | ||||||
1546 | cast<FixedVectorType>(VTy)->getNumElements()); | ||||||
1547 | Ty = Ty->getScalarType(); | ||||||
1548 | } | ||||||
1549 | ScalarTys.push_back(Ty); | ||||||
1550 | } | ||||||
1551 | if (ScalarCalls == 1) | ||||||
1552 | return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. | ||||||
1553 | |||||||
1554 | IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); | ||||||
1555 | InstructionCost ScalarCost = | ||||||
1556 | thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); | ||||||
1557 | |||||||
1558 | return ScalarCalls * ScalarCost + ScalarizationCost; | ||||||
1559 | } | ||||||
1560 | // Look for intrinsics that can be lowered directly or turned into a scalar | ||||||
1561 | // intrinsic call. | ||||||
1562 | case Intrinsic::sqrt: | ||||||
1563 | ISDs.push_back(ISD::FSQRT); | ||||||
1564 | break; | ||||||
1565 | case Intrinsic::sin: | ||||||
1566 | ISDs.push_back(ISD::FSIN); | ||||||
1567 | break; | ||||||
1568 | case Intrinsic::cos: | ||||||
1569 | ISDs.push_back(ISD::FCOS); | ||||||
1570 | break; | ||||||
1571 | case Intrinsic::exp: | ||||||
1572 | ISDs.push_back(ISD::FEXP); | ||||||
1573 | break; | ||||||
1574 | case Intrinsic::exp2: | ||||||
1575 | ISDs.push_back(ISD::FEXP2); | ||||||
1576 | break; | ||||||
1577 | case Intrinsic::log: | ||||||
1578 | ISDs.push_back(ISD::FLOG); | ||||||
1579 | break; | ||||||
1580 | case Intrinsic::log10: | ||||||
1581 | ISDs.push_back(ISD::FLOG10); | ||||||
1582 | break; | ||||||
1583 | case Intrinsic::log2: | ||||||
1584 | ISDs.push_back(ISD::FLOG2); | ||||||
1585 | break; | ||||||
1586 | case Intrinsic::fabs: | ||||||
1587 | ISDs.push_back(ISD::FABS); | ||||||
1588 | break; | ||||||
1589 | case Intrinsic::canonicalize: | ||||||
1590 | ISDs.push_back(ISD::FCANONICALIZE); | ||||||
1591 | break; | ||||||
1592 | case Intrinsic::minnum: | ||||||
1593 | ISDs.push_back(ISD::FMINNUM); | ||||||
1594 | break; | ||||||
1595 | case Intrinsic::maxnum: | ||||||
1596 | ISDs.push_back(ISD::FMAXNUM); | ||||||
1597 | break; | ||||||
1598 | case Intrinsic::minimum: | ||||||
1599 | ISDs.push_back(ISD::FMINIMUM); | ||||||
1600 | break; | ||||||
1601 | case Intrinsic::maximum: | ||||||
1602 | ISDs.push_back(ISD::FMAXIMUM); | ||||||
1603 | break; | ||||||
1604 | case Intrinsic::copysign: | ||||||
1605 | ISDs.push_back(ISD::FCOPYSIGN); | ||||||
1606 | break; | ||||||
1607 | case Intrinsic::floor: | ||||||
1608 | ISDs.push_back(ISD::FFLOOR); | ||||||
1609 | break; | ||||||
1610 | case Intrinsic::ceil: | ||||||
1611 | ISDs.push_back(ISD::FCEIL); | ||||||
1612 | break; | ||||||
1613 | case Intrinsic::trunc: | ||||||
1614 | ISDs.push_back(ISD::FTRUNC); | ||||||
1615 | break; | ||||||
1616 | case Intrinsic::nearbyint: | ||||||
1617 | ISDs.push_back(ISD::FNEARBYINT); | ||||||
1618 | break; | ||||||
1619 | case Intrinsic::rint: | ||||||
1620 | ISDs.push_back(ISD::FRINT); | ||||||
1621 | break; | ||||||
1622 | case Intrinsic::round: | ||||||
1623 | ISDs.push_back(ISD::FROUND); | ||||||
1624 | break; | ||||||
1625 | case Intrinsic::roundeven: | ||||||
1626 | ISDs.push_back(ISD::FROUNDEVEN); | ||||||
1627 | break; | ||||||
1628 | case Intrinsic::pow: | ||||||
1629 | ISDs.push_back(ISD::FPOW); | ||||||
1630 | break; | ||||||
1631 | case Intrinsic::fma: | ||||||
1632 | ISDs.push_back(ISD::FMA); | ||||||
1633 | break; | ||||||
1634 | case Intrinsic::fmuladd: | ||||||
1635 | ISDs.push_back(ISD::FMA); | ||||||
1636 | break; | ||||||
1637 | case Intrinsic::experimental_constrained_fmuladd: | ||||||
1638 | ISDs.push_back(ISD::STRICT_FMA); | ||||||
1639 | break; | ||||||
1640 | // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. | ||||||
1641 | case Intrinsic::lifetime_start: | ||||||
1642 | case Intrinsic::lifetime_end: | ||||||
1643 | case Intrinsic::sideeffect: | ||||||
1644 | case Intrinsic::pseudoprobe: | ||||||
1645 | case Intrinsic::arithmetic_fence: | ||||||
1646 | return 0; | ||||||
1647 | case Intrinsic::masked_store: { | ||||||
1648 | Type *Ty = Tys[0]; | ||||||
1649 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); | ||||||
1650 | return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, | ||||||
1651 | CostKind); | ||||||
1652 | } | ||||||
1653 | case Intrinsic::masked_load: { | ||||||
1654 | Type *Ty = RetTy; | ||||||
1655 | Align TyAlign = thisT()->DL.getABITypeAlign(Ty); | ||||||
1656 | return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, | ||||||
1657 | CostKind); | ||||||
1658 | } | ||||||
1659 | case Intrinsic::vector_reduce_add: | ||||||
1660 | return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, | ||||||
1661 | None, CostKind); | ||||||
1662 | case Intrinsic::vector_reduce_mul: | ||||||
1663 | return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, | ||||||
1664 | None, CostKind); | ||||||
1665 | case Intrinsic::vector_reduce_and: | ||||||
1666 | return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, | ||||||
1667 | None, CostKind); | ||||||
1668 | case Intrinsic::vector_reduce_or: | ||||||
1669 | return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None, | ||||||
1670 | CostKind); | ||||||
1671 | case Intrinsic::vector_reduce_xor: | ||||||
1672 | return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, | ||||||
1673 | None, CostKind); | ||||||
1674 | case Intrinsic::vector_reduce_fadd: | ||||||
1675 | return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, | ||||||
1676 | FMF, CostKind); | ||||||
1677 | case Intrinsic::vector_reduce_fmul: | ||||||
1678 | return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, | ||||||
1679 | FMF, CostKind); | ||||||
1680 | case Intrinsic::vector_reduce_smax: | ||||||
1681 | case Intrinsic::vector_reduce_smin: | ||||||
1682 | case Intrinsic::vector_reduce_fmax: | ||||||
1683 | case Intrinsic::vector_reduce_fmin: | ||||||
1684 | return thisT()->getMinMaxReductionCost( | ||||||
1685 | VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), | ||||||
1686 | /*IsUnsigned=*/false, CostKind); | ||||||
1687 | case Intrinsic::vector_reduce_umax: | ||||||
1688 | case Intrinsic::vector_reduce_umin: | ||||||
1689 | return thisT()->getMinMaxReductionCost( | ||||||
1690 | VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), | ||||||
1691 | /*IsUnsigned=*/true, CostKind); | ||||||
1692 | case Intrinsic::abs: | ||||||
1693 | case Intrinsic::smax: | ||||||
1694 | case Intrinsic::smin: | ||||||
1695 | case Intrinsic::umax: | ||||||
1696 | case Intrinsic::umin: { | ||||||
1697 | // abs(X) = select(icmp(X,0),X,sub(0,X)) | ||||||
1698 | // minmax(X,Y) = select(icmp(X,Y),X,Y) | ||||||
1699 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||
1700 | InstructionCost Cost = 0; | ||||||
1701 | // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code. | ||||||
1702 | Cost += | ||||||
1703 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, | ||||||
1704 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1705 | Cost += | ||||||
1706 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, | ||||||
1707 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1708 | // TODO: Should we add an OperandValueProperties::OP_Zero property? | ||||||
1709 | if (IID == Intrinsic::abs) | ||||||
1710 | Cost += thisT()->getArithmeticInstrCost( | ||||||
1711 | BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); | ||||||
1712 | return Cost; | ||||||
1713 | } | ||||||
1714 | case Intrinsic::sadd_sat: | ||||||
1715 | case Intrinsic::ssub_sat: { | ||||||
1716 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||
1717 | |||||||
1718 | Type *OpTy = StructType::create({RetTy, CondTy}); | ||||||
1719 | Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat | ||||||
1720 | ? Intrinsic::sadd_with_overflow | ||||||
1721 | : Intrinsic::ssub_with_overflow; | ||||||
1722 | |||||||
1723 | // SatMax -> Overflow && SumDiff < 0 | ||||||
1724 | // SatMin -> Overflow && SumDiff >= 0 | ||||||
1725 | InstructionCost Cost = 0; | ||||||
1726 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, | ||||||
1727 | nullptr, ScalarizationCostPassed); | ||||||
1728 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); | ||||||
1729 | Cost += | ||||||
1730 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, | ||||||
1731 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1732 | Cost += 2 * thisT()->getCmpSelInstrCost( | ||||||
1733 | BinaryOperator::Select, RetTy, CondTy, | ||||||
1734 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1735 | return Cost; | ||||||
1736 | } | ||||||
1737 | case Intrinsic::uadd_sat: | ||||||
1738 | case Intrinsic::usub_sat: { | ||||||
1739 | Type *CondTy = RetTy->getWithNewBitWidth(1); | ||||||
1740 | |||||||
1741 | Type *OpTy = StructType::create({RetTy, CondTy}); | ||||||
1742 | Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat | ||||||
1743 | ? Intrinsic::uadd_with_overflow | ||||||
1744 | : Intrinsic::usub_with_overflow; | ||||||
1745 | |||||||
1746 | InstructionCost Cost = 0; | ||||||
1747 | IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, | ||||||
1748 | nullptr, ScalarizationCostPassed); | ||||||
1749 | Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); | ||||||
1750 | Cost += | ||||||
1751 | thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, | ||||||
1752 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1753 | return Cost; | ||||||
1754 | } | ||||||
1755 | case Intrinsic::smul_fix: | ||||||
1756 | case Intrinsic::umul_fix: { | ||||||
1757 | unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; | ||||||
1758 | Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); | ||||||
1759 | |||||||
1760 | unsigned ExtOp = | ||||||
1761 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; | ||||||
1762 | TTI::CastContextHint CCH = TTI::CastContextHint::None; | ||||||
1763 | |||||||
1764 | InstructionCost Cost = 0; | ||||||
1765 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); | ||||||
1766 | Cost += | ||||||
1767 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); | ||||||
1768 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, | ||||||
1769 | CCH, CostKind); | ||||||
1770 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, | ||||||
1771 | CostKind, TTI::OK_AnyValue, | ||||||
1772 | TTI::OK_UniformConstantValue); | ||||||
1773 | Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, | ||||||
1774 | TTI::OK_AnyValue, | ||||||
1775 | TTI::OK_UniformConstantValue); | ||||||
1776 | Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); | ||||||
1777 | return Cost; | ||||||
1778 | } | ||||||
1779 | case Intrinsic::sadd_with_overflow: | ||||||
1780 | case Intrinsic::ssub_with_overflow: { | ||||||
1781 | Type *SumTy = RetTy->getContainedType(0); | ||||||
1782 | Type *OverflowTy = RetTy->getContainedType(1); | ||||||
1783 | unsigned Opcode = IID == Intrinsic::sadd_with_overflow | ||||||
1784 | ? BinaryOperator::Add | ||||||
1785 | : BinaryOperator::Sub; | ||||||
1786 | |||||||
1787 | // LHSSign -> LHS >= 0 | ||||||
1788 | // RHSSign -> RHS >= 0 | ||||||
1789 | // SumSign -> Sum >= 0 | ||||||
1790 | // | ||||||
1791 | // Add: | ||||||
1792 | // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) | ||||||
1793 | // Sub: | ||||||
1794 | // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) | ||||||
1795 | InstructionCost Cost = 0; | ||||||
1796 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); | ||||||
1797 | Cost += 3 * thisT()->getCmpSelInstrCost( | ||||||
1798 | Instruction::ICmp, SumTy, OverflowTy, | ||||||
1799 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1800 | Cost += 2 * thisT()->getCmpSelInstrCost( | ||||||
1801 | Instruction::Select, OverflowTy, OverflowTy, | ||||||
1802 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1803 | Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy, | ||||||
1804 | CostKind); | ||||||
1805 | return Cost; | ||||||
1806 | } | ||||||
1807 | case Intrinsic::uadd_with_overflow: | ||||||
1808 | case Intrinsic::usub_with_overflow: { | ||||||
1809 | Type *SumTy = RetTy->getContainedType(0); | ||||||
1810 | Type *OverflowTy = RetTy->getContainedType(1); | ||||||
1811 | unsigned Opcode = IID == Intrinsic::uadd_with_overflow | ||||||
1812 | ? BinaryOperator::Add | ||||||
1813 | : BinaryOperator::Sub; | ||||||
1814 | |||||||
1815 | InstructionCost Cost = 0; | ||||||
1816 | Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); | ||||||
1817 | Cost += | ||||||
1818 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, | ||||||
1819 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1820 | return Cost; | ||||||
1821 | } | ||||||
1822 | case Intrinsic::smul_with_overflow: | ||||||
1823 | case Intrinsic::umul_with_overflow: { | ||||||
1824 | Type *MulTy = RetTy->getContainedType(0); | ||||||
1825 | Type *OverflowTy = RetTy->getContainedType(1); | ||||||
1826 | unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; | ||||||
1827 | Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); | ||||||
1828 | |||||||
1829 | unsigned ExtOp = | ||||||
1830 | IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; | ||||||
1831 | TTI::CastContextHint CCH = TTI::CastContextHint::None; | ||||||
1832 | |||||||
1833 | InstructionCost Cost = 0; | ||||||
1834 | Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); | ||||||
1835 | Cost += | ||||||
1836 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); | ||||||
1837 | Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, | ||||||
1838 | CCH, CostKind); | ||||||
1839 | Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy, | ||||||
1840 | CostKind, TTI::OK_AnyValue, | ||||||
1841 | TTI::OK_UniformConstantValue); | ||||||
1842 | |||||||
1843 | if (IID == Intrinsic::smul_with_overflow) | ||||||
1844 | Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, | ||||||
1845 | CostKind, TTI::OK_AnyValue, | ||||||
1846 | TTI::OK_UniformConstantValue); | ||||||
1847 | |||||||
1848 | Cost += | ||||||
1849 | thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy, | ||||||
1850 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
1851 | return Cost; | ||||||
1852 | } | ||||||
1853 | case Intrinsic::ctpop: | ||||||
1854 | ISDs.push_back(ISD::CTPOP); | ||||||
1855 | // In case of legalization use TCC_Expensive. This is cheaper than a | ||||||
1856 | // library call but still not a cheap instruction. | ||||||
1857 | SingleCallCost = TargetTransformInfo::TCC_Expensive; | ||||||
1858 | break; | ||||||
1859 | case Intrinsic::ctlz: | ||||||
1860 | ISDs.push_back(ISD::CTLZ); | ||||||
1861 | break; | ||||||
1862 | case Intrinsic::cttz: | ||||||
1863 | ISDs.push_back(ISD::CTTZ); | ||||||
1864 | break; | ||||||
1865 | case Intrinsic::bswap: | ||||||
1866 | ISDs.push_back(ISD::BSWAP); | ||||||
1867 | break; | ||||||
1868 | case Intrinsic::bitreverse: | ||||||
1869 | ISDs.push_back(ISD::BITREVERSE); | ||||||
1870 | break; | ||||||
1871 | } | ||||||
1872 | |||||||
1873 | const TargetLoweringBase *TLI = getTLI(); | ||||||
1874 | std::pair<InstructionCost, MVT> LT = | ||||||
1875 | TLI->getTypeLegalizationCost(DL, RetTy); | ||||||
1876 | |||||||
1877 | SmallVector<InstructionCost, 2> LegalCost; | ||||||
1878 | SmallVector<InstructionCost, 2> CustomCost; | ||||||
1879 | for (unsigned ISD : ISDs) { | ||||||
1880 | if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { | ||||||
1881 | if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && | ||||||
1882 | TLI->isFAbsFree(LT.second)) { | ||||||
1883 | return 0; | ||||||
1884 | } | ||||||
1885 | |||||||
1886 | // The operation is legal. Assume it costs 1. | ||||||
1887 | // If the type is split to multiple registers, assume that there is some | ||||||
1888 | // overhead to this. | ||||||
1889 | // TODO: Once we have extract/insert subvector cost we need to use them. | ||||||
1890 | if (LT.first > 1) | ||||||
1891 | LegalCost.push_back(LT.first * 2); | ||||||
1892 | else | ||||||
1893 | LegalCost.push_back(LT.first * 1); | ||||||
1894 | } else if (!TLI->isOperationExpand(ISD, LT.second)) { | ||||||
1895 | // If the operation is custom lowered then assume | ||||||
1896 | // that the code is twice as expensive. | ||||||
1897 | CustomCost.push_back(LT.first * 2); | ||||||
1898 | } | ||||||
1899 | } | ||||||
1900 | |||||||
1901 | auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); | ||||||
1902 | if (MinLegalCostI != LegalCost.end()) | ||||||
1903 | return *MinLegalCostI; | ||||||
1904 | |||||||
1905 | auto MinCustomCostI = | ||||||
1906 | std::min_element(CustomCost.begin(), CustomCost.end()); | ||||||
1907 | if (MinCustomCostI != CustomCost.end()) | ||||||
1908 | return *MinCustomCostI; | ||||||
1909 | |||||||
1910 | // If we can't lower fmuladd into an FMA estimate the cost as a floating | ||||||
1911 | // point mul followed by an add. | ||||||
1912 | if (IID == Intrinsic::fmuladd) | ||||||
1913 | return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, | ||||||
1914 | CostKind) + | ||||||
1915 | thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, | ||||||
1916 | CostKind); | ||||||
1917 | if (IID == Intrinsic::experimental_constrained_fmuladd) { | ||||||
1918 | IntrinsicCostAttributes FMulAttrs( | ||||||
1919 | Intrinsic::experimental_constrained_fmul, RetTy, Tys); | ||||||
1920 | IntrinsicCostAttributes FAddAttrs( | ||||||
1921 | Intrinsic::experimental_constrained_fadd, RetTy, Tys); | ||||||
1922 | return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + | ||||||
1923 | thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); | ||||||
1924 | } | ||||||
1925 | |||||||
1926 | // Else, assume that we need to scalarize this intrinsic. For math builtins | ||||||
1927 | // this will emit a costly libcall, adding call overhead and spills. Make it | ||||||
1928 | // very expensive. | ||||||
1929 | if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { | ||||||
1930 | // Scalable vectors cannot be scalarized, so return Invalid. | ||||||
1931 | if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { | ||||||
1932 | return isa<ScalableVectorType>(Ty); | ||||||
1933 | })) | ||||||
1934 | return InstructionCost::getInvalid(); | ||||||
1935 | |||||||
1936 | InstructionCost ScalarizationCost = | ||||||
1937 | SkipScalarizationCost ? ScalarizationCostPassed | ||||||
1938 | : getScalarizationOverhead(RetVTy, true, false); | ||||||
1939 | |||||||
1940 | unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); | ||||||
1941 | SmallVector<Type *, 4> ScalarTys; | ||||||
1942 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { | ||||||
1943 | Type *Ty = Tys[i]; | ||||||
1944 | if (Ty->isVectorTy()) | ||||||
1945 | Ty = Ty->getScalarType(); | ||||||
1946 | ScalarTys.push_back(Ty); | ||||||
1947 | } | ||||||
1948 | IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); | ||||||
1949 | InstructionCost ScalarCost = | ||||||
1950 | thisT()->getIntrinsicInstrCost(Attrs, CostKind); | ||||||
1951 | for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { | ||||||
1952 | if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { | ||||||
1953 | if (!ICA.skipScalarizationCost()) | ||||||
1954 | ScalarizationCost += getScalarizationOverhead(VTy, false, true); | ||||||
1955 | ScalarCalls = std::max(ScalarCalls, | ||||||
1956 | cast<FixedVectorType>(VTy)->getNumElements()); | ||||||
1957 | } | ||||||
1958 | } | ||||||
1959 | return ScalarCalls * ScalarCost + ScalarizationCost; | ||||||
1960 | } | ||||||
1961 | |||||||
1962 | // This is going to be turned into a library call, make it expensive. | ||||||
1963 | return SingleCallCost; | ||||||
1964 | } | ||||||
1965 | |||||||
1966 | /// Compute a cost of the given call instruction. | ||||||
1967 | /// | ||||||
1968 | /// Compute the cost of calling function F with return type RetTy and | ||||||
1969 | /// argument types Tys. F might be nullptr, in this case the cost of an | ||||||
1970 | /// arbitrary call with the specified signature will be returned. | ||||||
1971 | /// This is used, for instance, when we estimate call of a vector | ||||||
1972 | /// counterpart of the given function. | ||||||
1973 | /// \param F Called function, might be nullptr. | ||||||
1974 | /// \param RetTy Return value types. | ||||||
1975 | /// \param Tys Argument types. | ||||||
1976 | /// \returns The cost of Call instruction. | ||||||
1977 | InstructionCost | ||||||
1978 | getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys, | ||||||
1979 | TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) { | ||||||
1980 | return 10; | ||||||
1981 | } | ||||||
1982 | |||||||
1983 | unsigned getNumberOfParts(Type *Tp) { | ||||||
1984 | std::pair<InstructionCost, MVT> LT = | ||||||
1985 | getTLI()->getTypeLegalizationCost(DL, Tp); | ||||||
1986 | return *LT.first.getValue(); | ||||||
1987 | } | ||||||
1988 | |||||||
1989 | InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, | ||||||
1990 | const SCEV *) { | ||||||
1991 | return 0; | ||||||
1992 | } | ||||||
1993 | |||||||
1994 | /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. | ||||||
1995 | /// We're assuming that reduction operation are performing the following way: | ||||||
1996 | /// | ||||||
1997 | /// %val1 = shufflevector<n x t> %val, <n x t> %undef, | ||||||
1998 | /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> | ||||||
1999 | /// \----------------v-------------/ \----------v------------/ | ||||||
2000 | /// n/2 elements n/2 elements | ||||||
2001 | /// %red1 = op <n x t> %val, <n x t> val1 | ||||||
2002 | /// After this operation we have a vector %red1 where only the first n/2 | ||||||
2003 | /// elements are meaningful, the second n/2 elements are undefined and can be | ||||||
2004 | /// dropped. All other operations are actually working with the vector of | ||||||
2005 | /// length n/2, not n, though the real vector length is still n. | ||||||
2006 | /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, | ||||||
2007 | /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> | ||||||
2008 | /// \----------------v-------------/ \----------v------------/ | ||||||
2009 | /// n/4 elements 3*n/4 elements | ||||||
2010 | /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of | ||||||
2011 | /// length n/2, the resulting vector has length n/4 etc. | ||||||
2012 | /// | ||||||
2013 | /// The cost model should take into account that the actual length of the | ||||||
2014 | /// vector is reduced on each iteration. | ||||||
2015 | InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, | ||||||
2016 | TTI::TargetCostKind CostKind) { | ||||||
2017 | Type *ScalarTy = Ty->getElementType(); | ||||||
2018 | unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); | ||||||
2019 | if ((Opcode == Instruction::Or || Opcode == Instruction::And) && | ||||||
2020 | ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && | ||||||
2021 | NumVecElts >= 2) { | ||||||
2022 | // Or reduction for i1 is represented as: | ||||||
2023 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth | ||||||
2024 | // %res = cmp ne iReduxWidth %val, 0 | ||||||
2025 | // And reduction for i1 is represented as: | ||||||
2026 | // %val = bitcast <ReduxWidth x i1> to iReduxWidth | ||||||
2027 | // %res = cmp eq iReduxWidth %val, 11111 | ||||||
2028 | Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); | ||||||
2029 | return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, | ||||||
2030 | TTI::CastContextHint::None, CostKind) + | ||||||
2031 | thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, | ||||||
2032 | CmpInst::makeCmpResultType(ValTy), | ||||||
2033 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
2034 | } | ||||||
2035 | unsigned NumReduxLevels = Log2_32(NumVecElts); | ||||||
2036 | InstructionCost ArithCost = 0; | ||||||
2037 | InstructionCost ShuffleCost = 0; | ||||||
2038 | std::pair<InstructionCost, MVT> LT = | ||||||
2039 | thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); | ||||||
2040 | unsigned LongVectorCount = 0; | ||||||
2041 | unsigned MVTLen = | ||||||
2042 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; | ||||||
2043 | while (NumVecElts > MVTLen) { | ||||||
2044 | NumVecElts /= 2; | ||||||
2045 | VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); | ||||||
2046 | ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, | ||||||
2047 | NumVecElts, SubTy); | ||||||
2048 | ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); | ||||||
2049 | Ty = SubTy; | ||||||
2050 | ++LongVectorCount; | ||||||
2051 | } | ||||||
2052 | |||||||
2053 | NumReduxLevels -= LongVectorCount; | ||||||
2054 | |||||||
2055 | // The minimal length of the vector is limited by the real length of vector | ||||||
2056 | // operations performed on the current platform. That's why several final | ||||||
2057 | // reduction operations are performed on the vectors with the same | ||||||
2058 | // architecture-dependent length. | ||||||
2059 | |||||||
2060 | // By default reductions need one shuffle per reduction level. | ||||||
2061 | ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( | ||||||
2062 | TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); | ||||||
2063 | ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); | ||||||
2064 | return ShuffleCost + ArithCost + | ||||||
2065 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | ||||||
2066 | } | ||||||
2067 | |||||||
2068 | /// Try to calculate the cost of performing strict (in-order) reductions, | ||||||
2069 | /// which involves doing a sequence of floating point additions in lane | ||||||
2070 | /// order, starting with an initial value. For example, consider a scalar | ||||||
2071 | /// initial value 'InitVal' of type float and a vector of type <4 x float>: | ||||||
2072 | /// | ||||||
2073 | /// Vector = <float %v0, float %v1, float %v2, float %v3> | ||||||
2074 | /// | ||||||
2075 | /// %add1 = %InitVal + %v0 | ||||||
2076 | /// %add2 = %add1 + %v1 | ||||||
2077 | /// %add3 = %add2 + %v2 | ||||||
2078 | /// %add4 = %add3 + %v3 | ||||||
2079 | /// | ||||||
2080 | /// As a simple estimate we can say the cost of such a reduction is 4 times | ||||||
2081 | /// the cost of a scalar FP addition. We can only estimate the costs for | ||||||
2082 | /// fixed-width vectors here because for scalable vectors we do not know the | ||||||
2083 | /// runtime number of operations. | ||||||
2084 | InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, | ||||||
2085 | TTI::TargetCostKind CostKind) { | ||||||
2086 | // Targets must implement a default value for the scalable case, since | ||||||
2087 | // we don't know how many lanes the vector has. | ||||||
2088 | if (isa<ScalableVectorType>(Ty)) | ||||||
2089 | return InstructionCost::getInvalid(); | ||||||
2090 | |||||||
2091 | auto *VTy = cast<FixedVectorType>(Ty); | ||||||
2092 | InstructionCost ExtractCost = | ||||||
2093 | getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); | ||||||
2094 | InstructionCost ArithCost = thisT()->getArithmeticInstrCost( | ||||||
2095 | Opcode, VTy->getElementType(), CostKind); | ||||||
2096 | ArithCost *= VTy->getNumElements(); | ||||||
2097 | |||||||
2098 | return ExtractCost + ArithCost; | ||||||
2099 | } | ||||||
2100 | |||||||
2101 | InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, | ||||||
2102 | Optional<FastMathFlags> FMF, | ||||||
2103 | TTI::TargetCostKind CostKind) { | ||||||
2104 | if (TTI::requiresOrderedReduction(FMF)) | ||||||
2105 | return getOrderedReductionCost(Opcode, Ty, CostKind); | ||||||
2106 | return getTreeReductionCost(Opcode, Ty, CostKind); | ||||||
2107 | } | ||||||
2108 | |||||||
2109 | /// Try to calculate op costs for min/max reduction operations. | ||||||
2110 | /// \param CondTy Conditional type for the Select instruction. | ||||||
2111 | InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, | ||||||
2112 | bool IsUnsigned, | ||||||
2113 | TTI::TargetCostKind CostKind) { | ||||||
2114 | Type *ScalarTy = Ty->getElementType(); | ||||||
2115 | Type *ScalarCondTy = CondTy->getElementType(); | ||||||
2116 | unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); | ||||||
2117 | unsigned NumReduxLevels = Log2_32(NumVecElts); | ||||||
2118 | unsigned CmpOpcode; | ||||||
2119 | if (Ty->isFPOrFPVectorTy()) { | ||||||
2120 | CmpOpcode = Instruction::FCmp; | ||||||
2121 | } else { | ||||||
2122 | assert(Ty->isIntOrIntVectorTy() &&((void)0) | ||||||
2123 | "expecting floating point or integer type for min/max reduction")((void)0); | ||||||
2124 | CmpOpcode = Instruction::ICmp; | ||||||
2125 | } | ||||||
2126 | InstructionCost MinMaxCost = 0; | ||||||
2127 | InstructionCost ShuffleCost = 0; | ||||||
2128 | std::pair<InstructionCost, MVT> LT = | ||||||
2129 | thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); | ||||||
2130 | unsigned LongVectorCount = 0; | ||||||
2131 | unsigned MVTLen = | ||||||
2132 | LT.second.isVector() ? LT.second.getVectorNumElements() : 1; | ||||||
2133 | while (NumVecElts > MVTLen) { | ||||||
2134 | NumVecElts /= 2; | ||||||
2135 | auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); | ||||||
2136 | CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); | ||||||
2137 | |||||||
2138 | ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, | ||||||
2139 | NumVecElts, SubTy); | ||||||
2140 | MinMaxCost += | ||||||
2141 | thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, | ||||||
2142 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + | ||||||
2143 | thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, | ||||||
2144 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | ||||||
2145 | Ty = SubTy; | ||||||
2146 | ++LongVectorCount; | ||||||
2147 | } | ||||||
2148 | |||||||
2149 | NumReduxLevels -= LongVectorCount; | ||||||
2150 | |||||||
2151 | // The minimal length of the vector is limited by the real length of vector | ||||||
2152 | // operations performed on the current platform. That's why several final | ||||||
2153 | // reduction opertions are perfomed on the vectors with the same | ||||||
2154 | // architecture-dependent length. | ||||||
2155 | ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( | ||||||
2156 | TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); | ||||||
2157 | MinMaxCost += | ||||||
2158 | NumReduxLevels * | ||||||
2159 | (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, | ||||||
2160 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + | ||||||
2161 | thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | ||||||
2162 | CmpInst::BAD_ICMP_PREDICATE, CostKind)); | ||||||
2163 | // The last min/max should be in vector registers and we counted it above. | ||||||
2164 | // So just need a single extractelement. | ||||||
2165 | return ShuffleCost + MinMaxCost + | ||||||
2166 | thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | ||||||
2167 | } | ||||||
2168 | |||||||
2169 | InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, | ||||||
2170 | Type *ResTy, VectorType *Ty, | ||||||
2171 | TTI::TargetCostKind CostKind) { | ||||||
2172 | // Without any native support, this is equivalent to the cost of | ||||||
2173 | // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) | ||||||
2174 | VectorType *ExtTy = VectorType::get(ResTy, Ty); | ||||||
2175 | InstructionCost RedCost = thisT()->getArithmeticReductionCost( | ||||||
2176 | Instruction::Add, ExtTy, None, CostKind); | ||||||
2177 | InstructionCost MulCost = 0; | ||||||
2178 | InstructionCost ExtCost = thisT()->getCastInstrCost( | ||||||
2179 | IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, | ||||||
2180 | TTI::CastContextHint::None, CostKind); | ||||||
2181 | if (IsMLA) { | ||||||
2182 | MulCost = | ||||||
2183 | thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); | ||||||
2184 | ExtCost *= 2; | ||||||
2185 | } | ||||||
2186 | |||||||
2187 | return RedCost + MulCost + ExtCost; | ||||||
2188 | } | ||||||
2189 | |||||||
2190 | InstructionCost getVectorSplitCost() { return 1; } | ||||||
2191 | |||||||
2192 | /// @} | ||||||
2193 | }; | ||||||
2194 | |||||||
2195 | /// Concrete BasicTTIImpl that can be used if no further customization | ||||||
2196 | /// is needed. | ||||||
2197 | class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { | ||||||
2198 | using BaseT = BasicTTIImplBase<BasicTTIImpl>; | ||||||
2199 | |||||||
2200 | friend class BasicTTIImplBase<BasicTTIImpl>; | ||||||
2201 | |||||||
2202 | const TargetSubtargetInfo *ST; | ||||||
2203 | const TargetLoweringBase *TLI; | ||||||
2204 | |||||||
2205 | const TargetSubtargetInfo *getST() const { return ST; } | ||||||
2206 | const TargetLoweringBase *getTLI() const { return TLI; } | ||||||
2207 | |||||||
2208 | public: | ||||||
2209 | explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); | ||||||
2210 | }; | ||||||
2211 | |||||||
2212 | } // end namespace llvm | ||||||
2213 | |||||||
2214 | #endif // LLVM_CODEGEN_BASICTTIIMPL_H |