1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | #include "X86ISelLowering.h" |
17 | #include "X86Subtarget.h" |
18 | #include "llvm/ADT/ArrayRef.h" |
19 | #include "llvm/ADT/SmallVector.h" |
20 | #include "llvm/Analysis/VectorUtils.h" |
21 | #include "llvm/IR/Constants.h" |
22 | #include "llvm/IR/DataLayout.h" |
23 | #include "llvm/IR/DerivedTypes.h" |
24 | #include "llvm/IR/IRBuilder.h" |
25 | #include "llvm/IR/Instruction.h" |
26 | #include "llvm/IR/Instructions.h" |
27 | #include "llvm/IR/Module.h" |
28 | #include "llvm/IR/Type.h" |
29 | #include "llvm/IR/Value.h" |
30 | #include "llvm/Support/Casting.h" |
31 | #include "llvm/Support/MachineValueType.h" |
32 | #include <algorithm> |
33 | #include <cassert> |
34 | #include <cmath> |
35 | #include <cstdint> |
36 | |
37 | using namespace llvm; |
38 | |
39 | namespace { |
40 | |
41 | |
42 | |
43 | |
44 | |
45 | |
46 | |
47 | |
48 | |
49 | class X86InterleavedAccessGroup { |
50 | |
51 | |
52 | Instruction *const Inst; |
53 | |
54 | |
55 | ArrayRef<ShuffleVectorInst *> Shuffles; |
56 | |
57 | |
58 | ArrayRef<unsigned> Indices; |
59 | |
60 | |
61 | const unsigned Factor; |
62 | |
63 | |
64 | const X86Subtarget &Subtarget; |
65 | |
66 | const DataLayout &DL; |
67 | |
68 | IRBuilder<> &Builder; |
69 | |
70 | |
71 | |
72 | void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T, |
73 | SmallVectorImpl<Instruction *> &DecomposedVectors); |
74 | |
75 | |
76 | |
77 | |
78 | |
79 | |
80 | |
81 | |
82 | |
83 | |
84 | |
85 | |
86 | |
87 | |
88 | void transpose_4x4(ArrayRef<Instruction *> InputVectors, |
89 | SmallVectorImpl<Value *> &TransposedMatrix); |
90 | void interleave8bitStride4(ArrayRef<Instruction *> InputVectors, |
91 | SmallVectorImpl<Value *> &TransposedMatrix, |
92 | unsigned NumSubVecElems); |
93 | void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors, |
94 | SmallVectorImpl<Value *> &TransposedMatrix); |
95 | void interleave8bitStride3(ArrayRef<Instruction *> InputVectors, |
96 | SmallVectorImpl<Value *> &TransposedMatrix, |
97 | unsigned NumSubVecElems); |
98 | void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors, |
99 | SmallVectorImpl<Value *> &TransposedMatrix, |
100 | unsigned NumSubVecElems); |
101 | |
102 | public: |
103 | |
104 | |
105 | |
106 | |
107 | |
108 | |
109 | explicit X86InterleavedAccessGroup(Instruction *I, |
110 | ArrayRef<ShuffleVectorInst *> Shuffs, |
111 | ArrayRef<unsigned> Ind, const unsigned F, |
112 | const X86Subtarget &STarget, |
113 | IRBuilder<> &B) |
114 | : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget), |
115 | DL(Inst->getModule()->getDataLayout()), Builder(B) {} |
116 | |
117 | |
118 | |
119 | bool isSupported() const; |
120 | |
121 | |
122 | |
123 | bool lowerIntoOptimizedSequence(); |
124 | }; |
125 | |
126 | } |
127 | |
128 | bool X86InterleavedAccessGroup::isSupported() const { |
129 | VectorType *ShuffleVecTy = Shuffles[0]->getType(); |
130 | Type *ShuffleEltTy = ShuffleVecTy->getElementType(); |
131 | unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); |
132 | unsigned WideInstSize; |
133 | |
134 | |
135 | |
136 | |
137 | |
138 | |
139 | |
140 | if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3)) |
141 | return false; |
142 | |
143 | if (isa<LoadInst>(Inst)) { |
144 | WideInstSize = DL.getTypeSizeInBits(Inst->getType()); |
145 | if (cast<LoadInst>(Inst)->getPointerAddressSpace()) |
146 | return false; |
147 | } else |
148 | WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); |
149 | |
150 | |
151 | |
152 | if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) |
153 | return true; |
154 | |
155 | if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 && |
156 | (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 || |
157 | WideInstSize == 2048)) |
158 | return true; |
159 | |
160 | if (ShuffleElemSize == 8 && Factor == 3 && |
161 | (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536)) |
162 | return true; |
163 | |
164 | return false; |
165 | } |
166 | |
167 | void X86InterleavedAccessGroup::decompose( |
168 | Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy, |
169 | SmallVectorImpl<Instruction *> &DecomposedVectors) { |
170 | assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) && |
171 | "Expected Load or Shuffle"); |
172 | |
173 | Type *VecWidth = VecInst->getType(); |
174 | (void)VecWidth; |
175 | assert(VecWidth->isVectorTy() && |
176 | DL.getTypeSizeInBits(VecWidth) >= |
177 | DL.getTypeSizeInBits(SubVecTy) * NumSubVectors && |
178 | "Invalid Inst-size!!!"); |
179 | |
180 | if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) { |
181 | Value *Op0 = SVI->getOperand(0); |
182 | Value *Op1 = SVI->getOperand(1); |
183 | |
184 | |
185 | for (unsigned i = 0; i < NumSubVectors; ++i) |
186 | DecomposedVectors.push_back( |
187 | cast<ShuffleVectorInst>(Builder.CreateShuffleVector( |
188 | Op0, Op1, |
189 | createSequentialMask(Indices[i], SubVecTy->getNumElements(), |
190 | 0)))); |
191 | return; |
192 | } |
193 | |
194 | |
195 | LoadInst *LI = cast<LoadInst>(VecInst); |
196 | Type *VecBaseTy, *VecBasePtrTy; |
197 | Value *VecBasePtr; |
198 | unsigned int NumLoads = NumSubVectors; |
199 | |
200 | |
201 | |
202 | unsigned VecLength = DL.getTypeSizeInBits(VecWidth); |
203 | if (VecLength == 768 || VecLength == 1536) { |
204 | VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16); |
205 | VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace()); |
206 | VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); |
207 | NumLoads = NumSubVectors * (VecLength / 384); |
208 | } else { |
209 | VecBaseTy = SubVecTy; |
210 | VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace()); |
211 | VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); |
212 | } |
213 | |
214 | assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) && |
215 | "VecBaseTy's size must be a multiple of 8"); |
216 | const Align FirstAlignment = LI->getAlign(); |
217 | const Align SubsequentAlignment = commonAlignment( |
218 | FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8); |
219 | Align Alignment = FirstAlignment; |
220 | for (unsigned i = 0; i < NumLoads; i++) { |
221 | |
222 | Value *NewBasePtr = |
223 | Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i)); |
224 | Instruction *NewLoad = |
225 | Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment); |
226 | DecomposedVectors.push_back(NewLoad); |
227 | Alignment = SubsequentAlignment; |
228 | } |
229 | } |
230 | |
231 | |
232 | |
233 | static MVT scaleVectorType(MVT VT) { |
234 | unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2; |
235 | return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), |
236 | VT.getVectorNumElements() / 2); |
237 | } |
238 | |
239 | static constexpr int Concat[] = { |
240 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
241 | 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, |
242 | 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, |
243 | 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; |
244 | |
245 | |
246 | |
247 | |
248 | |
249 | |
250 | |
251 | |
252 | |
253 | |
254 | |
255 | |
256 | |
257 | |
258 | |
259 | |
260 | |
261 | static void genShuffleBland(MVT VT, ArrayRef<int> Mask, |
262 | SmallVectorImpl<int> &Out, int LowOffset, |
263 | int HighOffset) { |
264 | assert(VT.getSizeInBits() >= 256 && |
265 | "This function doesn't accept width smaller then 256"); |
266 | unsigned NumOfElm = VT.getVectorNumElements(); |
267 | for (unsigned i = 0; i < Mask.size(); i++) |
268 | Out.push_back(Mask[i] + LowOffset); |
269 | for (unsigned i = 0; i < Mask.size(); i++) |
270 | Out.push_back(Mask[i] + HighOffset + NumOfElm); |
271 | } |
272 | |
273 | |
274 | |
275 | |
276 | |
277 | |
278 | |
279 | |
280 | |
281 | |
282 | |
283 | |
284 | |
285 | |
286 | |
287 | |
288 | |
289 | |
290 | |
291 | static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix, |
292 | ArrayRef<Value *> Vec, ArrayRef<int> VPShuf, |
293 | unsigned VecElems, unsigned Stride, |
294 | IRBuilder<> &Builder) { |
295 | |
296 | if (VecElems == 16) { |
297 | for (unsigned i = 0; i < Stride; i++) |
298 | TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); |
299 | return; |
300 | } |
301 | |
302 | SmallVector<int, 32> OptimizeShuf; |
303 | Value *Temp[8]; |
304 | |
305 | for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) { |
306 | genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16, |
307 | (i + 1) / Stride * 16); |
308 | Temp[i / 2] = Builder.CreateShuffleVector( |
309 | Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); |
310 | OptimizeShuf.clear(); |
311 | } |
312 | |
313 | if (VecElems == 32) { |
314 | std::copy(Temp, Temp + Stride, TransposedMatrix.begin()); |
315 | return; |
316 | } else |
317 | for (unsigned i = 0; i < Stride; i++) |
318 | TransposedMatrix[i] = |
319 | Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); |
320 | } |
321 | |
322 | void X86InterleavedAccessGroup::interleave8bitStride4VF8( |
323 | ArrayRef<Instruction *> Matrix, |
324 | SmallVectorImpl<Value *> &TransposedMatrix) { |
325 | |
326 | |
327 | |
328 | |
329 | |
330 | |
331 | MVT VT = MVT::v8i16; |
332 | TransposedMatrix.resize(2); |
333 | SmallVector<int, 16> MaskLow; |
334 | SmallVector<int, 32> MaskLowTemp1, MaskLowWord; |
335 | SmallVector<int, 32> MaskHighTemp1, MaskHighWord; |
336 | |
337 | for (unsigned i = 0; i < 8; ++i) { |
338 | MaskLow.push_back(i); |
339 | MaskLow.push_back(i + 8); |
340 | } |
341 | |
342 | createUnpackShuffleMask(VT, MaskLowTemp1, true, false); |
343 | createUnpackShuffleMask(VT, MaskHighTemp1, false, false); |
344 | narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord); |
345 | narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord); |
346 | |
347 | |
348 | Value *IntrVec1Low = |
349 | Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); |
350 | Value *IntrVec2Low = |
351 | Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); |
352 | |
353 | |
354 | |
355 | |
356 | TransposedMatrix[0] = |
357 | Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord); |
358 | TransposedMatrix[1] = |
359 | Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord); |
360 | } |
361 | |
362 | void X86InterleavedAccessGroup::interleave8bitStride4( |
363 | ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix, |
364 | unsigned NumOfElm) { |
365 | |
366 | |
367 | |
368 | |
369 | |
370 | |
371 | MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm); |
372 | MVT HalfVT = scaleVectorType(VT); |
373 | |
374 | TransposedMatrix.resize(4); |
375 | SmallVector<int, 32> MaskHigh; |
376 | SmallVector<int, 32> MaskLow; |
377 | SmallVector<int, 32> LowHighMask[2]; |
378 | SmallVector<int, 32> MaskHighTemp; |
379 | SmallVector<int, 32> MaskLowTemp; |
380 | |
381 | |
382 | |
383 | |
384 | createUnpackShuffleMask(VT, MaskLow, true, false); |
385 | createUnpackShuffleMask(VT, MaskHigh, false, false); |
386 | |
387 | |
388 | |
389 | |
390 | createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false); |
391 | createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false); |
392 | narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]); |
393 | narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]); |
394 | |
395 | |
396 | |
397 | |
398 | |
399 | Value *IntrVec[4]; |
400 | |
401 | IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); |
402 | IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh); |
403 | IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); |
404 | IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh); |
405 | |
406 | |
407 | |
408 | |
409 | |
410 | |
411 | Value *VecOut[4]; |
412 | for (int i = 0; i < 4; i++) |
413 | VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2], |
414 | LowHighMask[i % 2]); |
415 | |
416 | |
417 | |
418 | |
419 | |
420 | |
421 | if (VT == MVT::v16i8) { |
422 | std::copy(VecOut, VecOut + 4, TransposedMatrix.begin()); |
423 | return; |
424 | } |
425 | |
426 | reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16), |
427 | NumOfElm, 4, Builder); |
428 | } |
429 | |
430 | |
431 | |
432 | |
433 | |
434 | |
435 | |
436 | |
437 | |
438 | |
439 | |
440 | static void createShuffleStride(MVT VT, int Stride, |
441 | SmallVectorImpl<int> &Mask) { |
442 | int VectorSize = VT.getSizeInBits(); |
443 | int VF = VT.getVectorNumElements(); |
444 | int LaneCount = std::max(VectorSize / 128, 1); |
445 | for (int Lane = 0; Lane < LaneCount; Lane++) |
446 | for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i) |
447 | Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane); |
448 | } |
449 | |
450 | |
451 | |
452 | |
453 | |
454 | static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) { |
455 | int VectorSize = VT.getSizeInBits(); |
456 | int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1); |
457 | for (int i = 0, FirstGroupElement = 0; i < 3; i++) { |
458 | int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0); |
459 | SizeInfo.push_back(GroupSize); |
460 | FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF; |
461 | } |
462 | } |
463 | |
464 | |
465 | |
466 | |
467 | |
468 | |
469 | |
470 | |
471 | |
472 | |
473 | |
474 | |
475 | |
476 | |
477 | static void DecodePALIGNRMask(MVT VT, unsigned Imm, |
478 | SmallVectorImpl<int> &ShuffleMask, |
479 | bool AlignDirection = true, bool Unary = false) { |
480 | unsigned NumElts = VT.getVectorNumElements(); |
481 | unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1); |
482 | unsigned NumLaneElts = NumElts / NumLanes; |
483 | |
484 | Imm = AlignDirection ? Imm : (NumLaneElts - Imm); |
485 | unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8); |
486 | |
487 | for (unsigned l = 0; l != NumElts; l += NumLaneElts) { |
488 | for (unsigned i = 0; i != NumLaneElts; ++i) { |
489 | unsigned Base = i + Offset; |
490 | |
491 | |
492 | if (Base >= NumLaneElts) |
493 | Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts; |
494 | ShuffleMask.push_back(Base + l); |
495 | } |
496 | } |
497 | } |
498 | |
499 | |
500 | |
501 | |
502 | |
503 | |
504 | |
505 | |
506 | |
507 | |
508 | |
509 | |
510 | |
511 | |
512 | |
513 | |
514 | |
515 | |
516 | |
517 | |
518 | |
519 | |
520 | |
521 | |
522 | |
523 | |
524 | |
525 | |
526 | static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec, |
527 | unsigned VecElems, IRBuilder<> &Builder) { |
528 | if (VecElems == 16) { |
| 5 | | Assuming 'VecElems' is not equal to 16 | |
|
| |
529 | for (int i = 0; i < 3; i++) |
530 | Vec[i] = InVec[i]; |
531 | return; |
532 | } |
533 | |
534 | for (unsigned j = 0; j < VecElems / 32; j++) |
| 7 | | Assuming the condition is false | |
|
| 8 | | Loop condition is false. Execution continues on line 539 | |
|
535 | for (int i = 0; i < 3; i++) |
536 | Vec[i + j * 3] = Builder.CreateShuffleVector( |
537 | InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32)); |
538 | |
539 | if (VecElems == 32) |
| 9 | | Assuming 'VecElems' is not equal to 32 | |
|
| |
540 | return; |
541 | |
542 | for (int i = 0; i < 3; i++) |
| |
| 12 | | Loop condition is true. Entering loop body | |
|
543 | Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat); |
| 13 | | 1st function call argument is an uninitialized value |
|
544 | } |
545 | |
546 | void X86InterleavedAccessGroup::deinterleave8bitStride3( |
547 | ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, |
548 | unsigned VecElems) { |
549 | |
550 | |
551 | |
552 | |
553 | |
554 | TransposedMatrix.resize(3); |
555 | SmallVector<int, 32> VPShuf; |
556 | SmallVector<int, 32> VPAlign[2]; |
557 | SmallVector<int, 32> VPAlign2; |
558 | SmallVector<int, 32> VPAlign3; |
559 | SmallVector<int, 3> GroupSize; |
560 | Value *Vec[6], *TempVector[3]; |
561 | |
562 | MVT VT = MVT::getVT(Shuffles[0]->getType()); |
563 | |
564 | createShuffleStride(VT, 3, VPShuf); |
565 | setGroupSize(VT, GroupSize); |
566 | |
567 | for (int i = 0; i < 2; i++) |
| 1 | Loop condition is true. Entering loop body | |
|
| 2 | | Loop condition is true. Entering loop body | |
|
| 3 | | Loop condition is false. Execution continues on line 570 | |
|
568 | DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false); |
569 | |
570 | DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true); |
571 | DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true); |
572 | |
573 | concatSubVector(Vec, InVec, VecElems, Builder); |
| 4 | | Calling 'concatSubVector' | |
|
574 | |
575 | |
576 | |
577 | |
578 | for (int i = 0; i < 3; i++) |
579 | Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); |
580 | |
581 | |
582 | |
583 | |
584 | |
585 | for (int i = 0; i < 3; i++) |
586 | TempVector[i] = |
587 | Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]); |
588 | |
589 | |
590 | |
591 | |
592 | |
593 | for (int i = 0; i < 3; i++) |
594 | Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i], |
595 | VPAlign[1]); |
596 | |
597 | |
598 | |
599 | |
600 | |
601 | Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3); |
602 | TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2); |
603 | TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec; |
604 | TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2]; |
605 | } |
606 | |
607 | |
608 | |
609 | |
610 | static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask, |
611 | SmallVectorImpl<int> &Output) { |
612 | int IndexGroup[3] = {0, 0, 0}; |
613 | int Index = 0; |
614 | int VectorWidth = VT.getSizeInBits(); |
615 | int VF = VT.getVectorNumElements(); |
616 | |
617 | int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1; |
618 | for (int i = 0; i < 3; i++) { |
619 | IndexGroup[(Index * 3) % (VF / Lane)] = Index; |
620 | Index += Mask[i]; |
621 | } |
622 | |
623 | for (int i = 0; i < VF / Lane; i++) { |
624 | Output.push_back(IndexGroup[i % 3]); |
625 | IndexGroup[i % 3]++; |
626 | } |
627 | } |
628 | |
629 | void X86InterleavedAccessGroup::interleave8bitStride3( |
630 | ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, |
631 | unsigned VecElems) { |
632 | |
633 | |
634 | |
635 | |
636 | |
637 | TransposedMatrix.resize(3); |
638 | SmallVector<int, 3> GroupSize; |
639 | SmallVector<int, 32> VPShuf; |
640 | SmallVector<int, 32> VPAlign[3]; |
641 | SmallVector<int, 32> VPAlign2; |
642 | SmallVector<int, 32> VPAlign3; |
643 | |
644 | Value *Vec[3], *TempVector[3]; |
645 | MVT VT = MVT::getVectorVT(MVT::i8, VecElems); |
646 | |
647 | setGroupSize(VT, GroupSize); |
648 | |
649 | for (int i = 0; i < 3; i++) |
650 | DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]); |
651 | |
652 | DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true); |
653 | DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true); |
654 | |
655 | |
656 | |
657 | |
658 | |
659 | Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2); |
660 | Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3); |
661 | Vec[2] = InVec[2]; |
662 | |
663 | |
664 | |
665 | |
666 | |
667 | for (int i = 0; i < 3; i++) |
668 | TempVector[i] = |
669 | Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]); |
670 | |
671 | |
672 | |
673 | |
674 | |
675 | for (int i = 0; i < 3; i++) |
676 | Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3], |
677 | VPAlign[2]); |
678 | |
679 | |
680 | |
681 | |
682 | |
683 | unsigned NumOfElm = VT.getVectorNumElements(); |
684 | group2Shuffle(VT, GroupSize, VPShuf); |
685 | reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder); |
686 | } |
687 | |
688 | void X86InterleavedAccessGroup::transpose_4x4( |
689 | ArrayRef<Instruction *> Matrix, |
690 | SmallVectorImpl<Value *> &TransposedMatrix) { |
691 | assert(Matrix.size() == 4 && "Invalid matrix size"); |
692 | TransposedMatrix.resize(4); |
693 | |
694 | |
695 | static constexpr int IntMask1[] = {0, 1, 4, 5}; |
696 | ArrayRef<int> Mask = makeArrayRef(IntMask1, 4); |
697 | Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); |
698 | Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); |
699 | |
700 | |
701 | static constexpr int IntMask2[] = {2, 3, 6, 7}; |
702 | Mask = makeArrayRef(IntMask2, 4); |
703 | Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); |
704 | Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); |
705 | |
706 | |
707 | static constexpr int IntMask3[] = {0, 4, 2, 6}; |
708 | Mask = makeArrayRef(IntMask3, 4); |
709 | TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); |
710 | TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); |
711 | |
712 | |
713 | static constexpr int IntMask4[] = {1, 5, 3, 7}; |
714 | Mask = makeArrayRef(IntMask4, 4); |
715 | TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); |
716 | TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); |
717 | } |
718 | |
719 | |
720 | |
721 | bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { |
722 | SmallVector<Instruction *, 4> DecomposedVectors; |
723 | SmallVector<Value *, 4> TransposedVectors; |
724 | auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType()); |
725 | |
726 | if (isa<LoadInst>(Inst)) { |
727 | auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType()); |
728 | unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor; |
729 | switch (NumSubVecElems) { |
730 | default: |
731 | return false; |
732 | case 4: |
733 | case 8: |
734 | case 16: |
735 | case 32: |
736 | case 64: |
737 | if (ShuffleTy->getNumElements() != NumSubVecElems) |
738 | return false; |
739 | break; |
740 | } |
741 | |
742 | |
743 | decompose(Inst, Factor, ShuffleTy, DecomposedVectors); |
744 | |
745 | |
746 | |
747 | |
748 | |
749 | if (NumSubVecElems == 4) |
750 | transpose_4x4(DecomposedVectors, TransposedVectors); |
751 | else |
752 | deinterleave8bitStride3(DecomposedVectors, TransposedVectors, |
753 | NumSubVecElems); |
754 | |
755 | |
756 | |
757 | for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) |
758 | Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); |
759 | |
760 | return true; |
761 | } |
762 | |
763 | Type *ShuffleEltTy = ShuffleTy->getElementType(); |
764 | unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor; |
765 | |
766 | |
767 | |
768 | |
769 | decompose(Shuffles[0], Factor, |
770 | FixedVectorType::get(ShuffleEltTy, NumSubVecElems), |
771 | DecomposedVectors); |
772 | |
773 | |
774 | |
775 | switch (NumSubVecElems) { |
776 | case 4: |
777 | transpose_4x4(DecomposedVectors, TransposedVectors); |
778 | break; |
779 | case 8: |
780 | interleave8bitStride4VF8(DecomposedVectors, TransposedVectors); |
781 | break; |
782 | case 16: |
783 | case 32: |
784 | case 64: |
785 | if (Factor == 4) |
786 | interleave8bitStride4(DecomposedVectors, TransposedVectors, |
787 | NumSubVecElems); |
788 | if (Factor == 3) |
789 | interleave8bitStride3(DecomposedVectors, TransposedVectors, |
790 | NumSubVecElems); |
791 | break; |
792 | default: |
793 | return false; |
794 | } |
795 | |
796 | |
797 | Value *WideVec = concatenateVectors(Builder, TransposedVectors); |
798 | |
799 | |
800 | StoreInst *SI = cast<StoreInst>(Inst); |
801 | Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign()); |
802 | |
803 | return true; |
804 | } |
805 | |
806 | |
807 | |
808 | |
809 | |
810 | bool X86TargetLowering::lowerInterleavedLoad( |
811 | LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, |
812 | ArrayRef<unsigned> Indices, unsigned Factor) const { |
813 | assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && |
814 | "Invalid interleave factor"); |
815 | assert(!Shuffles.empty() && "Empty shufflevector input"); |
816 | assert(Shuffles.size() == Indices.size() && |
817 | "Unmatched number of shufflevectors and indices"); |
818 | |
819 | |
820 | IRBuilder<> Builder(LI); |
821 | X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, |
822 | Builder); |
823 | |
824 | return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); |
825 | } |
826 | |
827 | bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, |
828 | ShuffleVectorInst *SVI, |
829 | unsigned Factor) const { |
830 | assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && |
831 | "Invalid interleave factor"); |
832 | |
833 | assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor == |
834 | 0 && |
835 | "Invalid interleaved store"); |
836 | |
837 | |
838 | |
839 | SmallVector<unsigned, 4> Indices; |
840 | auto Mask = SVI->getShuffleMask(); |
841 | for (unsigned i = 0; i < Factor; i++) |
842 | Indices.push_back(Mask[i]); |
843 | |
844 | ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI); |
845 | |
846 | |
847 | IRBuilder<> Builder(SI); |
848 | X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget, |
849 | Builder); |
850 | |
851 | return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); |
852 | } |