File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp |
Warning: | line 628, column 7 Forming reference to null pointer |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | // This pass lowers all occurrences of i1 values (with a vreg_1 register class) | |||
10 | // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA | |||
11 | // form and a wave-level control flow graph. | |||
12 | // | |||
13 | // Before this pass, values that are semantically i1 and are defined and used | |||
14 | // within the same basic block are already represented as lane masks in scalar | |||
15 | // registers. However, values that cross basic blocks are always transferred | |||
16 | // between basic blocks in vreg_1 virtual registers and are lowered by this | |||
17 | // pass. | |||
18 | // | |||
19 | // The only instructions that use or define vreg_1 virtual registers are COPY, | |||
20 | // PHI, and IMPLICIT_DEF. | |||
21 | // | |||
22 | //===----------------------------------------------------------------------===// | |||
23 | ||||
24 | #include "AMDGPU.h" | |||
25 | #include "GCNSubtarget.h" | |||
26 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
27 | #include "llvm/CodeGen/MachineDominators.h" | |||
28 | #include "llvm/CodeGen/MachineFunctionPass.h" | |||
29 | #include "llvm/CodeGen/MachinePostDominators.h" | |||
30 | #include "llvm/CodeGen/MachineSSAUpdater.h" | |||
31 | #include "llvm/InitializePasses.h" | |||
32 | ||||
33 | #define DEBUG_TYPE"si-i1-copies" "si-i1-copies" | |||
34 | ||||
35 | using namespace llvm; | |||
36 | ||||
37 | static unsigned createLaneMaskReg(MachineFunction &MF); | |||
38 | static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); | |||
39 | ||||
40 | namespace { | |||
41 | ||||
42 | class SILowerI1Copies : public MachineFunctionPass { | |||
43 | public: | |||
44 | static char ID; | |||
45 | ||||
46 | private: | |||
47 | bool IsWave32 = false; | |||
48 | MachineFunction *MF = nullptr; | |||
49 | MachineDominatorTree *DT = nullptr; | |||
50 | MachinePostDominatorTree *PDT = nullptr; | |||
51 | MachineRegisterInfo *MRI = nullptr; | |||
52 | const GCNSubtarget *ST = nullptr; | |||
53 | const SIInstrInfo *TII = nullptr; | |||
54 | ||||
55 | unsigned ExecReg; | |||
56 | unsigned MovOp; | |||
57 | unsigned AndOp; | |||
58 | unsigned OrOp; | |||
59 | unsigned XorOp; | |||
60 | unsigned AndN2Op; | |||
61 | unsigned OrN2Op; | |||
62 | ||||
63 | DenseSet<unsigned> ConstrainRegs; | |||
64 | ||||
65 | public: | |||
66 | SILowerI1Copies() : MachineFunctionPass(ID) { | |||
67 | initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); | |||
68 | } | |||
69 | ||||
70 | bool runOnMachineFunction(MachineFunction &MF) override; | |||
71 | ||||
72 | StringRef getPassName() const override { return "SI Lower i1 Copies"; } | |||
73 | ||||
74 | void getAnalysisUsage(AnalysisUsage &AU) const override { | |||
75 | AU.setPreservesCFG(); | |||
76 | AU.addRequired<MachineDominatorTree>(); | |||
77 | AU.addRequired<MachinePostDominatorTree>(); | |||
78 | MachineFunctionPass::getAnalysisUsage(AU); | |||
79 | } | |||
80 | ||||
81 | private: | |||
82 | void lowerCopiesFromI1(); | |||
83 | void lowerPhis(); | |||
84 | void lowerCopiesToI1(); | |||
85 | bool isConstantLaneMask(Register Reg, bool &Val) const; | |||
86 | void buildMergeLaneMasks(MachineBasicBlock &MBB, | |||
87 | MachineBasicBlock::iterator I, const DebugLoc &DL, | |||
88 | unsigned DstReg, unsigned PrevReg, unsigned CurReg); | |||
89 | MachineBasicBlock::iterator | |||
90 | getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; | |||
91 | ||||
92 | bool isVreg1(Register Reg) const { | |||
93 | return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; | |||
94 | } | |||
95 | ||||
96 | bool isLaneMaskReg(unsigned Reg) const { | |||
97 | return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && | |||
98 | TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == | |||
99 | ST->getWavefrontSize(); | |||
100 | } | |||
101 | }; | |||
102 | ||||
103 | /// Helper class that determines the relationship between incoming values of a | |||
104 | /// phi in the control flow graph to determine where an incoming value can | |||
105 | /// simply be taken as a scalar lane mask as-is, and where it needs to be | |||
106 | /// merged with another, previously defined lane mask. | |||
107 | /// | |||
108 | /// The approach is as follows: | |||
109 | /// - Determine all basic blocks which, starting from the incoming blocks, | |||
110 | /// a wave may reach before entering the def block (the block containing the | |||
111 | /// phi). | |||
112 | /// - If an incoming block has no predecessors in this set, we can take the | |||
113 | /// incoming value as a scalar lane mask as-is. | |||
114 | /// -- A special case of this is when the def block has a self-loop. | |||
115 | /// - Otherwise, the incoming value needs to be merged with a previously | |||
116 | /// defined lane mask. | |||
117 | /// - If there is a path into the set of reachable blocks that does _not_ go | |||
118 | /// through an incoming block where we can take the scalar lane mask as-is, | |||
119 | /// we need to invent an available value for the SSAUpdater. Choices are | |||
120 | /// 0 and undef, with differing consequences for how to merge values etc. | |||
121 | /// | |||
122 | /// TODO: We could use region analysis to quickly skip over SESE regions during | |||
123 | /// the traversal. | |||
124 | /// | |||
125 | class PhiIncomingAnalysis { | |||
126 | MachinePostDominatorTree &PDT; | |||
127 | ||||
128 | // For each reachable basic block, whether it is a source in the induced | |||
129 | // subgraph of the CFG. | |||
130 | DenseMap<MachineBasicBlock *, bool> ReachableMap; | |||
131 | SmallVector<MachineBasicBlock *, 4> ReachableOrdered; | |||
132 | SmallVector<MachineBasicBlock *, 4> Stack; | |||
133 | SmallVector<MachineBasicBlock *, 4> Predecessors; | |||
134 | ||||
135 | public: | |||
136 | PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} | |||
137 | ||||
138 | /// Returns whether \p MBB is a source in the induced subgraph of reachable | |||
139 | /// blocks. | |||
140 | bool isSource(MachineBasicBlock &MBB) const { | |||
141 | return ReachableMap.find(&MBB)->second; | |||
142 | } | |||
143 | ||||
144 | ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } | |||
145 | ||||
146 | void analyze(MachineBasicBlock &DefBlock, | |||
147 | ArrayRef<MachineBasicBlock *> IncomingBlocks) { | |||
148 | assert(Stack.empty())((void)0); | |||
149 | ReachableMap.clear(); | |||
150 | ReachableOrdered.clear(); | |||
151 | Predecessors.clear(); | |||
152 | ||||
153 | // Insert the def block first, so that it acts as an end point for the | |||
154 | // traversal. | |||
155 | ReachableMap.try_emplace(&DefBlock, false); | |||
156 | ReachableOrdered.push_back(&DefBlock); | |||
157 | ||||
158 | for (MachineBasicBlock *MBB : IncomingBlocks) { | |||
159 | if (MBB == &DefBlock) { | |||
160 | ReachableMap[&DefBlock] = true; // self-loop on DefBlock | |||
161 | continue; | |||
162 | } | |||
163 | ||||
164 | ReachableMap.try_emplace(MBB, false); | |||
165 | ReachableOrdered.push_back(MBB); | |||
166 | ||||
167 | // If this block has a divergent terminator and the def block is its | |||
168 | // post-dominator, the wave may first visit the other successors. | |||
169 | bool Divergent = false; | |||
170 | for (MachineInstr &MI : MBB->terminators()) { | |||
171 | if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || | |||
172 | MI.getOpcode() == AMDGPU::SI_IF || | |||
173 | MI.getOpcode() == AMDGPU::SI_ELSE || | |||
174 | MI.getOpcode() == AMDGPU::SI_LOOP) { | |||
175 | Divergent = true; | |||
176 | break; | |||
177 | } | |||
178 | } | |||
179 | ||||
180 | if (Divergent && PDT.dominates(&DefBlock, MBB)) | |||
181 | append_range(Stack, MBB->successors()); | |||
182 | } | |||
183 | ||||
184 | while (!Stack.empty()) { | |||
185 | MachineBasicBlock *MBB = Stack.pop_back_val(); | |||
186 | if (!ReachableMap.try_emplace(MBB, false).second) | |||
187 | continue; | |||
188 | ReachableOrdered.push_back(MBB); | |||
189 | ||||
190 | append_range(Stack, MBB->successors()); | |||
191 | } | |||
192 | ||||
193 | for (MachineBasicBlock *MBB : ReachableOrdered) { | |||
194 | bool HaveReachablePred = false; | |||
195 | for (MachineBasicBlock *Pred : MBB->predecessors()) { | |||
196 | if (ReachableMap.count(Pred)) { | |||
197 | HaveReachablePred = true; | |||
198 | } else { | |||
199 | Stack.push_back(Pred); | |||
200 | } | |||
201 | } | |||
202 | if (!HaveReachablePred) | |||
203 | ReachableMap[MBB] = true; | |||
204 | if (HaveReachablePred) { | |||
205 | for (MachineBasicBlock *UnreachablePred : Stack) { | |||
206 | if (!llvm::is_contained(Predecessors, UnreachablePred)) | |||
207 | Predecessors.push_back(UnreachablePred); | |||
208 | } | |||
209 | } | |||
210 | Stack.clear(); | |||
211 | } | |||
212 | } | |||
213 | }; | |||
214 | ||||
215 | /// Helper class that detects loops which require us to lower an i1 COPY into | |||
216 | /// bitwise manipulation. | |||
217 | /// | |||
218 | /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish | |||
219 | /// between loops with the same header. Consider this example: | |||
220 | /// | |||
221 | /// A-+-+ | |||
222 | /// | | | | |||
223 | /// B-+ | | |||
224 | /// | | | |||
225 | /// C---+ | |||
226 | /// | |||
227 | /// A is the header of a loop containing A, B, and C as far as LoopInfo is | |||
228 | /// concerned. However, an i1 COPY in B that is used in C must be lowered to | |||
229 | /// bitwise operations to combine results from different loop iterations when | |||
230 | /// B has a divergent branch (since by default we will compile this code such | |||
231 | /// that threads in a wave are merged at the entry of C). | |||
232 | /// | |||
233 | /// The following rule is implemented to determine whether bitwise operations | |||
234 | /// are required: use the bitwise lowering for a def in block B if a backward | |||
235 | /// edge to B is reachable without going through the nearest common | |||
236 | /// post-dominator of B and all uses of the def. | |||
237 | /// | |||
238 | /// TODO: This rule is conservative because it does not check whether the | |||
239 | /// relevant branches are actually divergent. | |||
240 | /// | |||
241 | /// The class is designed to cache the CFG traversal so that it can be re-used | |||
242 | /// for multiple defs within the same basic block. | |||
243 | /// | |||
244 | /// TODO: We could use region analysis to quickly skip over SESE regions during | |||
245 | /// the traversal. | |||
246 | /// | |||
247 | class LoopFinder { | |||
248 | MachineDominatorTree &DT; | |||
249 | MachinePostDominatorTree &PDT; | |||
250 | ||||
251 | // All visited / reachable block, tagged by level (level 0 is the def block, | |||
252 | // level 1 are all blocks reachable including but not going through the def | |||
253 | // block's IPDOM, etc.). | |||
254 | DenseMap<MachineBasicBlock *, unsigned> Visited; | |||
255 | ||||
256 | // Nearest common dominator of all visited blocks by level (level 0 is the | |||
257 | // def block). Used for seeding the SSAUpdater. | |||
258 | SmallVector<MachineBasicBlock *, 4> CommonDominators; | |||
259 | ||||
260 | // Post-dominator of all visited blocks. | |||
261 | MachineBasicBlock *VisitedPostDom = nullptr; | |||
262 | ||||
263 | // Level at which a loop was found: 0 is not possible; 1 = a backward edge is | |||
264 | // reachable without going through the IPDOM of the def block (if the IPDOM | |||
265 | // itself has an edge to the def block, the loop level is 2), etc. | |||
266 | unsigned FoundLoopLevel = ~0u; | |||
267 | ||||
268 | MachineBasicBlock *DefBlock = nullptr; | |||
269 | SmallVector<MachineBasicBlock *, 4> Stack; | |||
270 | SmallVector<MachineBasicBlock *, 4> NextLevel; | |||
271 | ||||
272 | public: | |||
273 | LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) | |||
274 | : DT(DT), PDT(PDT) {} | |||
275 | ||||
276 | void initialize(MachineBasicBlock &MBB) { | |||
277 | Visited.clear(); | |||
278 | CommonDominators.clear(); | |||
279 | Stack.clear(); | |||
280 | NextLevel.clear(); | |||
281 | VisitedPostDom = nullptr; | |||
282 | FoundLoopLevel = ~0u; | |||
283 | ||||
284 | DefBlock = &MBB; | |||
285 | } | |||
286 | ||||
287 | /// Check whether a backward edge can be reached without going through the | |||
288 | /// given \p PostDom of the def block. | |||
289 | /// | |||
290 | /// Return the level of \p PostDom if a loop was found, or 0 otherwise. | |||
291 | unsigned findLoop(MachineBasicBlock *PostDom) { | |||
292 | MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); | |||
293 | ||||
294 | if (!VisitedPostDom) | |||
295 | advanceLevel(); | |||
296 | ||||
297 | unsigned Level = 0; | |||
298 | while (PDNode->getBlock() != PostDom) { | |||
299 | if (PDNode->getBlock() == VisitedPostDom) | |||
300 | advanceLevel(); | |||
301 | PDNode = PDNode->getIDom(); | |||
302 | Level++; | |||
303 | if (FoundLoopLevel == Level) | |||
304 | return Level; | |||
305 | } | |||
306 | ||||
307 | return 0; | |||
308 | } | |||
309 | ||||
310 | /// Add undef values dominating the loop and the optionally given additional | |||
311 | /// blocks, so that the SSA updater doesn't have to search all the way to the | |||
312 | /// function entry. | |||
313 | void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, | |||
314 | ArrayRef<MachineBasicBlock *> Blocks = {}) { | |||
315 | assert(LoopLevel < CommonDominators.size())((void)0); | |||
316 | ||||
317 | MachineBasicBlock *Dom = CommonDominators[LoopLevel]; | |||
318 | for (MachineBasicBlock *MBB : Blocks) | |||
319 | Dom = DT.findNearestCommonDominator(Dom, MBB); | |||
320 | ||||
321 | if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { | |||
322 | SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); | |||
323 | } else { | |||
324 | // The dominator is part of the loop or the given blocks, so add the | |||
325 | // undef value to unreachable predecessors instead. | |||
326 | for (MachineBasicBlock *Pred : Dom->predecessors()) { | |||
327 | if (!inLoopLevel(*Pred, LoopLevel, Blocks)) | |||
328 | SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); | |||
329 | } | |||
330 | } | |||
331 | } | |||
332 | ||||
333 | private: | |||
334 | bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, | |||
335 | ArrayRef<MachineBasicBlock *> Blocks) const { | |||
336 | auto DomIt = Visited.find(&MBB); | |||
337 | if (DomIt != Visited.end() && DomIt->second <= LoopLevel) | |||
338 | return true; | |||
339 | ||||
340 | if (llvm::is_contained(Blocks, &MBB)) | |||
341 | return true; | |||
342 | ||||
343 | return false; | |||
344 | } | |||
345 | ||||
346 | void advanceLevel() { | |||
347 | MachineBasicBlock *VisitedDom; | |||
348 | ||||
349 | if (!VisitedPostDom) { | |||
350 | VisitedPostDom = DefBlock; | |||
351 | VisitedDom = DefBlock; | |||
352 | Stack.push_back(DefBlock); | |||
353 | } else { | |||
354 | VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); | |||
355 | VisitedDom = CommonDominators.back(); | |||
356 | ||||
357 | for (unsigned i = 0; i < NextLevel.size();) { | |||
358 | if (PDT.dominates(VisitedPostDom, NextLevel[i])) { | |||
359 | Stack.push_back(NextLevel[i]); | |||
360 | ||||
361 | NextLevel[i] = NextLevel.back(); | |||
362 | NextLevel.pop_back(); | |||
363 | } else { | |||
364 | i++; | |||
365 | } | |||
366 | } | |||
367 | } | |||
368 | ||||
369 | unsigned Level = CommonDominators.size(); | |||
370 | while (!Stack.empty()) { | |||
371 | MachineBasicBlock *MBB = Stack.pop_back_val(); | |||
372 | if (!PDT.dominates(VisitedPostDom, MBB)) | |||
373 | NextLevel.push_back(MBB); | |||
374 | ||||
375 | Visited[MBB] = Level; | |||
376 | VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); | |||
377 | ||||
378 | for (MachineBasicBlock *Succ : MBB->successors()) { | |||
379 | if (Succ == DefBlock) { | |||
380 | if (MBB == VisitedPostDom) | |||
381 | FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); | |||
382 | else | |||
383 | FoundLoopLevel = std::min(FoundLoopLevel, Level); | |||
384 | continue; | |||
385 | } | |||
386 | ||||
387 | if (Visited.try_emplace(Succ, ~0u).second) { | |||
388 | if (MBB == VisitedPostDom) | |||
389 | NextLevel.push_back(Succ); | |||
390 | else | |||
391 | Stack.push_back(Succ); | |||
392 | } | |||
393 | } | |||
394 | } | |||
395 | ||||
396 | CommonDominators.push_back(VisitedDom); | |||
397 | } | |||
398 | }; | |||
399 | ||||
400 | } // End anonymous namespace. | |||
401 | ||||
402 | INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,static void *initializeSILowerI1CopiesPassOnce(PassRegistry & Registry) { | |||
403 | false)static void *initializeSILowerI1CopiesPassOnce(PassRegistry & Registry) { | |||
404 | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)initializeMachineDominatorTreePass(Registry); | |||
405 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)initializeMachinePostDominatorTreePass(Registry); | |||
406 | INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,PassInfo *PI = new PassInfo( "SI Lower i1 Copies", "si-i1-copies" , &SILowerI1Copies::ID, PassInfo::NormalCtor_t(callDefaultCtor <SILowerI1Copies>), false, false); Registry.registerPass (*PI, true); return PI; } static llvm::once_flag InitializeSILowerI1CopiesPassFlag ; void llvm::initializeSILowerI1CopiesPass(PassRegistry & Registry) { llvm::call_once(InitializeSILowerI1CopiesPassFlag , initializeSILowerI1CopiesPassOnce, std::ref(Registry)); } | |||
407 | false)PassInfo *PI = new PassInfo( "SI Lower i1 Copies", "si-i1-copies" , &SILowerI1Copies::ID, PassInfo::NormalCtor_t(callDefaultCtor <SILowerI1Copies>), false, false); Registry.registerPass (*PI, true); return PI; } static llvm::once_flag InitializeSILowerI1CopiesPassFlag ; void llvm::initializeSILowerI1CopiesPass(PassRegistry & Registry) { llvm::call_once(InitializeSILowerI1CopiesPassFlag , initializeSILowerI1CopiesPassOnce, std::ref(Registry)); } | |||
408 | ||||
409 | char SILowerI1Copies::ID = 0; | |||
410 | ||||
411 | char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; | |||
412 | ||||
413 | FunctionPass *llvm::createSILowerI1CopiesPass() { | |||
414 | return new SILowerI1Copies(); | |||
415 | } | |||
416 | ||||
417 | static unsigned createLaneMaskReg(MachineFunction &MF) { | |||
418 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
419 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
420 | return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass | |||
421 | : &AMDGPU::SReg_64RegClass); | |||
422 | } | |||
423 | ||||
424 | static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { | |||
425 | MachineFunction &MF = *MBB.getParent(); | |||
426 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
427 | const SIInstrInfo *TII = ST.getInstrInfo(); | |||
428 | unsigned UndefReg = createLaneMaskReg(MF); | |||
429 | BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), | |||
430 | UndefReg); | |||
431 | return UndefReg; | |||
432 | } | |||
433 | ||||
434 | /// Lower all instructions that def or use vreg_1 registers. | |||
435 | /// | |||
436 | /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can | |||
437 | /// occur around inline assembly. We do this first, before vreg_1 registers | |||
438 | /// are changed to scalar mask registers. | |||
439 | /// | |||
440 | /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before | |||
441 | /// all others, because phi lowering looks through copies and can therefore | |||
442 | /// often make copy lowering unnecessary. | |||
443 | bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { | |||
444 | // Only need to run this in SelectionDAG path. | |||
445 | if (TheMF.getProperties().hasProperty( | |||
| ||||
446 | MachineFunctionProperties::Property::Selected)) | |||
447 | return false; | |||
448 | ||||
449 | MF = &TheMF; | |||
450 | MRI = &MF->getRegInfo(); | |||
451 | DT = &getAnalysis<MachineDominatorTree>(); | |||
452 | PDT = &getAnalysis<MachinePostDominatorTree>(); | |||
453 | ||||
454 | ST = &MF->getSubtarget<GCNSubtarget>(); | |||
455 | TII = ST->getInstrInfo(); | |||
456 | IsWave32 = ST->isWave32(); | |||
457 | ||||
458 | if (IsWave32
| |||
459 | ExecReg = AMDGPU::EXEC_LO; | |||
460 | MovOp = AMDGPU::S_MOV_B32; | |||
461 | AndOp = AMDGPU::S_AND_B32; | |||
462 | OrOp = AMDGPU::S_OR_B32; | |||
463 | XorOp = AMDGPU::S_XOR_B32; | |||
464 | AndN2Op = AMDGPU::S_ANDN2_B32; | |||
465 | OrN2Op = AMDGPU::S_ORN2_B32; | |||
466 | } else { | |||
467 | ExecReg = AMDGPU::EXEC; | |||
468 | MovOp = AMDGPU::S_MOV_B64; | |||
469 | AndOp = AMDGPU::S_AND_B64; | |||
470 | OrOp = AMDGPU::S_OR_B64; | |||
471 | XorOp = AMDGPU::S_XOR_B64; | |||
472 | AndN2Op = AMDGPU::S_ANDN2_B64; | |||
473 | OrN2Op = AMDGPU::S_ORN2_B64; | |||
474 | } | |||
475 | ||||
476 | lowerCopiesFromI1(); | |||
477 | lowerPhis(); | |||
478 | lowerCopiesToI1(); | |||
479 | ||||
480 | for (unsigned Reg : ConstrainRegs) | |||
481 | MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); | |||
482 | ConstrainRegs.clear(); | |||
483 | ||||
484 | return true; | |||
485 | } | |||
486 | ||||
487 | #ifndef NDEBUG1 | |||
488 | static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, | |||
489 | const MachineRegisterInfo &MRI, | |||
490 | Register Reg) { | |||
491 | unsigned Size = TRI.getRegSizeInBits(Reg, MRI); | |||
492 | return Size == 1 || Size == 32; | |||
493 | } | |||
494 | #endif | |||
495 | ||||
496 | void SILowerI1Copies::lowerCopiesFromI1() { | |||
497 | SmallVector<MachineInstr *, 4> DeadCopies; | |||
498 | ||||
499 | for (MachineBasicBlock &MBB : *MF) { | |||
500 | for (MachineInstr &MI : MBB) { | |||
501 | if (MI.getOpcode() != AMDGPU::COPY) | |||
502 | continue; | |||
503 | ||||
504 | Register DstReg = MI.getOperand(0).getReg(); | |||
505 | Register SrcReg = MI.getOperand(1).getReg(); | |||
506 | if (!isVreg1(SrcReg)) | |||
507 | continue; | |||
508 | ||||
509 | if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) | |||
510 | continue; | |||
511 | ||||
512 | // Copy into a 32-bit vector register. | |||
513 | LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI)do { } while (false); | |||
514 | DebugLoc DL = MI.getDebugLoc(); | |||
515 | ||||
516 | assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg))((void)0); | |||
517 | assert(!MI.getOperand(0).getSubReg())((void)0); | |||
518 | ||||
519 | ConstrainRegs.insert(SrcReg); | |||
520 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) | |||
521 | .addImm(0) | |||
522 | .addImm(0) | |||
523 | .addImm(0) | |||
524 | .addImm(-1) | |||
525 | .addReg(SrcReg); | |||
526 | DeadCopies.push_back(&MI); | |||
527 | } | |||
528 | ||||
529 | for (MachineInstr *MI : DeadCopies) | |||
530 | MI->eraseFromParent(); | |||
531 | DeadCopies.clear(); | |||
532 | } | |||
533 | } | |||
534 | ||||
535 | void SILowerI1Copies::lowerPhis() { | |||
536 | MachineSSAUpdater SSAUpdater(*MF); | |||
537 | LoopFinder LF(*DT, *PDT); | |||
538 | PhiIncomingAnalysis PIA(*PDT); | |||
539 | SmallVector<MachineInstr *, 4> Vreg1Phis; | |||
540 | SmallVector<MachineBasicBlock *, 4> IncomingBlocks; | |||
541 | SmallVector<unsigned, 4> IncomingRegs; | |||
542 | SmallVector<unsigned, 4> IncomingUpdated; | |||
543 | #ifndef NDEBUG1 | |||
544 | DenseSet<unsigned> PhiRegisters; | |||
545 | #endif | |||
546 | ||||
547 | for (MachineBasicBlock &MBB : *MF) { | |||
548 | for (MachineInstr &MI : MBB.phis()) { | |||
549 | if (isVreg1(MI.getOperand(0).getReg())) | |||
550 | Vreg1Phis.push_back(&MI); | |||
551 | } | |||
552 | } | |||
553 | ||||
554 | MachineBasicBlock *PrevMBB = nullptr; | |||
555 | for (MachineInstr *MI : Vreg1Phis) { | |||
556 | MachineBasicBlock &MBB = *MI->getParent(); | |||
557 | if (&MBB != PrevMBB) { | |||
558 | LF.initialize(MBB); | |||
559 | PrevMBB = &MBB; | |||
560 | } | |||
561 | ||||
562 | LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI)do { } while (false); | |||
563 | ||||
564 | Register DstReg = MI->getOperand(0).getReg(); | |||
565 | MRI->setRegClass(DstReg, IsWave32
| |||
566 | : &AMDGPU::SReg_64RegClass); | |||
567 | ||||
568 | // Collect incoming values. | |||
569 | for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { | |||
570 | assert(i + 1 < MI->getNumOperands())((void)0); | |||
571 | Register IncomingReg = MI->getOperand(i).getReg(); | |||
572 | MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); | |||
573 | MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); | |||
574 | ||||
575 | if (IncomingDef->getOpcode() == AMDGPU::COPY) { | |||
576 | IncomingReg = IncomingDef->getOperand(1).getReg(); | |||
577 | assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg))((void)0); | |||
578 | assert(!IncomingDef->getOperand(1).getSubReg())((void)0); | |||
579 | } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { | |||
580 | continue; | |||
581 | } else { | |||
582 | assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg))((void)0); | |||
583 | } | |||
584 | ||||
585 | IncomingBlocks.push_back(IncomingMBB); | |||
586 | IncomingRegs.push_back(IncomingReg); | |||
587 | } | |||
588 | ||||
589 | #ifndef NDEBUG1 | |||
590 | PhiRegisters.insert(DstReg); | |||
591 | #endif | |||
592 | ||||
593 | // Phis in a loop that are observed outside the loop receive a simple but | |||
594 | // conservatively correct treatment. | |||
595 | std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; | |||
596 | for (MachineInstr &Use : MRI->use_instructions(DstReg)) | |||
597 | DomBlocks.push_back(Use.getParent()); | |||
598 | ||||
599 | MachineBasicBlock *PostDomBound = | |||
600 | PDT->findNearestCommonDominator(DomBlocks); | |||
601 | ||||
602 | // FIXME: This fails to find irreducible cycles. If we have a def (other | |||
603 | // than a constant) in a pair of blocks that end up looping back to each | |||
604 | // other, it will be mishandle. Due to structurization this shouldn't occur | |||
605 | // in practice. | |||
606 | unsigned FoundLoopLevel = LF.findLoop(PostDomBound); | |||
607 | ||||
608 | SSAUpdater.Initialize(DstReg); | |||
609 | ||||
610 | if (FoundLoopLevel
| |||
611 | LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); | |||
612 | ||||
613 | for (unsigned i = 0; i < IncomingRegs.size(); ++i) { | |||
614 | IncomingUpdated.push_back(createLaneMaskReg(*MF)); | |||
615 | SSAUpdater.AddAvailableValue(IncomingBlocks[i], | |||
616 | IncomingUpdated.back()); | |||
617 | } | |||
618 | ||||
619 | for (unsigned i = 0; i < IncomingRegs.size(); ++i) { | |||
620 | MachineBasicBlock &IMBB = *IncomingBlocks[i]; | |||
621 | buildMergeLaneMasks( | |||
622 | IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], | |||
623 | SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); | |||
624 | } | |||
625 | } else { | |||
626 | // The phi is not observed from outside a loop. Use a more accurate | |||
627 | // lowering. | |||
628 | PIA.analyze(MBB, IncomingBlocks); | |||
| ||||
629 | ||||
630 | for (MachineBasicBlock *MBB : PIA.predecessors()) | |||
631 | SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); | |||
632 | ||||
633 | for (unsigned i = 0; i < IncomingRegs.size(); ++i) { | |||
634 | MachineBasicBlock &IMBB = *IncomingBlocks[i]; | |||
635 | if (PIA.isSource(IMBB)) { | |||
636 | IncomingUpdated.push_back(0); | |||
637 | SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); | |||
638 | } else { | |||
639 | IncomingUpdated.push_back(createLaneMaskReg(*MF)); | |||
640 | SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); | |||
641 | } | |||
642 | } | |||
643 | ||||
644 | for (unsigned i = 0; i < IncomingRegs.size(); ++i) { | |||
645 | if (!IncomingUpdated[i]) | |||
646 | continue; | |||
647 | ||||
648 | MachineBasicBlock &IMBB = *IncomingBlocks[i]; | |||
649 | buildMergeLaneMasks( | |||
650 | IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], | |||
651 | SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); | |||
652 | } | |||
653 | } | |||
654 | ||||
655 | Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); | |||
656 | if (NewReg != DstReg) { | |||
657 | MRI->replaceRegWith(NewReg, DstReg); | |||
658 | MI->eraseFromParent(); | |||
659 | } | |||
660 | ||||
661 | IncomingBlocks.clear(); | |||
662 | IncomingRegs.clear(); | |||
663 | IncomingUpdated.clear(); | |||
664 | } | |||
665 | } | |||
666 | ||||
667 | void SILowerI1Copies::lowerCopiesToI1() { | |||
668 | MachineSSAUpdater SSAUpdater(*MF); | |||
669 | LoopFinder LF(*DT, *PDT); | |||
670 | SmallVector<MachineInstr *, 4> DeadCopies; | |||
671 | ||||
672 | for (MachineBasicBlock &MBB : *MF) { | |||
673 | LF.initialize(MBB); | |||
674 | ||||
675 | for (MachineInstr &MI : MBB) { | |||
676 | if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && | |||
677 | MI.getOpcode() != AMDGPU::COPY) | |||
678 | continue; | |||
679 | ||||
680 | Register DstReg = MI.getOperand(0).getReg(); | |||
681 | if (!isVreg1(DstReg)) | |||
682 | continue; | |||
683 | ||||
684 | if (MRI->use_empty(DstReg)) { | |||
685 | DeadCopies.push_back(&MI); | |||
686 | continue; | |||
687 | } | |||
688 | ||||
689 | LLVM_DEBUG(dbgs() << "Lower Other: " << MI)do { } while (false); | |||
690 | ||||
691 | MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass | |||
692 | : &AMDGPU::SReg_64RegClass); | |||
693 | if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) | |||
694 | continue; | |||
695 | ||||
696 | DebugLoc DL = MI.getDebugLoc(); | |||
697 | Register SrcReg = MI.getOperand(1).getReg(); | |||
698 | assert(!MI.getOperand(1).getSubReg())((void)0); | |||
699 | ||||
700 | if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { | |||
701 | assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32)((void)0); | |||
702 | unsigned TmpReg = createLaneMaskReg(*MF); | |||
703 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) | |||
704 | .addReg(SrcReg) | |||
705 | .addImm(0); | |||
706 | MI.getOperand(1).setReg(TmpReg); | |||
707 | SrcReg = TmpReg; | |||
708 | } | |||
709 | ||||
710 | // Defs in a loop that are observed outside the loop must be transformed | |||
711 | // into appropriate bit manipulation. | |||
712 | std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; | |||
713 | for (MachineInstr &Use : MRI->use_instructions(DstReg)) | |||
714 | DomBlocks.push_back(Use.getParent()); | |||
715 | ||||
716 | MachineBasicBlock *PostDomBound = | |||
717 | PDT->findNearestCommonDominator(DomBlocks); | |||
718 | unsigned FoundLoopLevel = LF.findLoop(PostDomBound); | |||
719 | if (FoundLoopLevel) { | |||
720 | SSAUpdater.Initialize(DstReg); | |||
721 | SSAUpdater.AddAvailableValue(&MBB, DstReg); | |||
722 | LF.addLoopEntries(FoundLoopLevel, SSAUpdater); | |||
723 | ||||
724 | buildMergeLaneMasks(MBB, MI, DL, DstReg, | |||
725 | SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); | |||
726 | DeadCopies.push_back(&MI); | |||
727 | } | |||
728 | } | |||
729 | ||||
730 | for (MachineInstr *MI : DeadCopies) | |||
731 | MI->eraseFromParent(); | |||
732 | DeadCopies.clear(); | |||
733 | } | |||
734 | } | |||
735 | ||||
736 | bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const { | |||
737 | const MachineInstr *MI; | |||
738 | for (;;) { | |||
739 | MI = MRI->getUniqueVRegDef(Reg); | |||
740 | if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) | |||
741 | return true; | |||
742 | ||||
743 | if (MI->getOpcode() != AMDGPU::COPY) | |||
744 | break; | |||
745 | ||||
746 | Reg = MI->getOperand(1).getReg(); | |||
747 | if (!Reg.isVirtual()) | |||
748 | return false; | |||
749 | if (!isLaneMaskReg(Reg)) | |||
750 | return false; | |||
751 | } | |||
752 | ||||
753 | if (MI->getOpcode() != MovOp) | |||
754 | return false; | |||
755 | ||||
756 | if (!MI->getOperand(1).isImm()) | |||
757 | return false; | |||
758 | ||||
759 | int64_t Imm = MI->getOperand(1).getImm(); | |||
760 | if (Imm == 0) { | |||
761 | Val = false; | |||
762 | return true; | |||
763 | } | |||
764 | if (Imm == -1) { | |||
765 | Val = true; | |||
766 | return true; | |||
767 | } | |||
768 | ||||
769 | return false; | |||
770 | } | |||
771 | ||||
772 | static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { | |||
773 | Def = false; | |||
774 | Use = false; | |||
775 | ||||
776 | for (const MachineOperand &MO : MI.operands()) { | |||
777 | if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { | |||
778 | if (MO.isUse()) | |||
779 | Use = true; | |||
780 | else | |||
781 | Def = true; | |||
782 | } | |||
783 | } | |||
784 | } | |||
785 | ||||
786 | /// Return a point at the end of the given \p MBB to insert SALU instructions | |||
787 | /// for lane mask calculation. Take terminators and SCC into account. | |||
788 | MachineBasicBlock::iterator | |||
789 | SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { | |||
790 | auto InsertionPt = MBB.getFirstTerminator(); | |||
791 | bool TerminatorsUseSCC = false; | |||
792 | for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { | |||
793 | bool DefsSCC; | |||
794 | instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); | |||
795 | if (TerminatorsUseSCC || DefsSCC) | |||
796 | break; | |||
797 | } | |||
798 | ||||
799 | if (!TerminatorsUseSCC) | |||
800 | return InsertionPt; | |||
801 | ||||
802 | while (InsertionPt != MBB.begin()) { | |||
803 | InsertionPt--; | |||
804 | ||||
805 | bool DefSCC, UseSCC; | |||
806 | instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); | |||
807 | if (DefSCC) | |||
808 | return InsertionPt; | |||
809 | } | |||
810 | ||||
811 | // We should have at least seen an IMPLICIT_DEF or COPY | |||
812 | llvm_unreachable("SCC used by terminator but no def in block")__builtin_unreachable(); | |||
813 | } | |||
814 | ||||
815 | void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, | |||
816 | MachineBasicBlock::iterator I, | |||
817 | const DebugLoc &DL, unsigned DstReg, | |||
818 | unsigned PrevReg, unsigned CurReg) { | |||
819 | bool PrevVal = false; | |||
820 | bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); | |||
821 | bool CurVal = false; | |||
822 | bool CurConstant = isConstantLaneMask(CurReg, CurVal); | |||
823 | ||||
824 | if (PrevConstant && CurConstant) { | |||
825 | if (PrevVal == CurVal) { | |||
826 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); | |||
827 | } else if (CurVal) { | |||
828 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); | |||
829 | } else { | |||
830 | BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) | |||
831 | .addReg(ExecReg) | |||
832 | .addImm(-1); | |||
833 | } | |||
834 | return; | |||
835 | } | |||
836 | ||||
837 | unsigned PrevMaskedReg = 0; | |||
838 | unsigned CurMaskedReg = 0; | |||
839 | if (!PrevConstant) { | |||
840 | if (CurConstant && CurVal) { | |||
841 | PrevMaskedReg = PrevReg; | |||
842 | } else { | |||
843 | PrevMaskedReg = createLaneMaskReg(*MF); | |||
844 | BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) | |||
845 | .addReg(PrevReg) | |||
846 | .addReg(ExecReg); | |||
847 | } | |||
848 | } | |||
849 | if (!CurConstant) { | |||
850 | // TODO: check whether CurReg is already masked by EXEC | |||
851 | if (PrevConstant && PrevVal) { | |||
852 | CurMaskedReg = CurReg; | |||
853 | } else { | |||
854 | CurMaskedReg = createLaneMaskReg(*MF); | |||
855 | BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) | |||
856 | .addReg(CurReg) | |||
857 | .addReg(ExecReg); | |||
858 | } | |||
859 | } | |||
860 | ||||
861 | if (PrevConstant && !PrevVal) { | |||
862 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) | |||
863 | .addReg(CurMaskedReg); | |||
864 | } else if (CurConstant && !CurVal) { | |||
865 | BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) | |||
866 | .addReg(PrevMaskedReg); | |||
867 | } else if (PrevConstant && PrevVal) { | |||
868 | BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) | |||
869 | .addReg(CurMaskedReg) | |||
870 | .addReg(ExecReg); | |||
871 | } else { | |||
872 | BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) | |||
873 | .addReg(PrevMaskedReg) | |||
874 | .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); | |||
875 | } | |||
876 | } |