File: | src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support/Alignment.h |
Warning: | line 85, column 47 The result of the left shift is undefined due to shifting by '255', which is greater or equal to the width of type 'uint64_t' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// | ||||
2 | // | ||||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||
4 | // See https://llvm.org/LICENSE.txt for license information. | ||||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||
6 | // | ||||
7 | //===----------------------------------------------------------------------===// | ||||
8 | // | ||||
9 | // This pass eliminates allocas by either converting them into vectors or | ||||
10 | // by migrating them to local address space. | ||||
11 | // | ||||
12 | //===----------------------------------------------------------------------===// | ||||
13 | |||||
14 | #include "AMDGPU.h" | ||||
15 | #include "GCNSubtarget.h" | ||||
16 | #include "llvm/Analysis/CaptureTracking.h" | ||||
17 | #include "llvm/Analysis/ValueTracking.h" | ||||
18 | #include "llvm/CodeGen/TargetPassConfig.h" | ||||
19 | #include "llvm/IR/IRBuilder.h" | ||||
20 | #include "llvm/IR/IntrinsicsAMDGPU.h" | ||||
21 | #include "llvm/IR/IntrinsicsR600.h" | ||||
22 | #include "llvm/Pass.h" | ||||
23 | #include "llvm/Target/TargetMachine.h" | ||||
24 | |||||
25 | #define DEBUG_TYPE"amdgpu-promote-alloca" "amdgpu-promote-alloca" | ||||
26 | |||||
27 | using namespace llvm; | ||||
28 | |||||
29 | namespace { | ||||
30 | |||||
31 | static cl::opt<bool> DisablePromoteAllocaToVector( | ||||
32 | "disable-promote-alloca-to-vector", | ||||
33 | cl::desc("Disable promote alloca to vector"), | ||||
34 | cl::init(false)); | ||||
35 | |||||
36 | static cl::opt<bool> DisablePromoteAllocaToLDS( | ||||
37 | "disable-promote-alloca-to-lds", | ||||
38 | cl::desc("Disable promote alloca to LDS"), | ||||
39 | cl::init(false)); | ||||
40 | |||||
41 | static cl::opt<unsigned> PromoteAllocaToVectorLimit( | ||||
42 | "amdgpu-promote-alloca-to-vector-limit", | ||||
43 | cl::desc("Maximum byte size to consider promote alloca to vector"), | ||||
44 | cl::init(0)); | ||||
45 | |||||
46 | // FIXME: This can create globals so should be a module pass. | ||||
47 | class AMDGPUPromoteAlloca : public FunctionPass { | ||||
48 | public: | ||||
49 | static char ID; | ||||
50 | |||||
51 | AMDGPUPromoteAlloca() : FunctionPass(ID) {} | ||||
52 | |||||
53 | bool runOnFunction(Function &F) override; | ||||
54 | |||||
55 | StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } | ||||
56 | |||||
57 | bool handleAlloca(AllocaInst &I, bool SufficientLDS); | ||||
58 | |||||
59 | void getAnalysisUsage(AnalysisUsage &AU) const override { | ||||
60 | AU.setPreservesCFG(); | ||||
61 | FunctionPass::getAnalysisUsage(AU); | ||||
62 | } | ||||
63 | }; | ||||
64 | |||||
65 | class AMDGPUPromoteAllocaImpl { | ||||
66 | private: | ||||
67 | const TargetMachine &TM; | ||||
68 | Module *Mod = nullptr; | ||||
69 | const DataLayout *DL = nullptr; | ||||
70 | |||||
71 | // FIXME: This should be per-kernel. | ||||
72 | uint32_t LocalMemLimit = 0; | ||||
73 | uint32_t CurrentLocalMemUsage = 0; | ||||
74 | unsigned MaxVGPRs; | ||||
75 | |||||
76 | bool IsAMDGCN = false; | ||||
77 | bool IsAMDHSA = false; | ||||
78 | |||||
79 | std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder); | ||||
80 | Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); | ||||
81 | |||||
82 | /// BaseAlloca is the alloca root the search started from. | ||||
83 | /// Val may be that alloca or a recursive user of it. | ||||
84 | bool collectUsesWithPtrTypes(Value *BaseAlloca, | ||||
85 | Value *Val, | ||||
86 | std::vector<Value*> &WorkList) const; | ||||
87 | |||||
88 | /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand | ||||
89 | /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). | ||||
90 | /// Returns true if both operands are derived from the same alloca. Val should | ||||
91 | /// be the same value as one of the input operands of UseInst. | ||||
92 | bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, | ||||
93 | Instruction *UseInst, | ||||
94 | int OpIdx0, int OpIdx1) const; | ||||
95 | |||||
96 | /// Check whether we have enough local memory for promotion. | ||||
97 | bool hasSufficientLocalMem(const Function &F); | ||||
98 | |||||
99 | bool handleAlloca(AllocaInst &I, bool SufficientLDS); | ||||
100 | |||||
101 | public: | ||||
102 | AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {} | ||||
103 | bool run(Function &F); | ||||
104 | }; | ||||
105 | |||||
106 | class AMDGPUPromoteAllocaToVector : public FunctionPass { | ||||
107 | public: | ||||
108 | static char ID; | ||||
109 | |||||
110 | AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {} | ||||
111 | |||||
112 | bool runOnFunction(Function &F) override; | ||||
113 | |||||
114 | StringRef getPassName() const override { | ||||
115 | return "AMDGPU Promote Alloca to vector"; | ||||
116 | } | ||||
117 | |||||
118 | void getAnalysisUsage(AnalysisUsage &AU) const override { | ||||
119 | AU.setPreservesCFG(); | ||||
120 | FunctionPass::getAnalysisUsage(AU); | ||||
121 | } | ||||
122 | }; | ||||
123 | |||||
124 | } // end anonymous namespace | ||||
125 | |||||
126 | char AMDGPUPromoteAlloca::ID = 0; | ||||
127 | char AMDGPUPromoteAllocaToVector::ID = 0; | ||||
128 | |||||
129 | INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,static void *initializeAMDGPUPromoteAllocaPassOnce(PassRegistry &Registry) { | ||||
130 | "AMDGPU promote alloca to vector or LDS", false, false)static void *initializeAMDGPUPromoteAllocaPassOnce(PassRegistry &Registry) { | ||||
131 | // Move LDS uses from functions to kernels before promote alloca for accurate | ||||
132 | // estimation of LDS available | ||||
133 | INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)initializeAMDGPULowerModuleLDSPass(Registry); | ||||
134 | INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,PassInfo *PI = new PassInfo( "AMDGPU promote alloca to vector or LDS" , "amdgpu-promote-alloca", &AMDGPUPromoteAlloca::ID, PassInfo ::NormalCtor_t(callDefaultCtor<AMDGPUPromoteAlloca>), false , false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeAMDGPUPromoteAllocaPassFlag; void llvm ::initializeAMDGPUPromoteAllocaPass(PassRegistry &Registry ) { llvm::call_once(InitializeAMDGPUPromoteAllocaPassFlag, initializeAMDGPUPromoteAllocaPassOnce , std::ref(Registry)); } | ||||
135 | "AMDGPU promote alloca to vector or LDS", false, false)PassInfo *PI = new PassInfo( "AMDGPU promote alloca to vector or LDS" , "amdgpu-promote-alloca", &AMDGPUPromoteAlloca::ID, PassInfo ::NormalCtor_t(callDefaultCtor<AMDGPUPromoteAlloca>), false , false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeAMDGPUPromoteAllocaPassFlag; void llvm ::initializeAMDGPUPromoteAllocaPass(PassRegistry &Registry ) { llvm::call_once(InitializeAMDGPUPromoteAllocaPassFlag, initializeAMDGPUPromoteAllocaPassOnce , std::ref(Registry)); } | ||||
136 | |||||
137 | INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",static void *initializeAMDGPUPromoteAllocaToVectorPassOnce(PassRegistry &Registry) { PassInfo *PI = new PassInfo( "AMDGPU promote alloca to vector" , "amdgpu-promote-alloca" "-to-vector", &AMDGPUPromoteAllocaToVector ::ID, PassInfo::NormalCtor_t(callDefaultCtor<AMDGPUPromoteAllocaToVector >), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeAMDGPUPromoteAllocaToVectorPassFlag ; void llvm::initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry &Registry) { llvm::call_once(InitializeAMDGPUPromoteAllocaToVectorPassFlag , initializeAMDGPUPromoteAllocaToVectorPassOnce, std::ref(Registry )); } | ||||
138 | "AMDGPU promote alloca to vector", false, false)static void *initializeAMDGPUPromoteAllocaToVectorPassOnce(PassRegistry &Registry) { PassInfo *PI = new PassInfo( "AMDGPU promote alloca to vector" , "amdgpu-promote-alloca" "-to-vector", &AMDGPUPromoteAllocaToVector ::ID, PassInfo::NormalCtor_t(callDefaultCtor<AMDGPUPromoteAllocaToVector >), false, false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeAMDGPUPromoteAllocaToVectorPassFlag ; void llvm::initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry &Registry) { llvm::call_once(InitializeAMDGPUPromoteAllocaToVectorPassFlag , initializeAMDGPUPromoteAllocaToVectorPassOnce, std::ref(Registry )); } | ||||
139 | |||||
140 | char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; | ||||
141 | char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; | ||||
142 | |||||
143 | bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { | ||||
144 | if (skipFunction(F)) | ||||
145 | return false; | ||||
146 | |||||
147 | if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { | ||||
148 | return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F); | ||||
149 | } | ||||
150 | return false; | ||||
151 | } | ||||
152 | |||||
153 | PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, | ||||
154 | FunctionAnalysisManager &AM) { | ||||
155 | bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F); | ||||
| |||||
156 | if (Changed) { | ||||
157 | PreservedAnalyses PA; | ||||
158 | PA.preserveSet<CFGAnalyses>(); | ||||
159 | return PA; | ||||
160 | } | ||||
161 | return PreservedAnalyses::all(); | ||||
162 | } | ||||
163 | |||||
164 | bool AMDGPUPromoteAllocaImpl::run(Function &F) { | ||||
165 | Mod = F.getParent(); | ||||
166 | DL = &Mod->getDataLayout(); | ||||
167 | |||||
168 | const Triple &TT = TM.getTargetTriple(); | ||||
169 | IsAMDGCN = TT.getArch() == Triple::amdgcn; | ||||
170 | IsAMDHSA = TT.getOS() == Triple::AMDHSA; | ||||
171 | |||||
172 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); | ||||
173 | if (!ST.isPromoteAllocaEnabled()) | ||||
174 | return false; | ||||
175 | |||||
176 | if (IsAMDGCN
| ||||
177 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); | ||||
178 | MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); | ||||
179 | } else { | ||||
180 | MaxVGPRs = 128; | ||||
181 | } | ||||
182 | |||||
183 | bool SufficientLDS = hasSufficientLocalMem(F); | ||||
184 | bool Changed = false; | ||||
185 | BasicBlock &EntryBB = *F.begin(); | ||||
186 | |||||
187 | SmallVector<AllocaInst *, 16> Allocas; | ||||
188 | for (Instruction &I : EntryBB) { | ||||
189 | if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) | ||||
190 | Allocas.push_back(AI); | ||||
191 | } | ||||
192 | |||||
193 | for (AllocaInst *AI : Allocas) { | ||||
194 | if (handleAlloca(*AI, SufficientLDS)) | ||||
195 | Changed = true; | ||||
196 | } | ||||
197 | |||||
198 | return Changed; | ||||
199 | } | ||||
200 | |||||
201 | std::pair<Value *, Value *> | ||||
202 | AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { | ||||
203 | const Function &F = *Builder.GetInsertBlock()->getParent(); | ||||
204 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); | ||||
205 | |||||
206 | if (!IsAMDHSA) { | ||||
207 | Function *LocalSizeYFn | ||||
208 | = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); | ||||
209 | Function *LocalSizeZFn | ||||
210 | = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); | ||||
211 | |||||
212 | CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); | ||||
213 | CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); | ||||
214 | |||||
215 | ST.makeLIDRangeMetadata(LocalSizeY); | ||||
216 | ST.makeLIDRangeMetadata(LocalSizeZ); | ||||
217 | |||||
218 | return std::make_pair(LocalSizeY, LocalSizeZ); | ||||
219 | } | ||||
220 | |||||
221 | // We must read the size out of the dispatch pointer. | ||||
222 | assert(IsAMDGCN)((void)0); | ||||
223 | |||||
224 | // We are indexing into this struct, and want to extract the workgroup_size_* | ||||
225 | // fields. | ||||
226 | // | ||||
227 | // typedef struct hsa_kernel_dispatch_packet_s { | ||||
228 | // uint16_t header; | ||||
229 | // uint16_t setup; | ||||
230 | // uint16_t workgroup_size_x ; | ||||
231 | // uint16_t workgroup_size_y; | ||||
232 | // uint16_t workgroup_size_z; | ||||
233 | // uint16_t reserved0; | ||||
234 | // uint32_t grid_size_x ; | ||||
235 | // uint32_t grid_size_y ; | ||||
236 | // uint32_t grid_size_z; | ||||
237 | // | ||||
238 | // uint32_t private_segment_size; | ||||
239 | // uint32_t group_segment_size; | ||||
240 | // uint64_t kernel_object; | ||||
241 | // | ||||
242 | // #ifdef HSA_LARGE_MODEL | ||||
243 | // void *kernarg_address; | ||||
244 | // #elif defined HSA_LITTLE_ENDIAN | ||||
245 | // void *kernarg_address; | ||||
246 | // uint32_t reserved1; | ||||
247 | // #else | ||||
248 | // uint32_t reserved1; | ||||
249 | // void *kernarg_address; | ||||
250 | // #endif | ||||
251 | // uint64_t reserved2; | ||||
252 | // hsa_signal_t completion_signal; // uint64_t wrapper | ||||
253 | // } hsa_kernel_dispatch_packet_t | ||||
254 | // | ||||
255 | Function *DispatchPtrFn | ||||
256 | = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); | ||||
257 | |||||
258 | CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); | ||||
259 | DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); | ||||
260 | DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); | ||||
261 | |||||
262 | // Size of the dispatch packet struct. | ||||
263 | DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64); | ||||
264 | |||||
265 | Type *I32Ty = Type::getInt32Ty(Mod->getContext()); | ||||
266 | Value *CastDispatchPtr = Builder.CreateBitCast( | ||||
267 | DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); | ||||
268 | |||||
269 | // We could do a single 64-bit load here, but it's likely that the basic | ||||
270 | // 32-bit and extract sequence is already present, and it is probably easier | ||||
271 | // to CSE this. The loads should be mergable later anyway. | ||||
272 | Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1); | ||||
273 | LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4)); | ||||
274 | |||||
275 | Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2); | ||||
276 | LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4)); | ||||
277 | |||||
278 | MDNode *MD = MDNode::get(Mod->getContext(), None); | ||||
279 | LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); | ||||
280 | LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); | ||||
281 | ST.makeLIDRangeMetadata(LoadZU); | ||||
282 | |||||
283 | // Extract y component. Upper half of LoadZU should be zero already. | ||||
284 | Value *Y = Builder.CreateLShr(LoadXY, 16); | ||||
285 | |||||
286 | return std::make_pair(Y, LoadZU); | ||||
287 | } | ||||
288 | |||||
289 | Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, | ||||
290 | unsigned N) { | ||||
291 | const AMDGPUSubtarget &ST = | ||||
292 | AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent()); | ||||
293 | Intrinsic::ID IntrID = Intrinsic::not_intrinsic; | ||||
294 | |||||
295 | switch (N) { | ||||
296 | case 0: | ||||
297 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x | ||||
298 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_x; | ||||
299 | break; | ||||
300 | case 1: | ||||
301 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y | ||||
302 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_y; | ||||
303 | break; | ||||
304 | |||||
305 | case 2: | ||||
306 | IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z | ||||
307 | : (Intrinsic::ID)Intrinsic::r600_read_tidig_z; | ||||
308 | break; | ||||
309 | default: | ||||
310 | llvm_unreachable("invalid dimension")__builtin_unreachable(); | ||||
311 | } | ||||
312 | |||||
313 | Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); | ||||
314 | CallInst *CI = Builder.CreateCall(WorkitemIdFn); | ||||
315 | ST.makeLIDRangeMetadata(CI); | ||||
316 | |||||
317 | return CI; | ||||
318 | } | ||||
319 | |||||
320 | static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) { | ||||
321 | return FixedVectorType::get(ArrayTy->getElementType(), | ||||
322 | ArrayTy->getNumElements()); | ||||
323 | } | ||||
324 | |||||
325 | static Value *stripBitcasts(Value *V) { | ||||
326 | while (Instruction *I = dyn_cast<Instruction>(V)) { | ||||
327 | if (I->getOpcode() != Instruction::BitCast) | ||||
328 | break; | ||||
329 | V = I->getOperand(0); | ||||
330 | } | ||||
331 | return V; | ||||
332 | } | ||||
333 | |||||
334 | static Value * | ||||
335 | calculateVectorIndex(Value *Ptr, | ||||
336 | const std::map<GetElementPtrInst *, Value *> &GEPIdx) { | ||||
337 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr)); | ||||
338 | if (!GEP) | ||||
339 | return nullptr; | ||||
340 | |||||
341 | auto I = GEPIdx.find(GEP); | ||||
342 | return I == GEPIdx.end() ? nullptr : I->second; | ||||
343 | } | ||||
344 | |||||
345 | static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { | ||||
346 | // FIXME we only support simple cases | ||||
347 | if (GEP->getNumOperands() != 3) | ||||
348 | return nullptr; | ||||
349 | |||||
350 | ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); | ||||
351 | if (!I0 || !I0->isZero()) | ||||
352 | return nullptr; | ||||
353 | |||||
354 | return GEP->getOperand(2); | ||||
355 | } | ||||
356 | |||||
357 | // Not an instruction handled below to turn into a vector. | ||||
358 | // | ||||
359 | // TODO: Check isTriviallyVectorizable for calls and handle other | ||||
360 | // instructions. | ||||
361 | static bool canVectorizeInst(Instruction *Inst, User *User, | ||||
362 | const DataLayout &DL) { | ||||
363 | switch (Inst->getOpcode()) { | ||||
364 | case Instruction::Load: { | ||||
365 | // Currently only handle the case where the Pointer Operand is a GEP. | ||||
366 | // Also we could not vectorize volatile or atomic loads. | ||||
367 | LoadInst *LI = cast<LoadInst>(Inst); | ||||
368 | if (isa<AllocaInst>(User) && | ||||
369 | LI->getPointerOperandType() == User->getType() && | ||||
370 | isa<VectorType>(LI->getType())) | ||||
371 | return true; | ||||
372 | |||||
373 | Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand()); | ||||
374 | if (!PtrInst) | ||||
375 | return false; | ||||
376 | |||||
377 | return (PtrInst->getOpcode() == Instruction::GetElementPtr || | ||||
378 | PtrInst->getOpcode() == Instruction::BitCast) && | ||||
379 | LI->isSimple(); | ||||
380 | } | ||||
381 | case Instruction::BitCast: | ||||
382 | return true; | ||||
383 | case Instruction::Store: { | ||||
384 | // Must be the stored pointer operand, not a stored value, plus | ||||
385 | // since it should be canonical form, the User should be a GEP. | ||||
386 | // Also we could not vectorize volatile or atomic stores. | ||||
387 | StoreInst *SI = cast<StoreInst>(Inst); | ||||
388 | if (isa<AllocaInst>(User) && | ||||
389 | SI->getPointerOperandType() == User->getType() && | ||||
390 | isa<VectorType>(SI->getValueOperand()->getType())) | ||||
391 | return true; | ||||
392 | |||||
393 | Instruction *UserInst = dyn_cast<Instruction>(User); | ||||
394 | if (!UserInst) | ||||
395 | return false; | ||||
396 | |||||
397 | return (SI->getPointerOperand() == User) && | ||||
398 | (UserInst->getOpcode() == Instruction::GetElementPtr || | ||||
399 | UserInst->getOpcode() == Instruction::BitCast) && | ||||
400 | SI->isSimple(); | ||||
401 | } | ||||
402 | default: | ||||
403 | return false; | ||||
404 | } | ||||
405 | } | ||||
406 | |||||
407 | static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, | ||||
408 | unsigned MaxVGPRs) { | ||||
409 | |||||
410 | if (DisablePromoteAllocaToVector) { | ||||
411 | LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n")do { } while (false); | ||||
412 | return false; | ||||
413 | } | ||||
414 | |||||
415 | Type *AllocaTy = Alloca->getAllocatedType(); | ||||
416 | auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy); | ||||
417 | if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) { | ||||
418 | if (VectorType::isValidElementType(ArrayTy->getElementType()) && | ||||
419 | ArrayTy->getNumElements() > 0) | ||||
420 | VectorTy = arrayTypeToVecType(ArrayTy); | ||||
421 | } | ||||
422 | |||||
423 | // Use up to 1/4 of available register budget for vectorization. | ||||
424 | unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 | ||||
425 | : (MaxVGPRs * 32); | ||||
426 | |||||
427 | if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) { | ||||
428 | LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "do { } while (false) | ||||
429 | << MaxVGPRs << " registers available\n")do { } while (false); | ||||
430 | return false; | ||||
431 | } | ||||
432 | |||||
433 | LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n")do { } while (false); | ||||
434 | |||||
435 | // FIXME: There is no reason why we can't support larger arrays, we | ||||
436 | // are just being conservative for now. | ||||
437 | // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these | ||||
438 | // could also be promoted but we don't currently handle this case | ||||
439 | if (!VectorTy || VectorTy->getNumElements() > 16 || | ||||
440 | VectorTy->getNumElements() < 2) { | ||||
441 | LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n")do { } while (false); | ||||
442 | return false; | ||||
443 | } | ||||
444 | |||||
445 | std::map<GetElementPtrInst*, Value*> GEPVectorIdx; | ||||
446 | std::vector<Value *> WorkList; | ||||
447 | SmallVector<User *, 8> Users(Alloca->users()); | ||||
448 | SmallVector<User *, 8> UseUsers(Users.size(), Alloca); | ||||
449 | Type *VecEltTy = VectorTy->getElementType(); | ||||
450 | while (!Users.empty()) { | ||||
451 | User *AllocaUser = Users.pop_back_val(); | ||||
452 | User *UseUser = UseUsers.pop_back_val(); | ||||
453 | Instruction *Inst = dyn_cast<Instruction>(AllocaUser); | ||||
454 | |||||
455 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); | ||||
456 | if (!GEP) { | ||||
457 | if (!canVectorizeInst(Inst, UseUser, DL)) | ||||
458 | return false; | ||||
459 | |||||
460 | if (Inst->getOpcode() == Instruction::BitCast) { | ||||
461 | Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType(); | ||||
462 | Type *ToTy = Inst->getType()->getPointerElementType(); | ||||
463 | if (FromTy->isAggregateType() || ToTy->isAggregateType() || | ||||
464 | DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy)) | ||||
465 | continue; | ||||
466 | |||||
467 | for (User *CastUser : Inst->users()) { | ||||
468 | if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser))) | ||||
469 | continue; | ||||
470 | Users.push_back(CastUser); | ||||
471 | UseUsers.push_back(Inst); | ||||
472 | } | ||||
473 | |||||
474 | continue; | ||||
475 | } | ||||
476 | |||||
477 | WorkList.push_back(AllocaUser); | ||||
478 | continue; | ||||
479 | } | ||||
480 | |||||
481 | Value *Index = GEPToVectorIndex(GEP); | ||||
482 | |||||
483 | // If we can't compute a vector index from this GEP, then we can't | ||||
484 | // promote this alloca to vector. | ||||
485 | if (!Index) { | ||||
486 | LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEPdo { } while (false) | ||||
487 | << '\n')do { } while (false); | ||||
488 | return false; | ||||
489 | } | ||||
490 | |||||
491 | GEPVectorIdx[GEP] = Index; | ||||
492 | Users.append(GEP->user_begin(), GEP->user_end()); | ||||
493 | UseUsers.append(GEP->getNumUses(), GEP); | ||||
494 | } | ||||
495 | |||||
496 | LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "do { } while (false) | ||||
497 | << *VectorTy << '\n')do { } while (false); | ||||
498 | |||||
499 | for (Value *V : WorkList) { | ||||
500 | Instruction *Inst = cast<Instruction>(V); | ||||
501 | IRBuilder<> Builder(Inst); | ||||
502 | switch (Inst->getOpcode()) { | ||||
503 | case Instruction::Load: { | ||||
504 | if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy()) | ||||
505 | break; | ||||
506 | |||||
507 | Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand(); | ||||
508 | Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); | ||||
509 | if (!Index) | ||||
510 | break; | ||||
511 | |||||
512 | Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); | ||||
513 | Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); | ||||
514 | Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); | ||||
515 | Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); | ||||
516 | if (Inst->getType() != VecEltTy) | ||||
517 | ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); | ||||
518 | Inst->replaceAllUsesWith(ExtractElement); | ||||
519 | Inst->eraseFromParent(); | ||||
520 | break; | ||||
521 | } | ||||
522 | case Instruction::Store: { | ||||
523 | StoreInst *SI = cast<StoreInst>(Inst); | ||||
524 | if (SI->getValueOperand()->getType() == AllocaTy || | ||||
525 | SI->getValueOperand()->getType()->isVectorTy()) | ||||
526 | break; | ||||
527 | |||||
528 | Value *Ptr = SI->getPointerOperand(); | ||||
529 | Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); | ||||
530 | if (!Index) | ||||
531 | break; | ||||
532 | |||||
533 | Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); | ||||
534 | Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); | ||||
535 | Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); | ||||
536 | Value *Elt = SI->getValueOperand(); | ||||
537 | if (Elt->getType() != VecEltTy) | ||||
538 | Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy); | ||||
539 | Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index); | ||||
540 | Builder.CreateStore(NewVecValue, BitCast); | ||||
541 | Inst->eraseFromParent(); | ||||
542 | break; | ||||
543 | } | ||||
544 | |||||
545 | default: | ||||
546 | llvm_unreachable("Inconsistency in instructions promotable to vector")__builtin_unreachable(); | ||||
547 | } | ||||
548 | } | ||||
549 | return true; | ||||
550 | } | ||||
551 | |||||
552 | static bool isCallPromotable(CallInst *CI) { | ||||
553 | IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); | ||||
554 | if (!II) | ||||
555 | return false; | ||||
556 | |||||
557 | switch (II->getIntrinsicID()) { | ||||
558 | case Intrinsic::memcpy: | ||||
559 | case Intrinsic::memmove: | ||||
560 | case Intrinsic::memset: | ||||
561 | case Intrinsic::lifetime_start: | ||||
562 | case Intrinsic::lifetime_end: | ||||
563 | case Intrinsic::invariant_start: | ||||
564 | case Intrinsic::invariant_end: | ||||
565 | case Intrinsic::launder_invariant_group: | ||||
566 | case Intrinsic::strip_invariant_group: | ||||
567 | case Intrinsic::objectsize: | ||||
568 | return true; | ||||
569 | default: | ||||
570 | return false; | ||||
571 | } | ||||
572 | } | ||||
573 | |||||
574 | bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( | ||||
575 | Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0, | ||||
576 | int OpIdx1) const { | ||||
577 | // Figure out which operand is the one we might not be promoting. | ||||
578 | Value *OtherOp = Inst->getOperand(OpIdx0); | ||||
579 | if (Val == OtherOp) | ||||
580 | OtherOp = Inst->getOperand(OpIdx1); | ||||
581 | |||||
582 | if (isa<ConstantPointerNull>(OtherOp)) | ||||
583 | return true; | ||||
584 | |||||
585 | Value *OtherObj = getUnderlyingObject(OtherOp); | ||||
586 | if (!isa<AllocaInst>(OtherObj)) | ||||
587 | return false; | ||||
588 | |||||
589 | // TODO: We should be able to replace undefs with the right pointer type. | ||||
590 | |||||
591 | // TODO: If we know the other base object is another promotable | ||||
592 | // alloca, not necessarily this alloca, we can do this. The | ||||
593 | // important part is both must have the same address space at | ||||
594 | // the end. | ||||
595 | if (OtherObj != BaseAlloca) { | ||||
596 | LLVM_DEBUG(do { } while (false) | ||||
597 | dbgs() << "Found a binary instruction with another alloca object\n")do { } while (false); | ||||
598 | return false; | ||||
599 | } | ||||
600 | |||||
601 | return true; | ||||
602 | } | ||||
603 | |||||
604 | bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( | ||||
605 | Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const { | ||||
606 | |||||
607 | for (User *User : Val->users()) { | ||||
608 | if (is_contained(WorkList, User)) | ||||
609 | continue; | ||||
610 | |||||
611 | if (CallInst *CI = dyn_cast<CallInst>(User)) { | ||||
612 | if (!isCallPromotable(CI)) | ||||
613 | return false; | ||||
614 | |||||
615 | WorkList.push_back(User); | ||||
616 | continue; | ||||
617 | } | ||||
618 | |||||
619 | Instruction *UseInst = cast<Instruction>(User); | ||||
620 | if (UseInst->getOpcode() == Instruction::PtrToInt) | ||||
621 | return false; | ||||
622 | |||||
623 | if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) { | ||||
624 | if (LI->isVolatile()) | ||||
625 | return false; | ||||
626 | |||||
627 | continue; | ||||
628 | } | ||||
629 | |||||
630 | if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) { | ||||
631 | if (SI->isVolatile()) | ||||
632 | return false; | ||||
633 | |||||
634 | // Reject if the stored value is not the pointer operand. | ||||
635 | if (SI->getPointerOperand() != Val) | ||||
636 | return false; | ||||
637 | } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) { | ||||
638 | if (RMW->isVolatile()) | ||||
639 | return false; | ||||
640 | } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) { | ||||
641 | if (CAS->isVolatile()) | ||||
642 | return false; | ||||
643 | } | ||||
644 | |||||
645 | // Only promote a select if we know that the other select operand | ||||
646 | // is from another pointer that will also be promoted. | ||||
647 | if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { | ||||
648 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1)) | ||||
649 | return false; | ||||
650 | |||||
651 | // May need to rewrite constant operands. | ||||
652 | WorkList.push_back(ICmp); | ||||
653 | } | ||||
654 | |||||
655 | if (UseInst->getOpcode() == Instruction::AddrSpaceCast) { | ||||
656 | // Give up if the pointer may be captured. | ||||
657 | if (PointerMayBeCaptured(UseInst, true, true)) | ||||
658 | return false; | ||||
659 | // Don't collect the users of this. | ||||
660 | WorkList.push_back(User); | ||||
661 | continue; | ||||
662 | } | ||||
663 | |||||
664 | // Do not promote vector/aggregate type instructions. It is hard to track | ||||
665 | // their users. | ||||
666 | if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User)) | ||||
667 | return false; | ||||
668 | |||||
669 | if (!User->getType()->isPointerTy()) | ||||
670 | continue; | ||||
671 | |||||
672 | if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) { | ||||
673 | // Be conservative if an address could be computed outside the bounds of | ||||
674 | // the alloca. | ||||
675 | if (!GEP->isInBounds()) | ||||
676 | return false; | ||||
677 | } | ||||
678 | |||||
679 | // Only promote a select if we know that the other select operand is from | ||||
680 | // another pointer that will also be promoted. | ||||
681 | if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) { | ||||
682 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2)) | ||||
683 | return false; | ||||
684 | } | ||||
685 | |||||
686 | // Repeat for phis. | ||||
687 | if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) { | ||||
688 | // TODO: Handle more complex cases. We should be able to replace loops | ||||
689 | // over arrays. | ||||
690 | switch (Phi->getNumIncomingValues()) { | ||||
691 | case 1: | ||||
692 | break; | ||||
693 | case 2: | ||||
694 | if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1)) | ||||
695 | return false; | ||||
696 | break; | ||||
697 | default: | ||||
698 | return false; | ||||
699 | } | ||||
700 | } | ||||
701 | |||||
702 | WorkList.push_back(User); | ||||
703 | if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList)) | ||||
704 | return false; | ||||
705 | } | ||||
706 | |||||
707 | return true; | ||||
708 | } | ||||
709 | |||||
710 | bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { | ||||
711 | |||||
712 | FunctionType *FTy = F.getFunctionType(); | ||||
713 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); | ||||
714 | |||||
715 | // If the function has any arguments in the local address space, then it's | ||||
716 | // possible these arguments require the entire local memory space, so | ||||
717 | // we cannot use local memory in the pass. | ||||
718 | for (Type *ParamTy : FTy->params()) { | ||||
719 | PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); | ||||
720 | if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { | ||||
721 | LocalMemLimit = 0; | ||||
722 | LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "do { } while (false) | ||||
723 | "local memory disabled.\n")do { } while (false); | ||||
724 | return false; | ||||
725 | } | ||||
726 | } | ||||
727 | |||||
728 | LocalMemLimit = ST.getLocalMemorySize(); | ||||
729 | if (LocalMemLimit == 0) | ||||
730 | return false; | ||||
731 | |||||
732 | SmallVector<const Constant *, 16> Stack; | ||||
733 | SmallPtrSet<const Constant *, 8> VisitedConstants; | ||||
734 | SmallPtrSet<const GlobalVariable *, 8> UsedLDS; | ||||
735 | |||||
736 | auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool { | ||||
737 | for (const User *U : Val->users()) { | ||||
738 | if (const Instruction *Use = dyn_cast<Instruction>(U)) { | ||||
739 | if (Use->getParent()->getParent() == &F) | ||||
740 | return true; | ||||
741 | } else { | ||||
742 | const Constant *C = cast<Constant>(U); | ||||
743 | if (VisitedConstants.insert(C).second) | ||||
744 | Stack.push_back(C); | ||||
745 | } | ||||
746 | } | ||||
747 | |||||
748 | return false; | ||||
749 | }; | ||||
750 | |||||
751 | for (GlobalVariable &GV : Mod->globals()) { | ||||
752 | if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) | ||||
753 | continue; | ||||
754 | |||||
755 | if (visitUsers(&GV, &GV)) { | ||||
756 | UsedLDS.insert(&GV); | ||||
757 | Stack.clear(); | ||||
758 | continue; | ||||
759 | } | ||||
760 | |||||
761 | // For any ConstantExpr uses, we need to recursively search the users until | ||||
762 | // we see a function. | ||||
763 | while (!Stack.empty()) { | ||||
764 | const Constant *C = Stack.pop_back_val(); | ||||
765 | if (visitUsers(&GV, C)) { | ||||
766 | UsedLDS.insert(&GV); | ||||
767 | Stack.clear(); | ||||
768 | break; | ||||
769 | } | ||||
770 | } | ||||
771 | } | ||||
772 | |||||
773 | const DataLayout &DL = Mod->getDataLayout(); | ||||
774 | SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes; | ||||
775 | AllocatedSizes.reserve(UsedLDS.size()); | ||||
776 | |||||
777 | for (const GlobalVariable *GV : UsedLDS) { | ||||
778 | Align Alignment = | ||||
779 | DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()); | ||||
780 | uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType()); | ||||
781 | AllocatedSizes.emplace_back(AllocSize, Alignment); | ||||
782 | } | ||||
783 | |||||
784 | // Sort to try to estimate the worst case alignment padding | ||||
785 | // | ||||
786 | // FIXME: We should really do something to fix the addresses to a more optimal | ||||
787 | // value instead | ||||
788 | llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS, | ||||
789 | std::pair<uint64_t, Align> RHS) { | ||||
790 | return LHS.second < RHS.second; | ||||
791 | }); | ||||
792 | |||||
793 | // Check how much local memory is being used by global objects | ||||
794 | CurrentLocalMemUsage = 0; | ||||
795 | |||||
796 | // FIXME: Try to account for padding here. The real padding and address is | ||||
797 | // currently determined from the inverse order of uses in the function when | ||||
798 | // legalizing, which could also potentially change. We try to estimate the | ||||
799 | // worst case here, but we probably should fix the addresses earlier. | ||||
800 | for (auto Alloc : AllocatedSizes) { | ||||
801 | CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second); | ||||
802 | CurrentLocalMemUsage += Alloc.first; | ||||
803 | } | ||||
804 | |||||
805 | unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, | ||||
806 | F); | ||||
807 | |||||
808 | // Restrict local memory usage so that we don't drastically reduce occupancy, | ||||
809 | // unless it is already significantly reduced. | ||||
810 | |||||
811 | // TODO: Have some sort of hint or other heuristics to guess occupancy based | ||||
812 | // on other factors.. | ||||
813 | unsigned OccupancyHint = ST.getWavesPerEU(F).second; | ||||
814 | if (OccupancyHint == 0) | ||||
815 | OccupancyHint = 7; | ||||
816 | |||||
817 | // Clamp to max value. | ||||
818 | OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); | ||||
819 | |||||
820 | // Check the hint but ignore it if it's obviously wrong from the existing LDS | ||||
821 | // usage. | ||||
822 | MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); | ||||
823 | |||||
824 | |||||
825 | // Round up to the next tier of usage. | ||||
826 | unsigned MaxSizeWithWaveCount | ||||
827 | = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); | ||||
828 | |||||
829 | // Program is possibly broken by using more local mem than available. | ||||
830 | if (CurrentLocalMemUsage > MaxSizeWithWaveCount) | ||||
831 | return false; | ||||
832 | |||||
833 | LocalMemLimit = MaxSizeWithWaveCount; | ||||
834 | |||||
835 | LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsagedo { } while (false) | ||||
836 | << " bytes of LDS\n"do { } while (false) | ||||
837 | << " Rounding size to " << MaxSizeWithWaveCountdo { } while (false) | ||||
838 | << " with a maximum occupancy of " << MaxOccupancy << '\n'do { } while (false) | ||||
839 | << " and " << (LocalMemLimit - CurrentLocalMemUsage)do { } while (false) | ||||
840 | << " available for promotion\n")do { } while (false); | ||||
841 | |||||
842 | return true; | ||||
843 | } | ||||
844 | |||||
845 | // FIXME: Should try to pick the most likely to be profitable allocas first. | ||||
846 | bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { | ||||
847 | // Array allocations are probably not worth handling, since an allocation of | ||||
848 | // the array type is the canonical form. | ||||
849 | if (!I.isStaticAlloca() || I.isArrayAllocation()) | ||||
850 | return false; | ||||
851 | |||||
852 | const DataLayout &DL = Mod->getDataLayout(); | ||||
853 | IRBuilder<> Builder(&I); | ||||
854 | |||||
855 | // First try to replace the alloca with a vector | ||||
856 | Type *AllocaTy = I.getAllocatedType(); | ||||
857 | |||||
858 | LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n')do { } while (false); | ||||
859 | |||||
860 | if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs)) | ||||
861 | return true; // Promoted to vector. | ||||
862 | |||||
863 | if (DisablePromoteAllocaToLDS) | ||||
864 | return false; | ||||
865 | |||||
866 | const Function &ContainingFunction = *I.getParent()->getParent(); | ||||
867 | CallingConv::ID CC = ContainingFunction.getCallingConv(); | ||||
868 | |||||
869 | // Don't promote the alloca to LDS for shader calling conventions as the work | ||||
870 | // item ID intrinsics are not supported for these calling conventions. | ||||
871 | // Furthermore not all LDS is available for some of the stages. | ||||
872 | switch (CC) { | ||||
873 | case CallingConv::AMDGPU_KERNEL: | ||||
874 | case CallingConv::SPIR_KERNEL: | ||||
875 | break; | ||||
876 | default: | ||||
877 | LLVM_DEBUG(do { } while (false) | ||||
878 | dbgs()do { } while (false) | ||||
879 | << " promote alloca to LDS not supported with calling convention.\n")do { } while (false); | ||||
880 | return false; | ||||
881 | } | ||||
882 | |||||
883 | // Not likely to have sufficient local memory for promotion. | ||||
884 | if (!SufficientLDS
| ||||
885 | return false; | ||||
886 | |||||
887 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction); | ||||
888 | unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; | ||||
889 | |||||
890 | Align Alignment = | ||||
891 | DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType()); | ||||
892 | |||||
893 | // FIXME: This computed padding is likely wrong since it depends on inverse | ||||
894 | // usage order. | ||||
895 | // | ||||
896 | // FIXME: It is also possible that if we're allowed to use all of the memory | ||||
897 | // could could end up using more than the maximum due to alignment padding. | ||||
898 | |||||
899 | uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); | ||||
900 | uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); | ||||
901 | NewSize += AllocSize; | ||||
902 | |||||
903 | if (NewSize > LocalMemLimit) { | ||||
904 | LLVM_DEBUG(dbgs() << " " << AllocSizedo { } while (false) | ||||
905 | << " bytes of local memory not available to promote\n")do { } while (false); | ||||
906 | return false; | ||||
907 | } | ||||
908 | |||||
909 | CurrentLocalMemUsage = NewSize; | ||||
910 | |||||
911 | std::vector<Value*> WorkList; | ||||
912 | |||||
913 | if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { | ||||
914 | LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n")do { } while (false); | ||||
915 | return false; | ||||
916 | } | ||||
917 | |||||
918 | LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n")do { } while (false); | ||||
919 | |||||
920 | Function *F = I.getParent()->getParent(); | ||||
921 | |||||
922 | Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); | ||||
923 | GlobalVariable *GV = new GlobalVariable( | ||||
924 | *Mod, GVTy, false, GlobalValue::InternalLinkage, | ||||
925 | UndefValue::get(GVTy), | ||||
926 | Twine(F->getName()) + Twine('.') + I.getName(), | ||||
927 | nullptr, | ||||
928 | GlobalVariable::NotThreadLocal, | ||||
929 | AMDGPUAS::LOCAL_ADDRESS); | ||||
930 | GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); | ||||
931 | GV->setAlignment(MaybeAlign(I.getAlignment())); | ||||
932 | |||||
933 | Value *TCntY, *TCntZ; | ||||
934 | |||||
935 | std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); | ||||
936 | Value *TIdX = getWorkitemID(Builder, 0); | ||||
937 | Value *TIdY = getWorkitemID(Builder, 1); | ||||
938 | Value *TIdZ = getWorkitemID(Builder, 2); | ||||
939 | |||||
940 | Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); | ||||
941 | Tmp0 = Builder.CreateMul(Tmp0, TIdX); | ||||
942 | Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); | ||||
943 | Value *TID = Builder.CreateAdd(Tmp0, Tmp1); | ||||
944 | TID = Builder.CreateAdd(TID, TIdZ); | ||||
945 | |||||
946 | Value *Indices[] = { | ||||
947 | Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), | ||||
948 | TID | ||||
949 | }; | ||||
950 | |||||
951 | Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); | ||||
952 | I.mutateType(Offset->getType()); | ||||
953 | I.replaceAllUsesWith(Offset); | ||||
954 | I.eraseFromParent(); | ||||
955 | |||||
956 | SmallVector<IntrinsicInst *> DeferredIntrs; | ||||
957 | |||||
958 | for (Value *V : WorkList) { | ||||
959 | CallInst *Call = dyn_cast<CallInst>(V); | ||||
960 | if (!Call) { | ||||
961 | if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { | ||||
962 | Value *Src0 = CI->getOperand(0); | ||||
963 | PointerType *NewTy = PointerType::getWithSamePointeeType( | ||||
964 | cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS); | ||||
965 | |||||
966 | if (isa<ConstantPointerNull>(CI->getOperand(0))) | ||||
967 | CI->setOperand(0, ConstantPointerNull::get(NewTy)); | ||||
968 | |||||
969 | if (isa<ConstantPointerNull>(CI->getOperand(1))) | ||||
970 | CI->setOperand(1, ConstantPointerNull::get(NewTy)); | ||||
971 | |||||
972 | continue; | ||||
973 | } | ||||
974 | |||||
975 | // The operand's value should be corrected on its own and we don't want to | ||||
976 | // touch the users. | ||||
977 | if (isa<AddrSpaceCastInst>(V)) | ||||
978 | continue; | ||||
979 | |||||
980 | PointerType *NewTy = PointerType::getWithSamePointeeType( | ||||
981 | cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS); | ||||
982 | |||||
983 | // FIXME: It doesn't really make sense to try to do this for all | ||||
984 | // instructions. | ||||
985 | V->mutateType(NewTy); | ||||
986 | |||||
987 | // Adjust the types of any constant operands. | ||||
988 | if (SelectInst *SI = dyn_cast<SelectInst>(V)) { | ||||
989 | if (isa<ConstantPointerNull>(SI->getOperand(1))) | ||||
990 | SI->setOperand(1, ConstantPointerNull::get(NewTy)); | ||||
991 | |||||
992 | if (isa<ConstantPointerNull>(SI->getOperand(2))) | ||||
993 | SI->setOperand(2, ConstantPointerNull::get(NewTy)); | ||||
994 | } else if (PHINode *Phi = dyn_cast<PHINode>(V)) { | ||||
995 | for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { | ||||
996 | if (isa<ConstantPointerNull>(Phi->getIncomingValue(I))) | ||||
997 | Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy)); | ||||
998 | } | ||||
999 | } | ||||
1000 | |||||
1001 | continue; | ||||
1002 | } | ||||
1003 | |||||
1004 | IntrinsicInst *Intr = cast<IntrinsicInst>(Call); | ||||
1005 | Builder.SetInsertPoint(Intr); | ||||
1006 | switch (Intr->getIntrinsicID()) { | ||||
1007 | case Intrinsic::lifetime_start: | ||||
1008 | case Intrinsic::lifetime_end: | ||||
1009 | // These intrinsics are for address space 0 only | ||||
1010 | Intr->eraseFromParent(); | ||||
1011 | continue; | ||||
1012 | case Intrinsic::memcpy: | ||||
1013 | case Intrinsic::memmove: | ||||
1014 | // These have 2 pointer operands. In case if second pointer also needs | ||||
1015 | // to be replaced we defer processing of these intrinsics until all | ||||
1016 | // other values are processed. | ||||
1017 | DeferredIntrs.push_back(Intr); | ||||
1018 | continue; | ||||
1019 | case Intrinsic::memset: { | ||||
1020 | MemSetInst *MemSet = cast<MemSetInst>(Intr); | ||||
1021 | Builder.CreateMemSet( | ||||
1022 | MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(), | ||||
1023 | MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile()); | ||||
1024 | Intr->eraseFromParent(); | ||||
1025 | continue; | ||||
1026 | } | ||||
1027 | case Intrinsic::invariant_start: | ||||
1028 | case Intrinsic::invariant_end: | ||||
1029 | case Intrinsic::launder_invariant_group: | ||||
1030 | case Intrinsic::strip_invariant_group: | ||||
1031 | Intr->eraseFromParent(); | ||||
1032 | // FIXME: I think the invariant marker should still theoretically apply, | ||||
1033 | // but the intrinsics need to be changed to accept pointers with any | ||||
1034 | // address space. | ||||
1035 | continue; | ||||
1036 | case Intrinsic::objectsize: { | ||||
1037 | Value *Src = Intr->getOperand(0); | ||||
1038 | Function *ObjectSize = Intrinsic::getDeclaration( | ||||
1039 | Mod, Intrinsic::objectsize, | ||||
1040 | {Intr->getType(), | ||||
1041 | PointerType::getWithSamePointeeType( | ||||
1042 | cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)}); | ||||
1043 | |||||
1044 | CallInst *NewCall = Builder.CreateCall( | ||||
1045 | ObjectSize, | ||||
1046 | {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)}); | ||||
1047 | Intr->replaceAllUsesWith(NewCall); | ||||
1048 | Intr->eraseFromParent(); | ||||
1049 | continue; | ||||
1050 | } | ||||
1051 | default: | ||||
1052 | Intr->print(errs()); | ||||
1053 | llvm_unreachable("Don't know how to promote alloca intrinsic use.")__builtin_unreachable(); | ||||
1054 | } | ||||
1055 | } | ||||
1056 | |||||
1057 | for (IntrinsicInst *Intr : DeferredIntrs) { | ||||
1058 | Builder.SetInsertPoint(Intr); | ||||
1059 | Intrinsic::ID ID = Intr->getIntrinsicID(); | ||||
1060 | assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove)((void)0); | ||||
1061 | |||||
1062 | MemTransferInst *MI = cast<MemTransferInst>(Intr); | ||||
1063 | auto *B = | ||||
1064 | Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(), | ||||
1065 | MI->getRawSource(), MI->getSourceAlign(), | ||||
1066 | MI->getLength(), MI->isVolatile()); | ||||
1067 | |||||
1068 | for (unsigned I = 1; I != 3; ++I) { | ||||
1069 | if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) { | ||||
1070 | B->addDereferenceableAttr(I, Bytes); | ||||
1071 | } | ||||
1072 | } | ||||
1073 | |||||
1074 | Intr->eraseFromParent(); | ||||
1075 | } | ||||
1076 | |||||
1077 | return true; | ||||
1078 | } | ||||
1079 | |||||
1080 | bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) { | ||||
1081 | // Array allocations are probably not worth handling, since an allocation of | ||||
1082 | // the array type is the canonical form. | ||||
1083 | if (!I.isStaticAlloca() || I.isArrayAllocation()) | ||||
1084 | return false; | ||||
1085 | |||||
1086 | LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n')do { } while (false); | ||||
1087 | |||||
1088 | Module *Mod = I.getParent()->getParent()->getParent(); | ||||
1089 | return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); | ||||
1090 | } | ||||
1091 | |||||
1092 | bool promoteAllocasToVector(Function &F, TargetMachine &TM) { | ||||
1093 | if (DisablePromoteAllocaToVector) | ||||
1094 | return false; | ||||
1095 | |||||
1096 | const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); | ||||
1097 | if (!ST.isPromoteAllocaEnabled()) | ||||
1098 | return false; | ||||
1099 | |||||
1100 | unsigned MaxVGPRs; | ||||
1101 | if (TM.getTargetTriple().getArch() == Triple::amdgcn) { | ||||
1102 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); | ||||
1103 | MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); | ||||
1104 | } else { | ||||
1105 | MaxVGPRs = 128; | ||||
1106 | } | ||||
1107 | |||||
1108 | bool Changed = false; | ||||
1109 | BasicBlock &EntryBB = *F.begin(); | ||||
1110 | |||||
1111 | SmallVector<AllocaInst *, 16> Allocas; | ||||
1112 | for (Instruction &I : EntryBB) { | ||||
1113 | if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) | ||||
1114 | Allocas.push_back(AI); | ||||
1115 | } | ||||
1116 | |||||
1117 | for (AllocaInst *AI : Allocas) { | ||||
1118 | if (handlePromoteAllocaToVector(*AI, MaxVGPRs)) | ||||
1119 | Changed = true; | ||||
1120 | } | ||||
1121 | |||||
1122 | return Changed; | ||||
1123 | } | ||||
1124 | |||||
1125 | bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { | ||||
1126 | if (skipFunction(F)) | ||||
1127 | return false; | ||||
1128 | if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { | ||||
1129 | return promoteAllocasToVector(F, TPC->getTM<TargetMachine>()); | ||||
1130 | } | ||||
1131 | return false; | ||||
1132 | } | ||||
1133 | |||||
1134 | PreservedAnalyses | ||||
1135 | AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { | ||||
1136 | bool Changed = promoteAllocasToVector(F, TM); | ||||
1137 | if (Changed) { | ||||
1138 | PreservedAnalyses PA; | ||||
1139 | PA.preserveSet<CFGAnalyses>(); | ||||
1140 | return PA; | ||||
1141 | } | ||||
1142 | return PreservedAnalyses::all(); | ||||
1143 | } | ||||
1144 | |||||
1145 | FunctionPass *llvm::createAMDGPUPromoteAlloca() { | ||||
1146 | return new AMDGPUPromoteAlloca(); | ||||
1147 | } | ||||
1148 | |||||
1149 | FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() { | ||||
1150 | return new AMDGPUPromoteAllocaToVector(); | ||||
1151 | } |
1 | //===-- llvm/Support/Alignment.h - Useful alignment functions ---*- C++ -*-===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | // This file contains types to represent alignments. | |||
10 | // They are instrumented to guarantee some invariants are preserved and prevent | |||
11 | // invalid manipulations. | |||
12 | // | |||
13 | // - Align represents an alignment in bytes, it is always set and always a valid | |||
14 | // power of two, its minimum value is 1 which means no alignment requirements. | |||
15 | // | |||
16 | // - MaybeAlign is an optional type, it may be undefined or set. When it's set | |||
17 | // you can get the underlying Align type by using the getValue() method. | |||
18 | // | |||
19 | //===----------------------------------------------------------------------===// | |||
20 | ||||
21 | #ifndef LLVM_SUPPORT_ALIGNMENT_H_ | |||
22 | #define LLVM_SUPPORT_ALIGNMENT_H_ | |||
23 | ||||
24 | #include "llvm/ADT/Optional.h" | |||
25 | #include "llvm/Support/MathExtras.h" | |||
26 | #include <cassert> | |||
27 | #ifndef NDEBUG1 | |||
28 | #include <string> | |||
29 | #endif // NDEBUG | |||
30 | ||||
31 | namespace llvm { | |||
32 | ||||
33 | #define ALIGN_CHECK_ISPOSITIVE(decl) \ | |||
34 | assert(decl > 0 && (#decl " should be defined"))((void)0) | |||
35 | ||||
36 | /// This struct is a compact representation of a valid (non-zero power of two) | |||
37 | /// alignment. | |||
38 | /// It is suitable for use as static global constants. | |||
39 | struct Align { | |||
40 | private: | |||
41 | uint8_t ShiftValue = 0; /// The log2 of the required alignment. | |||
42 | /// ShiftValue is less than 64 by construction. | |||
43 | ||||
44 | friend struct MaybeAlign; | |||
45 | friend unsigned Log2(Align); | |||
46 | friend bool operator==(Align Lhs, Align Rhs); | |||
47 | friend bool operator!=(Align Lhs, Align Rhs); | |||
48 | friend bool operator<=(Align Lhs, Align Rhs); | |||
49 | friend bool operator>=(Align Lhs, Align Rhs); | |||
50 | friend bool operator<(Align Lhs, Align Rhs); | |||
51 | friend bool operator>(Align Lhs, Align Rhs); | |||
52 | friend unsigned encode(struct MaybeAlign A); | |||
53 | friend struct MaybeAlign decodeMaybeAlign(unsigned Value); | |||
54 | ||||
55 | /// A trivial type to allow construction of constexpr Align. | |||
56 | /// This is currently needed to workaround a bug in GCC 5.3 which prevents | |||
57 | /// definition of constexpr assign operators. | |||
58 | /// https://stackoverflow.com/questions/46756288/explicitly-defaulted-function-cannot-be-declared-as-constexpr-because-the-implic | |||
59 | /// FIXME: Remove this, make all assign operators constexpr and introduce user | |||
60 | /// defined literals when we don't have to support GCC 5.3 anymore. | |||
61 | /// https://llvm.org/docs/GettingStarted.html#getting-a-modern-host-c-toolchain | |||
62 | struct LogValue { | |||
63 | uint8_t Log; | |||
64 | }; | |||
65 | ||||
66 | public: | |||
67 | /// Default is byte-aligned. | |||
68 | constexpr Align() = default; | |||
69 | /// Do not perform checks in case of copy/move construct/assign, because the | |||
70 | /// checks have been performed when building `Other`. | |||
71 | constexpr Align(const Align &Other) = default; | |||
72 | constexpr Align(Align &&Other) = default; | |||
73 | Align &operator=(const Align &Other) = default; | |||
74 | Align &operator=(Align &&Other) = default; | |||
75 | ||||
76 | explicit Align(uint64_t Value) { | |||
77 | assert(Value > 0 && "Value must not be 0")((void)0); | |||
78 | assert(llvm::isPowerOf2_64(Value) && "Alignment is not a power of 2")((void)0); | |||
79 | ShiftValue = Log2_64(Value); | |||
80 | assert(ShiftValue < 64 && "Broken invariant")((void)0); | |||
81 | } | |||
82 | ||||
83 | /// This is a hole in the type system and should not be abused. | |||
84 | /// Needed to interact with C for instance. | |||
85 | uint64_t value() const { return uint64_t(1) << ShiftValue; } | |||
| ||||
86 | ||||
87 | /// Allow constructions of constexpr Align. | |||
88 | template <size_t kValue> constexpr static LogValue Constant() { | |||
89 | return LogValue{static_cast<uint8_t>(CTLog2<kValue>())}; | |||
90 | } | |||
91 | ||||
92 | /// Allow constructions of constexpr Align from types. | |||
93 | /// Compile time equivalent to Align(alignof(T)). | |||
94 | template <typename T> constexpr static LogValue Of() { | |||
95 | return Constant<std::alignment_of<T>::value>(); | |||
96 | } | |||
97 | ||||
98 | /// Constexpr constructor from LogValue type. | |||
99 | constexpr Align(LogValue CA) : ShiftValue(CA.Log) {} | |||
100 | }; | |||
101 | ||||
102 | /// Treats the value 0 as a 1, so Align is always at least 1. | |||
103 | inline Align assumeAligned(uint64_t Value) { | |||
104 | return Value ? Align(Value) : Align(); | |||
105 | } | |||
106 | ||||
107 | /// This struct is a compact representation of a valid (power of two) or | |||
108 | /// undefined (0) alignment. | |||
109 | struct MaybeAlign : public llvm::Optional<Align> { | |||
110 | private: | |||
111 | using UP = llvm::Optional<Align>; | |||
112 | ||||
113 | public: | |||
114 | /// Default is undefined. | |||
115 | MaybeAlign() = default; | |||
116 | /// Do not perform checks in case of copy/move construct/assign, because the | |||
117 | /// checks have been performed when building `Other`. | |||
118 | MaybeAlign(const MaybeAlign &Other) = default; | |||
119 | MaybeAlign &operator=(const MaybeAlign &Other) = default; | |||
120 | MaybeAlign(MaybeAlign &&Other) = default; | |||
121 | MaybeAlign &operator=(MaybeAlign &&Other) = default; | |||
122 | ||||
123 | /// Use llvm::Optional<Align> constructor. | |||
124 | using UP::UP; | |||
125 | ||||
126 | explicit MaybeAlign(uint64_t Value) { | |||
127 | assert((Value == 0 || llvm::isPowerOf2_64(Value)) &&((void)0) | |||
128 | "Alignment is neither 0 nor a power of 2")((void)0); | |||
129 | if (Value) | |||
130 | emplace(Value); | |||
131 | } | |||
132 | ||||
133 | /// For convenience, returns a valid alignment or 1 if undefined. | |||
134 | Align valueOrOne() const { return hasValue() ? getValue() : Align(); } | |||
135 | }; | |||
136 | ||||
137 | /// Checks that SizeInBytes is a multiple of the alignment. | |||
138 | inline bool isAligned(Align Lhs, uint64_t SizeInBytes) { | |||
139 | return SizeInBytes % Lhs.value() == 0; | |||
140 | } | |||
141 | ||||
142 | /// Checks that Addr is a multiple of the alignment. | |||
143 | inline bool isAddrAligned(Align Lhs, const void *Addr) { | |||
144 | return isAligned(Lhs, reinterpret_cast<uintptr_t>(Addr)); | |||
145 | } | |||
146 | ||||
147 | /// Returns a multiple of A needed to store `Size` bytes. | |||
148 | inline uint64_t alignTo(uint64_t Size, Align A) { | |||
149 | const uint64_t Value = A.value(); | |||
150 | // The following line is equivalent to `(Size + Value - 1) / Value * Value`. | |||
151 | ||||
152 | // The division followed by a multiplication can be thought of as a right | |||
153 | // shift followed by a left shift which zeros out the extra bits produced in | |||
154 | // the bump; `~(Value - 1)` is a mask where all those bits being zeroed out | |||
155 | // are just zero. | |||
156 | ||||
157 | // Most compilers can generate this code but the pattern may be missed when | |||
158 | // multiple functions gets inlined. | |||
159 | return (Size + Value - 1) & ~(Value - 1U); | |||
160 | } | |||
161 | ||||
162 | /// If non-zero \p Skew is specified, the return value will be a minimal integer | |||
163 | /// that is greater than or equal to \p Size and equal to \p A * N + \p Skew for | |||
164 | /// some integer N. If \p Skew is larger than \p A, its value is adjusted to '\p | |||
165 | /// Skew mod \p A'. | |||
166 | /// | |||
167 | /// Examples: | |||
168 | /// \code | |||
169 | /// alignTo(5, Align(8), 7) = 7 | |||
170 | /// alignTo(17, Align(8), 1) = 17 | |||
171 | /// alignTo(~0LL, Align(8), 3) = 3 | |||
172 | /// \endcode | |||
173 | inline uint64_t alignTo(uint64_t Size, Align A, uint64_t Skew) { | |||
174 | const uint64_t Value = A.value(); | |||
175 | Skew %= Value; | |||
176 | return ((Size + Value - 1 - Skew) & ~(Value - 1U)) + Skew; | |||
177 | } | |||
178 | ||||
179 | /// Returns a multiple of A needed to store `Size` bytes. | |||
180 | /// Returns `Size` if current alignment is undefined. | |||
181 | inline uint64_t alignTo(uint64_t Size, MaybeAlign A) { | |||
182 | return A ? alignTo(Size, A.getValue()) : Size; | |||
183 | } | |||
184 | ||||
185 | /// Aligns `Addr` to `Alignment` bytes, rounding up. | |||
186 | inline uintptr_t alignAddr(const void *Addr, Align Alignment) { | |||
187 | uintptr_t ArithAddr = reinterpret_cast<uintptr_t>(Addr); | |||
188 | assert(static_cast<uintptr_t>(ArithAddr + Alignment.value() - 1) >=((void)0) | |||
189 | ArithAddr &&((void)0) | |||
190 | "Overflow")((void)0); | |||
191 | return alignTo(ArithAddr, Alignment); | |||
192 | } | |||
193 | ||||
194 | /// Returns the offset to the next integer (mod 2**64) that is greater than | |||
195 | /// or equal to \p Value and is a multiple of \p Align. | |||
196 | inline uint64_t offsetToAlignment(uint64_t Value, Align Alignment) { | |||
197 | return alignTo(Value, Alignment) - Value; | |||
198 | } | |||
199 | ||||
200 | /// Returns the necessary adjustment for aligning `Addr` to `Alignment` | |||
201 | /// bytes, rounding up. | |||
202 | inline uint64_t offsetToAlignedAddr(const void *Addr, Align Alignment) { | |||
203 | return offsetToAlignment(reinterpret_cast<uintptr_t>(Addr), Alignment); | |||
204 | } | |||
205 | ||||
206 | /// Returns the log2 of the alignment. | |||
207 | inline unsigned Log2(Align A) { return A.ShiftValue; } | |||
208 | ||||
209 | /// Returns the alignment that satisfies both alignments. | |||
210 | /// Same semantic as MinAlign. | |||
211 | inline Align commonAlignment(Align A, Align B) { return std::min(A, B); } | |||
212 | ||||
213 | /// Returns the alignment that satisfies both alignments. | |||
214 | /// Same semantic as MinAlign. | |||
215 | inline Align commonAlignment(Align A, uint64_t Offset) { | |||
216 | return Align(MinAlign(A.value(), Offset)); | |||
217 | } | |||
218 | ||||
219 | /// Returns the alignment that satisfies both alignments. | |||
220 | /// Same semantic as MinAlign. | |||
221 | inline MaybeAlign commonAlignment(MaybeAlign A, MaybeAlign B) { | |||
222 | return A && B ? commonAlignment(*A, *B) : A ? A : B; | |||
223 | } | |||
224 | ||||
225 | /// Returns the alignment that satisfies both alignments. | |||
226 | /// Same semantic as MinAlign. | |||
227 | inline MaybeAlign commonAlignment(MaybeAlign A, uint64_t Offset) { | |||
228 | return MaybeAlign(MinAlign((*A).value(), Offset)); | |||
229 | } | |||
230 | ||||
231 | /// Returns a representation of the alignment that encodes undefined as 0. | |||
232 | inline unsigned encode(MaybeAlign A) { return A ? A->ShiftValue + 1 : 0; } | |||
233 | ||||
234 | /// Dual operation of the encode function above. | |||
235 | inline MaybeAlign decodeMaybeAlign(unsigned Value) { | |||
236 | if (Value == 0) | |||
237 | return MaybeAlign(); | |||
238 | Align Out; | |||
239 | Out.ShiftValue = Value - 1; | |||
240 | return Out; | |||
241 | } | |||
242 | ||||
243 | /// Returns a representation of the alignment, the encoded value is positive by | |||
244 | /// definition. | |||
245 | inline unsigned encode(Align A) { return encode(MaybeAlign(A)); } | |||
246 | ||||
247 | /// Comparisons between Align and scalars. Rhs must be positive. | |||
248 | inline bool operator==(Align Lhs, uint64_t Rhs) { | |||
249 | ALIGN_CHECK_ISPOSITIVE(Rhs); | |||
250 | return Lhs.value() == Rhs; | |||
251 | } | |||
252 | inline bool operator!=(Align Lhs, uint64_t Rhs) { | |||
253 | ALIGN_CHECK_ISPOSITIVE(Rhs); | |||
254 | return Lhs.value() != Rhs; | |||
255 | } | |||
256 | inline bool operator<=(Align Lhs, uint64_t Rhs) { | |||
257 | ALIGN_CHECK_ISPOSITIVE(Rhs); | |||
258 | return Lhs.value() <= Rhs; | |||
259 | } | |||
260 | inline bool operator>=(Align Lhs, uint64_t Rhs) { | |||
261 | ALIGN_CHECK_ISPOSITIVE(Rhs); | |||
262 | return Lhs.value() >= Rhs; | |||
263 | } | |||
264 | inline bool operator<(Align Lhs, uint64_t Rhs) { | |||
265 | ALIGN_CHECK_ISPOSITIVE(Rhs); | |||
266 | return Lhs.value() < Rhs; | |||
267 | } | |||
268 | inline bool operator>(Align Lhs, uint64_t Rhs) { | |||
269 | ALIGN_CHECK_ISPOSITIVE(Rhs); | |||
270 | return Lhs.value() > Rhs; | |||
271 | } | |||
272 | ||||
273 | /// Comparisons between MaybeAlign and scalars. | |||
274 | inline bool operator==(MaybeAlign Lhs, uint64_t Rhs) { | |||
275 | return Lhs ? (*Lhs).value() == Rhs : Rhs == 0; | |||
276 | } | |||
277 | inline bool operator!=(MaybeAlign Lhs, uint64_t Rhs) { | |||
278 | return Lhs ? (*Lhs).value() != Rhs : Rhs != 0; | |||
279 | } | |||
280 | ||||
281 | /// Comparisons operators between Align. | |||
282 | inline bool operator==(Align Lhs, Align Rhs) { | |||
283 | return Lhs.ShiftValue == Rhs.ShiftValue; | |||
284 | } | |||
285 | inline bool operator!=(Align Lhs, Align Rhs) { | |||
286 | return Lhs.ShiftValue != Rhs.ShiftValue; | |||
287 | } | |||
288 | inline bool operator<=(Align Lhs, Align Rhs) { | |||
289 | return Lhs.ShiftValue <= Rhs.ShiftValue; | |||
290 | } | |||
291 | inline bool operator>=(Align Lhs, Align Rhs) { | |||
292 | return Lhs.ShiftValue >= Rhs.ShiftValue; | |||
293 | } | |||
294 | inline bool operator<(Align Lhs, Align Rhs) { | |||
295 | return Lhs.ShiftValue < Rhs.ShiftValue; | |||
296 | } | |||
297 | inline bool operator>(Align Lhs, Align Rhs) { | |||
298 | return Lhs.ShiftValue > Rhs.ShiftValue; | |||
299 | } | |||
300 | ||||
301 | // Don't allow relational comparisons with MaybeAlign. | |||
302 | bool operator<=(Align Lhs, MaybeAlign Rhs) = delete; | |||
303 | bool operator>=(Align Lhs, MaybeAlign Rhs) = delete; | |||
304 | bool operator<(Align Lhs, MaybeAlign Rhs) = delete; | |||
305 | bool operator>(Align Lhs, MaybeAlign Rhs) = delete; | |||
306 | ||||
307 | bool operator<=(MaybeAlign Lhs, Align Rhs) = delete; | |||
308 | bool operator>=(MaybeAlign Lhs, Align Rhs) = delete; | |||
309 | bool operator<(MaybeAlign Lhs, Align Rhs) = delete; | |||
310 | bool operator>(MaybeAlign Lhs, Align Rhs) = delete; | |||
311 | ||||
312 | bool operator<=(MaybeAlign Lhs, MaybeAlign Rhs) = delete; | |||
313 | bool operator>=(MaybeAlign Lhs, MaybeAlign Rhs) = delete; | |||
314 | bool operator<(MaybeAlign Lhs, MaybeAlign Rhs) = delete; | |||
315 | bool operator>(MaybeAlign Lhs, MaybeAlign Rhs) = delete; | |||
316 | ||||
317 | inline Align operator*(Align Lhs, uint64_t Rhs) { | |||
318 | assert(Rhs > 0 && "Rhs must be positive")((void)0); | |||
319 | return Align(Lhs.value() * Rhs); | |||
320 | } | |||
321 | ||||
322 | inline MaybeAlign operator*(MaybeAlign Lhs, uint64_t Rhs) { | |||
323 | assert(Rhs > 0 && "Rhs must be positive")((void)0); | |||
324 | return Lhs ? Lhs.getValue() * Rhs : MaybeAlign(); | |||
325 | } | |||
326 | ||||
327 | inline Align operator/(Align Lhs, uint64_t Divisor) { | |||
328 | assert(llvm::isPowerOf2_64(Divisor) &&((void)0) | |||
329 | "Divisor must be positive and a power of 2")((void)0); | |||
330 | assert(Lhs != 1 && "Can't halve byte alignment")((void)0); | |||
331 | return Align(Lhs.value() / Divisor); | |||
332 | } | |||
333 | ||||
334 | inline MaybeAlign operator/(MaybeAlign Lhs, uint64_t Divisor) { | |||
335 | assert(llvm::isPowerOf2_64(Divisor) &&((void)0) | |||
336 | "Divisor must be positive and a power of 2")((void)0); | |||
337 | return Lhs ? Lhs.getValue() / Divisor : MaybeAlign(); | |||
338 | } | |||
339 | ||||
340 | inline Align max(MaybeAlign Lhs, Align Rhs) { | |||
341 | return Lhs && *Lhs > Rhs ? *Lhs : Rhs; | |||
342 | } | |||
343 | ||||
344 | inline Align max(Align Lhs, MaybeAlign Rhs) { | |||
345 | return Rhs && *Rhs > Lhs ? *Rhs : Lhs; | |||
346 | } | |||
347 | ||||
348 | #ifndef NDEBUG1 | |||
349 | // For usage in LLVM_DEBUG macros. | |||
350 | inline std::string DebugStr(const Align &A) { | |||
351 | return std::to_string(A.value()); | |||
352 | } | |||
353 | // For usage in LLVM_DEBUG macros. | |||
354 | inline std::string DebugStr(const MaybeAlign &MA) { | |||
355 | if (MA) | |||
356 | return std::to_string(MA->value()); | |||
357 | return "None"; | |||
358 | } | |||
359 | #endif // NDEBUG | |||
360 | ||||
361 | #undef ALIGN_CHECK_ISPOSITIVE | |||
362 | ||||
363 | } // namespace llvm | |||
364 | ||||
365 | #endif // LLVM_SUPPORT_ALIGNMENT_H_ |