clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model static -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -stack-protector 2 -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | #include "AMDGPULegalizerInfo.h" |
15 | |
16 | #include "AMDGPU.h" |
17 | #include "AMDGPUGlobalISelUtils.h" |
18 | #include "AMDGPUInstrInfo.h" |
19 | #include "AMDGPUTargetMachine.h" |
20 | #include "SIMachineFunctionInfo.h" |
21 | #include "Utils/AMDGPUBaseInfo.h" |
22 | #include "llvm/ADT/ScopeExit.h" |
23 | #include "llvm/BinaryFormat/ELF.h" |
24 | #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" |
25 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
26 | #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
27 | #include "llvm/IR/DiagnosticInfo.h" |
28 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
29 | |
30 | #define DEBUG_TYPE "amdgpu-legalinfo" |
31 | |
32 | using namespace llvm; |
33 | using namespace LegalizeActions; |
34 | using namespace LegalizeMutations; |
35 | using namespace LegalityPredicates; |
36 | using namespace MIPatternMatch; |
37 | |
38 | |
39 | static cl::opt<bool> EnableNewLegality( |
40 | "amdgpu-global-isel-new-legality", |
41 | cl::desc("Use GlobalISel desired legality, rather than try to use" |
42 | "rules compatible with selection patterns"), |
43 | cl::init(false), |
44 | cl::ReallyHidden); |
45 | |
46 | static constexpr unsigned MaxRegisterSize = 1024; |
47 | |
48 | |
49 | static LLT getPow2VectorType(LLT Ty) { |
50 | unsigned NElts = Ty.getNumElements(); |
51 | unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); |
52 | return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); |
53 | } |
54 | |
55 | |
56 | static LLT getPow2ScalarType(LLT Ty) { |
57 | unsigned Bits = Ty.getSizeInBits(); |
58 | unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); |
59 | return LLT::scalar(Pow2Bits); |
60 | } |
61 | |
62 | |
63 | |
64 | |
65 | static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { |
66 | return [=](const LegalityQuery &Query) { |
67 | const LLT Ty = Query.Types[TypeIdx]; |
68 | if (!Ty.isVector()) |
69 | return false; |
70 | |
71 | const LLT EltTy = Ty.getElementType(); |
72 | const unsigned EltSize = EltTy.getSizeInBits(); |
73 | return Ty.getNumElements() % 2 != 0 && |
74 | EltSize > 1 && EltSize < 32 && |
75 | Ty.getSizeInBits() % 32 != 0; |
76 | }; |
77 | } |
78 | |
79 | static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { |
80 | return [=](const LegalityQuery &Query) { |
81 | const LLT Ty = Query.Types[TypeIdx]; |
82 | return Ty.getSizeInBits() % 32 == 0; |
83 | }; |
84 | } |
85 | |
86 | static LegalityPredicate isWideVec16(unsigned TypeIdx) { |
87 | return [=](const LegalityQuery &Query) { |
88 | const LLT Ty = Query.Types[TypeIdx]; |
89 | const LLT EltTy = Ty.getScalarType(); |
90 | return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; |
91 | }; |
92 | } |
93 | |
94 | static LegalizeMutation oneMoreElement(unsigned TypeIdx) { |
95 | return [=](const LegalityQuery &Query) { |
96 | const LLT Ty = Query.Types[TypeIdx]; |
97 | const LLT EltTy = Ty.getElementType(); |
98 | return std::make_pair(TypeIdx, |
99 | LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); |
100 | }; |
101 | } |
102 | |
103 | static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { |
104 | return [=](const LegalityQuery &Query) { |
105 | const LLT Ty = Query.Types[TypeIdx]; |
106 | const LLT EltTy = Ty.getElementType(); |
107 | unsigned Size = Ty.getSizeInBits(); |
108 | unsigned Pieces = (Size + 63) / 64; |
109 | unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; |
110 | return std::make_pair( |
111 | TypeIdx, |
112 | LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); |
113 | }; |
114 | } |
115 | |
116 | |
117 | |
118 | static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { |
119 | return [=](const LegalityQuery &Query) { |
120 | const LLT Ty = Query.Types[TypeIdx]; |
121 | |
122 | const LLT EltTy = Ty.getElementType(); |
123 | const int Size = Ty.getSizeInBits(); |
124 | const int EltSize = EltTy.getSizeInBits(); |
125 | const int NextMul32 = (Size + 31) / 32; |
126 | |
127 | assert(EltSize < 32); |
128 | |
129 | const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; |
130 | return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); |
131 | }; |
132 | } |
133 | |
134 | static LLT getBitcastRegisterType(const LLT Ty) { |
135 | const unsigned Size = Ty.getSizeInBits(); |
136 | |
137 | LLT CoercedTy; |
138 | if (Size <= 32) { |
139 | |
140 | |
141 | return LLT::scalar(Size); |
142 | } |
143 | |
144 | return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); |
145 | } |
146 | |
147 | static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { |
148 | return [=](const LegalityQuery &Query) { |
149 | const LLT Ty = Query.Types[TypeIdx]; |
150 | return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); |
151 | }; |
152 | } |
153 | |
154 | static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { |
155 | return [=](const LegalityQuery &Query) { |
156 | const LLT Ty = Query.Types[TypeIdx]; |
157 | unsigned Size = Ty.getSizeInBits(); |
158 | assert(Size % 32 == 0); |
159 | return std::make_pair( |
160 | TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); |
161 | }; |
162 | } |
163 | |
164 | static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { |
165 | return [=](const LegalityQuery &Query) { |
166 | const LLT QueryTy = Query.Types[TypeIdx]; |
167 | return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; |
168 | }; |
169 | } |
170 | |
171 | static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { |
172 | return [=](const LegalityQuery &Query) { |
173 | const LLT QueryTy = Query.Types[TypeIdx]; |
174 | return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; |
175 | }; |
176 | } |
177 | |
178 | static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { |
179 | return [=](const LegalityQuery &Query) { |
180 | const LLT QueryTy = Query.Types[TypeIdx]; |
181 | return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; |
182 | }; |
183 | } |
184 | |
185 | static bool isRegisterSize(unsigned Size) { |
186 | return Size % 32 == 0 && Size <= MaxRegisterSize; |
187 | } |
188 | |
189 | static bool isRegisterVectorElementType(LLT EltTy) { |
190 | const int EltSize = EltTy.getSizeInBits(); |
191 | return EltSize == 16 || EltSize % 32 == 0; |
192 | } |
193 | |
194 | static bool isRegisterVectorType(LLT Ty) { |
195 | const int EltSize = Ty.getElementType().getSizeInBits(); |
196 | return EltSize == 32 || EltSize == 64 || |
197 | (EltSize == 16 && Ty.getNumElements() % 2 == 0) || |
198 | EltSize == 128 || EltSize == 256; |
199 | } |
200 | |
201 | static bool isRegisterType(LLT Ty) { |
202 | if (!isRegisterSize(Ty.getSizeInBits())) |
203 | return false; |
204 | |
205 | if (Ty.isVector()) |
206 | return isRegisterVectorType(Ty); |
207 | |
208 | return true; |
209 | } |
210 | |
211 | |
212 | |
213 | static LegalityPredicate isRegisterType(unsigned TypeIdx) { |
214 | return [=](const LegalityQuery &Query) { |
215 | return isRegisterType(Query.Types[TypeIdx]); |
216 | }; |
217 | } |
218 | |
219 | static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { |
220 | return [=](const LegalityQuery &Query) { |
221 | const LLT QueryTy = Query.Types[TypeIdx]; |
222 | if (!QueryTy.isVector()) |
223 | return false; |
224 | const LLT EltTy = QueryTy.getElementType(); |
225 | return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; |
226 | }; |
227 | } |
228 | |
229 | |
230 | |
231 | static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { |
232 | return [=](const LegalityQuery &Query) { |
233 | const LLT Ty = Query.Types[TypeIdx]; |
234 | return !Ty.isVector() && Ty.getSizeInBits() > 32 && |
235 | Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); |
236 | }; |
237 | } |
238 | |
239 | |
240 | |
241 | |
242 | static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, |
243 | bool IsLoad) { |
244 | switch (AS) { |
245 | case AMDGPUAS::PRIVATE_ADDRESS: |
246 | |
247 | return ST.enableFlatScratch() ? 128 : 32; |
248 | case AMDGPUAS::LOCAL_ADDRESS: |
249 | return ST.useDS128() ? 128 : 64; |
250 | case AMDGPUAS::GLOBAL_ADDRESS: |
251 | case AMDGPUAS::CONSTANT_ADDRESS: |
252 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: |
253 | |
254 | |
255 | |
256 | |
257 | |
258 | |
259 | return IsLoad ? 512 : 128; |
260 | default: |
261 | |
262 | |
263 | return 128; |
264 | } |
265 | } |
266 | |
267 | static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, |
268 | const LegalityQuery &Query) { |
269 | const LLT Ty = Query.Types[0]; |
270 | |
271 | |
272 | const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; |
273 | |
274 | unsigned RegSize = Ty.getSizeInBits(); |
275 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); |
276 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; |
277 | unsigned AS = Query.Types[1].getAddressSpace(); |
278 | |
279 | |
280 | if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) |
281 | return false; |
282 | |
283 | |
284 | if (Ty.isVector() && MemSize != RegSize) |
285 | return false; |
286 | |
287 | |
288 | |
289 | #if 0 |
290 | |
291 | if (IsLoad && MemSize < Size) |
292 | MemSize = std::max(MemSize, Align); |
293 | #endif |
294 | |
295 | |
296 | if (MemSize != RegSize && RegSize != 32) |
297 | return false; |
298 | |
299 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) |
300 | return false; |
301 | |
302 | switch (MemSize) { |
303 | case 8: |
304 | case 16: |
305 | case 32: |
306 | case 64: |
307 | case 128: |
308 | break; |
309 | case 96: |
310 | if (!ST.hasDwordx3LoadStores()) |
311 | return false; |
312 | break; |
313 | case 256: |
314 | case 512: |
315 | |
316 | break; |
317 | default: |
318 | return false; |
319 | } |
320 | |
321 | assert(RegSize >= MemSize); |
322 | |
323 | if (AlignBits < MemSize) { |
324 | const SITargetLowering *TLI = ST.getTargetLowering(); |
325 | if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, |
326 | Align(AlignBits / 8))) |
327 | return false; |
328 | } |
329 | |
330 | return true; |
331 | } |
332 | |
333 | |
334 | |
335 | |
336 | |
337 | static bool loadStoreBitcastWorkaround(const LLT Ty) { |
338 | if (EnableNewLegality) |
339 | return false; |
340 | |
341 | const unsigned Size = Ty.getSizeInBits(); |
342 | if (Size <= 64) |
343 | return false; |
344 | if (!Ty.isVector()) |
345 | return true; |
346 | |
347 | LLT EltTy = Ty.getElementType(); |
348 | if (EltTy.isPointer()) |
349 | return true; |
350 | |
351 | unsigned EltSize = EltTy.getSizeInBits(); |
352 | return EltSize != 32 && EltSize != 64; |
353 | } |
354 | |
355 | static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { |
356 | const LLT Ty = Query.Types[0]; |
357 | return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && |
358 | !loadStoreBitcastWorkaround(Ty); |
359 | } |
360 | |
361 | |
362 | |
363 | static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, |
364 | const LLT MemTy) { |
365 | const unsigned MemSizeInBits = MemTy.getSizeInBits(); |
366 | const unsigned Size = Ty.getSizeInBits(); |
367 | if (Size != MemSizeInBits) |
368 | return Size <= 32 && Ty.isVector(); |
369 | |
370 | if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) |
371 | return true; |
372 | |
373 | |
374 | return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && |
375 | (Size <= 32 || isRegisterSize(Size)) && |
376 | !isRegisterVectorElementType(Ty.getElementType()); |
377 | } |
378 | |
379 | |
380 | |
381 | |
382 | static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, |
383 | unsigned AlignInBits, unsigned AddrSpace, |
384 | unsigned Opcode) { |
385 | unsigned SizeInBits = MemoryTy.getSizeInBits(); |
386 | |
387 | if (isPowerOf2_32(SizeInBits)) |
388 | return false; |
389 | |
390 | |
391 | |
392 | |
393 | if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) |
394 | return false; |
395 | |
396 | if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) |
397 | return false; |
398 | |
399 | |
400 | |
401 | |
402 | |
403 | unsigned RoundedSize = NextPowerOf2(SizeInBits); |
404 | if (AlignInBits < RoundedSize) |
405 | return false; |
406 | |
407 | |
408 | const SITargetLowering *TLI = ST.getTargetLowering(); |
409 | bool Fast = false; |
410 | return TLI->allowsMisalignedMemoryAccessesImpl( |
411 | RoundedSize, AddrSpace, Align(AlignInBits / 8), |
412 | MachineMemOperand::MOLoad, &Fast) && |
413 | Fast; |
414 | } |
415 | |
416 | static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, |
417 | unsigned Opcode) { |
418 | if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) |
419 | return false; |
420 | |
421 | return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, |
422 | Query.MMODescrs[0].AlignInBits, |
423 | Query.Types[1].getAddressSpace(), Opcode); |
424 | } |
425 | |
426 | AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, |
427 | const GCNTargetMachine &TM) |
428 | : ST(ST_) { |
429 | using namespace TargetOpcode; |
430 | |
431 | auto GetAddrSpacePtr = [&TM](unsigned AS) { |
432 | return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); |
433 | }; |
434 | |
435 | const LLT S1 = LLT::scalar(1); |
436 | const LLT S8 = LLT::scalar(8); |
437 | const LLT S16 = LLT::scalar(16); |
438 | const LLT S32 = LLT::scalar(32); |
439 | const LLT S64 = LLT::scalar(64); |
440 | const LLT S128 = LLT::scalar(128); |
441 | const LLT S256 = LLT::scalar(256); |
442 | const LLT S512 = LLT::scalar(512); |
443 | const LLT MaxScalar = LLT::scalar(MaxRegisterSize); |
444 | |
445 | const LLT V2S8 = LLT::fixed_vector(2, 8); |
446 | const LLT V2S16 = LLT::fixed_vector(2, 16); |
447 | const LLT V4S16 = LLT::fixed_vector(4, 16); |
448 | |
449 | const LLT V2S32 = LLT::fixed_vector(2, 32); |
450 | const LLT V3S32 = LLT::fixed_vector(3, 32); |
451 | const LLT V4S32 = LLT::fixed_vector(4, 32); |
452 | const LLT V5S32 = LLT::fixed_vector(5, 32); |
453 | const LLT V6S32 = LLT::fixed_vector(6, 32); |
454 | const LLT V7S32 = LLT::fixed_vector(7, 32); |
455 | const LLT V8S32 = LLT::fixed_vector(8, 32); |
456 | const LLT V9S32 = LLT::fixed_vector(9, 32); |
457 | const LLT V10S32 = LLT::fixed_vector(10, 32); |
458 | const LLT V11S32 = LLT::fixed_vector(11, 32); |
459 | const LLT V12S32 = LLT::fixed_vector(12, 32); |
460 | const LLT V13S32 = LLT::fixed_vector(13, 32); |
461 | const LLT V14S32 = LLT::fixed_vector(14, 32); |
462 | const LLT V15S32 = LLT::fixed_vector(15, 32); |
463 | const LLT V16S32 = LLT::fixed_vector(16, 32); |
464 | const LLT V32S32 = LLT::fixed_vector(32, 32); |
465 | |
466 | const LLT V2S64 = LLT::fixed_vector(2, 64); |
467 | const LLT V3S64 = LLT::fixed_vector(3, 64); |
468 | const LLT V4S64 = LLT::fixed_vector(4, 64); |
469 | const LLT V5S64 = LLT::fixed_vector(5, 64); |
470 | const LLT V6S64 = LLT::fixed_vector(6, 64); |
471 | const LLT V7S64 = LLT::fixed_vector(7, 64); |
472 | const LLT V8S64 = LLT::fixed_vector(8, 64); |
473 | const LLT V16S64 = LLT::fixed_vector(16, 64); |
474 | |
475 | std::initializer_list<LLT> AllS32Vectors = |
476 | {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, |
477 | V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; |
478 | std::initializer_list<LLT> AllS64Vectors = |
479 | {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; |
480 | |
481 | const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); |
482 | const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); |
483 | const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); |
484 | const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); |
485 | const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); |
486 | const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); |
487 | const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); |
488 | |
489 | const LLT CodePtr = FlatPtr; |
490 | |
491 | const std::initializer_list<LLT> AddrSpaces64 = { |
492 | GlobalPtr, ConstantPtr, FlatPtr |
493 | }; |
494 | |
495 | const std::initializer_list<LLT> AddrSpaces32 = { |
496 | LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr |
497 | }; |
498 | |
499 | const std::initializer_list<LLT> FPTypesBase = { |
500 | S32, S64 |
501 | }; |
502 | |
503 | const std::initializer_list<LLT> FPTypes16 = { |
504 | S32, S64, S16 |
505 | }; |
506 | |
507 | const std::initializer_list<LLT> FPTypesPK16 = { |
508 | S32, S64, S16, V2S16 |
509 | }; |
510 | |
511 | const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; |
512 | |
513 | |
514 | getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); |
515 | |
516 | |
517 | |
518 | getActionDefinitionsBuilder(G_PHI) |
519 | .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) |
520 | .legalFor(AllS32Vectors) |
521 | .legalFor(AllS64Vectors) |
522 | .legalFor(AddrSpaces64) |
523 | .legalFor(AddrSpaces32) |
524 | .legalIf(isPointer(0)) |
525 | .clampScalar(0, S16, S256) |
526 | .widenScalarToNextPow2(0, 32) |
527 | .clampMaxNumElements(0, S32, 16) |
528 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
529 | .scalarize(0); |
530 | |
531 | if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { |
532 | |
533 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) |
534 | .legalFor({S32, S16, V2S16}) |
535 | .clampScalar(0, S16, S32) |
536 | .clampMaxNumElements(0, S16, 2) |
537 | .scalarize(0) |
538 | .widenScalarToNextPow2(0, 32); |
539 | |
540 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) |
541 | .legalFor({S32, S16, V2S16}) |
542 | .minScalarOrElt(0, S16) |
543 | .clampMaxNumElements(0, S16, 2) |
544 | .scalarize(0) |
545 | .widenScalarToNextPow2(0, 32) |
546 | .lower(); |
547 | } else if (ST.has16BitInsts()) { |
548 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) |
549 | .legalFor({S32, S16}) |
550 | .clampScalar(0, S16, S32) |
551 | .scalarize(0) |
552 | .widenScalarToNextPow2(0, 32); |
553 | |
554 | |
555 | |
556 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) |
557 | .legalFor({S32, S16}) |
558 | .minScalar(0, S16) |
559 | .scalarize(0) |
560 | .widenScalarToNextPow2(0, 16) |
561 | .lower(); |
562 | |
563 | |
564 | |
565 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) |
566 | .minScalar(0, S16) |
567 | .scalarize(0) |
568 | .lower(); |
569 | } else { |
570 | getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) |
571 | .legalFor({S32}) |
572 | .clampScalar(0, S32, S32) |
573 | .scalarize(0); |
574 | |
575 | if (ST.hasIntClamp()) { |
576 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) |
577 | .legalFor({S32}) |
578 | .scalarize(0) |
579 | .minScalarOrElt(0, S32) |
580 | .lower(); |
581 | } else { |
582 | |
583 | getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) |
584 | .minScalar(0, S32) |
585 | .scalarize(0) |
586 | .lower(); |
587 | } |
588 | |
589 | |
590 | |
591 | getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) |
592 | .minScalar(0, S32) |
593 | .scalarize(0) |
594 | .lower(); |
595 | } |
596 | |
597 | getActionDefinitionsBuilder( |
598 | {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) |
599 | .customFor({S32, S64}) |
600 | .clampScalar(0, S32, S64) |
601 | .widenScalarToNextPow2(0, 32) |
602 | .scalarize(0); |
603 | |
604 | auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) |
605 | .legalFor({S32}) |
606 | .maxScalarOrElt(0, S32); |
607 | |
608 | if (ST.hasVOP3PInsts()) { |
609 | Mulh |
610 | .clampMaxNumElements(0, S8, 2) |
611 | .lowerFor({V2S8}); |
612 | } |
613 | |
614 | Mulh |
615 | .scalarize(0) |
616 | .lower(); |
617 | |
618 | |
619 | |
620 | getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) |
621 | .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) |
622 | .clampScalar(0, S32, S64) |
623 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
624 | .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) |
625 | .widenScalarToNextPow2(0) |
626 | .scalarize(0); |
627 | |
628 | getActionDefinitionsBuilder({G_UADDO, G_USUBO, |
629 | G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) |
630 | .legalFor({{S32, S1}, {S32, S32}}) |
631 | .minScalar(0, S32) |
632 | |
633 | .lower(); |
634 | |
635 | getActionDefinitionsBuilder(G_BITCAST) |
636 | |
637 | .legalIf(all(isRegisterType(0), isRegisterType(1))) |
638 | .lower(); |
639 | |
640 | |
641 | getActionDefinitionsBuilder(G_CONSTANT) |
642 | .legalFor({S1, S32, S64, S16, GlobalPtr, |
643 | LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) |
644 | .legalIf(isPointer(0)) |
645 | .clampScalar(0, S32, S64) |
646 | .widenScalarToNextPow2(0); |
647 | |
648 | getActionDefinitionsBuilder(G_FCONSTANT) |
649 | .legalFor({S32, S64, S16}) |
650 | .clampScalar(0, S16, S64); |
651 | |
652 | getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) |
653 | .legalIf(isRegisterType(0)) |
654 | |
655 | |
656 | .legalFor({S1, S16}) |
657 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
658 | .clampScalarOrElt(0, S32, MaxScalar) |
659 | .widenScalarToNextPow2(0, 32) |
660 | .clampMaxNumElements(0, S32, 16); |
661 | |
662 | getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); |
663 | |
664 | |
665 | |
666 | getActionDefinitionsBuilder(G_DYN_STACKALLOC) |
667 | .legalFor({{PrivatePtr, S32}}); |
668 | |
669 | getActionDefinitionsBuilder(G_GLOBAL_VALUE) |
670 | .customIf(typeIsNot(0, PrivatePtr)); |
671 | |
672 | getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); |
673 | |
674 | auto &FPOpActions = getActionDefinitionsBuilder( |
675 | { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) |
676 | .legalFor({S32, S64}); |
677 | auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) |
678 | .customFor({S32, S64}); |
679 | auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) |
680 | .customFor({S32, S64}); |
681 | |
682 | if (ST.has16BitInsts()) { |
683 | if (ST.hasVOP3PInsts()) |
684 | FPOpActions.legalFor({S16, V2S16}); |
685 | else |
686 | FPOpActions.legalFor({S16}); |
687 | |
688 | TrigActions.customFor({S16}); |
689 | FDIVActions.customFor({S16}); |
690 | } |
691 | |
692 | auto &MinNumMaxNum = getActionDefinitionsBuilder({ |
693 | G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); |
694 | |
695 | if (ST.hasVOP3PInsts()) { |
696 | MinNumMaxNum.customFor(FPTypesPK16) |
697 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
698 | .clampMaxNumElements(0, S16, 2) |
699 | .clampScalar(0, S16, S64) |
700 | .scalarize(0); |
701 | } else if (ST.has16BitInsts()) { |
702 | MinNumMaxNum.customFor(FPTypes16) |
703 | .clampScalar(0, S16, S64) |
704 | .scalarize(0); |
705 | } else { |
706 | MinNumMaxNum.customFor(FPTypesBase) |
707 | .clampScalar(0, S32, S64) |
708 | .scalarize(0); |
709 | } |
710 | |
711 | if (ST.hasVOP3PInsts()) |
712 | FPOpActions.clampMaxNumElements(0, S16, 2); |
713 | |
714 | FPOpActions |
715 | .scalarize(0) |
716 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
717 | |
718 | TrigActions |
719 | .scalarize(0) |
720 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
721 | |
722 | FDIVActions |
723 | .scalarize(0) |
724 | .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); |
725 | |
726 | getActionDefinitionsBuilder({G_FNEG, G_FABS}) |
727 | .legalFor(FPTypesPK16) |
728 | .clampMaxNumElements(0, S16, 2) |
729 | .scalarize(0) |
730 | .clampScalar(0, S16, S64); |
731 | |
732 | if (ST.has16BitInsts()) { |
733 | getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) |
734 | .legalFor({S32, S64, S16}) |
735 | .scalarize(0) |
736 | .clampScalar(0, S16, S64); |
737 | } else { |
738 | getActionDefinitionsBuilder(G_FSQRT) |
739 | .legalFor({S32, S64}) |
740 | .scalarize(0) |
741 | .clampScalar(0, S32, S64); |
742 | |
743 | if (ST.hasFractBug()) { |
744 | getActionDefinitionsBuilder(G_FFLOOR) |
745 | .customFor({S64}) |
746 | .legalFor({S32, S64}) |
747 | .scalarize(0) |
748 | .clampScalar(0, S32, S64); |
749 | } else { |
750 | getActionDefinitionsBuilder(G_FFLOOR) |
751 | .legalFor({S32, S64}) |
752 | .scalarize(0) |
753 | .clampScalar(0, S32, S64); |
754 | } |
755 | } |
756 | |
757 | getActionDefinitionsBuilder(G_FPTRUNC) |
758 | .legalFor({{S32, S64}, {S16, S32}}) |
759 | .scalarize(0) |
760 | .lower(); |
761 | |
762 | getActionDefinitionsBuilder(G_FPEXT) |
763 | .legalFor({{S64, S32}, {S32, S16}}) |
764 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) |
765 | .scalarize(0); |
766 | |
767 | getActionDefinitionsBuilder(G_FSUB) |
768 | |
769 | .legalFor({S32}) |
770 | |
771 | .lowerFor({S64, S16, V2S16}) |
772 | .scalarize(0) |
773 | .clampScalar(0, S32, S64); |
774 | |
775 | |
776 | auto &FMad = getActionDefinitionsBuilder(G_FMAD); |
777 | if (ST.hasMadF16() && ST.hasMadMacF32Insts()) |
778 | FMad.customFor({S32, S16}); |
779 | else if (ST.hasMadMacF32Insts()) |
780 | FMad.customFor({S32}); |
781 | else if (ST.hasMadF16()) |
782 | FMad.customFor({S16}); |
783 | FMad.scalarize(0) |
784 | .lower(); |
785 | |
786 | auto &FRem = getActionDefinitionsBuilder(G_FREM); |
787 | if (ST.has16BitInsts()) { |
788 | FRem.customFor({S16, S32, S64}); |
789 | } else { |
790 | FRem.minScalar(0, S32) |
791 | .customFor({S32, S64}); |
792 | } |
793 | FRem.scalarize(0); |
794 | |
795 | |
796 | getActionDefinitionsBuilder(G_TRUNC) |
797 | .legalIf(isScalar(0)) |
798 | .legalFor({{V2S16, V2S32}}) |
799 | .clampMaxNumElements(0, S16, 2) |
800 | |
801 | |
802 | |
803 | .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) |
804 | .alwaysLegal(); |
805 | |
806 | getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) |
807 | .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, |
808 | {S32, S1}, {S64, S1}, {S16, S1}}) |
809 | .scalarize(0) |
810 | .clampScalar(0, S32, S64) |
811 | .widenScalarToNextPow2(1, 32); |
812 | |
813 | |
814 | auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) |
815 | .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) |
816 | .lowerFor({{S32, S64}}) |
817 | .lowerIf(typeIs(1, S1)) |
818 | .customFor({{S64, S64}}); |
819 | if (ST.has16BitInsts()) |
820 | IToFP.legalFor({{S16, S16}}); |
821 | IToFP.clampScalar(1, S32, S64) |
822 | .minScalar(0, S32) |
823 | .scalarize(0) |
824 | .widenScalarToNextPow2(1); |
825 | |
826 | auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) |
827 | .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) |
828 | .customFor({{S64, S32}, {S64, S64}}) |
829 | .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); |
830 | if (ST.has16BitInsts()) |
831 | FPToI.legalFor({{S16, S16}}); |
832 | else |
833 | FPToI.minScalar(1, S32); |
834 | |
835 | FPToI.minScalar(0, S32) |
836 | .widenScalarToNextPow2(0, 32) |
837 | .scalarize(0) |
838 | .lower(); |
839 | |
840 | |
841 | getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) |
842 | .scalarize(0) |
843 | .lower(); |
844 | |
845 | if (ST.has16BitInsts()) { |
846 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
847 | .legalFor({S16, S32, S64}) |
848 | .clampScalar(0, S16, S64) |
849 | .scalarize(0); |
850 | } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { |
851 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
852 | .legalFor({S32, S64}) |
853 | .clampScalar(0, S32, S64) |
854 | .scalarize(0); |
855 | } else { |
856 | getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) |
857 | .legalFor({S32}) |
858 | .customFor({S64}) |
859 | .clampScalar(0, S32, S64) |
860 | .scalarize(0); |
861 | } |
862 | |
863 | getActionDefinitionsBuilder(G_PTR_ADD) |
864 | .legalIf(all(isPointer(0), sameSize(0, 1))) |
865 | .scalarize(0) |
866 | .scalarSameSizeAs(1, 0); |
867 | |
868 | getActionDefinitionsBuilder(G_PTRMASK) |
869 | .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) |
870 | .scalarSameSizeAs(1, 0) |
871 | .scalarize(0); |
872 | |
873 | auto &CmpBuilder = |
874 | getActionDefinitionsBuilder(G_ICMP) |
875 | |
876 | |
877 | |
878 | |
879 | |
880 | |
881 | |
882 | |
883 | |
884 | |
885 | .legalForCartesianProduct( |
886 | {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) |
887 | .legalForCartesianProduct( |
888 | {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); |
889 | if (ST.has16BitInsts()) { |
890 | CmpBuilder.legalFor({{S1, S16}}); |
891 | } |
892 | |
893 | CmpBuilder |
894 | .widenScalarToNextPow2(1) |
895 | .clampScalar(1, S32, S64) |
896 | .scalarize(0) |
897 | .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); |
898 | |
899 | getActionDefinitionsBuilder(G_FCMP) |
900 | .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) |
901 | .widenScalarToNextPow2(1) |
902 | .clampScalar(1, S32, S64) |
903 | .scalarize(0); |
904 | |
905 | |
906 | auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); |
907 | if (ST.has16BitInsts()) |
908 | Exp2Ops.legalFor({S32, S16}); |
909 | else |
910 | Exp2Ops.legalFor({S32}); |
911 | Exp2Ops.clampScalar(0, MinScalarFPTy, S32); |
912 | Exp2Ops.scalarize(0); |
913 | |
914 | auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); |
915 | if (ST.has16BitInsts()) |
916 | ExpOps.customFor({{S32}, {S16}}); |
917 | else |
918 | ExpOps.customFor({S32}); |
919 | ExpOps.clampScalar(0, MinScalarFPTy, S32) |
920 | .scalarize(0); |
921 | |
922 | getActionDefinitionsBuilder(G_FPOWI) |
923 | .clampScalar(0, MinScalarFPTy, S32) |
924 | .lower(); |
925 | |
926 | |
927 | getActionDefinitionsBuilder(G_CTPOP) |
928 | .legalFor({{S32, S32}, {S32, S64}}) |
929 | .clampScalar(0, S32, S32) |
930 | .clampScalar(1, S32, S64) |
931 | .scalarize(0) |
932 | .widenScalarToNextPow2(0, 32) |
933 | .widenScalarToNextPow2(1, 32); |
934 | |
935 | |
936 | |
937 | |
938 | getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) |
939 | .scalarize(0) |
940 | .clampScalar(0, S32, S32) |
941 | .clampScalar(1, S32, S64) |
942 | .widenScalarToNextPow2(0, 32) |
943 | .widenScalarToNextPow2(1, 32) |
944 | .lower(); |
945 | |
946 | |
947 | getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) |
948 | .legalFor({{S32, S32}, {S32, S64}}) |
949 | .clampScalar(0, S32, S32) |
950 | .clampScalar(1, S32, S64) |
951 | .scalarize(0) |
952 | .widenScalarToNextPow2(0, 32) |
953 | .widenScalarToNextPow2(1, 32); |
954 | |
955 | |
956 | |
957 | getActionDefinitionsBuilder(G_BITREVERSE) |
958 | .legalFor({S32, S64}) |
959 | .clampScalar(0, S32, S64) |
960 | .scalarize(0) |
961 | .widenScalarToNextPow2(0); |
962 | |
963 | if (ST.has16BitInsts()) { |
964 | getActionDefinitionsBuilder(G_BSWAP) |
965 | .legalFor({S16, S32, V2S16}) |
966 | .clampMaxNumElements(0, S16, 2) |
967 | |
968 | |
969 | .widenScalarToNextPow2(0) |
970 | .clampScalar(0, S16, S32) |
971 | .scalarize(0); |
972 | |
973 | if (ST.hasVOP3PInsts()) { |
974 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) |
975 | .legalFor({S32, S16, V2S16}) |
976 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
977 | .clampMaxNumElements(0, S16, 2) |
978 | .minScalar(0, S16) |
979 | .widenScalarToNextPow2(0) |
980 | .scalarize(0) |
981 | .lower(); |
982 | } else { |
983 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) |
984 | .legalFor({S32, S16}) |
985 | .widenScalarToNextPow2(0) |
986 | .minScalar(0, S16) |
987 | .scalarize(0) |
988 | .lower(); |
989 | } |
990 | } else { |
991 | |
992 | getActionDefinitionsBuilder(G_BSWAP) |
993 | .legalFor({S32}) |
994 | .lowerIf(scalarNarrowerThan(0, 32)) |
995 | |
996 | |
997 | .widenScalarToNextPow2(0) |
998 | .maxScalar(0, S32) |
999 | .scalarize(0) |
1000 | .lower(); |
1001 | |
1002 | getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) |
1003 | .legalFor({S32}) |
1004 | .minScalar(0, S32) |
1005 | .widenScalarToNextPow2(0) |
1006 | .scalarize(0) |
1007 | .lower(); |
1008 | } |
1009 | |
1010 | getActionDefinitionsBuilder(G_INTTOPTR) |
1011 | |
1012 | .legalForCartesianProduct(AddrSpaces64, {S64}) |
1013 | .legalForCartesianProduct(AddrSpaces32, {S32}) |
1014 | .scalarize(0) |
1015 | |
1016 | .legalIf(sameSize(0, 1)) |
1017 | .widenScalarIf(smallerThan(1, 0), |
1018 | [](const LegalityQuery &Query) { |
1019 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); |
1020 | }) |
1021 | .narrowScalarIf(largerThan(1, 0), |
1022 | [](const LegalityQuery &Query) { |
1023 | return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); |
1024 | }); |
1025 | |
1026 | getActionDefinitionsBuilder(G_PTRTOINT) |
1027 | |
1028 | .legalForCartesianProduct(AddrSpaces64, {S64}) |
1029 | .legalForCartesianProduct(AddrSpaces32, {S32}) |
1030 | .scalarize(0) |
1031 | |
1032 | .legalIf(sameSize(0, 1)) |
1033 | .widenScalarIf(smallerThan(0, 1), |
1034 | [](const LegalityQuery &Query) { |
1035 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); |
1036 | }) |
1037 | .narrowScalarIf( |
1038 | largerThan(0, 1), |
1039 | [](const LegalityQuery &Query) { |
1040 | return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); |
1041 | }); |
1042 | |
1043 | getActionDefinitionsBuilder(G_ADDRSPACE_CAST) |
1044 | .scalarize(0) |
1045 | .custom(); |
1046 | |
1047 | const auto needToSplitMemOp = [=](const LegalityQuery &Query, |
1048 | bool IsLoad) -> bool { |
1049 | const LLT DstTy = Query.Types[0]; |
1050 | |
1051 | |
1052 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); |
1053 | unsigned AlignBits = Query.MMODescrs[0].AlignInBits; |
1054 | |
1055 | if (MemSize < DstTy.getSizeInBits()) |
1056 | MemSize = std::max(MemSize, AlignBits); |
1057 | |
1058 | if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) |
1059 | return true; |
1060 | |
1061 | const LLT PtrTy = Query.Types[1]; |
1062 | unsigned AS = PtrTy.getAddressSpace(); |
1063 | if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) |
1064 | return true; |
1065 | |
1066 | |
1067 | |
1068 | unsigned NumRegs = (MemSize + 31) / 32; |
1069 | if (NumRegs == 3) { |
1070 | if (!ST.hasDwordx3LoadStores()) |
1071 | return true; |
1072 | } else { |
1073 | |
1074 | if (!isPowerOf2_32(NumRegs)) |
1075 | return true; |
1076 | } |
1077 | |
1078 | if (AlignBits < MemSize) { |
1079 | const SITargetLowering *TLI = ST.getTargetLowering(); |
1080 | return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, |
1081 | Align(AlignBits / 8)); |
1082 | } |
1083 | |
1084 | return false; |
1085 | }; |
1086 | |
1087 | unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; |
1088 | unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; |
1089 | unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; |
1090 | |
1091 | |
1092 | |
1093 | |
1094 | |
1095 | for (unsigned Op : {G_LOAD, G_STORE}) { |
1096 | const bool IsStore = Op == G_STORE; |
1097 | |
1098 | auto &Actions = getActionDefinitionsBuilder(Op); |
1099 | |
1100 | |
1101 | Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, |
1102 | {V2S32, GlobalPtr, V2S32, GlobalAlign32}, |
1103 | {V4S32, GlobalPtr, V4S32, GlobalAlign32}, |
1104 | {S64, GlobalPtr, S64, GlobalAlign32}, |
1105 | {V2S64, GlobalPtr, V2S64, GlobalAlign32}, |
1106 | {V2S16, GlobalPtr, V2S16, GlobalAlign32}, |
1107 | {S32, GlobalPtr, S8, GlobalAlign8}, |
1108 | {S32, GlobalPtr, S16, GlobalAlign16}, |
1109 | |
1110 | {S32, LocalPtr, S32, 32}, |
1111 | {S64, LocalPtr, S64, 32}, |
1112 | {V2S32, LocalPtr, V2S32, 32}, |
1113 | {S32, LocalPtr, S8, 8}, |
1114 | {S32, LocalPtr, S16, 16}, |
1115 | {V2S16, LocalPtr, S32, 32}, |
1116 | |
1117 | {S32, PrivatePtr, S32, 32}, |
1118 | {S32, PrivatePtr, S8, 8}, |
1119 | {S32, PrivatePtr, S16, 16}, |
1120 | {V2S16, PrivatePtr, S32, 32}, |
1121 | |
1122 | {S32, ConstantPtr, S32, GlobalAlign32}, |
1123 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}, |
1124 | {V4S32, ConstantPtr, V4S32, GlobalAlign32}, |
1125 | {S64, ConstantPtr, S64, GlobalAlign32}, |
1126 | {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); |
1127 | Actions.legalIf( |
1128 | [=](const LegalityQuery &Query) -> bool { |
1129 | return isLoadStoreLegal(ST, Query); |
1130 | }); |
1131 | |
1132 | |
1133 | |
1134 | |
1135 | |
1136 | |
1137 | Actions.customIf(typeIs(1, Constant32Ptr)); |
1138 | |
1139 | |
1140 | |
1141 | |
1142 | |
1143 | |
1144 | |
1145 | Actions.bitcastIf( |
1146 | [=](const LegalityQuery &Query) -> bool { |
1147 | return shouldBitcastLoadStoreType(ST, Query.Types[0], |
1148 | Query.MMODescrs[0].MemoryTy); |
1149 | }, bitcastToRegisterType(0)); |
1150 | |
1151 | if (!IsStore) { |
1152 | |
1153 | |
1154 | Actions.customIf([=](const LegalityQuery &Query) -> bool { |
1155 | return shouldWidenLoad(ST, Query, G_LOAD); |
1156 | }); |
1157 | } |
1158 | |
1159 | |
1160 | Actions |
1161 | .narrowScalarIf( |
1162 | [=](const LegalityQuery &Query) -> bool { |
1163 | return !Query.Types[0].isVector() && |
1164 | needToSplitMemOp(Query, Op == G_LOAD); |
1165 | }, |
1166 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { |
1167 | const LLT DstTy = Query.Types[0]; |
1168 | const LLT PtrTy = Query.Types[1]; |
1169 | |
1170 | const unsigned DstSize = DstTy.getSizeInBits(); |
1171 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); |
1172 | |
1173 | |
1174 | if (DstSize > MemSize) |
1175 | return std::make_pair(0, LLT::scalar(MemSize)); |
1176 | |
1177 | if (!isPowerOf2_32(DstSize)) { |
1178 | |
1179 | |
1180 | |
1181 | unsigned FloorSize = PowerOf2Floor(DstSize); |
1182 | return std::make_pair(0, LLT::scalar(FloorSize)); |
1183 | } |
1184 | |
1185 | if (DstSize > 32 && (DstSize % 32 != 0)) { |
1186 | |
1187 | |
1188 | return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); |
1189 | } |
1190 | |
1191 | unsigned MaxSize = maxSizeForAddrSpace(ST, |
1192 | PtrTy.getAddressSpace(), |
1193 | Op == G_LOAD); |
1194 | if (MemSize > MaxSize) |
1195 | return std::make_pair(0, LLT::scalar(MaxSize)); |
1196 | |
1197 | unsigned Align = Query.MMODescrs[0].AlignInBits; |
1198 | return std::make_pair(0, LLT::scalar(Align)); |
1199 | }) |
1200 | .fewerElementsIf( |
1201 | [=](const LegalityQuery &Query) -> bool { |
1202 | return Query.Types[0].isVector() && |
1203 | needToSplitMemOp(Query, Op == G_LOAD); |
1204 | }, |
1205 | [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { |
1206 | const LLT DstTy = Query.Types[0]; |
1207 | const LLT PtrTy = Query.Types[1]; |
1208 | |
1209 | LLT EltTy = DstTy.getElementType(); |
1210 | unsigned MaxSize = maxSizeForAddrSpace(ST, |
1211 | PtrTy.getAddressSpace(), |
1212 | Op == G_LOAD); |
1213 | |
1214 | |
1215 | |
1216 | |
1217 | |
1218 | |
1219 | unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); |
1220 | if (MemSize > MaxSize) { |
1221 | unsigned NumElts = DstTy.getNumElements(); |
1222 | unsigned EltSize = EltTy.getSizeInBits(); |
1223 | |
1224 | if (MaxSize % EltSize == 0) { |
1225 | return std::make_pair( |
1226 | 0, LLT::scalarOrVector( |
1227 | ElementCount::getFixed(MaxSize / EltSize), EltTy)); |
1228 | } |
1229 | |
1230 | unsigned NumPieces = MemSize / MaxSize; |
1231 | |
1232 | |
1233 | |
1234 | if (NumPieces == 1 || NumPieces >= NumElts || |
1235 | NumElts % NumPieces != 0) |
1236 | return std::make_pair(0, EltTy); |
1237 | |
1238 | return std::make_pair( |
1239 | 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); |
1240 | } |
1241 | |
1242 | |
1243 | if (DstTy.getSizeInBits() > MemSize) |
1244 | return std::make_pair(0, EltTy); |
1245 | |
1246 | unsigned EltSize = EltTy.getSizeInBits(); |
1247 | unsigned DstSize = DstTy.getSizeInBits(); |
1248 | if (!isPowerOf2_32(DstSize)) { |
1249 | |
1250 | |
1251 | |
1252 | unsigned FloorSize = PowerOf2Floor(DstSize); |
1253 | return std::make_pair( |
1254 | 0, LLT::scalarOrVector( |
1255 | ElementCount::getFixed(FloorSize / EltSize), EltTy)); |
1256 | } |
1257 | |
1258 | |
1259 | unsigned Align = Query.MMODescrs[0].AlignInBits; |
1260 | if (EltSize > Align && |
1261 | (EltSize / Align < DstTy.getNumElements())) { |
1262 | return std::make_pair( |
1263 | 0, LLT::fixed_vector(EltSize / Align, EltTy)); |
1264 | } |
1265 | |
1266 | |
1267 | return std::make_pair(0, EltTy); |
1268 | }) |
1269 | .lowerIfMemSizeNotPow2() |
1270 | .minScalar(0, S32) |
1271 | .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) |
1272 | .widenScalarToNextPow2(0) |
1273 | .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) |
1274 | .lower(); |
1275 | } |
1276 | |
1277 | |
1278 | auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) |
1279 | .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, |
1280 | {S32, GlobalPtr, S16, 2 * 8}, |
1281 | {S32, LocalPtr, S8, 8}, |
1282 | {S32, LocalPtr, S16, 16}, |
1283 | {S32, PrivatePtr, S8, 8}, |
1284 | {S32, PrivatePtr, S16, 16}, |
1285 | {S32, ConstantPtr, S8, 8}, |
1286 | {S32, ConstantPtr, S16, 2 * 8}}) |
1287 | .legalIf( |
1288 | [=](const LegalityQuery &Query) -> bool { |
1289 | return isLoadStoreLegal(ST, Query); |
1290 | }); |
1291 | |
1292 | if (ST.hasFlatAddressSpace()) { |
1293 | ExtLoads.legalForTypesWithMemDesc( |
1294 | {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); |
1295 | } |
1296 | |
1297 | |
1298 | |
1299 | |
1300 | |
1301 | |
1302 | ExtLoads.customIf(typeIs(1, Constant32Ptr)); |
1303 | |
1304 | ExtLoads.clampScalar(0, S32, S32) |
1305 | .widenScalarToNextPow2(0) |
1306 | .lower(); |
1307 | |
1308 | auto &Atomics = getActionDefinitionsBuilder( |
1309 | {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, |
1310 | G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, |
1311 | G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, |
1312 | G_ATOMICRMW_UMIN}) |
1313 | .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, |
1314 | {S64, GlobalPtr}, {S64, LocalPtr}, |
1315 | {S32, RegionPtr}, {S64, RegionPtr}}); |
1316 | if (ST.hasFlatAddressSpace()) { |
1317 | Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); |
1318 | } |
1319 | |
1320 | auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); |
1321 | if (ST.hasLDSFPAtomics()) { |
1322 | Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); |
1323 | if (ST.hasGFX90AInsts()) |
1324 | Atomic.legalFor({{S64, LocalPtr}}); |
1325 | } |
1326 | if (ST.hasAtomicFaddInsts()) |
1327 | Atomic.legalFor({{S32, GlobalPtr}}); |
1328 | |
1329 | |
1330 | |
1331 | getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) |
1332 | .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, |
1333 | {S32, FlatPtr}, {S64, FlatPtr}}) |
1334 | .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, |
1335 | {S32, RegionPtr}, {S64, RegionPtr}}); |
1336 | |
1337 | |
1338 | |
1339 | getActionDefinitionsBuilder(G_SELECT) |
1340 | .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, |
1341 | LocalPtr, FlatPtr, PrivatePtr, |
1342 | LLT::fixed_vector(2, LocalPtr), |
1343 | LLT::fixed_vector(2, PrivatePtr)}, |
1344 | {S1, S32}) |
1345 | .clampScalar(0, S16, S64) |
1346 | .scalarize(1) |
1347 | .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) |
1348 | .fewerElementsIf(numElementsNotEven(0), scalarize(0)) |
1349 | .clampMaxNumElements(0, S32, 2) |
1350 | .clampMaxNumElements(0, LocalPtr, 2) |
1351 | .clampMaxNumElements(0, PrivatePtr, 2) |
1352 | .scalarize(0) |
1353 | .widenScalarToNextPow2(0) |
1354 | .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); |
1355 | |
1356 | |
1357 | |
1358 | auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) |
1359 | .legalFor({{S32, S32}, {S64, S32}}); |
1360 | if (ST.has16BitInsts()) { |
1361 | if (ST.hasVOP3PInsts()) { |
1362 | Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) |
1363 | .clampMaxNumElements(0, S16, 2); |
1364 | } else |
1365 | Shifts.legalFor({{S16, S16}}); |
1366 | |
1367 | |
1368 | Shifts.widenScalarIf( |
1369 | [=](const LegalityQuery &Query) { |
1370 | |
1371 | |
1372 | const LLT ValTy = Query.Types[0]; |
1373 | const LLT AmountTy = Query.Types[1]; |
1374 | return ValTy.getSizeInBits() <= 16 && |
1375 | AmountTy.getSizeInBits() < 16; |
1376 | }, changeTo(1, S16)); |
1377 | Shifts.maxScalarIf(typeIs(0, S16), 1, S16); |
1378 | Shifts.clampScalar(1, S32, S32); |
1379 | Shifts.clampScalar(0, S16, S64); |
1380 | Shifts.widenScalarToNextPow2(0, 16); |
1381 | |
1382 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) |
1383 | .minScalar(0, S16) |
1384 | .scalarize(0) |
1385 | .lower(); |
1386 | } else { |
1387 | |
1388 | |
1389 | |
1390 | Shifts.clampScalar(1, S32, S32); |
1391 | Shifts.clampScalar(0, S32, S64); |
1392 | Shifts.widenScalarToNextPow2(0, 32); |
1393 | |
1394 | getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) |
1395 | .minScalar(0, S32) |
1396 | .scalarize(0) |
1397 | .lower(); |
1398 | } |
1399 | Shifts.scalarize(0); |
1400 | |
1401 | for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { |
1402 | unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; |
1403 | unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; |
1404 | unsigned IdxTypeIdx = 2; |
1405 | |
1406 | getActionDefinitionsBuilder(Op) |
1407 | .customIf([=](const LegalityQuery &Query) { |
1408 | const LLT EltTy = Query.Types[EltTypeIdx]; |
1409 | const LLT VecTy = Query.Types[VecTypeIdx]; |
1410 | const LLT IdxTy = Query.Types[IdxTypeIdx]; |
1411 | const unsigned EltSize = EltTy.getSizeInBits(); |
1412 | return (EltSize == 32 || EltSize == 64) && |
1413 | VecTy.getSizeInBits() % 32 == 0 && |
1414 | VecTy.getSizeInBits() <= MaxRegisterSize && |
1415 | IdxTy.getSizeInBits() == 32; |
1416 | }) |
1417 | .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), |
1418 | bitcastToVectorElement32(VecTypeIdx)) |
1419 | |
1420 | .bitcastIf( |
1421 | all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), |
1422 | [=](const LegalityQuery &Query) { |
1423 | |
1424 | |
1425 | |
1426 | const LLT EltTy = Query.Types[EltTypeIdx]; |
1427 | const LLT VecTy = Query.Types[VecTypeIdx]; |
1428 | const unsigned DstEltSize = EltTy.getSizeInBits(); |
1429 | const unsigned VecSize = VecTy.getSizeInBits(); |
1430 | |
1431 | const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; |
1432 | return std::make_pair( |
1433 | VecTypeIdx, |
1434 | LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); |
1435 | }) |
1436 | .clampScalar(EltTypeIdx, S32, S64) |
1437 | .clampScalar(VecTypeIdx, S32, S64) |
1438 | .clampScalar(IdxTypeIdx, S32, S32) |
1439 | .clampMaxNumElements(VecTypeIdx, S32, 32) |
1440 | |
1441 | |
1442 | |
1443 | .lower(); |
1444 | } |
1445 | |
1446 | getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) |
1447 | .unsupportedIf([=](const LegalityQuery &Query) { |
1448 | const LLT &EltTy = Query.Types[1].getElementType(); |
1449 | return Query.Types[0] != EltTy; |
1450 | }); |
1451 | |
1452 | for (unsigned Op : {G_EXTRACT, G_INSERT}) { |
1453 | unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; |
1454 | unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; |
1455 | |
1456 | |
1457 | getActionDefinitionsBuilder(Op) |
1458 | .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) |
1459 | |
1460 | .legalIf([=](const LegalityQuery &Query) { |
1461 | const LLT BigTy = Query.Types[BigTyIdx]; |
1462 | const LLT LitTy = Query.Types[LitTyIdx]; |
1463 | return (BigTy.getSizeInBits() % 32 == 0) && |
1464 | (LitTy.getSizeInBits() % 16 == 0); |
1465 | }) |
1466 | .widenScalarIf( |
1467 | [=](const LegalityQuery &Query) { |
1468 | const LLT BigTy = Query.Types[BigTyIdx]; |
1469 | return (BigTy.getScalarSizeInBits() < 16); |
1470 | }, |
1471 | LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) |
1472 | .widenScalarIf( |
1473 | [=](const LegalityQuery &Query) { |
1474 | const LLT LitTy = Query.Types[LitTyIdx]; |
1475 | return (LitTy.getScalarSizeInBits() < 16); |
1476 | }, |
1477 | LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) |
1478 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) |
1479 | .widenScalarToNextPow2(BigTyIdx, 32); |
1480 | |
1481 | } |
1482 | |
1483 | auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) |
1484 | .legalForCartesianProduct(AllS32Vectors, {S32}) |
1485 | .legalForCartesianProduct(AllS64Vectors, {S64}) |
1486 | .clampNumElements(0, V16S32, V32S32) |
1487 | .clampNumElements(0, V2S64, V16S64) |
1488 | .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); |
1489 | |
1490 | if (ST.hasScalarPackInsts()) { |
1491 | BuildVector |
1492 | |
1493 | .minScalarOrElt(0, S16) |
1494 | |
1495 | .minScalar(1, S32); |
1496 | |
1497 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) |
1498 | .legalFor({V2S16, S32}) |
1499 | .lower(); |
1500 | BuildVector.minScalarOrElt(0, S32); |
1501 | } else { |
1502 | BuildVector.customFor({V2S16, S16}); |
1503 | BuildVector.minScalarOrElt(0, S32); |
1504 | |
1505 | getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) |
1506 | .customFor({V2S16, S32}) |
1507 | .lower(); |
1508 | } |
1509 | |
1510 | BuildVector.legalIf(isRegisterType(0)); |
1511 | |
1512 | |
1513 | getActionDefinitionsBuilder(G_CONCAT_VECTORS) |
1514 | .legalIf(all(isRegisterType(0), isRegisterType(1))) |
1515 | .clampMaxNumElements(0, S32, 32) |
1516 | .clampMaxNumElements(1, S16, 2) |
1517 | .clampMaxNumElements(0, S16, 64); |
1518 | |
1519 | |
1520 | |
1521 | if (ST.hasVOP3PInsts()) { |
1522 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) |
1523 | .customFor({V2S16, V2S16}) |
1524 | .lower(); |
1525 | } else |
1526 | getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); |
1527 | |
1528 | |
1529 | for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { |
1530 | unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; |
1531 | unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; |
1532 | |
1533 | auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { |
1534 | const LLT Ty = Query.Types[TypeIdx]; |
1535 | if (Ty.isVector()) { |
1536 | const LLT &EltTy = Ty.getElementType(); |
1537 | if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) |
1538 | return true; |
1539 | if (!isPowerOf2_32(EltTy.getSizeInBits())) |
1540 | return true; |
1541 | } |
1542 | return false; |
1543 | }; |
1544 | |
1545 | auto &Builder = getActionDefinitionsBuilder(Op) |
1546 | .legalIf(all(isRegisterType(0), isRegisterType(1))) |
1547 | .lowerFor({{S16, V2S16}}) |
1548 | .lowerIf([=](const LegalityQuery &Query) { |
1549 | const LLT BigTy = Query.Types[BigTyIdx]; |
1550 | return BigTy.getSizeInBits() == 32; |
1551 | }) |
1552 | |
1553 | |
1554 | .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) |
1555 | .widenScalarToNextPow2(LitTyIdx, 16) |
1556 | .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) |
1557 | .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), |
1558 | elementTypeIs(1, S16)), |
1559 | changeTo(1, V2S16)) |
1560 | |
1561 | |
1562 | |
1563 | .clampScalar(LitTyIdx, S32, S512) |
1564 | .widenScalarToNextPow2(LitTyIdx, 32) |
1565 | |
1566 | .fewerElementsIf( |
1567 | [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, |
1568 | scalarize(0)) |
1569 | .fewerElementsIf( |
1570 | [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, |
1571 | scalarize(1)) |
1572 | .clampScalar(BigTyIdx, S32, MaxScalar); |
1573 | |
1574 | if (Op == G_MERGE_VALUES) { |
1575 | Builder.widenScalarIf( |
1576 | |
1577 | [=](const LegalityQuery &Query) { |
1578 | const LLT Ty = Query.Types[LitTyIdx]; |
1579 | return Ty.getSizeInBits() < 32; |
1580 | }, |
1581 | changeTo(LitTyIdx, S32)); |
1582 | } |
1583 | |
1584 | Builder.widenScalarIf( |
1585 | [=](const LegalityQuery &Query) { |
1586 | const LLT Ty = Query.Types[BigTyIdx]; |
1587 | return !isPowerOf2_32(Ty.getSizeInBits()) && |
1588 | Ty.getSizeInBits() % 16 != 0; |
1589 | }, |
1590 | [=](const LegalityQuery &Query) { |
1591 | |
1592 | |
1593 | const LLT &Ty = Query.Types[BigTyIdx]; |
1594 | unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); |
1595 | if (NewSizeInBits >= 256) { |
1596 | unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); |
1597 | if (RoundedTo < NewSizeInBits) |
1598 | NewSizeInBits = RoundedTo; |
1599 | } |
1600 | return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); |
1601 | }) |
1602 | |
1603 | .scalarize(0) |
1604 | .scalarize(1); |
1605 | } |
1606 | |
1607 | |
1608 | |
1609 | auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) |
1610 | .legalFor({{S32}, {S64}}); |
1611 | |
1612 | if (ST.hasVOP3PInsts()) { |
1613 | SextInReg.lowerFor({{V2S16}}) |
1614 | |
1615 | |
1616 | |
1617 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); |
1618 | } else if (ST.has16BitInsts()) { |
1619 | SextInReg.lowerFor({{S32}, {S64}, {S16}}); |
1620 | } else { |
1621 | |
1622 | |
1623 | SextInReg.lowerFor({{S32}, {S64}}); |
1624 | } |
1625 | |
1626 | SextInReg |
1627 | .scalarize(0) |
1628 | .clampScalar(0, S32, S64) |
1629 | .lower(); |
1630 | |
1631 | |
1632 | getActionDefinitionsBuilder(G_FSHR) |
1633 | .legalFor({{S32, S32}}) |
1634 | .lowerFor({{V2S16, V2S16}}) |
1635 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) |
1636 | .scalarize(0) |
1637 | .lower(); |
1638 | |
1639 | if (ST.hasVOP3PInsts()) { |
1640 | getActionDefinitionsBuilder(G_FSHL) |
1641 | .lowerFor({{V2S16, V2S16}}) |
1642 | .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) |
1643 | .scalarize(0) |
1644 | .lower(); |
1645 | } else { |
1646 | getActionDefinitionsBuilder(G_FSHL) |
1647 | .scalarize(0) |
1648 | .lower(); |
1649 | } |
1650 | |
1651 | getActionDefinitionsBuilder(G_READCYCLECOUNTER) |
1652 | .legalFor({S64}); |
1653 | |
1654 | getActionDefinitionsBuilder(G_FENCE) |
1655 | .alwaysLegal(); |
1656 | |
1657 | getActionDefinitionsBuilder({G_SMULO, G_UMULO}) |
1658 | .scalarize(0) |
1659 | .minScalar(0, S32) |
1660 | .lower(); |
1661 | |
1662 | getActionDefinitionsBuilder({G_SBFX, G_UBFX}) |
1663 | .legalFor({{S32, S32}, {S64, S32}}) |
1664 | .clampScalar(1, S32, S32) |
1665 | .clampScalar(0, S32, S64) |
1666 | .widenScalarToNextPow2(0) |
1667 | .scalarize(0); |
1668 | |
1669 | getActionDefinitionsBuilder({ |
1670 | |
1671 | G_FCOPYSIGN, |
1672 | |
1673 | G_ATOMIC_CMPXCHG_WITH_SUCCESS, |
1674 | G_ATOMICRMW_NAND, |
1675 | G_ATOMICRMW_FSUB, |
1676 | G_READ_REGISTER, |
1677 | G_WRITE_REGISTER, |
1678 | |
1679 | G_SADDO, G_SSUBO, |
1680 | |
1681 | |
1682 | G_FMINIMUM, G_FMAXIMUM}).lower(); |
1683 | |
1684 | getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, |
1685 | G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, |
1686 | G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) |
1687 | .unsupported(); |
1688 | |
1689 | getLegacyLegalizerInfo().computeTables(); |
1690 | verify(*ST.getInstrInfo()); |
1691 | } |
1692 | |
1693 | bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, |
1694 | MachineInstr &MI) const { |
1695 | MachineIRBuilder &B = Helper.MIRBuilder; |
1696 | MachineRegisterInfo &MRI = *B.getMRI(); |
1697 | |
1698 | switch (MI.getOpcode()) { |
1699 | case TargetOpcode::G_ADDRSPACE_CAST: |
1700 | return legalizeAddrSpaceCast(MI, MRI, B); |
1701 | case TargetOpcode::G_FRINT: |
1702 | return legalizeFrint(MI, MRI, B); |
1703 | case TargetOpcode::G_FCEIL: |
1704 | return legalizeFceil(MI, MRI, B); |
1705 | case TargetOpcode::G_FREM: |
1706 | return legalizeFrem(MI, MRI, B); |
1707 | case TargetOpcode::G_INTRINSIC_TRUNC: |
1708 | return legalizeIntrinsicTrunc(MI, MRI, B); |
1709 | case TargetOpcode::G_SITOFP: |
1710 | return legalizeITOFP(MI, MRI, B, true); |
1711 | case TargetOpcode::G_UITOFP: |
1712 | return legalizeITOFP(MI, MRI, B, false); |
1713 | case TargetOpcode::G_FPTOSI: |
1714 | return legalizeFPTOI(MI, MRI, B, true); |
1715 | case TargetOpcode::G_FPTOUI: |
1716 | return legalizeFPTOI(MI, MRI, B, false); |
1717 | case TargetOpcode::G_FMINNUM: |
1718 | case TargetOpcode::G_FMAXNUM: |
1719 | case TargetOpcode::G_FMINNUM_IEEE: |
1720 | case TargetOpcode::G_FMAXNUM_IEEE: |
1721 | return legalizeMinNumMaxNum(Helper, MI); |
1722 | case TargetOpcode::G_EXTRACT_VECTOR_ELT: |
1723 | return legalizeExtractVectorElt(MI, MRI, B); |
1724 | case TargetOpcode::G_INSERT_VECTOR_ELT: |
1725 | return legalizeInsertVectorElt(MI, MRI, B); |
1726 | case TargetOpcode::G_SHUFFLE_VECTOR: |
1727 | return legalizeShuffleVector(MI, MRI, B); |
1728 | case TargetOpcode::G_FSIN: |
1729 | case TargetOpcode::G_FCOS: |
1730 | return legalizeSinCos(MI, MRI, B); |
1731 | case TargetOpcode::G_GLOBAL_VALUE: |
1732 | return legalizeGlobalValue(MI, MRI, B); |
1733 | case TargetOpcode::G_LOAD: |
1734 | case TargetOpcode::G_SEXTLOAD: |
1735 | case TargetOpcode::G_ZEXTLOAD: |
1736 | return legalizeLoad(Helper, MI); |
1737 | case TargetOpcode::G_FMAD: |
1738 | return legalizeFMad(MI, MRI, B); |
1739 | case TargetOpcode::G_FDIV: |
1740 | return legalizeFDIV(MI, MRI, B); |
1741 | case TargetOpcode::G_UDIV: |
1742 | case TargetOpcode::G_UREM: |
1743 | case TargetOpcode::G_UDIVREM: |
1744 | return legalizeUnsignedDIV_REM(MI, MRI, B); |
1745 | case TargetOpcode::G_SDIV: |
1746 | case TargetOpcode::G_SREM: |
1747 | case TargetOpcode::G_SDIVREM: |
1748 | return legalizeSignedDIV_REM(MI, MRI, B); |
1749 | case TargetOpcode::G_ATOMIC_CMPXCHG: |
1750 | return legalizeAtomicCmpXChg(MI, MRI, B); |
1751 | case TargetOpcode::G_FLOG: |
1752 | return legalizeFlog(MI, B, numbers::ln2f); |
1753 | case TargetOpcode::G_FLOG10: |
1754 | return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); |
1755 | case TargetOpcode::G_FEXP: |
1756 | return legalizeFExp(MI, B); |
1757 | case TargetOpcode::G_FPOW: |
1758 | return legalizeFPow(MI, B); |
1759 | case TargetOpcode::G_FFLOOR: |
1760 | return legalizeFFloor(MI, MRI, B); |
1761 | case TargetOpcode::G_BUILD_VECTOR: |
1762 | return legalizeBuildVector(MI, MRI, B); |
1763 | default: |
1764 | return false; |
1765 | } |
1766 | |
1767 | llvm_unreachable("expected switch to return"); |
1768 | } |
1769 | |
1770 | Register AMDGPULegalizerInfo::getSegmentAperture( |
1771 | unsigned AS, |
1772 | MachineRegisterInfo &MRI, |
1773 | MachineIRBuilder &B) const { |
1774 | MachineFunction &MF = B.getMF(); |
1775 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
1776 | const LLT S32 = LLT::scalar(32); |
1777 | |
1778 | assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); |
1779 | |
1780 | if (ST.hasApertureRegs()) { |
1781 | |
1782 | |
1783 | unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? |
1784 | AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : |
1785 | AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; |
1786 | unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? |
1787 | AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : |
1788 | AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; |
1789 | unsigned Encoding = |
1790 | AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | |
1791 | Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | |
1792 | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; |
1793 | |
1794 | Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); |
1795 | |
1796 | B.buildInstr(AMDGPU::S_GETREG_B32) |
1797 | .addDef(GetReg) |
1798 | .addImm(Encoding); |
1799 | MRI.setType(GetReg, S32); |
1800 | |
1801 | auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); |
1802 | return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); |
1803 | } |
1804 | |
1805 | Register QueuePtr = MRI.createGenericVirtualRegister( |
1806 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); |
1807 | |
1808 | if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) |
1809 | return Register(); |
1810 | |
1811 | |
1812 | |
1813 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; |
1814 | |
1815 | |
1816 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
1817 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
1818 | PtrInfo, |
1819 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
1820 | MachineMemOperand::MOInvariant, |
1821 | LLT::scalar(32), commonAlignment(Align(64), StructOffset)); |
1822 | |
1823 | Register LoadAddr; |
1824 | |
1825 | B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); |
1826 | return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); |
1827 | } |
1828 | |
1829 | bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( |
1830 | MachineInstr &MI, MachineRegisterInfo &MRI, |
1831 | MachineIRBuilder &B) const { |
1832 | MachineFunction &MF = B.getMF(); |
1833 | |
1834 | const LLT S32 = LLT::scalar(32); |
1835 | Register Dst = MI.getOperand(0).getReg(); |
1836 | Register Src = MI.getOperand(1).getReg(); |
1837 | |
1838 | LLT DstTy = MRI.getType(Dst); |
1839 | LLT SrcTy = MRI.getType(Src); |
1840 | unsigned DestAS = DstTy.getAddressSpace(); |
1841 | unsigned SrcAS = SrcTy.getAddressSpace(); |
1842 | |
1843 | |
1844 | |
1845 | assert(!DstTy.isVector()); |
1846 | |
1847 | const AMDGPUTargetMachine &TM |
1848 | = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); |
1849 | |
1850 | if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { |
1851 | MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); |
1852 | return true; |
1853 | } |
1854 | |
1855 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
1856 | |
1857 | B.buildExtract(Dst, Src, 0); |
1858 | MI.eraseFromParent(); |
1859 | return true; |
1860 | } |
1861 | |
1862 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
1863 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
1864 | uint32_t AddrHiVal = Info->get32BitAddressHighBits(); |
1865 | |
1866 | |
1867 | |
1868 | |
1869 | auto HighAddr = B.buildConstant( |
1870 | LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); |
1871 | B.buildMerge(Dst, {Src, HighAddr}); |
1872 | MI.eraseFromParent(); |
1873 | return true; |
1874 | } |
1875 | |
1876 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { |
1877 | assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || |
1878 | DestAS == AMDGPUAS::PRIVATE_ADDRESS); |
1879 | unsigned NullVal = TM.getNullPointerValue(DestAS); |
1880 | |
1881 | auto SegmentNull = B.buildConstant(DstTy, NullVal); |
1882 | auto FlatNull = B.buildConstant(SrcTy, 0); |
1883 | |
1884 | |
1885 | auto PtrLo32 = B.buildExtract(DstTy, Src, 0); |
1886 | |
1887 | auto CmpRes = |
1888 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); |
1889 | B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); |
1890 | |
1891 | MI.eraseFromParent(); |
1892 | return true; |
1893 | } |
1894 | |
1895 | if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) |
1896 | return false; |
1897 | |
1898 | if (!ST.hasFlatAddressSpace()) |
1899 | return false; |
1900 | |
1901 | auto SegmentNull = |
1902 | B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); |
1903 | auto FlatNull = |
1904 | B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); |
1905 | |
1906 | Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); |
1907 | if (!ApertureReg.isValid()) |
1908 | return false; |
1909 | |
1910 | auto CmpRes = |
1911 | B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); |
1912 | |
1913 | |
1914 | Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); |
1915 | |
1916 | |
1917 | |
1918 | auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); |
1919 | B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); |
1920 | |
1921 | MI.eraseFromParent(); |
1922 | return true; |
1923 | } |
1924 | |
1925 | bool AMDGPULegalizerInfo::legalizeFrint( |
1926 | MachineInstr &MI, MachineRegisterInfo &MRI, |
1927 | MachineIRBuilder &B) const { |
1928 | Register Src = MI.getOperand(1).getReg(); |
1929 | LLT Ty = MRI.getType(Src); |
1930 | assert(Ty.isScalar() && Ty.getSizeInBits() == 64); |
1931 | |
1932 | APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); |
1933 | APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); |
1934 | |
1935 | auto C1 = B.buildFConstant(Ty, C1Val); |
1936 | auto CopySign = B.buildFCopysign(Ty, C1, Src); |
1937 | |
1938 | |
1939 | auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); |
1940 | auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); |
1941 | |
1942 | auto C2 = B.buildFConstant(Ty, C2Val); |
1943 | auto Fabs = B.buildFAbs(Ty, Src); |
1944 | |
1945 | auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); |
1946 | B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); |
1947 | MI.eraseFromParent(); |
1948 | return true; |
1949 | } |
1950 | |
1951 | bool AMDGPULegalizerInfo::legalizeFceil( |
1952 | MachineInstr &MI, MachineRegisterInfo &MRI, |
1953 | MachineIRBuilder &B) const { |
1954 | |
1955 | const LLT S1 = LLT::scalar(1); |
1956 | const LLT S64 = LLT::scalar(64); |
1957 | |
1958 | Register Src = MI.getOperand(1).getReg(); |
1959 | assert(MRI.getType(Src) == S64); |
1960 | |
1961 | |
1962 | |
1963 | |
1964 | |
1965 | auto Trunc = B.buildIntrinsicTrunc(S64, Src); |
1966 | |
1967 | const auto Zero = B.buildFConstant(S64, 0.0); |
1968 | const auto One = B.buildFConstant(S64, 1.0); |
1969 | auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); |
1970 | auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); |
1971 | auto And = B.buildAnd(S1, Lt0, NeTrunc); |
1972 | auto Add = B.buildSelect(S64, And, One, Zero); |
1973 | |
1974 | |
1975 | B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); |
1976 | return true; |
1977 | } |
1978 | |
1979 | bool AMDGPULegalizerInfo::legalizeFrem( |
1980 | MachineInstr &MI, MachineRegisterInfo &MRI, |
1981 | MachineIRBuilder &B) const { |
1982 | Register DstReg = MI.getOperand(0).getReg(); |
1983 | Register Src0Reg = MI.getOperand(1).getReg(); |
1984 | Register Src1Reg = MI.getOperand(2).getReg(); |
1985 | auto Flags = MI.getFlags(); |
1986 | LLT Ty = MRI.getType(DstReg); |
1987 | |
1988 | auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); |
1989 | auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); |
1990 | auto Neg = B.buildFNeg(Ty, Trunc, Flags); |
1991 | B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); |
1992 | MI.eraseFromParent(); |
1993 | return true; |
1994 | } |
1995 | |
1996 | static MachineInstrBuilder extractF64Exponent(Register Hi, |
1997 | MachineIRBuilder &B) { |
1998 | const unsigned FractBits = 52; |
1999 | const unsigned ExpBits = 11; |
2000 | LLT S32 = LLT::scalar(32); |
2001 | |
2002 | auto Const0 = B.buildConstant(S32, FractBits - 32); |
2003 | auto Const1 = B.buildConstant(S32, ExpBits); |
2004 | |
2005 | auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) |
2006 | .addUse(Hi) |
2007 | .addUse(Const0.getReg(0)) |
2008 | .addUse(Const1.getReg(0)); |
2009 | |
2010 | return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); |
2011 | } |
2012 | |
2013 | bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( |
2014 | MachineInstr &MI, MachineRegisterInfo &MRI, |
2015 | MachineIRBuilder &B) const { |
2016 | const LLT S1 = LLT::scalar(1); |
2017 | const LLT S32 = LLT::scalar(32); |
2018 | const LLT S64 = LLT::scalar(64); |
2019 | |
2020 | Register Src = MI.getOperand(1).getReg(); |
2021 | assert(MRI.getType(Src) == S64); |
2022 | |
2023 | |
2024 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); |
2025 | Register Hi = Unmerge.getReg(1); |
2026 | |
2027 | |
2028 | |
2029 | auto Exp = extractF64Exponent(Hi, B); |
2030 | |
2031 | const unsigned FractBits = 52; |
2032 | |
2033 | |
2034 | const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); |
2035 | auto SignBit = B.buildAnd(S32, Hi, SignBitMask); |
2036 | |
2037 | const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); |
2038 | |
2039 | const auto Zero32 = B.buildConstant(S32, 0); |
2040 | |
2041 | |
2042 | auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); |
2043 | |
2044 | auto Shr = B.buildAShr(S64, FractMask, Exp); |
2045 | auto Not = B.buildNot(S64, Shr); |
2046 | auto Tmp0 = B.buildAnd(S64, Src, Not); |
2047 | auto FiftyOne = B.buildConstant(S32, FractBits - 1); |
2048 | |
2049 | auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); |
2050 | auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); |
2051 | |
2052 | auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); |
2053 | B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); |
2054 | MI.eraseFromParent(); |
2055 | return true; |
2056 | } |
2057 | |
2058 | bool AMDGPULegalizerInfo::legalizeITOFP( |
2059 | MachineInstr &MI, MachineRegisterInfo &MRI, |
2060 | MachineIRBuilder &B, bool Signed) const { |
2061 | |
2062 | Register Dst = MI.getOperand(0).getReg(); |
2063 | Register Src = MI.getOperand(1).getReg(); |
2064 | |
2065 | const LLT S64 = LLT::scalar(64); |
2066 | const LLT S32 = LLT::scalar(32); |
2067 | |
2068 | assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); |
2069 | |
2070 | auto Unmerge = B.buildUnmerge({S32, S32}, Src); |
2071 | |
2072 | auto CvtHi = Signed ? |
2073 | B.buildSITOFP(S64, Unmerge.getReg(1)) : |
2074 | B.buildUITOFP(S64, Unmerge.getReg(1)); |
2075 | |
2076 | auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); |
2077 | |
2078 | auto ThirtyTwo = B.buildConstant(S32, 32); |
2079 | auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) |
2080 | .addUse(CvtHi.getReg(0)) |
2081 | .addUse(ThirtyTwo.getReg(0)); |
2082 | |
2083 | |
2084 | B.buildFAdd(Dst, LdExp, CvtLo); |
2085 | MI.eraseFromParent(); |
2086 | return true; |
2087 | } |
2088 | |
2089 | |
2090 | |
2091 | bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, |
2092 | MachineRegisterInfo &MRI, |
2093 | MachineIRBuilder &B, |
2094 | bool Signed) const { |
2095 | |
2096 | Register Dst = MI.getOperand(0).getReg(); |
2097 | Register Src = MI.getOperand(1).getReg(); |
2098 | |
2099 | const LLT S64 = LLT::scalar(64); |
2100 | const LLT S32 = LLT::scalar(32); |
2101 | |
2102 | const LLT SrcLT = MRI.getType(Src); |
2103 | assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); |
2104 | |
2105 | unsigned Flags = MI.getFlags(); |
2106 | |
2107 | |
2108 | |
2109 | |
2110 | |
2111 | |
2112 | |
2113 | |
2114 | |
2115 | |
2116 | auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); |
2117 | MachineInstrBuilder Sign; |
2118 | if (Signed && SrcLT == S32) { |
2119 | |
2120 | |
2121 | |
2122 | |
2123 | |
2124 | Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); |
2125 | Trunc = B.buildFAbs(S32, Trunc, Flags); |
2126 | } |
2127 | MachineInstrBuilder K0, K1; |
2128 | if (SrcLT == S64) { |
2129 | K0 = B.buildFConstant(S64, |
2130 | BitsToDouble(UINT64_C( 0x3df0000000000000))); |
2131 | K1 = B.buildFConstant(S64, |
2132 | BitsToDouble(UINT64_C( 0xc1f0000000000000))); |
2133 | } else { |
2134 | K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C( 0x2f800000))); |
2135 | K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C( 0xcf800000))); |
2136 | } |
2137 | |
2138 | auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); |
2139 | auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); |
2140 | auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); |
2141 | |
2142 | auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) |
2143 | : B.buildFPTOUI(S32, FloorMul); |
2144 | auto Lo = B.buildFPTOUI(S32, Fma); |
2145 | |
2146 | if (Signed && SrcLT == S32) { |
2147 | |
2148 | Sign = B.buildMerge(S64, {Sign, Sign}); |
2149 | |
2150 | B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); |
2151 | } else |
2152 | B.buildMerge(Dst, {Lo, Hi}); |
2153 | MI.eraseFromParent(); |
2154 | |
2155 | return true; |
2156 | } |
2157 | |
2158 | bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, |
2159 | MachineInstr &MI) const { |
2160 | MachineFunction &MF = Helper.MIRBuilder.getMF(); |
2161 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
2162 | |
2163 | const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || |
2164 | MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; |
2165 | |
2166 | |
2167 | |
2168 | if (!MFI->getMode().IEEE) |
2169 | return !IsIEEEOp; |
2170 | |
2171 | if (IsIEEEOp) |
2172 | return true; |
2173 | |
2174 | return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; |
2175 | } |
2176 | |
2177 | bool AMDGPULegalizerInfo::legalizeExtractVectorElt( |
2178 | MachineInstr &MI, MachineRegisterInfo &MRI, |
2179 | MachineIRBuilder &B) const { |
2180 | |
2181 | |
2182 | |
2183 | |
2184 | |
2185 | |
2186 | |
2187 | Optional<ValueAndVReg> MaybeIdxVal = |
2188 | getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); |
2189 | if (!MaybeIdxVal) |
2190 | return true; |
2191 | const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); |
2192 | |
2193 | Register Dst = MI.getOperand(0).getReg(); |
2194 | Register Vec = MI.getOperand(1).getReg(); |
2195 | |
2196 | LLT VecTy = MRI.getType(Vec); |
2197 | LLT EltTy = VecTy.getElementType(); |
2198 | assert(EltTy == MRI.getType(Dst)); |
2199 | |
2200 | if (IdxVal < VecTy.getNumElements()) |
2201 | B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); |
2202 | else |
2203 | B.buildUndef(Dst); |
2204 | |
2205 | MI.eraseFromParent(); |
2206 | return true; |
2207 | } |
2208 | |
2209 | bool AMDGPULegalizerInfo::legalizeInsertVectorElt( |
2210 | MachineInstr &MI, MachineRegisterInfo &MRI, |
2211 | MachineIRBuilder &B) const { |
2212 | |
2213 | |
2214 | |
2215 | |
2216 | |
2217 | |
2218 | |
2219 | Optional<ValueAndVReg> MaybeIdxVal = |
2220 | getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); |
2221 | if (!MaybeIdxVal) |
2222 | return true; |
2223 | |
2224 | int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); |
2225 | Register Dst = MI.getOperand(0).getReg(); |
2226 | Register Vec = MI.getOperand(1).getReg(); |
2227 | Register Ins = MI.getOperand(2).getReg(); |
2228 | |
2229 | LLT VecTy = MRI.getType(Vec); |
2230 | LLT EltTy = VecTy.getElementType(); |
2231 | assert(EltTy == MRI.getType(Ins)); |
2232 | |
2233 | if (IdxVal < VecTy.getNumElements()) |
2234 | B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); |
2235 | else |
2236 | B.buildUndef(Dst); |
2237 | |
2238 | MI.eraseFromParent(); |
2239 | return true; |
2240 | } |
2241 | |
2242 | bool AMDGPULegalizerInfo::legalizeShuffleVector( |
2243 | MachineInstr &MI, MachineRegisterInfo &MRI, |
2244 | MachineIRBuilder &B) const { |
2245 | const LLT V2S16 = LLT::fixed_vector(2, 16); |
2246 | |
2247 | Register Dst = MI.getOperand(0).getReg(); |
2248 | Register Src0 = MI.getOperand(1).getReg(); |
2249 | LLT DstTy = MRI.getType(Dst); |
2250 | LLT SrcTy = MRI.getType(Src0); |
2251 | |
2252 | if (SrcTy == V2S16 && DstTy == V2S16 && |
2253 | AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) |
2254 | return true; |
2255 | |
2256 | MachineIRBuilder HelperBuilder(MI); |
2257 | GISelObserverWrapper DummyObserver; |
2258 | LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); |
2259 | return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; |
2260 | } |
2261 | |
2262 | bool AMDGPULegalizerInfo::legalizeSinCos( |
2263 | MachineInstr &MI, MachineRegisterInfo &MRI, |
2264 | MachineIRBuilder &B) const { |
2265 | |
2266 | Register DstReg = MI.getOperand(0).getReg(); |
2267 | Register SrcReg = MI.getOperand(1).getReg(); |
2268 | LLT Ty = MRI.getType(DstReg); |
2269 | unsigned Flags = MI.getFlags(); |
2270 | |
2271 | Register TrigVal; |
2272 | auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); |
2273 | if (ST.hasTrigReducedRange()) { |
2274 | auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); |
2275 | TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) |
2276 | .addUse(MulVal.getReg(0)) |
2277 | .setMIFlags(Flags).getReg(0); |
2278 | } else |
2279 | TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); |
2280 | |
2281 | Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? |
2282 | Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; |
2283 | B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) |
2284 | .addUse(TrigVal) |
2285 | .setMIFlags(Flags); |
2286 | MI.eraseFromParent(); |
2287 | return true; |
2288 | } |
2289 | |
2290 | bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, |
2291 | MachineIRBuilder &B, |
2292 | const GlobalValue *GV, |
2293 | int64_t Offset, |
2294 | unsigned GAFlags) const { |
2295 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); |
2296 | |
2297 | |
2298 | |
2299 | |
2300 | |
2301 | |
2302 | |
2303 | |
2304 | |
2305 | |
2306 | |
2307 | |
2308 | |
2309 | |
2310 | |
2311 | |
2312 | |
2313 | |
2314 | |
2315 | |
2316 | |
2317 | |
2318 | |
2319 | |
2320 | |
2321 | |
2322 | |
2323 | |
2324 | |
2325 | |
2326 | |
2327 | |
2328 | |
2329 | LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
2330 | |
2331 | Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : |
2332 | B.getMRI()->createGenericVirtualRegister(ConstPtrTy); |
2333 | |
2334 | MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) |
2335 | .addDef(PCReg); |
2336 | |
2337 | MIB.addGlobalAddress(GV, Offset + 4, GAFlags); |
2338 | if (GAFlags == SIInstrInfo::MO_NONE) |
2339 | MIB.addImm(0); |
2340 | else |
2341 | MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); |
2342 | |
2343 | B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); |
2344 | |
2345 | if (PtrTy.getSizeInBits() == 32) |
2346 | B.buildExtract(DstReg, PCReg, 0); |
2347 | return true; |
2348 | } |
2349 | |
2350 | bool AMDGPULegalizerInfo::legalizeGlobalValue( |
2351 | MachineInstr &MI, MachineRegisterInfo &MRI, |
2352 | MachineIRBuilder &B) const { |
2353 | Register DstReg = MI.getOperand(0).getReg(); |
2354 | LLT Ty = MRI.getType(DstReg); |
2355 | unsigned AS = Ty.getAddressSpace(); |
2356 | |
2357 | const GlobalValue *GV = MI.getOperand(1).getGlobal(); |
2358 | MachineFunction &MF = B.getMF(); |
2359 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
2360 | |
2361 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
2362 | if (!MFI->isModuleEntryFunction() && |
2363 | !GV->getName().equals("llvm.amdgcn.module.lds")) { |
2364 | const Function &Fn = MF.getFunction(); |
2365 | DiagnosticInfoUnsupported BadLDSDecl( |
2366 | Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), |
2367 | DS_Warning); |
2368 | Fn.getContext().diagnose(BadLDSDecl); |
2369 | |
2370 | |
2371 | |
2372 | |
2373 | |
2374 | |
2375 | B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); |
2376 | B.buildUndef(DstReg); |
2377 | MI.eraseFromParent(); |
2378 | return true; |
2379 | } |
2380 | |
2381 | |
2382 | if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { |
2383 | const SITargetLowering *TLI = ST.getTargetLowering(); |
2384 | if (!TLI->shouldUseLDSConstAddress(GV)) { |
2385 | MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); |
2386 | return true; |
2387 | } |
2388 | |
2389 | if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { |
2390 | Type *Ty = GV->getValueType(); |
2391 | |
2392 | |
2393 | |
2394 | |
2395 | |
2396 | if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { |
2397 | |
2398 | MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); |
2399 | LLT S32 = LLT::scalar(32); |
2400 | auto Sz = |
2401 | B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); |
2402 | B.buildIntToPtr(DstReg, Sz); |
2403 | MI.eraseFromParent(); |
2404 | return true; |
2405 | } |
2406 | } |
2407 | |
2408 | B.buildConstant( |
2409 | DstReg, |
2410 | MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); |
2411 | MI.eraseFromParent(); |
2412 | return true; |
2413 | } |
2414 | |
2415 | const Function &Fn = MF.getFunction(); |
2416 | DiagnosticInfoUnsupported BadInit( |
2417 | Fn, "unsupported initializer for address space", MI.getDebugLoc()); |
2418 | Fn.getContext().diagnose(BadInit); |
2419 | return true; |
2420 | } |
2421 | |
2422 | const SITargetLowering *TLI = ST.getTargetLowering(); |
2423 | |
2424 | if (TLI->shouldEmitFixup(GV)) { |
2425 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); |
2426 | MI.eraseFromParent(); |
2427 | return true; |
2428 | } |
2429 | |
2430 | if (TLI->shouldEmitPCReloc(GV)) { |
2431 | buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); |
2432 | MI.eraseFromParent(); |
2433 | return true; |
2434 | } |
2435 | |
2436 | LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
2437 | Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); |
2438 | |
2439 | LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; |
2440 | MachineMemOperand *GOTMMO = MF.getMachineMemOperand( |
2441 | MachinePointerInfo::getGOT(MF), |
2442 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
2443 | MachineMemOperand::MOInvariant, |
2444 | LoadTy, Align(8)); |
2445 | |
2446 | buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); |
2447 | |
2448 | if (Ty.getSizeInBits() == 32) { |
2449 | |
2450 | auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); |
2451 | B.buildExtract(DstReg, Load, 0); |
2452 | } else |
2453 | B.buildLoad(DstReg, GOTAddr, *GOTMMO); |
2454 | |
2455 | MI.eraseFromParent(); |
2456 | return true; |
2457 | } |
2458 | |
2459 | static LLT widenToNextPowerOf2(LLT Ty) { |
2460 | if (Ty.isVector()) |
2461 | return Ty.changeElementCount( |
2462 | ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); |
2463 | return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); |
2464 | } |
2465 | |
2466 | bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, |
2467 | MachineInstr &MI) const { |
2468 | MachineIRBuilder &B = Helper.MIRBuilder; |
2469 | MachineRegisterInfo &MRI = *B.getMRI(); |
2470 | GISelChangeObserver &Observer = Helper.Observer; |
2471 | |
2472 | Register PtrReg = MI.getOperand(1).getReg(); |
2473 | LLT PtrTy = MRI.getType(PtrReg); |
2474 | unsigned AddrSpace = PtrTy.getAddressSpace(); |
2475 | |
2476 | if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
2477 | LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); |
2478 | auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); |
2479 | Observer.changingInstr(MI); |
2480 | MI.getOperand(1).setReg(Cast.getReg(0)); |
2481 | Observer.changedInstr(MI); |
2482 | return true; |
2483 | } |
2484 | |
2485 | if (MI.getOpcode() != AMDGPU::G_LOAD) |
2486 | return false; |
2487 | |
2488 | Register ValReg = MI.getOperand(0).getReg(); |
2489 | LLT ValTy = MRI.getType(ValReg); |
2490 | |
2491 | MachineMemOperand *MMO = *MI.memoperands_begin(); |
2492 | const unsigned ValSize = ValTy.getSizeInBits(); |
2493 | const LLT MemTy = MMO->getMemoryType(); |
2494 | const Align MemAlign = MMO->getAlign(); |
2495 | const unsigned MemSize = MemTy.getSizeInBits(); |
2496 | const unsigned AlignInBits = 8 * MemAlign.value(); |
2497 | |
2498 | |
2499 | if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { |
2500 | const unsigned WideMemSize = PowerOf2Ceil(MemSize); |
2501 | |
2502 | |
2503 | |
2504 | if (WideMemSize == ValSize) { |
2505 | MachineFunction &MF = B.getMF(); |
2506 | |
2507 | MachineMemOperand *WideMMO = |
2508 | MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); |
2509 | Observer.changingInstr(MI); |
2510 | MI.setMemRefs(MF, {WideMMO}); |
2511 | Observer.changedInstr(MI); |
2512 | return true; |
2513 | } |
2514 | |
2515 | |
2516 | if (ValSize > WideMemSize) |
2517 | return false; |
2518 | |
2519 | LLT WideTy = widenToNextPowerOf2(ValTy); |
2520 | |
2521 | Register WideLoad; |
2522 | if (!WideTy.isVector()) { |
2523 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); |
2524 | B.buildTrunc(ValReg, WideLoad).getReg(0); |
2525 | } else { |
2526 | |
2527 | |
2528 | if (isRegisterType(ValTy)) { |
2529 | |
2530 | |
2531 | WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); |
2532 | B.buildExtract(ValReg, WideLoad, 0); |
2533 | } else { |
2534 | |
2535 | |
2536 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); |
2537 | WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); |
2538 | B.setInsertPt(B.getMBB(), MI.getIterator()); |
2539 | B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); |
2540 | } |
2541 | } |
2542 | |
2543 | MI.eraseFromParent(); |
2544 | return true; |
2545 | } |
2546 | |
2547 | return false; |
2548 | } |
2549 | |
2550 | bool AMDGPULegalizerInfo::legalizeFMad( |
2551 | MachineInstr &MI, MachineRegisterInfo &MRI, |
2552 | MachineIRBuilder &B) const { |
2553 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); |
2554 | assert(Ty.isScalar()); |
2555 | |
2556 | MachineFunction &MF = B.getMF(); |
2557 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
2558 | |
2559 | |
2560 | |
2561 | if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) |
2562 | return true; |
2563 | if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) |
2564 | return true; |
2565 | |
2566 | MachineIRBuilder HelperBuilder(MI); |
2567 | GISelObserverWrapper DummyObserver; |
2568 | LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); |
2569 | return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; |
2570 | } |
2571 | |
2572 | bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( |
2573 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { |
2574 | Register DstReg = MI.getOperand(0).getReg(); |
2575 | Register PtrReg = MI.getOperand(1).getReg(); |
2576 | Register CmpVal = MI.getOperand(2).getReg(); |
2577 | Register NewVal = MI.getOperand(3).getReg(); |
2578 | |
2579 | assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && |
2580 | "this should not have been custom lowered"); |
2581 | |
2582 | LLT ValTy = MRI.getType(CmpVal); |
2583 | LLT VecTy = LLT::fixed_vector(2, ValTy); |
2584 | |
2585 | Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); |
2586 | |
2587 | B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) |
2588 | .addDef(DstReg) |
2589 | .addUse(PtrReg) |
2590 | .addUse(PackedVal) |
2591 | .setMemRefs(MI.memoperands()); |
2592 | |
2593 | MI.eraseFromParent(); |
2594 | return true; |
2595 | } |
2596 | |
2597 | bool AMDGPULegalizerInfo::legalizeFlog( |
2598 | MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { |
2599 | Register Dst = MI.getOperand(0).getReg(); |
2600 | Register Src = MI.getOperand(1).getReg(); |
2601 | LLT Ty = B.getMRI()->getType(Dst); |
2602 | unsigned Flags = MI.getFlags(); |
2603 | |
2604 | auto Log2Operand = B.buildFLog2(Ty, Src, Flags); |
2605 | auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); |
2606 | |
2607 | B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); |
2608 | MI.eraseFromParent(); |
2609 | return true; |
2610 | } |
2611 | |
2612 | bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, |
2613 | MachineIRBuilder &B) const { |
2614 | Register Dst = MI.getOperand(0).getReg(); |
2615 | Register Src = MI.getOperand(1).getReg(); |
2616 | unsigned Flags = MI.getFlags(); |
2617 | LLT Ty = B.getMRI()->getType(Dst); |
2618 | |
2619 | auto K = B.buildFConstant(Ty, numbers::log2e); |
2620 | auto Mul = B.buildFMul(Ty, Src, K, Flags); |
2621 | B.buildFExp2(Dst, Mul, Flags); |
2622 | MI.eraseFromParent(); |
2623 | return true; |
2624 | } |
2625 | |
2626 | bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, |
2627 | MachineIRBuilder &B) const { |
2628 | Register Dst = MI.getOperand(0).getReg(); |
2629 | Register Src0 = MI.getOperand(1).getReg(); |
2630 | Register Src1 = MI.getOperand(2).getReg(); |
2631 | unsigned Flags = MI.getFlags(); |
2632 | LLT Ty = B.getMRI()->getType(Dst); |
2633 | const LLT S16 = LLT::scalar(16); |
2634 | const LLT S32 = LLT::scalar(32); |
2635 | |
2636 | if (Ty == S32) { |
2637 | auto Log = B.buildFLog2(S32, Src0, Flags); |
2638 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) |
2639 | .addUse(Log.getReg(0)) |
2640 | .addUse(Src1) |
2641 | .setMIFlags(Flags); |
2642 | B.buildFExp2(Dst, Mul, Flags); |
2643 | } else if (Ty == S16) { |
2644 | |
2645 | auto Log = B.buildFLog2(S16, Src0, Flags); |
2646 | auto Ext0 = B.buildFPExt(S32, Log, Flags); |
2647 | auto Ext1 = B.buildFPExt(S32, Src1, Flags); |
2648 | auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) |
2649 | .addUse(Ext0.getReg(0)) |
2650 | .addUse(Ext1.getReg(0)) |
2651 | .setMIFlags(Flags); |
2652 | |
2653 | B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); |
2654 | } else |
2655 | return false; |
2656 | |
2657 | MI.eraseFromParent(); |
2658 | return true; |
2659 | } |
2660 | |
2661 | |
2662 | static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { |
2663 | Register ModSrc = OrigSrc; |
2664 | if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { |
2665 | ModSrc = SrcFNeg->getOperand(1).getReg(); |
2666 | if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) |
2667 | ModSrc = SrcFAbs->getOperand(1).getReg(); |
2668 | } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) |
2669 | ModSrc = SrcFAbs->getOperand(1).getReg(); |
2670 | return ModSrc; |
2671 | } |
2672 | |
2673 | bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, |
2674 | MachineRegisterInfo &MRI, |
2675 | MachineIRBuilder &B) const { |
2676 | |
2677 | const LLT S1 = LLT::scalar(1); |
2678 | const LLT S64 = LLT::scalar(64); |
2679 | Register Dst = MI.getOperand(0).getReg(); |
2680 | Register OrigSrc = MI.getOperand(1).getReg(); |
2681 | unsigned Flags = MI.getFlags(); |
2682 | assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && |
2683 | "this should not have been custom lowered"); |
2684 | |
2685 | |
2686 | |
2687 | |
2688 | |
2689 | |
2690 | |
2691 | |
2692 | |
2693 | auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) |
2694 | .addUse(OrigSrc) |
2695 | .setMIFlags(Flags); |
2696 | |
2697 | |
2698 | |
2699 | |
2700 | |
2701 | |
2702 | Register ModSrc = stripAnySourceMods(OrigSrc, MRI); |
2703 | |
2704 | auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); |
2705 | |
2706 | Register Min = MRI.createGenericVirtualRegister(S64); |
2707 | |
2708 | |
2709 | |
2710 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); |
2711 | if (MFI->getMode().IEEE) |
2712 | B.buildFMinNumIEEE(Min, Fract, Const, Flags); |
2713 | else |
2714 | B.buildFMinNum(Min, Fract, Const, Flags); |
2715 | |
2716 | Register CorrectedFract = Min; |
2717 | if (!MI.getFlag(MachineInstr::FmNoNans)) { |
2718 | auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); |
2719 | CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); |
2720 | } |
2721 | |
2722 | auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); |
2723 | B.buildFAdd(Dst, OrigSrc, NegFract, Flags); |
2724 | |
2725 | MI.eraseFromParent(); |
2726 | return true; |
2727 | } |
2728 | |
2729 | |
2730 | |
2731 | bool AMDGPULegalizerInfo::legalizeBuildVector( |
2732 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { |
2733 | Register Dst = MI.getOperand(0).getReg(); |
2734 | const LLT S32 = LLT::scalar(32); |
2735 | assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); |
2736 | |
2737 | Register Src0 = MI.getOperand(1).getReg(); |
2738 | Register Src1 = MI.getOperand(2).getReg(); |
2739 | assert(MRI.getType(Src0) == LLT::scalar(16)); |
2740 | |
2741 | auto Merge = B.buildMerge(S32, {Src0, Src1}); |
2742 | B.buildBitcast(Dst, Merge); |
2743 | |
2744 | MI.eraseFromParent(); |
2745 | return true; |
2746 | } |
2747 | |
2748 | |
2749 | static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { |
2750 | if (MI.getOpcode() != TargetOpcode::G_XOR) |
2751 | return false; |
2752 | auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); |
2753 | return ConstVal && *ConstVal == -1; |
2754 | } |
2755 | |
2756 | |
2757 | static MachineInstr * |
2758 | verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, |
2759 | MachineBasicBlock *&UncondBrTarget, bool &Negated) { |
2760 | Register CondDef = MI.getOperand(0).getReg(); |
2761 | if (!MRI.hasOneNonDBGUse(CondDef)) |
2762 | return nullptr; |
2763 | |
2764 | MachineBasicBlock *Parent = MI.getParent(); |
2765 | MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); |
2766 | |
2767 | if (isNot(MRI, *UseMI)) { |
2768 | Register NegatedCond = UseMI->getOperand(0).getReg(); |
2769 | if (!MRI.hasOneNonDBGUse(NegatedCond)) |
2770 | return nullptr; |
2771 | |
2772 | |
2773 | UseMI->eraseFromParent(); |
2774 | |
2775 | UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); |
2776 | Negated = true; |
2777 | } |
2778 | |
2779 | if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) |
2780 | return nullptr; |
2781 | |
2782 | |
2783 | MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); |
2784 | if (Next == Parent->end()) { |
2785 | MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); |
2786 | if (NextMBB == Parent->getParent()->end()) |
2787 | return nullptr; |
2788 | UncondBrTarget = &*NextMBB; |
2789 | } else { |
2790 | if (Next->getOpcode() != AMDGPU::G_BR) |
2791 | return nullptr; |
2792 | Br = &*Next; |
2793 | UncondBrTarget = Br->getOperand(0).getMBB(); |
2794 | } |
2795 | |
2796 | return UseMI; |
2797 | } |
2798 | |
2799 | bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, |
2800 | const ArgDescriptor *Arg, |
2801 | const TargetRegisterClass *ArgRC, |
2802 | LLT ArgTy) const { |
2803 | MCRegister SrcReg = Arg->getRegister(); |
2804 | assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); |
2805 | assert(DstReg.isVirtual() && "Virtual register expected"); |
2806 | |
2807 | Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, |
2808 | ArgTy); |
2809 | if (Arg->isMasked()) { |
| 6 | | Calling 'ArgDescriptor::isMasked' | |
|
| 9 | | Returning from 'ArgDescriptor::isMasked' | |
|
| |
2810 | |
2811 | const LLT S32 = LLT::scalar(32); |
2812 | const unsigned Mask = Arg->getMask(); |
2813 | const unsigned Shift = countTrailingZeros<unsigned>(Mask); |
| 11 | | Calling 'countTrailingZeros<unsigned int>' | |
|
| 18 | | Returning from 'countTrailingZeros<unsigned int>' | |
|
| 19 | | 'Shift' initialized to 32 | |
|
2814 | |
2815 | Register AndMaskSrc = LiveIn; |
2816 | |
2817 | if (Shift != 0) { |
| |
2818 | auto ShiftAmt = B.buildConstant(S32, Shift); |
2819 | AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); |
2820 | } |
2821 | |
2822 | B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); |
| 21 | | The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int' |
|
2823 | } else { |
2824 | B.buildCopy(DstReg, LiveIn); |
2825 | } |
2826 | |
2827 | return true; |
2828 | } |
2829 | |
2830 | bool AMDGPULegalizerInfo::loadInputValue( |
2831 | Register DstReg, MachineIRBuilder &B, |
2832 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { |
2833 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); |
2834 | const ArgDescriptor *Arg; |
2835 | const TargetRegisterClass *ArgRC; |
2836 | LLT ArgTy; |
2837 | std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); |
2838 | |
2839 | if (!Arg->isRegister() || !Arg->getRegister().isValid()) |
| |
2840 | return false; |
2841 | return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); |
| 5 | | Calling 'AMDGPULegalizerInfo::loadInputValue' | |
|
2842 | } |
2843 | |
2844 | bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( |
2845 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, |
2846 | AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { |
2847 | if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) |
| 3 | | Calling 'AMDGPULegalizerInfo::loadInputValue' | |
|
2848 | return false; |
2849 | |
2850 | MI.eraseFromParent(); |
2851 | return true; |
2852 | } |
2853 | |
2854 | bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, |
2855 | MachineRegisterInfo &MRI, |
2856 | MachineIRBuilder &B) const { |
2857 | Register Dst = MI.getOperand(0).getReg(); |
2858 | LLT DstTy = MRI.getType(Dst); |
2859 | LLT S16 = LLT::scalar(16); |
2860 | LLT S32 = LLT::scalar(32); |
2861 | LLT S64 = LLT::scalar(64); |
2862 | |
2863 | if (DstTy == S16) |
2864 | return legalizeFDIV16(MI, MRI, B); |
2865 | if (DstTy == S32) |
2866 | return legalizeFDIV32(MI, MRI, B); |
2867 | if (DstTy == S64) |
2868 | return legalizeFDIV64(MI, MRI, B); |
2869 | |
2870 | return false; |
2871 | } |
2872 | |
2873 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, |
2874 | Register DstDivReg, |
2875 | Register DstRemReg, |
2876 | Register X, |
2877 | Register Y) const { |
2878 | const LLT S1 = LLT::scalar(1); |
2879 | const LLT S32 = LLT::scalar(32); |
2880 | |
2881 | |
2882 | |
2883 | |
2884 | |
2885 | auto FloatY = B.buildUITOFP(S32, Y); |
2886 | auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); |
2887 | auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); |
2888 | auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); |
2889 | auto Z = B.buildFPTOUI(S32, ScaledY); |
2890 | |
2891 | |
2892 | auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); |
2893 | auto NegYZ = B.buildMul(S32, NegY, Z); |
2894 | Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); |
2895 | |
2896 | |
2897 | auto Q = B.buildUMulH(S32, X, Z); |
2898 | auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); |
2899 | |
2900 | |
2901 | auto One = B.buildConstant(S32, 1); |
2902 | auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); |
2903 | if (DstDivReg) |
2904 | Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); |
2905 | R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); |
2906 | |
2907 | |
2908 | Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); |
2909 | if (DstDivReg) |
2910 | B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); |
2911 | |
2912 | if (DstRemReg) |
2913 | B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); |
2914 | } |
2915 | |
2916 | |
2917 | |
2918 | |
2919 | |
2920 | |
2921 | |
2922 | |
2923 | |
2924 | |
2925 | |
2926 | |
2927 | |
2928 | |
2929 | static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, |
2930 | Register Val) { |
2931 | const LLT S32 = LLT::scalar(32); |
2932 | auto Unmerge = B.buildUnmerge(S32, Val); |
2933 | |
2934 | auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); |
2935 | auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); |
2936 | |
2937 | auto Mad = B.buildFMAD(S32, CvtHi, |
2938 | B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); |
2939 | |
2940 | auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); |
2941 | auto Mul1 = |
2942 | B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); |
2943 | |
2944 | |
2945 | auto Mul2 = |
2946 | B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); |
2947 | auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); |
2948 | |
2949 | |
2950 | auto Mad2 = B.buildFMAD(S32, Trunc, |
2951 | B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); |
2952 | |
2953 | auto ResultLo = B.buildFPTOUI(S32, Mad2); |
2954 | auto ResultHi = B.buildFPTOUI(S32, Trunc); |
2955 | |
2956 | return {ResultLo.getReg(0), ResultHi.getReg(0)}; |
2957 | } |
2958 | |
2959 | void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, |
2960 | Register DstDivReg, |
2961 | Register DstRemReg, |
2962 | Register Numer, |
2963 | Register Denom) const { |
2964 | const LLT S32 = LLT::scalar(32); |
2965 | const LLT S64 = LLT::scalar(64); |
2966 | const LLT S1 = LLT::scalar(1); |
2967 | Register RcpLo, RcpHi; |
2968 | |
2969 | std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); |
2970 | |
2971 | auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); |
2972 | |
2973 | auto Zero64 = B.buildConstant(S64, 0); |
2974 | auto NegDenom = B.buildSub(S64, Zero64, Denom); |
2975 | |
2976 | auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); |
2977 | auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); |
2978 | |
2979 | auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); |
2980 | Register MulHi1_Lo = UnmergeMulHi1.getReg(0); |
2981 | Register MulHi1_Hi = UnmergeMulHi1.getReg(1); |
2982 | |
2983 | auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); |
2984 | auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); |
2985 | auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); |
2986 | auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); |
2987 | |
2988 | auto MulLo2 = B.buildMul(S64, NegDenom, Add1); |
2989 | auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); |
2990 | auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); |
2991 | Register MulHi2_Lo = UnmergeMulHi2.getReg(0); |
2992 | Register MulHi2_Hi = UnmergeMulHi2.getReg(1); |
2993 | |
2994 | auto Zero32 = B.buildConstant(S32, 0); |
2995 | auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); |
2996 | auto Add2_HiC = |
2997 | B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); |
2998 | auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); |
2999 | auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); |
3000 | |
3001 | auto UnmergeNumer = B.buildUnmerge(S32, Numer); |
3002 | Register NumerLo = UnmergeNumer.getReg(0); |
3003 | Register NumerHi = UnmergeNumer.getReg(1); |
3004 | |
3005 | auto MulHi3 = B.buildUMulH(S64, Numer, Add2); |
3006 | auto Mul3 = B.buildMul(S64, Denom, MulHi3); |
3007 | auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); |
3008 | Register Mul3_Lo = UnmergeMul3.getReg(0); |
3009 | Register Mul3_Hi = UnmergeMul3.getReg(1); |
3010 | auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); |
3011 | auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); |
3012 | auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); |
3013 | auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); |
3014 | |
3015 | auto UnmergeDenom = B.buildUnmerge(S32, Denom); |
3016 | Register DenomLo = UnmergeDenom.getReg(0); |
3017 | Register DenomHi = UnmergeDenom.getReg(1); |
3018 | |
3019 | auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); |
3020 | auto C1 = B.buildSExt(S32, CmpHi); |
3021 | |
3022 | auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); |
3023 | auto C2 = B.buildSExt(S32, CmpLo); |
3024 | |
3025 | auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); |
3026 | auto C3 = B.buildSelect(S32, CmpEq, C2, C1); |
3027 | |
3028 | |
3029 | |
3030 | |
3031 | |
3032 | |
3033 | auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); |
3034 | auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); |
3035 | auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); |
3036 | auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); |
3037 | |
3038 | auto One64 = B.buildConstant(S64, 1); |
3039 | auto Add3 = B.buildAdd(S64, MulHi3, One64); |
3040 | |
3041 | auto C4 = |
3042 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); |
3043 | auto C5 = |
3044 | B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); |
3045 | auto C6 = B.buildSelect( |
3046 | S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); |
3047 | |
3048 | |
3049 | auto Add4 = B.buildAdd(S64, Add3, One64); |
3050 | auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); |
3051 | |
3052 | auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); |
3053 | auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); |
3054 | auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); |
3055 | |
3056 | |
3057 | |
3058 | |
3059 | if (DstDivReg) { |
3060 | auto Sel1 = B.buildSelect( |
3061 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); |
3062 | B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), |
3063 | Sel1, MulHi3); |
3064 | } |
3065 | |
3066 | if (DstRemReg) { |
3067 | auto Sel2 = B.buildSelect( |
3068 | S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); |
3069 | B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), |
3070 | Sel2, Sub1); |
3071 | } |
3072 | } |
3073 | |
3074 | bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, |
3075 | MachineRegisterInfo &MRI, |
3076 | MachineIRBuilder &B) const { |
3077 | Register DstDivReg, DstRemReg; |
3078 | switch (MI.getOpcode()) { |
3079 | default: |
3080 | llvm_unreachable("Unexpected opcode!"); |
3081 | case AMDGPU::G_UDIV: { |
3082 | DstDivReg = MI.getOperand(0).getReg(); |
3083 | break; |
3084 | } |
3085 | case AMDGPU::G_UREM: { |
3086 | DstRemReg = MI.getOperand(0).getReg(); |
3087 | break; |
3088 | } |
3089 | case AMDGPU::G_UDIVREM: { |
3090 | DstDivReg = MI.getOperand(0).getReg(); |
3091 | DstRemReg = MI.getOperand(1).getReg(); |
3092 | break; |
3093 | } |
3094 | } |
3095 | |
3096 | const LLT S64 = LLT::scalar(64); |
3097 | const LLT S32 = LLT::scalar(32); |
3098 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); |
3099 | Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); |
3100 | Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); |
3101 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); |
3102 | |
3103 | if (Ty == S32) |
3104 | legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); |
3105 | else if (Ty == S64) |
3106 | legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); |
3107 | else |
3108 | return false; |
3109 | |
3110 | MI.eraseFromParent(); |
3111 | return true; |
3112 | } |
3113 | |
3114 | bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, |
3115 | MachineRegisterInfo &MRI, |
3116 | MachineIRBuilder &B) const { |
3117 | const LLT S64 = LLT::scalar(64); |
3118 | const LLT S32 = LLT::scalar(32); |
3119 | |
3120 | LLT Ty = MRI.getType(MI.getOperand(0).getReg()); |
3121 | if (Ty != S32 && Ty != S64) |
3122 | return false; |
3123 | |
3124 | const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); |
3125 | Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); |
3126 | Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); |
3127 | |
3128 | auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); |
3129 | auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); |
3130 | auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); |
3131 | |
3132 | LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); |
3133 | RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); |
3134 | |
3135 | LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); |
3136 | RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); |
3137 | |
3138 | Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; |
3139 | switch (MI.getOpcode()) { |
3140 | default: |
3141 | llvm_unreachable("Unexpected opcode!"); |
3142 | case AMDGPU::G_SDIV: { |
3143 | DstDivReg = MI.getOperand(0).getReg(); |
3144 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); |
3145 | break; |
3146 | } |
3147 | case AMDGPU::G_SREM: { |
3148 | DstRemReg = MI.getOperand(0).getReg(); |
3149 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); |
3150 | break; |
3151 | } |
3152 | case AMDGPU::G_SDIVREM: { |
3153 | DstDivReg = MI.getOperand(0).getReg(); |
3154 | DstRemReg = MI.getOperand(1).getReg(); |
3155 | TmpDivReg = MRI.createGenericVirtualRegister(Ty); |
3156 | TmpRemReg = MRI.createGenericVirtualRegister(Ty); |
3157 | break; |
3158 | } |
3159 | } |
3160 | |
3161 | if (Ty == S32) |
3162 | legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); |
3163 | else |
3164 | legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); |
3165 | |
3166 | if (DstDivReg) { |
3167 | auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); |
3168 | auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); |
3169 | B.buildSub(DstDivReg, SignXor, Sign); |
3170 | } |
3171 | |
3172 | if (DstRemReg) { |
3173 | auto Sign = LHSign.getReg(0); |
3174 | auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); |
3175 | B.buildSub(DstRemReg, SignXor, Sign); |
3176 | } |
3177 | |
3178 | MI.eraseFromParent(); |
3179 | return true; |
3180 | } |
3181 | |
3182 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, |
3183 | MachineRegisterInfo &MRI, |
3184 | MachineIRBuilder &B) const { |
3185 | Register Res = MI.getOperand(0).getReg(); |
3186 | Register LHS = MI.getOperand(1).getReg(); |
3187 | Register RHS = MI.getOperand(2).getReg(); |
3188 | uint16_t Flags = MI.getFlags(); |
3189 | LLT ResTy = MRI.getType(Res); |
3190 | |
3191 | const MachineFunction &MF = B.getMF(); |
3192 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || |
3193 | MI.getFlag(MachineInstr::FmAfn); |
3194 | |
3195 | if (!AllowInaccurateRcp) |
3196 | return false; |
3197 | |
3198 | if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { |
3199 | |
3200 | if (CLHS->isExactlyValue(1.0)) { |
3201 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) |
3202 | .addUse(RHS) |
3203 | .setMIFlags(Flags); |
3204 | |
3205 | MI.eraseFromParent(); |
3206 | return true; |
3207 | } |
3208 | |
3209 | |
3210 | if (CLHS->isExactlyValue(-1.0)) { |
3211 | auto FNeg = B.buildFNeg(ResTy, RHS, Flags); |
3212 | B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) |
3213 | .addUse(FNeg.getReg(0)) |
3214 | .setMIFlags(Flags); |
3215 | |
3216 | MI.eraseFromParent(); |
3217 | return true; |
3218 | } |
3219 | } |
3220 | |
3221 | |
3222 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) |
3223 | .addUse(RHS) |
3224 | .setMIFlags(Flags); |
3225 | B.buildFMul(Res, LHS, RCP, Flags); |
3226 | |
3227 | MI.eraseFromParent(); |
3228 | return true; |
3229 | } |
3230 | |
3231 | bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, |
3232 | MachineRegisterInfo &MRI, |
3233 | MachineIRBuilder &B) const { |
3234 | Register Res = MI.getOperand(0).getReg(); |
3235 | Register X = MI.getOperand(1).getReg(); |
3236 | Register Y = MI.getOperand(2).getReg(); |
3237 | uint16_t Flags = MI.getFlags(); |
3238 | LLT ResTy = MRI.getType(Res); |
3239 | |
3240 | const MachineFunction &MF = B.getMF(); |
3241 | bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || |
3242 | MI.getFlag(MachineInstr::FmAfn); |
3243 | |
3244 | if (!AllowInaccurateRcp) |
3245 | return false; |
3246 | |
3247 | auto NegY = B.buildFNeg(ResTy, Y); |
3248 | auto One = B.buildFConstant(ResTy, 1.0); |
3249 | |
3250 | auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) |
3251 | .addUse(Y) |
3252 | .setMIFlags(Flags); |
3253 | |
3254 | auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); |
3255 | R = B.buildFMA(ResTy, Tmp0, R, R); |
3256 | |
3257 | auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); |
3258 | R = B.buildFMA(ResTy, Tmp1, R, R); |
3259 | |
3260 | auto Ret = B.buildFMul(ResTy, X, R); |
3261 | auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); |
3262 | |
3263 | B.buildFMA(Res, Tmp2, R, Ret); |
3264 | MI.eraseFromParent(); |
3265 | return true; |
3266 | } |
3267 | |
3268 | bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, |
3269 | MachineRegisterInfo &MRI, |
3270 | MachineIRBuilder &B) const { |
3271 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) |
3272 | return true; |
3273 | |
3274 | Register Res = MI.getOperand(0).getReg(); |
3275 | Register LHS = MI.getOperand(1).getReg(); |
3276 | Register RHS = MI.getOperand(2).getReg(); |
3277 | |
3278 | uint16_t Flags = MI.getFlags(); |
3279 | |
3280 | LLT S16 = LLT::scalar(16); |
3281 | LLT S32 = LLT::scalar(32); |
3282 | |
3283 | auto LHSExt = B.buildFPExt(S32, LHS, Flags); |
3284 | auto RHSExt = B.buildFPExt(S32, RHS, Flags); |
3285 | |
3286 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) |
3287 | .addUse(RHSExt.getReg(0)) |
3288 | .setMIFlags(Flags); |
3289 | |
3290 | auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); |
3291 | auto RDst = B.buildFPTrunc(S16, QUOT, Flags); |
3292 | |
3293 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) |
3294 | .addUse(RDst.getReg(0)) |
3295 | .addUse(RHS) |
3296 | .addUse(LHS) |
3297 | .setMIFlags(Flags); |
3298 | |
3299 | MI.eraseFromParent(); |
3300 | return true; |
3301 | } |
3302 | |
3303 | |
3304 | |
3305 | static void toggleSPDenormMode(bool Enable, |
3306 | MachineIRBuilder &B, |
3307 | const GCNSubtarget &ST, |
3308 | AMDGPU::SIModeRegisterDefaults Mode) { |
3309 | |
3310 | unsigned SPDenormMode = |
3311 | Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); |
3312 | |
3313 | if (ST.hasDenormModeInst()) { |
3314 | |
3315 | uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); |
3316 | |
3317 | uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); |
3318 | B.buildInstr(AMDGPU::S_DENORM_MODE) |
3319 | .addImm(NewDenormModeValue); |
3320 | |
3321 | } else { |
3322 | |
3323 | unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | |
3324 | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | |
3325 | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); |
3326 | |
3327 | B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) |
3328 | .addImm(SPDenormMode) |
3329 | .addImm(SPDenormModeBitField); |
3330 | } |
3331 | } |
3332 | |
3333 | bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, |
3334 | MachineRegisterInfo &MRI, |
3335 | MachineIRBuilder &B) const { |
3336 | if (legalizeFastUnsafeFDIV(MI, MRI, B)) |
3337 | return true; |
3338 | |
3339 | Register Res = MI.getOperand(0).getReg(); |
3340 | Register LHS = MI.getOperand(1).getReg(); |
3341 | Register RHS = MI.getOperand(2).getReg(); |
3342 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); |
3343 | AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); |
3344 | |
3345 | uint16_t Flags = MI.getFlags(); |
3346 | |
3347 | LLT S32 = LLT::scalar(32); |
3348 | LLT S1 = LLT::scalar(1); |
3349 | |
3350 | auto One = B.buildFConstant(S32, 1.0f); |
3351 | |
3352 | auto DenominatorScaled = |
3353 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) |
3354 | .addUse(LHS) |
3355 | .addUse(RHS) |
3356 | .addImm(0) |
3357 | .setMIFlags(Flags); |
3358 | auto NumeratorScaled = |
3359 | B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) |
3360 | .addUse(LHS) |
3361 | .addUse(RHS) |
3362 | .addImm(1) |
3363 | .setMIFlags(Flags); |
3364 | |
3365 | auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) |
3366 | .addUse(DenominatorScaled.getReg(0)) |
3367 | .setMIFlags(Flags); |
3368 | auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); |
3369 | |
3370 | |
3371 | |
3372 | if (!Mode.allFP32Denormals()) |
3373 | toggleSPDenormMode(true, B, ST, Mode); |
3374 | |
3375 | auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); |
3376 | auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); |
3377 | auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); |
3378 | auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); |
3379 | auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); |
3380 | auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); |
3381 | |
3382 | if (!Mode.allFP32Denormals()) |
3383 | toggleSPDenormMode(false, B, ST, Mode); |
3384 | |
3385 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) |
3386 | .addUse(Fma4.getReg(0)) |
3387 | .addUse(Fma1.getReg(0)) |
3388 | .addUse(Fma3.getReg(0)) |
3389 | .addUse(NumeratorScaled.getReg(1)) |
3390 | .setMIFlags(Flags); |
3391 | |
3392 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) |
3393 | .addUse(Fmas.getReg(0)) |
3394 | .addUse(RHS) |
3395 | .addUse(LHS) |
3396 | .setMIFlags(Flags); |
3397 | |
3398 | MI.eraseFromParent(); |
3399 | return true; |
3400 | } |
3401 | |
3402 | bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, |
3403 | MachineRegisterInfo &MRI, |
3404 | MachineIRBuilder &B) const { |
3405 | if (legalizeFastUnsafeFDIV64(MI, MRI, B)) |
3406 | return true; |
3407 | |
3408 | Register Res = MI.getOperand(0).getReg(); |
3409 | Register LHS = MI.getOperand(1).getReg(); |
3410 | Register RHS = MI.getOperand(2).getReg(); |
3411 | |
3412 | uint16_t Flags = MI.getFlags(); |
3413 | |
3414 | LLT S64 = LLT::scalar(64); |
3415 | LLT S1 = LLT::scalar(1); |
3416 | |
3417 | auto One = B.buildFConstant(S64, 1.0); |
3418 | |
3419 | auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) |
3420 | .addUse(LHS) |
3421 | .addUse(RHS) |
3422 | .addImm(0) |
3423 | .setMIFlags(Flags); |
3424 | |
3425 | auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); |
3426 | |
3427 | auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) |
3428 | .addUse(DivScale0.getReg(0)) |
3429 | .setMIFlags(Flags); |
3430 | |
3431 | auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); |
3432 | auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); |
3433 | auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); |
3434 | |
3435 | auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) |
3436 | .addUse(LHS) |
3437 | .addUse(RHS) |
3438 | .addImm(1) |
3439 | .setMIFlags(Flags); |
3440 | |
3441 | auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); |
3442 | auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); |
3443 | auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); |
3444 | |
3445 | Register Scale; |
3446 | if (!ST.hasUsableDivScaleConditionOutput()) { |
3447 | |
3448 | |
3449 | |
3450 | LLT S32 = LLT::scalar(32); |
3451 | |
3452 | auto NumUnmerge = B.buildUnmerge(S32, LHS); |
3453 | auto DenUnmerge = B.buildUnmerge(S32, RHS); |
3454 | auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); |
3455 | auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); |
3456 | |
3457 | auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), |
3458 | Scale1Unmerge.getReg(1)); |
3459 | auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), |
3460 | Scale0Unmerge.getReg(1)); |
3461 | Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); |
3462 | } else { |
3463 | Scale = DivScale1.getReg(1); |
3464 | } |
3465 | |
3466 | auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) |
3467 | .addUse(Fma4.getReg(0)) |
3468 | .addUse(Fma3.getReg(0)) |
3469 | .addUse(Mul.getReg(0)) |
3470 | .addUse(Scale) |
3471 | .setMIFlags(Flags); |
3472 | |
3473 | B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) |
3474 | .addUse(Fmas.getReg(0)) |
3475 | .addUse(RHS) |
3476 | .addUse(LHS) |
3477 | .setMIFlags(Flags); |
3478 | |
3479 | MI.eraseFromParent(); |
3480 | return true; |
3481 | } |
3482 | |
3483 | bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, |
3484 | MachineRegisterInfo &MRI, |
3485 | MachineIRBuilder &B) const { |
3486 | Register Res = MI.getOperand(0).getReg(); |
3487 | Register LHS = MI.getOperand(2).getReg(); |
3488 | Register RHS = MI.getOperand(3).getReg(); |
3489 | uint16_t Flags = MI.getFlags(); |
3490 | |
3491 | LLT S32 = LLT::scalar(32); |
3492 | LLT S1 = LLT::scalar(1); |
3493 | |
3494 | auto Abs = B.buildFAbs(S32, RHS, Flags); |
3495 | const APFloat C0Val(1.0f); |
3496 | |
3497 | auto C0 = B.buildConstant(S32, 0x6f800000); |
3498 | auto C1 = B.buildConstant(S32, 0x2f800000); |
3499 | auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); |
3500 | |
3501 | auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); |
3502 | auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); |
3503 | |
3504 | auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); |
3505 | |
3506 | auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) |
3507 | .addUse(Mul0.getReg(0)) |
3508 | .setMIFlags(Flags); |
3509 | |
3510 | auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); |
3511 | |
3512 | B.buildFMul(Res, Sel, Mul1, Flags); |
3513 | |
3514 | MI.eraseFromParent(); |
3515 | return true; |
3516 | } |
3517 | |
3518 | |
3519 | |
3520 | |
3521 | |
3522 | |
3523 | |
3524 | bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, |
3525 | MachineRegisterInfo &MRI, |
3526 | MachineIRBuilder &B) const { |
3527 | if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) |
3528 | return true; |
3529 | |
3530 | Register Dst = MI.getOperand(0).getReg(); |
3531 | Register Src = MI.getOperand(2).getReg(); |
3532 | auto Flags = MI.getFlags(); |
3533 | |
3534 | LLT Ty = MRI.getType(Dst); |
3535 | |
3536 | const fltSemantics *FltSemantics; |
3537 | if (Ty == LLT::scalar(32)) |
3538 | FltSemantics = &APFloat::IEEEsingle(); |
3539 | else if (Ty == LLT::scalar(64)) |
3540 | FltSemantics = &APFloat::IEEEdouble(); |
3541 | else |
3542 | return false; |
3543 | |
3544 | auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) |
3545 | .addUse(Src) |
3546 | .setMIFlags(Flags); |
3547 | |
3548 | |
3549 | |
3550 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); |
3551 | const bool UseIEEE = MFI->getMode().IEEE; |
3552 | |
3553 | auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); |
3554 | auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : |
3555 | B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); |
3556 | |
3557 | auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); |
3558 | |
3559 | if (UseIEEE) |
3560 | B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); |
3561 | else |
3562 | B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); |
3563 | MI.eraseFromParent(); |
3564 | return true; |
3565 | } |
3566 | |
3567 | static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { |
3568 | switch (IID) { |
3569 | case Intrinsic::amdgcn_ds_fadd: |
3570 | return AMDGPU::G_ATOMICRMW_FADD; |
3571 | case Intrinsic::amdgcn_ds_fmin: |
3572 | return AMDGPU::G_AMDGPU_ATOMIC_FMIN; |
3573 | case Intrinsic::amdgcn_ds_fmax: |
3574 | return AMDGPU::G_AMDGPU_ATOMIC_FMAX; |
3575 | default: |
3576 | llvm_unreachable("not a DS FP intrinsic"); |
3577 | } |
3578 | } |
3579 | |
3580 | bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, |
3581 | MachineInstr &MI, |
3582 | Intrinsic::ID IID) const { |
3583 | GISelChangeObserver &Observer = Helper.Observer; |
3584 | Observer.changingInstr(MI); |
3585 | |
3586 | MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); |
3587 | |
3588 | |
3589 | |
3590 | for (int I = 6; I > 3; --I) |
3591 | MI.RemoveOperand(I); |
3592 | |
3593 | MI.RemoveOperand(1); |
3594 | Observer.changedInstr(MI); |
3595 | return true; |
3596 | } |
3597 | |
3598 | bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, |
3599 | MachineRegisterInfo &MRI, |
3600 | MachineIRBuilder &B) const { |
3601 | uint64_t Offset = |
3602 | ST.getTargetLowering()->getImplicitParameterOffset( |
3603 | B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); |
3604 | LLT DstTy = MRI.getType(DstReg); |
3605 | LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); |
3606 | |
3607 | Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); |
3608 | if (!loadInputValue(KernargPtrReg, B, |
3609 | AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) |
3610 | return false; |
3611 | |
3612 | |
3613 | B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); |
3614 | return true; |
3615 | } |
3616 | |
3617 | bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, |
3618 | MachineRegisterInfo &MRI, |
3619 | MachineIRBuilder &B) const { |
3620 | const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); |
3621 | if (!MFI->isEntryFunction()) { |
3622 | return legalizePreloadedArgIntrin(MI, MRI, B, |
3623 | AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); |
3624 | } |
3625 | |
3626 | Register DstReg = MI.getOperand(0).getReg(); |
3627 | if (!getImplicitArgPtr(DstReg, MRI, B)) |
3628 | return false; |
3629 | |
3630 | MI.eraseFromParent(); |
3631 | return true; |
3632 | } |
3633 | |
3634 | bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, |
3635 | MachineRegisterInfo &MRI, |
3636 | MachineIRBuilder &B, |
3637 | unsigned AddrSpace) const { |
3638 | Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); |
3639 | auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); |
3640 | Register Hi32 = Unmerge.getReg(1); |
3641 | |
3642 | B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); |
3643 | MI.eraseFromParent(); |
3644 | return true; |
3645 | } |
3646 | |
3647 | |
3648 | |
3649 | |
3650 | |
3651 | |
3652 | |
3653 | std::pair<Register, unsigned> |
3654 | AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, |
3655 | Register OrigOffset) const { |
3656 | const unsigned MaxImm = 4095; |
3657 | Register BaseReg; |
3658 | unsigned ImmOffset; |
3659 | const LLT S32 = LLT::scalar(32); |
3660 | MachineRegisterInfo &MRI = *B.getMRI(); |
3661 | |
3662 | std::tie(BaseReg, ImmOffset) = |
3663 | AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); |
3664 | |
3665 | |
3666 | if (MRI.getType(BaseReg).isPointer()) |
3667 | BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); |
3668 | |
3669 | |
3670 | |
3671 | |
3672 | |
3673 | |
3674 | |
3675 | |
3676 | unsigned Overflow = ImmOffset & ~MaxImm; |
3677 | ImmOffset -= Overflow; |
3678 | if ((int32_t)Overflow < 0) { |
3679 | Overflow += ImmOffset; |
3680 | ImmOffset = 0; |
3681 | } |
3682 | |
3683 | if (Overflow != 0) { |
3684 | if (!BaseReg) { |
3685 | BaseReg = B.buildConstant(S32, Overflow).getReg(0); |
3686 | } else { |
3687 | auto OverflowVal = B.buildConstant(S32, Overflow); |
3688 | BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); |
3689 | } |
3690 | } |
3691 | |
3692 | if (!BaseReg) |
3693 | BaseReg = B.buildConstant(S32, 0).getReg(0); |
3694 | |
3695 | return std::make_pair(BaseReg, ImmOffset); |
3696 | } |
3697 | |
3698 | |
3699 | void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, |
3700 | Register VOffset, Register SOffset, |
3701 | unsigned ImmOffset, Register VIndex, |
3702 | MachineRegisterInfo &MRI) const { |
3703 | Optional<ValueAndVReg> MaybeVOffsetVal = |
3704 | getConstantVRegValWithLookThrough(VOffset, MRI); |
3705 | Optional<ValueAndVReg> MaybeSOffsetVal = |
3706 | getConstantVRegValWithLookThrough(SOffset, MRI); |
3707 | Optional<ValueAndVReg> MaybeVIndexVal = |
3708 | getConstantVRegValWithLookThrough(VIndex, MRI); |
3709 | |
3710 | |
3711 | |
3712 | if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && |
3713 | MaybeVIndexVal->Value == 0) { |
3714 | uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + |
3715 | MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; |
3716 | MMO->setOffset(TotalOffset); |
3717 | } else { |
3718 | |
3719 | MMO->setValue((Value *)nullptr); |
3720 | } |
3721 | } |
3722 | |
3723 | |
3724 | Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, |
3725 | MachineRegisterInfo &MRI, |
3726 | Register Reg, |
3727 | bool ImageStore) const { |
3728 | const LLT S16 = LLT::scalar(16); |
3729 | const LLT S32 = LLT::scalar(32); |
3730 | LLT StoreVT = MRI.getType(Reg); |
3731 | assert(StoreVT.isVector() && StoreVT.getElementType() == S16); |
3732 | |
3733 | if (ST.hasUnpackedD16VMem()) { |
3734 | auto Unmerge = B.buildUnmerge(S16, Reg); |
3735 | |
3736 | SmallVector<Register, 4> WideRegs; |
3737 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) |
3738 | WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); |
3739 | |
3740 | int NumElts = StoreVT.getNumElements(); |
3741 | |
3742 | return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) |
3743 | .getReg(0); |
3744 | } |
3745 | |
3746 | if (ImageStore && ST.hasImageStoreD16Bug()) { |
3747 | if (StoreVT.getNumElements() == 2) { |
3748 | SmallVector<Register, 4> PackedRegs; |
3749 | Reg = B.buildBitcast(S32, Reg).getReg(0); |
3750 | PackedRegs.push_back(Reg); |
3751 | PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); |
3752 | return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) |
3753 | .getReg(0); |
3754 | } |
3755 | |
3756 | if (StoreVT.getNumElements() == 3) { |
3757 | SmallVector<Register, 4> PackedRegs; |
3758 | auto Unmerge = B.buildUnmerge(S16, Reg); |
3759 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) |
3760 | PackedRegs.push_back(Unmerge.getReg(I)); |
3761 | PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); |
3762 | Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); |
3763 | return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); |
3764 | } |
3765 | |
3766 | if (StoreVT.getNumElements() == 4) { |
3767 | SmallVector<Register, 4> PackedRegs; |
3768 | Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); |
3769 | auto Unmerge = B.buildUnmerge(S32, Reg); |
3770 | for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) |
3771 | PackedRegs.push_back(Unmerge.getReg(I)); |
3772 | PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); |
3773 | return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) |
3774 | .getReg(0); |
3775 | } |
3776 | |
3777 | llvm_unreachable("invalid data type"); |
3778 | } |
3779 | |
3780 | return Reg; |
3781 | } |
3782 | |
3783 | Register AMDGPULegalizerInfo::fixStoreSourceType( |
3784 | MachineIRBuilder &B, Register VData, bool IsFormat) const { |
3785 | MachineRegisterInfo *MRI = B.getMRI(); |
3786 | LLT Ty = MRI->getType(VData); |
3787 | |
3788 | const LLT S16 = LLT::scalar(16); |
3789 | |
3790 | |
3791 | if (Ty == LLT::scalar(8) || Ty == S16) { |
3792 | Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); |
3793 | return AnyExt; |
3794 | } |
3795 | |
3796 | if (Ty.isVector()) { |
3797 | if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { |
3798 | if (IsFormat) |
3799 | return handleD16VData(B, *MRI, VData); |
3800 | } |
3801 | } |
3802 | |
3803 | return VData; |
3804 | } |
3805 | |
3806 | bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, |
3807 | MachineRegisterInfo &MRI, |
3808 | MachineIRBuilder &B, |
3809 | bool IsTyped, |
3810 | bool IsFormat) const { |
3811 | Register VData = MI.getOperand(1).getReg(); |
3812 | LLT Ty = MRI.getType(VData); |
3813 | LLT EltTy = Ty.getScalarType(); |
3814 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); |
3815 | const LLT S32 = LLT::scalar(32); |
3816 | |
3817 | VData = fixStoreSourceType(B, VData, IsFormat); |
3818 | Register RSrc = MI.getOperand(2).getReg(); |
3819 | |
3820 | MachineMemOperand *MMO = *MI.memoperands_begin(); |
3821 | const int MemSize = MMO->getSize(); |
3822 | |
3823 | unsigned ImmOffset; |
3824 | |
3825 | |
3826 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; |
3827 | |
3828 | |
3829 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; |
3830 | Register VIndex; |
3831 | int OpOffset = 0; |
3832 | if (HasVIndex) { |
3833 | VIndex = MI.getOperand(3).getReg(); |
3834 | OpOffset = 1; |
3835 | } else { |
3836 | VIndex = B.buildConstant(S32, 0).getReg(0); |
3837 | } |
3838 | |
3839 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); |
3840 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); |
3841 | |
3842 | unsigned Format = 0; |
3843 | if (IsTyped) { |
3844 | Format = MI.getOperand(5 + OpOffset).getImm(); |
3845 | ++OpOffset; |
3846 | } |
3847 | |
3848 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); |
3849 | |
3850 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); |
3851 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); |
3852 | |
3853 | unsigned Opc; |
3854 | if (IsTyped) { |
3855 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : |
3856 | AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; |
3857 | } else if (IsFormat) { |
3858 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : |
3859 | AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; |
3860 | } else { |
3861 | switch (MemSize) { |
3862 | case 1: |
3863 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; |
3864 | break; |
3865 | case 2: |
3866 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; |
3867 | break; |
3868 | default: |
3869 | Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; |
3870 | break; |
3871 | } |
3872 | } |
3873 | |
3874 | auto MIB = B.buildInstr(Opc) |
3875 | .addUse(VData) |
3876 | .addUse(RSrc) |
3877 | .addUse(VIndex) |
3878 | .addUse(VOffset) |
3879 | .addUse(SOffset) |
3880 | .addImm(ImmOffset); |
3881 | |
3882 | if (IsTyped) |
3883 | MIB.addImm(Format); |
3884 | |
3885 | MIB.addImm(AuxiliaryData) |
3886 | .addImm(HasVIndex ? -1 : 0) |
3887 | .addMemOperand(MMO); |
3888 | |
3889 | MI.eraseFromParent(); |
3890 | return true; |
3891 | } |
3892 | |
3893 | bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, |
3894 | MachineRegisterInfo &MRI, |
3895 | MachineIRBuilder &B, |
3896 | bool IsFormat, |
3897 | bool IsTyped) const { |
3898 | |
3899 | MachineMemOperand *MMO = *MI.memoperands_begin(); |
3900 | const LLT MemTy = MMO->getMemoryType(); |
3901 | const LLT S32 = LLT::scalar(32); |
3902 | |
3903 | Register Dst = MI.getOperand(0).getReg(); |
3904 | Register RSrc = MI.getOperand(2).getReg(); |
3905 | |
3906 | |
3907 | const unsigned NumVIndexOps = IsTyped ? 8 : 7; |
3908 | |
3909 | |
3910 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; |
3911 | Register VIndex; |
3912 | int OpOffset = 0; |
3913 | if (HasVIndex) { |
3914 | VIndex = MI.getOperand(3).getReg(); |
3915 | OpOffset = 1; |
3916 | } else { |
3917 | VIndex = B.buildConstant(S32, 0).getReg(0); |
3918 | } |
3919 | |
3920 | Register VOffset = MI.getOperand(3 + OpOffset).getReg(); |
3921 | Register SOffset = MI.getOperand(4 + OpOffset).getReg(); |
3922 | |
3923 | unsigned Format = 0; |
3924 | if (IsTyped) { |
3925 | Format = MI.getOperand(5 + OpOffset).getImm(); |
3926 | ++OpOffset; |
3927 | } |
3928 | |
3929 | unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); |
3930 | unsigned ImmOffset; |
3931 | |
3932 | LLT Ty = MRI.getType(Dst); |
3933 | LLT EltTy = Ty.getScalarType(); |
3934 | const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); |
3935 | const bool Unpacked = ST.hasUnpackedD16VMem(); |
3936 | |
3937 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); |
3938 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); |
3939 | |
3940 | unsigned Opc; |
3941 | |
3942 | if (IsTyped) { |
3943 | Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : |
3944 | AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; |
3945 | } else if (IsFormat) { |
3946 | Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : |
3947 | AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; |
3948 | } else { |
3949 | switch (MemTy.getSizeInBits()) { |
3950 | case 8: |
3951 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; |
3952 | break; |
3953 | case 16: |
3954 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; |
3955 | break; |
3956 | default: |
3957 | Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; |
3958 | break; |
3959 | } |
3960 | } |
3961 | |
3962 | Register LoadDstReg; |
3963 | |
3964 | bool IsExtLoad = |
3965 | (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); |
3966 | LLT UnpackedTy = Ty.changeElementSize(32); |
3967 | |
3968 | if (IsExtLoad) |
3969 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); |
3970 | else if (Unpacked && IsD16 && Ty.isVector()) |
3971 | LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); |
3972 | else |
3973 | LoadDstReg = Dst; |
3974 | |
3975 | auto MIB = B.buildInstr(Opc) |
3976 | .addDef(LoadDstReg) |
3977 | .addUse(RSrc) |
3978 | .addUse(VIndex) |
3979 | .addUse(VOffset) |
3980 | .addUse(SOffset) |
3981 | .addImm(ImmOffset); |
3982 | |
3983 | if (IsTyped) |
3984 | MIB.addImm(Format); |
3985 | |
3986 | MIB.addImm(AuxiliaryData) |
3987 | .addImm(HasVIndex ? -1 : 0) |
3988 | .addMemOperand(MMO); |
3989 | |
3990 | if (LoadDstReg != Dst) { |
3991 | B.setInsertPt(B.getMBB(), ++B.getInsertPt()); |
3992 | |
3993 | |
3994 | if (IsExtLoad) |
3995 | B.buildTrunc(Dst, LoadDstReg); |
3996 | else { |
3997 | |
3998 | |
3999 | auto Unmerge = B.buildUnmerge(S32, LoadDstReg); |
4000 | SmallVector<Register, 4> Repack; |
4001 | for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) |
4002 | Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); |
4003 | B.buildMerge(Dst, Repack); |
4004 | } |
4005 | } |
4006 | |
4007 | MI.eraseFromParent(); |
4008 | return true; |
4009 | } |
4010 | |
4011 | bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, |
4012 | MachineIRBuilder &B, |
4013 | bool IsInc) const { |
4014 | unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : |
4015 | AMDGPU::G_AMDGPU_ATOMIC_DEC; |
4016 | B.buildInstr(Opc) |
4017 | .addDef(MI.getOperand(0).getReg()) |
4018 | .addUse(MI.getOperand(2).getReg()) |
4019 | .addUse(MI.getOperand(3).getReg()) |
4020 | .cloneMemRefs(MI); |
4021 | MI.eraseFromParent(); |
4022 | return true; |
4023 | } |
4024 | |
4025 | static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { |
4026 | switch (IntrID) { |
4027 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: |
4028 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: |
4029 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; |
4030 | case Intrinsic::amdgcn_raw_buffer_atomic_add: |
4031 | case Intrinsic::amdgcn_struct_buffer_atomic_add: |
4032 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; |
4033 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: |
4034 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: |
4035 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; |
4036 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: |
4037 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: |
4038 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; |
4039 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: |
4040 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: |
4041 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; |
4042 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: |
4043 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: |
4044 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; |
4045 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: |
4046 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: |
4047 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; |
4048 | case Intrinsic::amdgcn_raw_buffer_atomic_and: |
4049 | case Intrinsic::amdgcn_struct_buffer_atomic_and: |
4050 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; |
4051 | case Intrinsic::amdgcn_raw_buffer_atomic_or: |
4052 | case Intrinsic::amdgcn_struct_buffer_atomic_or: |
4053 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; |
4054 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: |
4055 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: |
4056 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; |
4057 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: |
4058 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: |
4059 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; |
4060 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: |
4061 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: |
4062 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; |
4063 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: |
4064 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: |
4065 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; |
4066 | case Intrinsic::amdgcn_buffer_atomic_fadd: |
4067 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: |
4068 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: |
4069 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; |
4070 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: |
4071 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: |
4072 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; |
4073 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: |
4074 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: |
4075 | return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; |
4076 | default: |
4077 | llvm_unreachable("unhandled atomic opcode"); |
4078 | } |
4079 | } |
4080 | |
4081 | bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, |
4082 | MachineIRBuilder &B, |
4083 | Intrinsic::ID IID) const { |
4084 | const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || |
4085 | IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; |
4086 | const bool HasReturn = MI.getNumExplicitDefs() != 0; |
4087 | |
4088 | Register Dst; |
4089 | |
4090 | int OpOffset = 0; |
4091 | if (HasReturn) { |
4092 | |
4093 | Dst = MI.getOperand(0).getReg(); |
4094 | } else { |
4095 | OpOffset = -1; |
4096 | } |
4097 | |
4098 | Register VData = MI.getOperand(2 + OpOffset).getReg(); |
4099 | Register CmpVal; |
4100 | |
4101 | if (IsCmpSwap) { |
4102 | CmpVal = MI.getOperand(3 + OpOffset).getReg(); |
4103 | ++OpOffset; |
4104 | } |
4105 | |
4106 | Register RSrc = MI.getOperand(3 + OpOffset).getReg(); |
4107 | const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; |
4108 | |
4109 | |
4110 | const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; |
4111 | Register VIndex; |
4112 | if (HasVIndex) { |
4113 | VIndex = MI.getOperand(4 + OpOffset).getReg(); |
4114 | ++OpOffset; |
4115 | } else { |
4116 | VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); |
4117 | } |
4118 | |
4119 | Register VOffset = MI.getOperand(4 + OpOffset).getReg(); |
4120 | Register SOffset = MI.getOperand(5 + OpOffset).getReg(); |
4121 | unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); |
4122 | |
4123 | MachineMemOperand *MMO = *MI.memoperands_begin(); |
4124 | |
4125 | unsigned ImmOffset; |
4126 | std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); |
4127 | updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); |
4128 | |
4129 | auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); |
4130 | |
4131 | if (HasReturn) |
4132 | MIB.addDef(Dst); |
4133 | |
4134 | MIB.addUse(VData); |
4135 | |
4136 | if (IsCmpSwap) |
4137 | MIB.addReg(CmpVal); |
4138 | |
4139 | MIB.addUse(RSrc) |
4140 | .addUse(VIndex) |
4141 | .addUse(VOffset) |
4142 | .addUse(SOffset) |
4143 | .addImm(ImmOffset) |
4144 | .addImm(AuxiliaryData) |
4145 | .addImm(HasVIndex ? -1 : 0) |
4146 | .addMemOperand(MMO); |
4147 | |
4148 | MI.eraseFromParent(); |
4149 | return true; |
4150 | } |
4151 | |
4152 | |
4153 | |
4154 | static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, |
4155 | SmallVectorImpl<Register> &PackedAddrs, |
4156 | unsigned ArgOffset, |
4157 | const AMDGPU::ImageDimIntrinsicInfo *Intr, |
4158 | bool IsA16, bool IsG16) { |
4159 | const LLT S16 = LLT::scalar(16); |
4160 | const LLT V2S16 = LLT::fixed_vector(2, 16); |
4161 | auto EndIdx = Intr->VAddrEnd; |
4162 | |
4163 | for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { |
4164 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); |
4165 | if (!SrcOp.isReg()) |
4166 | continue; |
4167 | |
4168 | Register AddrReg = SrcOp.getReg(); |
4169 | |
4170 | if ((I < Intr->GradientStart) || |
4171 | (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || |
4172 | (I >= Intr->CoordStart && !IsA16)) { |
4173 | |
4174 | AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); |
4175 | PackedAddrs.push_back(AddrReg); |
4176 | } else { |
4177 | |
4178 | |
4179 | if (((I + 1) >= EndIdx) || |
4180 | ((Intr->NumGradients / 2) % 2 == 1 && |
4181 | (I == static_cast<unsigned>(Intr->GradientStart + |
4182 | (Intr->NumGradients / 2) - 1) || |
4183 | I == static_cast<unsigned>(Intr->GradientStart + |
4184 | Intr->NumGradients - 1))) || |
4185 | |
4186 | !MI.getOperand(ArgOffset + I + 1).isReg()) { |
4187 | PackedAddrs.push_back( |
4188 | B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) |
4189 | .getReg(0)); |
4190 | } else { |
4191 | PackedAddrs.push_back( |
4192 | B.buildBuildVector( |
4193 | V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) |
4194 | .getReg(0)); |
4195 | ++I; |
4196 | } |
4197 | } |
4198 | } |
4199 | } |
4200 | |
4201 | |
4202 | |
4203 | static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, |
4204 | int DimIdx, int NumVAddrs) { |
4205 | const LLT S32 = LLT::scalar(32); |
4206 | |
4207 | SmallVector<Register, 8> AddrRegs; |
4208 | for (int I = 0; I != NumVAddrs; ++I) { |
4209 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); |
4210 | if (SrcOp.isReg()) { |
4211 | AddrRegs.push_back(SrcOp.getReg()); |
4212 | assert(B.getMRI()->getType(SrcOp.getReg()) == S32); |
4213 | } |
4214 | } |
4215 | |
4216 | int NumAddrRegs = AddrRegs.size(); |
4217 | if (NumAddrRegs != 1) { |
4218 | |
4219 | if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { |
4220 | const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); |
4221 | auto Undef = B.buildUndef(S32); |
4222 | AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); |
4223 | NumAddrRegs = RoundedNumRegs; |
4224 | } |
4225 | |
4226 | auto VAddr = |
4227 | B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); |
4228 | MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); |
4229 | } |
4230 | |
4231 | for (int I = 1; I != NumVAddrs; ++I) { |
4232 | MachineOperand &SrcOp = MI.getOperand(DimIdx + I); |
4233 | if (SrcOp.isReg()) |
4234 | MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); |
4235 | } |
4236 | } |
4237 | |
4238 | |
4239 | |
4240 | |
4241 | |
4242 | |
4243 | |
4244 | |
4245 | |
4246 | |
4247 | |
4248 | |
4249 | |
4250 | |
4251 | bool AMDGPULegalizerInfo::legalizeImageIntrinsic( |
4252 | MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, |
4253 | const AMDGPU::ImageDimIntrinsicInfo *Intr) const { |
4254 | |
4255 | const unsigned NumDefs = MI.getNumExplicitDefs(); |
4256 | const unsigned ArgOffset = NumDefs + 1; |
4257 | bool IsTFE = NumDefs == 2; |
4258 | |
4259 | |
4260 | |
4261 | |
4262 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = |
4263 | AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); |
4264 | |
4265 | MachineRegisterInfo *MRI = B.getMRI(); |
4266 | const LLT S32 = LLT::scalar(32); |
4267 | const LLT S16 = LLT::scalar(16); |
4268 | const LLT V2S16 = LLT::fixed_vector(2, 16); |
4269 | |
4270 | unsigned DMask = 0; |
4271 | |
4272 | |
4273 | LLT GradTy = |
4274 | MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); |
4275 | LLT AddrTy = |
4276 | MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); |
4277 | const bool IsG16 = GradTy == S16; |
4278 | const bool IsA16 = AddrTy == S16; |
4279 | |
4280 | int DMaskLanes = 0; |
4281 | if (!BaseOpcode->Atomic) { |
4282 | DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); |
4283 | if (BaseOpcode->Gather4) { |
4284 | DMaskLanes = 4; |
4285 | } else if (DMask != 0) { |
4286 | DMaskLanes = countPopulation(DMask); |
4287 | } else if (!IsTFE && !BaseOpcode->Store) { |
4288 | |
4289 | B.buildUndef(MI.getOperand(0)); |
4290 | MI.eraseFromParent(); |
4291 | return true; |
4292 | } |
4293 | } |
4294 | |
4295 | Observer.changingInstr(MI); |
4296 | auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); |
4297 | |
4298 | unsigned NewOpcode = NumDefs == 0 ? |
4299 | AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; |
4300 | |
4301 | |
4302 | MI.setDesc(B.getTII().get(NewOpcode)); |
4303 | |
4304 | |
4305 | |
4306 | if (IsTFE && DMask == 0) { |
4307 | DMask = 0x1; |
4308 | DMaskLanes = 1; |
4309 | MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); |
4310 | } |
4311 | |
4312 | if (BaseOpcode->Atomic) { |
4313 | Register VData0 = MI.getOperand(2).getReg(); |
4314 | LLT Ty = MRI->getType(VData0); |
4315 | |
4316 | |
4317 | if (Ty.isVector()) |
4318 | return false; |
4319 | |
4320 | if (BaseOpcode->AtomicX2) { |
4321 | Register VData1 = MI.getOperand(3).getReg(); |
4322 | |
4323 | LLT PackedTy = LLT::fixed_vector(2, Ty); |
4324 | auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); |
4325 | MI.getOperand(2).setReg(Concat.getReg(0)); |
4326 | MI.getOperand(3).setReg(AMDGPU::NoRegister); |
4327 | } |
4328 | } |
4329 | |
4330 | unsigned CorrectedNumVAddrs = Intr->NumVAddrs; |
4331 | |
4332 | |
4333 | if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = |
4334 | AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { |
4335 | const ConstantFP *ConstantLod; |
4336 | |
4337 | if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, |
4338 | m_GFCst(ConstantLod))) { |
4339 | if (ConstantLod->isZero() || ConstantLod->isNegative()) { |
4340 | |
4341 | const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = |
4342 | AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, |
4343 | Intr->Dim); |
4344 | |
4345 | |
4346 | --CorrectedNumVAddrs; |
4347 | |
4348 | MI.getOperand(MI.getNumExplicitDefs()) |
4349 | .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); |
4350 | MI.RemoveOperand(ArgOffset + Intr->LodIndex); |
4351 | Intr = NewImageDimIntr; |
4352 | } |
4353 | } |
4354 | } |
4355 | |
4356 | |
4357 | if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { |
4358 | int64_t ConstantLod; |
4359 | if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, |
4360 | m_ICst(ConstantLod))) { |
4361 | if (ConstantLod == 0) { |
4362 | |
4363 | |
4364 | MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); |
4365 | --CorrectedNumVAddrs; |
4366 | } |
4367 | } |
4368 | } |
4369 | |
4370 | |
4371 | if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { |
4372 | |
4373 | |
4374 | return false; |
4375 | } |
4376 | |
4377 | if (IsA16 && !ST.hasA16()) { |
4378 | |
4379 | return false; |
4380 | } |
4381 | |
4382 | if (IsA16 || IsG16) { |
4383 | if (Intr->NumVAddrs > 1) { |
4384 | SmallVector<Register, 4> PackedRegs; |
4385 | |
4386 | packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, |
4387 | IsG16); |
4388 | |
4389 | |
4390 | const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && |
4391 | PackedRegs.size() <= ST.getNSAMaxSize(); |
4392 | |
4393 | if (!UseNSA && PackedRegs.size() > 1) { |
4394 | LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); |
4395 | auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); |
4396 | PackedRegs[0] = Concat.getReg(0); |
4397 | PackedRegs.resize(1); |
4398 | } |
4399 | |
4400 | const unsigned NumPacked = PackedRegs.size(); |
4401 | for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { |
4402 | MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); |
4403 | if (!SrcOp.isReg()) { |
4404 | assert(SrcOp.isImm() && SrcOp.getImm() == 0); |
4405 | continue; |
4406 | } |
4407 | |
4408 | assert(SrcOp.getReg() != AMDGPU::NoRegister); |
4409 | |
4410 | if (I - Intr->VAddrStart < NumPacked) |
4411 | SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); |
4412 | else |
4413 | SrcOp.setReg(AMDGPU::NoRegister); |
4414 | } |
4415 | } |
4416 | } else { |
4417 | |
4418 | |
4419 | |
4420 | |
4421 | |
4422 | |
4423 | |
4424 | |
4425 | |
4426 | |
4427 | |
4428 | const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && |
4429 | CorrectedNumVAddrs <= ST.getNSAMaxSize(); |
4430 | |
4431 | if (!UseNSA && Intr->NumVAddrs > 1) |
4432 | convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, |
4433 | Intr->NumVAddrs); |
4434 | } |
4435 | |
4436 | int Flags = 0; |
4437 | if (IsA16) |
4438 | Flags |= 1; |
4439 | if (IsG16) |
4440 | Flags |= 2; |
4441 | MI.addOperand(MachineOperand::CreateImm(Flags)); |
4442 | |
4443 | if (BaseOpcode->Store) { |
4444 | |
4445 | Register VData = MI.getOperand(1).getReg(); |
4446 | LLT Ty = MRI->getType(VData); |
4447 | if (!Ty.isVector() || Ty.getElementType() != S16) |
4448 | return true; |
4449 | |
4450 | Register RepackedReg = handleD16VData(B, *MRI, VData, true); |
4451 | if (RepackedReg != VData) { |
4452 | MI.getOperand(1).setReg(RepackedReg); |
4453 | } |
4454 | |
4455 | return true; |
4456 | } |
4457 | |
4458 | Register DstReg = MI.getOperand(0).getReg(); |
4459 | LLT Ty = MRI->getType(DstReg); |
4460 | const LLT EltTy = Ty.getScalarType(); |
4461 | const bool IsD16 = Ty.getScalarType() == S16; |
4462 | const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; |
4463 | |
4464 | |
4465 | if (NumElts < DMaskLanes) |
4466 | return false; |
4467 | |
4468 | if (NumElts > 4 || DMaskLanes > 4) |
4469 | return false; |
4470 | |
4471 | const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; |
4472 | const LLT AdjustedTy = |
4473 | Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); |
4474 | |
4475 | |
4476 | |
4477 | |
4478 | LLT RoundedTy; |
4479 | |
4480 | |
4481 | LLT TFETy; |
4482 | |
4483 | |
4484 | LLT RegTy; |
4485 | |
4486 | if (IsD16 && ST.hasUnpackedD16VMem()) { |
4487 | RoundedTy = |
4488 | LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); |
4489 | TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); |
4490 | RegTy = S32; |
4491 | } else { |
4492 | unsigned EltSize = EltTy.getSizeInBits(); |
4493 | unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; |
4494 | unsigned RoundedSize = 32 * RoundedElts; |
4495 | RoundedTy = LLT::scalarOrVector( |
4496 | ElementCount::getFixed(RoundedSize / EltSize), EltSize); |
4497 | TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); |
4498 | RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; |
4499 | } |
4500 | |
4501 | |
4502 | |
4503 | if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) |
4504 | return true; |
4505 | |
4506 | Register Dst1Reg; |
4507 | |
4508 | |
4509 | B.setInsertPt(*MI.getParent(), ++MI.getIterator()); |
4510 | |
4511 | |
4512 | |
4513 | const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; |
4514 | const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; |
4515 | |
4516 | Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); |
4517 | |
4518 | MI.getOperand(0).setReg(NewResultReg); |
4519 | |
4520 | |
4521 | |
4522 | |
4523 | |
4524 | |
4525 | if (IsTFE) { |
4526 | Dst1Reg = MI.getOperand(1).getReg(); |
4527 | if (MRI->getType(Dst1Reg) != S32) |
4528 | return false; |
4529 | |
4530 | |
4531 | MI.RemoveOperand(1); |
4532 | |
4533 | |
4534 | if (Ty == S32) { |
4535 | B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); |
4536 | return true; |
4537 | } |
4538 | } |
4539 | |
4540 | |
4541 | |
4542 | SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); |
4543 | |
4544 | const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; |
4545 | |
4546 | if (ResultNumRegs == 1) { |
4547 | assert(!IsTFE); |
4548 | ResultRegs[0] = NewResultReg; |
4549 | } else { |
4550 | |
4551 | for (int I = 0; I != NumDataRegs; ++I) |
4552 | ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); |
4553 | B.buildUnmerge(ResultRegs, NewResultReg); |
4554 | |
4555 | |
4556 | |
4557 | if (IsTFE) |
4558 | ResultRegs.resize(NumDataRegs); |
4559 | } |
4560 | |
4561 | |
4562 | |
4563 | if (IsD16 && !Ty.isVector()) { |
4564 | B.buildTrunc(DstReg, ResultRegs[0]); |
4565 | return true; |
4566 | } |
4567 | |
4568 | |
4569 | if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { |
4570 | B.buildBitcast(DstReg, ResultRegs[0]); |
4571 | return true; |
4572 | } |
4573 | |
4574 | assert(Ty.isVector()); |
4575 | |
4576 | if (IsD16) { |
4577 | |
4578 | |
4579 | |
4580 | |
4581 | |
4582 | if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { |
4583 | for (Register &Reg : ResultRegs) |
4584 | Reg = B.buildBitcast(V2S16, Reg).getReg(0); |
4585 | } else if (ST.hasUnpackedD16VMem()) { |
4586 | for (Register &Reg : ResultRegs) |
4587 | Reg = B.buildTrunc(S16, Reg).getReg(0); |
4588 | } |
4589 | } |
4590 | |
4591 | auto padWithUndef = [&](LLT Ty, int NumElts) { |
4592 | if (NumElts == 0) |
4593 | return; |
4594 | Register Undef = B.buildUndef(Ty).getReg(0); |
4595 | for (int I = 0; I != NumElts; ++I) |
4596 | ResultRegs.push_back(Undef); |
4597 | }; |
4598 | |
4599 | |
4600 | LLT ResTy = MRI->getType(ResultRegs[0]); |
4601 | if (!ResTy.isVector()) { |
4602 | padWithUndef(ResTy, NumElts - ResultRegs.size()); |
4603 | B.buildBuildVector(DstReg, ResultRegs); |
4604 | return true; |
4605 | } |
4606 | |
4607 | assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); |
4608 | const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; |
4609 | |
4610 | |
4611 | const LLT V3S16 = LLT::fixed_vector(3, 16); |
4612 | if (Ty == V3S16) { |
4613 | padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); |
4614 | auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); |
4615 | B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); |
4616 | return true; |
4617 | } |
4618 | |
4619 | padWithUndef(ResTy, RegsToCover - ResultRegs.size()); |
4620 | B.buildConcatVectors(DstReg, ResultRegs); |
4621 | return true; |
4622 | } |
4623 | |
4624 | bool AMDGPULegalizerInfo::legalizeSBufferLoad( |
4625 | LegalizerHelper &Helper, MachineInstr &MI) const { |
4626 | MachineIRBuilder &B = Helper.MIRBuilder; |
4627 | GISelChangeObserver &Observer = Helper.Observer; |
4628 | |
4629 | Register Dst = MI.getOperand(0).getReg(); |
4630 | LLT Ty = B.getMRI()->getType(Dst); |
4631 | unsigned Size = Ty.getSizeInBits(); |
4632 | MachineFunction &MF = B.getMF(); |
4633 | |
4634 | Observer.changingInstr(MI); |
4635 | |
4636 | if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { |
4637 | Ty = getBitcastRegisterType(Ty); |
4638 | Helper.bitcastDst(MI, Ty, 0); |
4639 | Dst = MI.getOperand(0).getReg(); |
4640 | B.setInsertPt(B.getMBB(), MI); |
4641 | } |
4642 | |
4643 | |
4644 | |
4645 | |
4646 | MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); |
4647 | MI.RemoveOperand(1); |
4648 | |
4649 | |
4650 | |
4651 | const unsigned MemSize = (Size + 7) / 8; |
4652 | const Align MemAlign(4); |
4653 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
4654 | MachinePointerInfo(), |
4655 | MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
4656 | MachineMemOperand::MOInvariant, |
4657 | MemSize, MemAlign); |
4658 | MI.addMemOperand(MF, MMO); |
4659 | |
4660 | |
4661 | |
4662 | |
4663 | if (!isPowerOf2_32(Size)) { |
4664 | if (Ty.isVector()) |
4665 | Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); |
4666 | else |
4667 | Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); |
4668 | } |
4669 | |
4670 | Observer.changedInstr(MI); |
4671 | return true; |
4672 | } |
4673 | |
4674 | |
4675 | bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, |
4676 | MachineRegisterInfo &MRI, |
4677 | MachineIRBuilder &B) const { |
4678 | if (!ST.isTrapHandlerEnabled() || |
4679 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) |
4680 | return legalizeTrapEndpgm(MI, MRI, B); |
4681 | |
4682 | if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { |
4683 | switch (*HsaAbiVer) { |
4684 | case ELF::ELFABIVERSION_AMDGPU_HSA_V2: |
4685 | case ELF::ELFABIVERSION_AMDGPU_HSA_V3: |
4686 | return legalizeTrapHsaQueuePtr(MI, MRI, B); |
4687 | case ELF::ELFABIVERSION_AMDGPU_HSA_V4: |
4688 | return ST.supportsGetDoorbellID() ? |
4689 | legalizeTrapHsa(MI, MRI, B) : |
4690 | legalizeTrapHsaQueuePtr(MI, MRI, B); |
4691 | } |
4692 | } |
4693 | |
4694 | llvm_unreachable("Unknown trap handler"); |
4695 | } |
4696 | |
4697 | bool AMDGPULegalizerInfo::legalizeTrapEndpgm( |
4698 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { |
4699 | B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); |
4700 | MI.eraseFromParent(); |
4701 | return true; |
4702 | } |
4703 | |
4704 | bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( |
4705 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { |
4706 | |
4707 | |
4708 | Register LiveIn = |
4709 | MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); |
4710 | if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) |
4711 | return false; |
4712 | |
4713 | Register SGPR01(AMDGPU::SGPR0_SGPR1); |
4714 | B.buildCopy(SGPR01, LiveIn); |
4715 | B.buildInstr(AMDGPU::S_TRAP) |
4716 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) |
4717 | .addReg(SGPR01, RegState::Implicit); |
4718 | |
4719 | MI.eraseFromParent(); |
4720 | return true; |
4721 | } |
4722 | |
4723 | bool AMDGPULegalizerInfo::legalizeTrapHsa( |
4724 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { |
4725 | B.buildInstr(AMDGPU::S_TRAP) |
4726 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); |
4727 | MI.eraseFromParent(); |
4728 | return true; |
4729 | } |
4730 | |
4731 | bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( |
4732 | MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { |
4733 | |
4734 | |
4735 | if (!ST.isTrapHandlerEnabled() || |
4736 | ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { |
4737 | DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), |
4738 | "debugtrap handler not supported", |
4739 | MI.getDebugLoc(), DS_Warning); |
4740 | LLVMContext &Ctx = B.getMF().getFunction().getContext(); |
4741 | Ctx.diagnose(NoTrap); |
4742 | } else { |
4743 | |
4744 | B.buildInstr(AMDGPU::S_TRAP) |
4745 | .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); |
4746 | } |
4747 | |
4748 | MI.eraseFromParent(); |
4749 | return true; |
4750 | } |
4751 | |
4752 | bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, |
4753 | MachineIRBuilder &B) const { |
4754 | MachineRegisterInfo &MRI = *B.getMRI(); |
4755 | const LLT S16 = LLT::scalar(16); |
4756 | const LLT S32 = LLT::scalar(32); |
4757 | |
4758 | Register DstReg = MI.getOperand(0).getReg(); |
4759 | Register NodePtr = MI.getOperand(2).getReg(); |
4760 | Register RayExtent = MI.getOperand(3).getReg(); |
4761 | Register RayOrigin = MI.getOperand(4).getReg(); |
4762 | Register RayDir = MI.getOperand(5).getReg(); |
4763 | Register RayInvDir = MI.getOperand(6).getReg(); |
4764 | Register TDescr = MI.getOperand(7).getReg(); |
4765 | |
4766 | if (!ST.hasGFX10_AEncoding()) { |
4767 | DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), |
4768 | "intrinsic not supported on subtarget", |
4769 | MI.getDebugLoc()); |
4770 | B.getMF().getFunction().getContext().diagnose(BadIntrin); |
4771 | return false; |
4772 | } |
4773 | |
4774 | bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; |
4775 | bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; |
4776 | unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa |
4777 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa |
4778 | : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa |
4779 | : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; |
4780 | |
4781 | SmallVector<Register, 12> Ops; |
4782 | if (Is64) { |
4783 | auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); |
4784 | Ops.push_back(Unmerge.getReg(0)); |
4785 | Ops.push_back(Unmerge.getReg(1)); |
4786 | } else { |
4787 | Ops.push_back(NodePtr); |
4788 | } |
4789 | Ops.push_back(RayExtent); |
4790 | |
4791 | auto packLanes = [&Ops, &S32, &B] (Register Src) { |
4792 | auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); |
4793 | Ops.push_back(Unmerge.getReg(0)); |
4794 | Ops.push_back(Unmerge.getReg(1)); |
4795 | Ops.push_back(Unmerge.getReg(2)); |
4796 | }; |
4797 | |
4798 | packLanes(RayOrigin); |
4799 | if (IsA16) { |
4800 | auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); |
4801 | auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); |
4802 | Register R1 = MRI.createGenericVirtualRegister(S32); |
4803 | Register R2 = MRI.createGenericVirtualRegister(S32); |
4804 | Register R3 = MRI.createGenericVirtualRegister(S32); |
4805 | B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); |
4806 | B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); |
4807 | B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); |
4808 | Ops.push_back(R1); |
4809 | Ops.push_back(R2); |
4810 | Ops.push_back(R3); |
4811 | } else { |
4812 | packLanes(RayDir); |
4813 | packLanes(RayInvDir); |
4814 | } |
4815 | |
4816 | auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) |
4817 | .addDef(DstReg) |
4818 | .addImm(Opcode); |
4819 | |
4820 | for (Register R : Ops) { |
4821 | MIB.addUse(R); |
4822 | } |
4823 | |
4824 | MIB.addUse(TDescr) |
4825 | .addImm(IsA16 ? 1 : 0) |
4826 | .cloneMemRefs(MI); |
4827 | |
4828 | MI.eraseFromParent(); |
4829 | return true; |
4830 | } |
4831 | |
4832 | bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, |
4833 | MachineInstr &MI) const { |
4834 | MachineIRBuilder &B = Helper.MIRBuilder; |
4835 | MachineRegisterInfo &MRI = *B.getMRI(); |
4836 | |
4837 | |
4838 | auto IntrID = MI.getIntrinsicID(); |
4839 | switch (IntrID) { |
| 1 | Control jumps to 'case amdgcn_implicit_buffer_ptr:' at line 4958 | |
|
4840 | case Intrinsic::amdgcn_if: |
4841 | case Intrinsic::amdgcn_else: { |
4842 | MachineInstr *Br = nullptr; |
4843 | MachineBasicBlock *UncondBrTarget = nullptr; |
4844 | bool Negated = false; |
4845 | if (MachineInstr *BrCond = |
4846 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { |
4847 | const SIRegisterInfo *TRI |
4848 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); |
4849 | |
4850 | Register Def = MI.getOperand(1).getReg(); |
4851 | Register Use = MI.getOperand(3).getReg(); |
4852 | |
4853 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); |
4854 | |
4855 | if (Negated) |
4856 | std::swap(CondBrTarget, UncondBrTarget); |
4857 | |
4858 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); |
4859 | if (IntrID == Intrinsic::amdgcn_if) { |
4860 | B.buildInstr(AMDGPU::SI_IF) |
4861 | .addDef(Def) |
4862 | .addUse(Use) |
4863 | .addMBB(UncondBrTarget); |
4864 | } else { |
4865 | B.buildInstr(AMDGPU::SI_ELSE) |
4866 | .addDef(Def) |
4867 | .addUse(Use) |
4868 | .addMBB(UncondBrTarget); |
4869 | } |
4870 | |
4871 | if (Br) { |
4872 | Br->getOperand(0).setMBB(CondBrTarget); |
4873 | } else { |
4874 | |
4875 | |
4876 | |
4877 | B.buildBr(*CondBrTarget); |
4878 | } |
4879 | |
4880 | MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); |
4881 | MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); |
4882 | MI.eraseFromParent(); |
4883 | BrCond->eraseFromParent(); |
4884 | return true; |
4885 | } |
4886 | |
4887 | return false; |
4888 | } |
4889 | case Intrinsic::amdgcn_loop: { |
4890 | MachineInstr *Br = nullptr; |
4891 | MachineBasicBlock *UncondBrTarget = nullptr; |
4892 | bool Negated = false; |
4893 | if (MachineInstr *BrCond = |
4894 | verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { |
4895 | const SIRegisterInfo *TRI |
4896 | = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); |
4897 | |
4898 | MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); |
4899 | Register Reg = MI.getOperand(2).getReg(); |
4900 | |
4901 | if (Negated) |
4902 | std::swap(CondBrTarget, UncondBrTarget); |
4903 | |
4904 | B.setInsertPt(B.getMBB(), BrCond->getIterator()); |
4905 | B.buildInstr(AMDGPU::SI_LOOP) |
4906 | .addUse(Reg) |
4907 | .addMBB(UncondBrTarget); |
4908 | |
4909 | if (Br) |
4910 | Br->getOperand(0).setMBB(CondBrTarget); |
4911 | else |
4912 | B.buildBr(*CondBrTarget); |
4913 | |
4914 | MI.eraseFromParent(); |
4915 | BrCond->eraseFromParent(); |
4916 | MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); |
4917 | return true; |
4918 | } |
4919 | |
4920 | return false; |
4921 | } |
4922 | case Intrinsic::amdgcn_kernarg_segment_ptr: |
4923 | if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { |
4924 | |
4925 | B.buildConstant(MI.getOperand(0).getReg(), 0); |
4926 | MI.eraseFromParent(); |
4927 | return true; |
4928 | } |
4929 | |
4930 | return legalizePreloadedArgIntrin( |
4931 | MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
4932 | case Intrinsic::amdgcn_implicitarg_ptr: |
4933 | return legalizeImplicitArgPtr(MI, MRI, B); |
4934 | case Intrinsic::amdgcn_workitem_id_x: |
4935 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4936 | AMDGPUFunctionArgInfo::WORKITEM_ID_X); |
4937 | case Intrinsic::amdgcn_workitem_id_y: |
4938 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4939 | AMDGPUFunctionArgInfo::WORKITEM_ID_Y); |
4940 | case Intrinsic::amdgcn_workitem_id_z: |
4941 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4942 | AMDGPUFunctionArgInfo::WORKITEM_ID_Z); |
4943 | case Intrinsic::amdgcn_workgroup_id_x: |
4944 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4945 | AMDGPUFunctionArgInfo::WORKGROUP_ID_X); |
4946 | case Intrinsic::amdgcn_workgroup_id_y: |
4947 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4948 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); |
4949 | case Intrinsic::amdgcn_workgroup_id_z: |
4950 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4951 | AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); |
4952 | case Intrinsic::amdgcn_dispatch_ptr: |
4953 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4954 | AMDGPUFunctionArgInfo::DISPATCH_PTR); |
4955 | case Intrinsic::amdgcn_queue_ptr: |
4956 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4957 | AMDGPUFunctionArgInfo::QUEUE_PTR); |
4958 | case Intrinsic::amdgcn_implicit_buffer_ptr: |
4959 | return legalizePreloadedArgIntrin( |
| 2 | | Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin' | |
|
4960 | MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); |
4961 | case Intrinsic::amdgcn_dispatch_id: |
4962 | return legalizePreloadedArgIntrin(MI, MRI, B, |
4963 | AMDGPUFunctionArgInfo::DISPATCH_ID); |
4964 | case Intrinsic::amdgcn_fdiv_fast: |
4965 | return legalizeFDIVFastIntrin(MI, MRI, B); |
4966 | case Intrinsic::amdgcn_is_shared: |
4967 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); |
4968 | case Intrinsic::amdgcn_is_private: |
4969 | return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); |
4970 | case Intrinsic::amdgcn_wavefrontsize: { |
4971 | B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); |
4972 | MI.eraseFromParent(); |
4973 | return true; |
4974 | } |
4975 | case Intrinsic::amdgcn_s_buffer_load: |
4976 | return legalizeSBufferLoad(Helper, MI); |
4977 | case Intrinsic::amdgcn_raw_buffer_store: |
4978 | case Intrinsic::amdgcn_struct_buffer_store: |
4979 | return legalizeBufferStore(MI, MRI, B, false, false); |
4980 | case Intrinsic::amdgcn_raw_buffer_store_format: |
4981 | case Intrinsic::amdgcn_struct_buffer_store_format: |
4982 | return legalizeBufferStore(MI, MRI, B, false, true); |
4983 | case Intrinsic::amdgcn_raw_tbuffer_store: |
4984 | case Intrinsic::amdgcn_struct_tbuffer_store: |
4985 | return legalizeBufferStore(MI, MRI, B, true, true); |
4986 | case Intrinsic::amdgcn_raw_buffer_load: |
4987 | case Intrinsic::amdgcn_struct_buffer_load: |
4988 | return legalizeBufferLoad(MI, MRI, B, false, false); |
4989 | case Intrinsic::amdgcn_raw_buffer_load_format: |
4990 | case Intrinsic::amdgcn_struct_buffer_load_format: |
4991 | return legalizeBufferLoad(MI, MRI, B, true, false); |
4992 | case Intrinsic::amdgcn_raw_tbuffer_load: |
4993 | case Intrinsic::amdgcn_struct_tbuffer_load: |
4994 | return legalizeBufferLoad(MI, MRI, B, true, true); |
4995 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: |
4996 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: |
4997 | case Intrinsic::amdgcn_raw_buffer_atomic_add: |
4998 | case Intrinsic::amdgcn_struct_buffer_atomic_add: |
4999 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: |
5000 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: |
5001 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: |
5002 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: |
5003 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: |
5004 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: |
5005 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: |
5006 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: |
5007 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: |
5008 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: |
5009 | case Intrinsic::amdgcn_raw_buffer_atomic_and: |
5010 | case Intrinsic::amdgcn_struct_buffer_atomic_and: |
5011 | case Intrinsic::amdgcn_raw_buffer_atomic_or: |
5012 | case Intrinsic::amdgcn_struct_buffer_atomic_or: |
5013 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: |
5014 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: |
5015 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: |
5016 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: |
5017 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: |
5018 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: |
5019 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: |
5020 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: |
5021 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: |
5022 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: |
5023 | case Intrinsic::amdgcn_buffer_atomic_fadd: |
5024 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: |
5025 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: |
5026 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: |
5027 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: |
5028 | return legalizeBufferAtomic(MI, B, IntrID); |
5029 | case Intrinsic::amdgcn_atomic_inc: |
5030 | return legalizeAtomicIncDec(MI, B, true); |
5031 | case Intrinsic::amdgcn_atomic_dec: |
5032 | return legalizeAtomicIncDec(MI, B, false); |
5033 | case Intrinsic::trap: |
5034 | return legalizeTrapIntrinsic(MI, MRI, B); |
5035 | case Intrinsic::debugtrap: |
5036 | return legalizeDebugTrapIntrinsic(MI, MRI, B); |
5037 | case Intrinsic::amdgcn_rsq_clamp: |
5038 | return legalizeRsqClampIntrinsic(MI, MRI, B); |
5039 | case Intrinsic::amdgcn_ds_fadd: |
5040 | case Intrinsic::amdgcn_ds_fmin: |
5041 | case Intrinsic::amdgcn_ds_fmax: |
5042 | return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); |
5043 | case Intrinsic::amdgcn_image_bvh_intersect_ray: |
5044 | return legalizeBVHIntrinsic(MI, B); |
5045 | default: { |
5046 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
5047 | AMDGPU::getImageDimIntrinsicInfo(IntrID)) |
5048 | return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); |
5049 | return true; |
5050 | } |
5051 | } |
5052 | |
5053 | return true; |
5054 | } |