Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Warning:line 2822, column 62
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPULegalizerInfo.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model static -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -stack-protector 2 -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
17#include "AMDGPUGlobalISelUtils.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/ADT/ScopeExit.h"
23#include "llvm/BinaryFormat/ELF.h"
24#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/IR/DiagnosticInfo.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE"amdgpu-legalinfo" "amdgpu-legalinfo"
31
32using namespace llvm;
33using namespace LegalizeActions;
34using namespace LegalizeMutations;
35using namespace LegalityPredicates;
36using namespace MIPatternMatch;
37
38// Hack until load/store selection patterns support any tuple of legal types.
39static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
45
46static constexpr unsigned MaxRegisterSize = 1024;
47
48// Round the number of elements to the next power of two elements
49static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53}
54
55// Round the number of bits to the next power of two bits
56static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
60}
61
62/// \returs true if this is an odd sized vector which should widen by adding an
63/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64/// excludes s1 vectors, which should always be scalarized.
65static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
70
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
76 };
77}
78
79static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
83 };
84}
85
86static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91 };
92}
93
94static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx,
99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100 };
101}
102
103static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 unsigned Size = Ty.getSizeInBits();
108 unsigned Pieces = (Size + 63) / 64;
109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110 return std::make_pair(
111 TypeIdx,
112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113 };
114}
115
116// Increase the number of vector elements to reach the next multiple of 32-bit
117// type.
118static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
121
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
126
127 assert(EltSize < 32)((void)0);
128
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131 };
132}
133
134static LLT getBitcastRegisterType(const LLT Ty) {
135 const unsigned Size = Ty.getSizeInBits();
136
137 LLT CoercedTy;
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
142 }
143
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145}
146
147static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151 };
152}
153
154static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0)((void)0);
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161 };
162}
163
164static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168 };
169}
170
171static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175 };
176}
177
178static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182 };
183}
184
185static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
187}
188
189static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
192}
193
194static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
199}
200
201static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
204
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
207
208 return true;
209}
210
211// Any combination of 32 or 64-bit elements up the maximum register size, and
212// multiples of v2s16.
213static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
216 };
217}
218
219static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226 };
227}
228
229// If we have a truncating store or an extending load with a data size larger
230// than 32-bits, we need to reduce to a 32-bit type.
231static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236 };
237}
238
239// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240// handle some operations by just promoting the register during
241// selection. There are also d16 loads on GFX9+ which preserve the high bits.
242static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
264 }
265}
266
267static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289#if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293#endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
301
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
319 }
320
321 assert(RegSize >= MemSize)((void)0);
322
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
328 }
329
330 return true;
331}
332
333// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334// workaround this. Eventually it should ignore the type for loads and only care
335// about the size. Return true in cases where we will workaround this for now by
336// bitcasting.
337static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
340
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
346
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
350
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
353}
354
355static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
359}
360
361/// Return true if a load or store of the type should be lowered with a bitcast
362/// to a different type.
363static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
369
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
372
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
377}
378
379/// Return true if we should legalize a load by widening an odd sized memory
380/// access up to the alignment. Note this case when the memory access itself
381/// changes, not the size of the result register.
382static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 unsigned AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
389
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
395
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
398
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
401 //
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
406
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
414}
415
416static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
420
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
424}
425
426AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
430
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433 };
434
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
448
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
465
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
474
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488
489 const LLT CodePtr = FlatPtr;
490
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
493 };
494
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497 };
498
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
501 };
502
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
505 };
506
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
509 };
510
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
530
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534 .legalFor({S32, S16, V2S16})
535 .clampScalar(0, S16, S32)
536 .clampMaxNumElements(0, S16, 2)
537 .scalarize(0)
538 .widenScalarToNextPow2(0, 32);
539
540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541 .legalFor({S32, S16, V2S16}) // Clamp modifier
542 .minScalarOrElt(0, S16)
543 .clampMaxNumElements(0, S16, 2)
544 .scalarize(0)
545 .widenScalarToNextPow2(0, 32)
546 .lower();
547 } else if (ST.has16BitInsts()) {
548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549 .legalFor({S32, S16})
550 .clampScalar(0, S16, S32)
551 .scalarize(0)
552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553
554 // Technically the saturating operations require clamp bit support, but this
555 // was introduced at the same time as 16-bit operations.
556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557 .legalFor({S32, S16}) // Clamp modifier
558 .minScalar(0, S16)
559 .scalarize(0)
560 .widenScalarToNextPow2(0, 16)
561 .lower();
562
563 // We're just lowering this, but it helps get a better result to try to
564 // coerce to the desired type first.
565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566 .minScalar(0, S16)
567 .scalarize(0)
568 .lower();
569 } else {
570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571 .legalFor({S32})
572 .clampScalar(0, S32, S32)
573 .scalarize(0);
574
575 if (ST.hasIntClamp()) {
576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577 .legalFor({S32}) // Clamp modifier.
578 .scalarize(0)
579 .minScalarOrElt(0, S32)
580 .lower();
581 } else {
582 // Clamp bit support was added in VI, along with 16-bit operations.
583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584 .minScalar(0, S32)
585 .scalarize(0)
586 .lower();
587 }
588
589 // FIXME: DAG expansion gets better results. The widening uses the smaller
590 // range values and goes for the min/max lowering directly.
591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592 .minScalar(0, S32)
593 .scalarize(0)
594 .lower();
595 }
596
597 getActionDefinitionsBuilder(
598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599 .customFor({S32, S64})
600 .clampScalar(0, S32, S64)
601 .widenScalarToNextPow2(0, 32)
602 .scalarize(0);
603
604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605 .legalFor({S32})
606 .maxScalarOrElt(0, S32);
607
608 if (ST.hasVOP3PInsts()) {
609 Mulh
610 .clampMaxNumElements(0, S8, 2)
611 .lowerFor({V2S8});
612 }
613
614 Mulh
615 .scalarize(0)
616 .lower();
617
618 // Report legal for any types we can handle anywhere. For the cases only legal
619 // on the SALU, RegBankSelect will be able to re-legalize.
620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622 .clampScalar(0, S32, S64)
623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625 .widenScalarToNextPow2(0)
626 .scalarize(0);
627
628 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630 .legalFor({{S32, S1}, {S32, S32}})
631 .minScalar(0, S32)
632 // TODO: .scalarize(0)
633 .lower();
634
635 getActionDefinitionsBuilder(G_BITCAST)
636 // Don't worry about the size constraint.
637 .legalIf(all(isRegisterType(0), isRegisterType(1)))
638 .lower();
639
640
641 getActionDefinitionsBuilder(G_CONSTANT)
642 .legalFor({S1, S32, S64, S16, GlobalPtr,
643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644 .legalIf(isPointer(0))
645 .clampScalar(0, S32, S64)
646 .widenScalarToNextPow2(0);
647
648 getActionDefinitionsBuilder(G_FCONSTANT)
649 .legalFor({S32, S64, S16})
650 .clampScalar(0, S16, S64);
651
652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653 .legalIf(isRegisterType(0))
654 // s1 and s16 are special cases because they have legal operations on
655 // them, but don't really occupy registers in the normal way.
656 .legalFor({S1, S16})
657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658 .clampScalarOrElt(0, S32, MaxScalar)
659 .widenScalarToNextPow2(0, 32)
660 .clampMaxNumElements(0, S32, 16);
661
662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
663
664 // If the amount is divergent, we have to do a wave reduction to get the
665 // maximum value, so this is expanded during RegBankSelect.
666 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667 .legalFor({{PrivatePtr, S32}});
668
669 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670 .customIf(typeIsNot(0, PrivatePtr));
671
672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
673
674 auto &FPOpActions = getActionDefinitionsBuilder(
675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676 .legalFor({S32, S64});
677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678 .customFor({S32, S64});
679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680 .customFor({S32, S64});
681
682 if (ST.has16BitInsts()) {
683 if (ST.hasVOP3PInsts())
684 FPOpActions.legalFor({S16, V2S16});
685 else
686 FPOpActions.legalFor({S16});
687
688 TrigActions.customFor({S16});
689 FDIVActions.customFor({S16});
690 }
691
692 auto &MinNumMaxNum = getActionDefinitionsBuilder({
693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
694
695 if (ST.hasVOP3PInsts()) {
696 MinNumMaxNum.customFor(FPTypesPK16)
697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698 .clampMaxNumElements(0, S16, 2)
699 .clampScalar(0, S16, S64)
700 .scalarize(0);
701 } else if (ST.has16BitInsts()) {
702 MinNumMaxNum.customFor(FPTypes16)
703 .clampScalar(0, S16, S64)
704 .scalarize(0);
705 } else {
706 MinNumMaxNum.customFor(FPTypesBase)
707 .clampScalar(0, S32, S64)
708 .scalarize(0);
709 }
710
711 if (ST.hasVOP3PInsts())
712 FPOpActions.clampMaxNumElements(0, S16, 2);
713
714 FPOpActions
715 .scalarize(0)
716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
717
718 TrigActions
719 .scalarize(0)
720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
721
722 FDIVActions
723 .scalarize(0)
724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
725
726 getActionDefinitionsBuilder({G_FNEG, G_FABS})
727 .legalFor(FPTypesPK16)
728 .clampMaxNumElements(0, S16, 2)
729 .scalarize(0)
730 .clampScalar(0, S16, S64);
731
732 if (ST.has16BitInsts()) {
733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734 .legalFor({S32, S64, S16})
735 .scalarize(0)
736 .clampScalar(0, S16, S64);
737 } else {
738 getActionDefinitionsBuilder(G_FSQRT)
739 .legalFor({S32, S64})
740 .scalarize(0)
741 .clampScalar(0, S32, S64);
742
743 if (ST.hasFractBug()) {
744 getActionDefinitionsBuilder(G_FFLOOR)
745 .customFor({S64})
746 .legalFor({S32, S64})
747 .scalarize(0)
748 .clampScalar(0, S32, S64);
749 } else {
750 getActionDefinitionsBuilder(G_FFLOOR)
751 .legalFor({S32, S64})
752 .scalarize(0)
753 .clampScalar(0, S32, S64);
754 }
755 }
756
757 getActionDefinitionsBuilder(G_FPTRUNC)
758 .legalFor({{S32, S64}, {S16, S32}})
759 .scalarize(0)
760 .lower();
761
762 getActionDefinitionsBuilder(G_FPEXT)
763 .legalFor({{S64, S32}, {S32, S16}})
764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765 .scalarize(0);
766
767 getActionDefinitionsBuilder(G_FSUB)
768 // Use actual fsub instruction
769 .legalFor({S32})
770 // Must use fadd + fneg
771 .lowerFor({S64, S16, V2S16})
772 .scalarize(0)
773 .clampScalar(0, S32, S64);
774
775 // Whether this is legal depends on the floating point mode for the function.
776 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778 FMad.customFor({S32, S16});
779 else if (ST.hasMadMacF32Insts())
780 FMad.customFor({S32});
781 else if (ST.hasMadF16())
782 FMad.customFor({S16});
783 FMad.scalarize(0)
784 .lower();
785
786 auto &FRem = getActionDefinitionsBuilder(G_FREM);
787 if (ST.has16BitInsts()) {
788 FRem.customFor({S16, S32, S64});
789 } else {
790 FRem.minScalar(0, S32)
791 .customFor({S32, S64});
792 }
793 FRem.scalarize(0);
794
795 // TODO: Do we need to clamp maximum bitwidth?
796 getActionDefinitionsBuilder(G_TRUNC)
797 .legalIf(isScalar(0))
798 .legalFor({{V2S16, V2S32}})
799 .clampMaxNumElements(0, S16, 2)
800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801 // situations (like an invalid implicit use), we don't want to infinite loop
802 // in the legalizer.
803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804 .alwaysLegal();
805
806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808 {S32, S1}, {S64, S1}, {S16, S1}})
809 .scalarize(0)
810 .clampScalar(0, S32, S64)
811 .widenScalarToNextPow2(1, 32);
812
813 // TODO: Split s1->s64 during regbankselect for VALU.
814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816 .lowerFor({{S32, S64}})
817 .lowerIf(typeIs(1, S1))
818 .customFor({{S64, S64}});
819 if (ST.has16BitInsts())
820 IToFP.legalFor({{S16, S16}});
821 IToFP.clampScalar(1, S32, S64)
822 .minScalar(0, S32)
823 .scalarize(0)
824 .widenScalarToNextPow2(1);
825
826 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
827 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
828 .customFor({{S64, S32}, {S64, S64}})
829 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
830 if (ST.has16BitInsts())
831 FPToI.legalFor({{S16, S16}});
832 else
833 FPToI.minScalar(1, S32);
834
835 FPToI.minScalar(0, S32)
836 .widenScalarToNextPow2(0, 32)
837 .scalarize(0)
838 .lower();
839
840 // Lower roundeven into G_FRINT
841 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
842 .scalarize(0)
843 .lower();
844
845 if (ST.has16BitInsts()) {
846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847 .legalFor({S16, S32, S64})
848 .clampScalar(0, S16, S64)
849 .scalarize(0);
850 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852 .legalFor({S32, S64})
853 .clampScalar(0, S32, S64)
854 .scalarize(0);
855 } else {
856 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
857 .legalFor({S32})
858 .customFor({S64})
859 .clampScalar(0, S32, S64)
860 .scalarize(0);
861 }
862
863 getActionDefinitionsBuilder(G_PTR_ADD)
864 .legalIf(all(isPointer(0), sameSize(0, 1)))
865 .scalarize(0)
866 .scalarSameSizeAs(1, 0);
867
868 getActionDefinitionsBuilder(G_PTRMASK)
869 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
870 .scalarSameSizeAs(1, 0)
871 .scalarize(0);
872
873 auto &CmpBuilder =
874 getActionDefinitionsBuilder(G_ICMP)
875 // The compare output type differs based on the register bank of the output,
876 // so make both s1 and s32 legal.
877 //
878 // Scalar compares producing output in scc will be promoted to s32, as that
879 // is the allocatable register type that will be needed for the copy from
880 // scc. This will be promoted during RegBankSelect, and we assume something
881 // before that won't try to use s32 result types.
882 //
883 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
884 // bank.
885 .legalForCartesianProduct(
886 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
887 .legalForCartesianProduct(
888 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
889 if (ST.has16BitInsts()) {
890 CmpBuilder.legalFor({{S1, S16}});
891 }
892
893 CmpBuilder
894 .widenScalarToNextPow2(1)
895 .clampScalar(1, S32, S64)
896 .scalarize(0)
897 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
898
899 getActionDefinitionsBuilder(G_FCMP)
900 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
901 .widenScalarToNextPow2(1)
902 .clampScalar(1, S32, S64)
903 .scalarize(0);
904
905 // FIXME: fpow has a selection pattern that should move to custom lowering.
906 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
907 if (ST.has16BitInsts())
908 Exp2Ops.legalFor({S32, S16});
909 else
910 Exp2Ops.legalFor({S32});
911 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
912 Exp2Ops.scalarize(0);
913
914 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
915 if (ST.has16BitInsts())
916 ExpOps.customFor({{S32}, {S16}});
917 else
918 ExpOps.customFor({S32});
919 ExpOps.clampScalar(0, MinScalarFPTy, S32)
920 .scalarize(0);
921
922 getActionDefinitionsBuilder(G_FPOWI)
923 .clampScalar(0, MinScalarFPTy, S32)
924 .lower();
925
926 // The 64-bit versions produce 32-bit results, but only on the SALU.
927 getActionDefinitionsBuilder(G_CTPOP)
928 .legalFor({{S32, S32}, {S32, S64}})
929 .clampScalar(0, S32, S32)
930 .clampScalar(1, S32, S64)
931 .scalarize(0)
932 .widenScalarToNextPow2(0, 32)
933 .widenScalarToNextPow2(1, 32);
934
935 // The hardware instructions return a different result on 0 than the generic
936 // instructions expect. The hardware produces -1, but these produce the
937 // bitwidth.
938 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
939 .scalarize(0)
940 .clampScalar(0, S32, S32)
941 .clampScalar(1, S32, S64)
942 .widenScalarToNextPow2(0, 32)
943 .widenScalarToNextPow2(1, 32)
944 .lower();
945
946 // The 64-bit versions produce 32-bit results, but only on the SALU.
947 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
948 .legalFor({{S32, S32}, {S32, S64}})
949 .clampScalar(0, S32, S32)
950 .clampScalar(1, S32, S64)
951 .scalarize(0)
952 .widenScalarToNextPow2(0, 32)
953 .widenScalarToNextPow2(1, 32);
954
955 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
956 // RegBankSelect.
957 getActionDefinitionsBuilder(G_BITREVERSE)
958 .legalFor({S32, S64})
959 .clampScalar(0, S32, S64)
960 .scalarize(0)
961 .widenScalarToNextPow2(0);
962
963 if (ST.has16BitInsts()) {
964 getActionDefinitionsBuilder(G_BSWAP)
965 .legalFor({S16, S32, V2S16})
966 .clampMaxNumElements(0, S16, 2)
967 // FIXME: Fixing non-power-of-2 before clamp is workaround for
968 // narrowScalar limitation.
969 .widenScalarToNextPow2(0)
970 .clampScalar(0, S16, S32)
971 .scalarize(0);
972
973 if (ST.hasVOP3PInsts()) {
974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
975 .legalFor({S32, S16, V2S16})
976 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
977 .clampMaxNumElements(0, S16, 2)
978 .minScalar(0, S16)
979 .widenScalarToNextPow2(0)
980 .scalarize(0)
981 .lower();
982 } else {
983 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
984 .legalFor({S32, S16})
985 .widenScalarToNextPow2(0)
986 .minScalar(0, S16)
987 .scalarize(0)
988 .lower();
989 }
990 } else {
991 // TODO: Should have same legality without v_perm_b32
992 getActionDefinitionsBuilder(G_BSWAP)
993 .legalFor({S32})
994 .lowerIf(scalarNarrowerThan(0, 32))
995 // FIXME: Fixing non-power-of-2 before clamp is workaround for
996 // narrowScalar limitation.
997 .widenScalarToNextPow2(0)
998 .maxScalar(0, S32)
999 .scalarize(0)
1000 .lower();
1001
1002 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1003 .legalFor({S32})
1004 .minScalar(0, S32)
1005 .widenScalarToNextPow2(0)
1006 .scalarize(0)
1007 .lower();
1008 }
1009
1010 getActionDefinitionsBuilder(G_INTTOPTR)
1011 // List the common cases
1012 .legalForCartesianProduct(AddrSpaces64, {S64})
1013 .legalForCartesianProduct(AddrSpaces32, {S32})
1014 .scalarize(0)
1015 // Accept any address space as long as the size matches
1016 .legalIf(sameSize(0, 1))
1017 .widenScalarIf(smallerThan(1, 0),
1018 [](const LegalityQuery &Query) {
1019 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1020 })
1021 .narrowScalarIf(largerThan(1, 0),
1022 [](const LegalityQuery &Query) {
1023 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1024 });
1025
1026 getActionDefinitionsBuilder(G_PTRTOINT)
1027 // List the common cases
1028 .legalForCartesianProduct(AddrSpaces64, {S64})
1029 .legalForCartesianProduct(AddrSpaces32, {S32})
1030 .scalarize(0)
1031 // Accept any address space as long as the size matches
1032 .legalIf(sameSize(0, 1))
1033 .widenScalarIf(smallerThan(0, 1),
1034 [](const LegalityQuery &Query) {
1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036 })
1037 .narrowScalarIf(
1038 largerThan(0, 1),
1039 [](const LegalityQuery &Query) {
1040 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1041 });
1042
1043 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1044 .scalarize(0)
1045 .custom();
1046
1047 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1048 bool IsLoad) -> bool {
1049 const LLT DstTy = Query.Types[0];
1050
1051 // Split vector extloads.
1052 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1053 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1054
1055 if (MemSize < DstTy.getSizeInBits())
1056 MemSize = std::max(MemSize, AlignBits);
1057
1058 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1059 return true;
1060
1061 const LLT PtrTy = Query.Types[1];
1062 unsigned AS = PtrTy.getAddressSpace();
1063 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1064 return true;
1065
1066 // Catch weird sized loads that don't evenly divide into the access sizes
1067 // TODO: May be able to widen depending on alignment etc.
1068 unsigned NumRegs = (MemSize + 31) / 32;
1069 if (NumRegs == 3) {
1070 if (!ST.hasDwordx3LoadStores())
1071 return true;
1072 } else {
1073 // If the alignment allows, these should have been widened.
1074 if (!isPowerOf2_32(NumRegs))
1075 return true;
1076 }
1077
1078 if (AlignBits < MemSize) {
1079 const SITargetLowering *TLI = ST.getTargetLowering();
1080 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1081 Align(AlignBits / 8));
1082 }
1083
1084 return false;
1085 };
1086
1087 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1088 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1089 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1090
1091 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1092 // LDS
1093 // TODO: Unsupported flat for SI.
1094
1095 for (unsigned Op : {G_LOAD, G_STORE}) {
1096 const bool IsStore = Op == G_STORE;
1097
1098 auto &Actions = getActionDefinitionsBuilder(Op);
1099 // Explicitly list some common cases.
1100 // TODO: Does this help compile time at all?
1101 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1102 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1103 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1104 {S64, GlobalPtr, S64, GlobalAlign32},
1105 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1106 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1107 {S32, GlobalPtr, S8, GlobalAlign8},
1108 {S32, GlobalPtr, S16, GlobalAlign16},
1109
1110 {S32, LocalPtr, S32, 32},
1111 {S64, LocalPtr, S64, 32},
1112 {V2S32, LocalPtr, V2S32, 32},
1113 {S32, LocalPtr, S8, 8},
1114 {S32, LocalPtr, S16, 16},
1115 {V2S16, LocalPtr, S32, 32},
1116
1117 {S32, PrivatePtr, S32, 32},
1118 {S32, PrivatePtr, S8, 8},
1119 {S32, PrivatePtr, S16, 16},
1120 {V2S16, PrivatePtr, S32, 32},
1121
1122 {S32, ConstantPtr, S32, GlobalAlign32},
1123 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1124 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1125 {S64, ConstantPtr, S64, GlobalAlign32},
1126 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1127 Actions.legalIf(
1128 [=](const LegalityQuery &Query) -> bool {
1129 return isLoadStoreLegal(ST, Query);
1130 });
1131
1132 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1133 // 64-bits.
1134 //
1135 // TODO: Should generalize bitcast action into coerce, which will also cover
1136 // inserting addrspacecasts.
1137 Actions.customIf(typeIs(1, Constant32Ptr));
1138
1139 // Turn any illegal element vectors into something easier to deal
1140 // with. These will ultimately produce 32-bit scalar shifts to extract the
1141 // parts anyway.
1142 //
1143 // For odd 16-bit element vectors, prefer to split those into pieces with
1144 // 16-bit vector parts.
1145 Actions.bitcastIf(
1146 [=](const LegalityQuery &Query) -> bool {
1147 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1148 Query.MMODescrs[0].MemoryTy);
1149 }, bitcastToRegisterType(0));
1150
1151 if (!IsStore) {
1152 // Widen suitably aligned loads by loading extra bytes. The standard
1153 // legalization actions can't properly express widening memory operands.
1154 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1155 return shouldWidenLoad(ST, Query, G_LOAD);
1156 });
1157 }
1158
1159 // FIXME: load/store narrowing should be moved to lower action
1160 Actions
1161 .narrowScalarIf(
1162 [=](const LegalityQuery &Query) -> bool {
1163 return !Query.Types[0].isVector() &&
1164 needToSplitMemOp(Query, Op == G_LOAD);
1165 },
1166 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1167 const LLT DstTy = Query.Types[0];
1168 const LLT PtrTy = Query.Types[1];
1169
1170 const unsigned DstSize = DstTy.getSizeInBits();
1171 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1172
1173 // Split extloads.
1174 if (DstSize > MemSize)
1175 return std::make_pair(0, LLT::scalar(MemSize));
1176
1177 if (!isPowerOf2_32(DstSize)) {
1178 // We're probably decomposing an odd sized store. Try to split
1179 // to the widest type. TODO: Account for alignment. As-is it
1180 // should be OK, since the new parts will be further legalized.
1181 unsigned FloorSize = PowerOf2Floor(DstSize);
1182 return std::make_pair(0, LLT::scalar(FloorSize));
1183 }
1184
1185 if (DstSize > 32 && (DstSize % 32 != 0)) {
1186 // FIXME: Need a way to specify non-extload of larger size if
1187 // suitably aligned.
1188 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1189 }
1190
1191 unsigned MaxSize = maxSizeForAddrSpace(ST,
1192 PtrTy.getAddressSpace(),
1193 Op == G_LOAD);
1194 if (MemSize > MaxSize)
1195 return std::make_pair(0, LLT::scalar(MaxSize));
1196
1197 unsigned Align = Query.MMODescrs[0].AlignInBits;
1198 return std::make_pair(0, LLT::scalar(Align));
1199 })
1200 .fewerElementsIf(
1201 [=](const LegalityQuery &Query) -> bool {
1202 return Query.Types[0].isVector() &&
1203 needToSplitMemOp(Query, Op == G_LOAD);
1204 },
1205 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1206 const LLT DstTy = Query.Types[0];
1207 const LLT PtrTy = Query.Types[1];
1208
1209 LLT EltTy = DstTy.getElementType();
1210 unsigned MaxSize = maxSizeForAddrSpace(ST,
1211 PtrTy.getAddressSpace(),
1212 Op == G_LOAD);
1213
1214 // FIXME: Handle widened to power of 2 results better. This ends
1215 // up scalarizing.
1216 // FIXME: 3 element stores scalarized on SI
1217
1218 // Split if it's too large for the address space.
1219 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1220 if (MemSize > MaxSize) {
1221 unsigned NumElts = DstTy.getNumElements();
1222 unsigned EltSize = EltTy.getSizeInBits();
1223
1224 if (MaxSize % EltSize == 0) {
1225 return std::make_pair(
1226 0, LLT::scalarOrVector(
1227 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1228 }
1229
1230 unsigned NumPieces = MemSize / MaxSize;
1231
1232 // FIXME: Refine when odd breakdowns handled
1233 // The scalars will need to be re-legalized.
1234 if (NumPieces == 1 || NumPieces >= NumElts ||
1235 NumElts % NumPieces != 0)
1236 return std::make_pair(0, EltTy);
1237
1238 return std::make_pair(
1239 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1240 }
1241
1242 // FIXME: We could probably handle weird extending loads better.
1243 if (DstTy.getSizeInBits() > MemSize)
1244 return std::make_pair(0, EltTy);
1245
1246 unsigned EltSize = EltTy.getSizeInBits();
1247 unsigned DstSize = DstTy.getSizeInBits();
1248 if (!isPowerOf2_32(DstSize)) {
1249 // We're probably decomposing an odd sized store. Try to split
1250 // to the widest type. TODO: Account for alignment. As-is it
1251 // should be OK, since the new parts will be further legalized.
1252 unsigned FloorSize = PowerOf2Floor(DstSize);
1253 return std::make_pair(
1254 0, LLT::scalarOrVector(
1255 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1256 }
1257
1258 // Need to split because of alignment.
1259 unsigned Align = Query.MMODescrs[0].AlignInBits;
1260 if (EltSize > Align &&
1261 (EltSize / Align < DstTy.getNumElements())) {
1262 return std::make_pair(
1263 0, LLT::fixed_vector(EltSize / Align, EltTy));
1264 }
1265
1266 // May need relegalization for the scalars.
1267 return std::make_pair(0, EltTy);
1268 })
1269 .lowerIfMemSizeNotPow2()
1270 .minScalar(0, S32)
1271 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1272 .widenScalarToNextPow2(0)
1273 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1274 .lower();
1275 }
1276
1277 // FIXME: Unaligned accesses not lowered.
1278 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1279 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1280 {S32, GlobalPtr, S16, 2 * 8},
1281 {S32, LocalPtr, S8, 8},
1282 {S32, LocalPtr, S16, 16},
1283 {S32, PrivatePtr, S8, 8},
1284 {S32, PrivatePtr, S16, 16},
1285 {S32, ConstantPtr, S8, 8},
1286 {S32, ConstantPtr, S16, 2 * 8}})
1287 .legalIf(
1288 [=](const LegalityQuery &Query) -> bool {
1289 return isLoadStoreLegal(ST, Query);
1290 });
1291
1292 if (ST.hasFlatAddressSpace()) {
1293 ExtLoads.legalForTypesWithMemDesc(
1294 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1295 }
1296
1297 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1298 // 64-bits.
1299 //
1300 // TODO: Should generalize bitcast action into coerce, which will also cover
1301 // inserting addrspacecasts.
1302 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1303
1304 ExtLoads.clampScalar(0, S32, S32)
1305 .widenScalarToNextPow2(0)
1306 .lower();
1307
1308 auto &Atomics = getActionDefinitionsBuilder(
1309 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1310 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1311 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1312 G_ATOMICRMW_UMIN})
1313 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1314 {S64, GlobalPtr}, {S64, LocalPtr},
1315 {S32, RegionPtr}, {S64, RegionPtr}});
1316 if (ST.hasFlatAddressSpace()) {
1317 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1318 }
1319
1320 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1321 if (ST.hasLDSFPAtomics()) {
1322 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1323 if (ST.hasGFX90AInsts())
1324 Atomic.legalFor({{S64, LocalPtr}});
1325 }
1326 if (ST.hasAtomicFaddInsts())
1327 Atomic.legalFor({{S32, GlobalPtr}});
1328
1329 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1330 // demarshalling
1331 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1332 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1333 {S32, FlatPtr}, {S64, FlatPtr}})
1334 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1335 {S32, RegionPtr}, {S64, RegionPtr}});
1336 // TODO: Pointer types, any 32-bit or 64-bit vector
1337
1338 // Condition should be s32 for scalar, s1 for vector.
1339 getActionDefinitionsBuilder(G_SELECT)
1340 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1341 LocalPtr, FlatPtr, PrivatePtr,
1342 LLT::fixed_vector(2, LocalPtr),
1343 LLT::fixed_vector(2, PrivatePtr)},
1344 {S1, S32})
1345 .clampScalar(0, S16, S64)
1346 .scalarize(1)
1347 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1348 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1349 .clampMaxNumElements(0, S32, 2)
1350 .clampMaxNumElements(0, LocalPtr, 2)
1351 .clampMaxNumElements(0, PrivatePtr, 2)
1352 .scalarize(0)
1353 .widenScalarToNextPow2(0)
1354 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1355
1356 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1357 // be more flexible with the shift amount type.
1358 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1359 .legalFor({{S32, S32}, {S64, S32}});
1360 if (ST.has16BitInsts()) {
1361 if (ST.hasVOP3PInsts()) {
1362 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1363 .clampMaxNumElements(0, S16, 2);
1364 } else
1365 Shifts.legalFor({{S16, S16}});
1366
1367 // TODO: Support 16-bit shift amounts for all types
1368 Shifts.widenScalarIf(
1369 [=](const LegalityQuery &Query) {
1370 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1371 // 32-bit amount.
1372 const LLT ValTy = Query.Types[0];
1373 const LLT AmountTy = Query.Types[1];
1374 return ValTy.getSizeInBits() <= 16 &&
1375 AmountTy.getSizeInBits() < 16;
1376 }, changeTo(1, S16));
1377 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1378 Shifts.clampScalar(1, S32, S32);
1379 Shifts.clampScalar(0, S16, S64);
1380 Shifts.widenScalarToNextPow2(0, 16);
1381
1382 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1383 .minScalar(0, S16)
1384 .scalarize(0)
1385 .lower();
1386 } else {
1387 // Make sure we legalize the shift amount type first, as the general
1388 // expansion for the shifted type will produce much worse code if it hasn't
1389 // been truncated already.
1390 Shifts.clampScalar(1, S32, S32);
1391 Shifts.clampScalar(0, S32, S64);
1392 Shifts.widenScalarToNextPow2(0, 32);
1393
1394 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1395 .minScalar(0, S32)
1396 .scalarize(0)
1397 .lower();
1398 }
1399 Shifts.scalarize(0);
1400
1401 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1402 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1403 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1404 unsigned IdxTypeIdx = 2;
1405
1406 getActionDefinitionsBuilder(Op)
1407 .customIf([=](const LegalityQuery &Query) {
1408 const LLT EltTy = Query.Types[EltTypeIdx];
1409 const LLT VecTy = Query.Types[VecTypeIdx];
1410 const LLT IdxTy = Query.Types[IdxTypeIdx];
1411 const unsigned EltSize = EltTy.getSizeInBits();
1412 return (EltSize == 32 || EltSize == 64) &&
1413 VecTy.getSizeInBits() % 32 == 0 &&
1414 VecTy.getSizeInBits() <= MaxRegisterSize &&
1415 IdxTy.getSizeInBits() == 32;
1416 })
1417 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1418 bitcastToVectorElement32(VecTypeIdx))
1419 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1420 .bitcastIf(
1421 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1422 [=](const LegalityQuery &Query) {
1423 // For > 64-bit element types, try to turn this into a 64-bit
1424 // element vector since we may be able to do better indexing
1425 // if this is scalar. If not, fall back to 32.
1426 const LLT EltTy = Query.Types[EltTypeIdx];
1427 const LLT VecTy = Query.Types[VecTypeIdx];
1428 const unsigned DstEltSize = EltTy.getSizeInBits();
1429 const unsigned VecSize = VecTy.getSizeInBits();
1430
1431 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1432 return std::make_pair(
1433 VecTypeIdx,
1434 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1435 })
1436 .clampScalar(EltTypeIdx, S32, S64)
1437 .clampScalar(VecTypeIdx, S32, S64)
1438 .clampScalar(IdxTypeIdx, S32, S32)
1439 .clampMaxNumElements(VecTypeIdx, S32, 32)
1440 // TODO: Clamp elements for 64-bit vectors?
1441 // It should only be necessary with variable indexes.
1442 // As a last resort, lower to the stack
1443 .lower();
1444 }
1445
1446 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1447 .unsupportedIf([=](const LegalityQuery &Query) {
1448 const LLT &EltTy = Query.Types[1].getElementType();
1449 return Query.Types[0] != EltTy;
1450 });
1451
1452 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1453 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1454 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1455
1456 // FIXME: Doesn't handle extract of illegal sizes.
1457 getActionDefinitionsBuilder(Op)
1458 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1459 // FIXME: Multiples of 16 should not be legal.
1460 .legalIf([=](const LegalityQuery &Query) {
1461 const LLT BigTy = Query.Types[BigTyIdx];
1462 const LLT LitTy = Query.Types[LitTyIdx];
1463 return (BigTy.getSizeInBits() % 32 == 0) &&
1464 (LitTy.getSizeInBits() % 16 == 0);
1465 })
1466 .widenScalarIf(
1467 [=](const LegalityQuery &Query) {
1468 const LLT BigTy = Query.Types[BigTyIdx];
1469 return (BigTy.getScalarSizeInBits() < 16);
1470 },
1471 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1472 .widenScalarIf(
1473 [=](const LegalityQuery &Query) {
1474 const LLT LitTy = Query.Types[LitTyIdx];
1475 return (LitTy.getScalarSizeInBits() < 16);
1476 },
1477 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1478 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1479 .widenScalarToNextPow2(BigTyIdx, 32);
1480
1481 }
1482
1483 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1484 .legalForCartesianProduct(AllS32Vectors, {S32})
1485 .legalForCartesianProduct(AllS64Vectors, {S64})
1486 .clampNumElements(0, V16S32, V32S32)
1487 .clampNumElements(0, V2S64, V16S64)
1488 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1489
1490 if (ST.hasScalarPackInsts()) {
1491 BuildVector
1492 // FIXME: Should probably widen s1 vectors straight to s32
1493 .minScalarOrElt(0, S16)
1494 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1495 .minScalar(1, S32);
1496
1497 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1498 .legalFor({V2S16, S32})
1499 .lower();
1500 BuildVector.minScalarOrElt(0, S32);
1501 } else {
1502 BuildVector.customFor({V2S16, S16});
1503 BuildVector.minScalarOrElt(0, S32);
1504
1505 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1506 .customFor({V2S16, S32})
1507 .lower();
1508 }
1509
1510 BuildVector.legalIf(isRegisterType(0));
1511
1512 // FIXME: Clamp maximum size
1513 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1514 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1515 .clampMaxNumElements(0, S32, 32)
1516 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1517 .clampMaxNumElements(0, S16, 64);
1518
1519 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1520 // pre-legalize.
1521 if (ST.hasVOP3PInsts()) {
1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1523 .customFor({V2S16, V2S16})
1524 .lower();
1525 } else
1526 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1527
1528 // Merge/Unmerge
1529 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1530 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1531 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1532
1533 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1534 const LLT Ty = Query.Types[TypeIdx];
1535 if (Ty.isVector()) {
1536 const LLT &EltTy = Ty.getElementType();
1537 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1538 return true;
1539 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1540 return true;
1541 }
1542 return false;
1543 };
1544
1545 auto &Builder = getActionDefinitionsBuilder(Op)
1546 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1547 .lowerFor({{S16, V2S16}})
1548 .lowerIf([=](const LegalityQuery &Query) {
1549 const LLT BigTy = Query.Types[BigTyIdx];
1550 return BigTy.getSizeInBits() == 32;
1551 })
1552 // Try to widen to s16 first for small types.
1553 // TODO: Only do this on targets with legal s16 shifts
1554 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1555 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1556 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1557 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1558 elementTypeIs(1, S16)),
1559 changeTo(1, V2S16))
1560 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1561 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1562 // valid.
1563 .clampScalar(LitTyIdx, S32, S512)
1564 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1565 // Break up vectors with weird elements into scalars
1566 .fewerElementsIf(
1567 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1568 scalarize(0))
1569 .fewerElementsIf(
1570 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1571 scalarize(1))
1572 .clampScalar(BigTyIdx, S32, MaxScalar);
1573
1574 if (Op == G_MERGE_VALUES) {
1575 Builder.widenScalarIf(
1576 // TODO: Use 16-bit shifts if legal for 8-bit values?
1577 [=](const LegalityQuery &Query) {
1578 const LLT Ty = Query.Types[LitTyIdx];
1579 return Ty.getSizeInBits() < 32;
1580 },
1581 changeTo(LitTyIdx, S32));
1582 }
1583
1584 Builder.widenScalarIf(
1585 [=](const LegalityQuery &Query) {
1586 const LLT Ty = Query.Types[BigTyIdx];
1587 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1588 Ty.getSizeInBits() % 16 != 0;
1589 },
1590 [=](const LegalityQuery &Query) {
1591 // Pick the next power of 2, or a multiple of 64 over 128.
1592 // Whichever is smaller.
1593 const LLT &Ty = Query.Types[BigTyIdx];
1594 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1595 if (NewSizeInBits >= 256) {
1596 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1597 if (RoundedTo < NewSizeInBits)
1598 NewSizeInBits = RoundedTo;
1599 }
1600 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1601 })
1602 // Any vectors left are the wrong size. Scalarize them.
1603 .scalarize(0)
1604 .scalarize(1);
1605 }
1606
1607 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1608 // RegBankSelect.
1609 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1610 .legalFor({{S32}, {S64}});
1611
1612 if (ST.hasVOP3PInsts()) {
1613 SextInReg.lowerFor({{V2S16}})
1614 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1615 // get more vector shift opportunities, since we'll get those when
1616 // expanded.
1617 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1618 } else if (ST.has16BitInsts()) {
1619 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1620 } else {
1621 // Prefer to promote to s32 before lowering if we don't have 16-bit
1622 // shifts. This avoid a lot of intermediate truncate and extend operations.
1623 SextInReg.lowerFor({{S32}, {S64}});
1624 }
1625
1626 SextInReg
1627 .scalarize(0)
1628 .clampScalar(0, S32, S64)
1629 .lower();
1630
1631 // TODO: Only Try to form v2s16 with legal packed instructions.
1632 getActionDefinitionsBuilder(G_FSHR)
1633 .legalFor({{S32, S32}})
1634 .lowerFor({{V2S16, V2S16}})
1635 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1636 .scalarize(0)
1637 .lower();
1638
1639 if (ST.hasVOP3PInsts()) {
1640 getActionDefinitionsBuilder(G_FSHL)
1641 .lowerFor({{V2S16, V2S16}})
1642 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1643 .scalarize(0)
1644 .lower();
1645 } else {
1646 getActionDefinitionsBuilder(G_FSHL)
1647 .scalarize(0)
1648 .lower();
1649 }
1650
1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652 .legalFor({S64});
1653
1654 getActionDefinitionsBuilder(G_FENCE)
1655 .alwaysLegal();
1656
1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658 .scalarize(0)
1659 .minScalar(0, S32)
1660 .lower();
1661
1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663 .legalFor({{S32, S32}, {S64, S32}})
1664 .clampScalar(1, S32, S32)
1665 .clampScalar(0, S32, S64)
1666 .widenScalarToNextPow2(0)
1667 .scalarize(0);
1668
1669 getActionDefinitionsBuilder({
1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671 G_FCOPYSIGN,
1672
1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674 G_ATOMICRMW_NAND,
1675 G_ATOMICRMW_FSUB,
1676 G_READ_REGISTER,
1677 G_WRITE_REGISTER,
1678
1679 G_SADDO, G_SSUBO,
1680
1681 // TODO: Implement
1682 G_FMINIMUM, G_FMAXIMUM}).lower();
1683
1684 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1685 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1686 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1687 .unsupported();
1688
1689 getLegacyLegalizerInfo().computeTables();
1690 verify(*ST.getInstrInfo());
1691}
1692
1693bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1694 MachineInstr &MI) const {
1695 MachineIRBuilder &B = Helper.MIRBuilder;
1696 MachineRegisterInfo &MRI = *B.getMRI();
1697
1698 switch (MI.getOpcode()) {
1699 case TargetOpcode::G_ADDRSPACE_CAST:
1700 return legalizeAddrSpaceCast(MI, MRI, B);
1701 case TargetOpcode::G_FRINT:
1702 return legalizeFrint(MI, MRI, B);
1703 case TargetOpcode::G_FCEIL:
1704 return legalizeFceil(MI, MRI, B);
1705 case TargetOpcode::G_FREM:
1706 return legalizeFrem(MI, MRI, B);
1707 case TargetOpcode::G_INTRINSIC_TRUNC:
1708 return legalizeIntrinsicTrunc(MI, MRI, B);
1709 case TargetOpcode::G_SITOFP:
1710 return legalizeITOFP(MI, MRI, B, true);
1711 case TargetOpcode::G_UITOFP:
1712 return legalizeITOFP(MI, MRI, B, false);
1713 case TargetOpcode::G_FPTOSI:
1714 return legalizeFPTOI(MI, MRI, B, true);
1715 case TargetOpcode::G_FPTOUI:
1716 return legalizeFPTOI(MI, MRI, B, false);
1717 case TargetOpcode::G_FMINNUM:
1718 case TargetOpcode::G_FMAXNUM:
1719 case TargetOpcode::G_FMINNUM_IEEE:
1720 case TargetOpcode::G_FMAXNUM_IEEE:
1721 return legalizeMinNumMaxNum(Helper, MI);
1722 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1723 return legalizeExtractVectorElt(MI, MRI, B);
1724 case TargetOpcode::G_INSERT_VECTOR_ELT:
1725 return legalizeInsertVectorElt(MI, MRI, B);
1726 case TargetOpcode::G_SHUFFLE_VECTOR:
1727 return legalizeShuffleVector(MI, MRI, B);
1728 case TargetOpcode::G_FSIN:
1729 case TargetOpcode::G_FCOS:
1730 return legalizeSinCos(MI, MRI, B);
1731 case TargetOpcode::G_GLOBAL_VALUE:
1732 return legalizeGlobalValue(MI, MRI, B);
1733 case TargetOpcode::G_LOAD:
1734 case TargetOpcode::G_SEXTLOAD:
1735 case TargetOpcode::G_ZEXTLOAD:
1736 return legalizeLoad(Helper, MI);
1737 case TargetOpcode::G_FMAD:
1738 return legalizeFMad(MI, MRI, B);
1739 case TargetOpcode::G_FDIV:
1740 return legalizeFDIV(MI, MRI, B);
1741 case TargetOpcode::G_UDIV:
1742 case TargetOpcode::G_UREM:
1743 case TargetOpcode::G_UDIVREM:
1744 return legalizeUnsignedDIV_REM(MI, MRI, B);
1745 case TargetOpcode::G_SDIV:
1746 case TargetOpcode::G_SREM:
1747 case TargetOpcode::G_SDIVREM:
1748 return legalizeSignedDIV_REM(MI, MRI, B);
1749 case TargetOpcode::G_ATOMIC_CMPXCHG:
1750 return legalizeAtomicCmpXChg(MI, MRI, B);
1751 case TargetOpcode::G_FLOG:
1752 return legalizeFlog(MI, B, numbers::ln2f);
1753 case TargetOpcode::G_FLOG10:
1754 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1755 case TargetOpcode::G_FEXP:
1756 return legalizeFExp(MI, B);
1757 case TargetOpcode::G_FPOW:
1758 return legalizeFPow(MI, B);
1759 case TargetOpcode::G_FFLOOR:
1760 return legalizeFFloor(MI, MRI, B);
1761 case TargetOpcode::G_BUILD_VECTOR:
1762 return legalizeBuildVector(MI, MRI, B);
1763 default:
1764 return false;
1765 }
1766
1767 llvm_unreachable("expected switch to return")__builtin_unreachable();
1768}
1769
1770Register AMDGPULegalizerInfo::getSegmentAperture(
1771 unsigned AS,
1772 MachineRegisterInfo &MRI,
1773 MachineIRBuilder &B) const {
1774 MachineFunction &MF = B.getMF();
1775 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1776 const LLT S32 = LLT::scalar(32);
1777
1778 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)((void)0);
1779
1780 if (ST.hasApertureRegs()) {
1781 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1782 // getreg.
1783 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1784 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1785 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1786 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1787 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1788 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1789 unsigned Encoding =
1790 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1791 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1792 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1793
1794 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1795
1796 B.buildInstr(AMDGPU::S_GETREG_B32)
1797 .addDef(GetReg)
1798 .addImm(Encoding);
1799 MRI.setType(GetReg, S32);
1800
1801 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1802 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1803 }
1804
1805 Register QueuePtr = MRI.createGenericVirtualRegister(
1806 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1807
1808 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1809 return Register();
1810
1811 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1812 // private_segment_aperture_base_hi.
1813 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1814
1815 // TODO: can we be smarter about machine pointer info?
1816 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1817 MachineMemOperand *MMO = MF.getMachineMemOperand(
1818 PtrInfo,
1819 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1820 MachineMemOperand::MOInvariant,
1821 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1822
1823 Register LoadAddr;
1824
1825 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1826 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1827}
1828
1829bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1830 MachineInstr &MI, MachineRegisterInfo &MRI,
1831 MachineIRBuilder &B) const {
1832 MachineFunction &MF = B.getMF();
1833
1834 const LLT S32 = LLT::scalar(32);
1835 Register Dst = MI.getOperand(0).getReg();
1836 Register Src = MI.getOperand(1).getReg();
1837
1838 LLT DstTy = MRI.getType(Dst);
1839 LLT SrcTy = MRI.getType(Src);
1840 unsigned DestAS = DstTy.getAddressSpace();
1841 unsigned SrcAS = SrcTy.getAddressSpace();
1842
1843 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1844 // vector element.
1845 assert(!DstTy.isVector())((void)0);
1846
1847 const AMDGPUTargetMachine &TM
1848 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1849
1850 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1851 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1852 return true;
1853 }
1854
1855 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1856 // Truncate.
1857 B.buildExtract(Dst, Src, 0);
1858 MI.eraseFromParent();
1859 return true;
1860 }
1861
1862 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1864 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1865
1866 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1867 // another. Merge operands are required to be the same type, but creating an
1868 // extra ptrtoint would be kind of pointless.
1869 auto HighAddr = B.buildConstant(
1870 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1871 B.buildMerge(Dst, {Src, HighAddr});
1872 MI.eraseFromParent();
1873 return true;
1874 }
1875
1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1877 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||((void)0)
1878 DestAS == AMDGPUAS::PRIVATE_ADDRESS)((void)0);
1879 unsigned NullVal = TM.getNullPointerValue(DestAS);
1880
1881 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1882 auto FlatNull = B.buildConstant(SrcTy, 0);
1883
1884 // Extract low 32-bits of the pointer.
1885 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1886
1887 auto CmpRes =
1888 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1889 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1890
1891 MI.eraseFromParent();
1892 return true;
1893 }
1894
1895 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1896 return false;
1897
1898 if (!ST.hasFlatAddressSpace())
1899 return false;
1900
1901 auto SegmentNull =
1902 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1903 auto FlatNull =
1904 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1905
1906 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1907 if (!ApertureReg.isValid())
1908 return false;
1909
1910 auto CmpRes =
1911 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1912
1913 // Coerce the type of the low half of the result so we can use merge_values.
1914 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1915
1916 // TODO: Should we allow mismatched types but matching sizes in merges to
1917 // avoid the ptrtoint?
1918 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1919 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1920
1921 MI.eraseFromParent();
1922 return true;
1923}
1924
1925bool AMDGPULegalizerInfo::legalizeFrint(
1926 MachineInstr &MI, MachineRegisterInfo &MRI,
1927 MachineIRBuilder &B) const {
1928 Register Src = MI.getOperand(1).getReg();
1929 LLT Ty = MRI.getType(Src);
1930 assert(Ty.isScalar() && Ty.getSizeInBits() == 64)((void)0);
1931
1932 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1933 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1934
1935 auto C1 = B.buildFConstant(Ty, C1Val);
1936 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1937
1938 // TODO: Should this propagate fast-math-flags?
1939 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1940 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1941
1942 auto C2 = B.buildFConstant(Ty, C2Val);
1943 auto Fabs = B.buildFAbs(Ty, Src);
1944
1945 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1946 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1947 MI.eraseFromParent();
1948 return true;
1949}
1950
1951bool AMDGPULegalizerInfo::legalizeFceil(
1952 MachineInstr &MI, MachineRegisterInfo &MRI,
1953 MachineIRBuilder &B) const {
1954
1955 const LLT S1 = LLT::scalar(1);
1956 const LLT S64 = LLT::scalar(64);
1957
1958 Register Src = MI.getOperand(1).getReg();
1959 assert(MRI.getType(Src) == S64)((void)0);
1960
1961 // result = trunc(src)
1962 // if (src > 0.0 && src != result)
1963 // result += 1.0
1964
1965 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1966
1967 const auto Zero = B.buildFConstant(S64, 0.0);
1968 const auto One = B.buildFConstant(S64, 1.0);
1969 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1970 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1971 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1972 auto Add = B.buildSelect(S64, And, One, Zero);
1973
1974 // TODO: Should this propagate fast-math-flags?
1975 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1976 return true;
1977}
1978
1979bool AMDGPULegalizerInfo::legalizeFrem(
1980 MachineInstr &MI, MachineRegisterInfo &MRI,
1981 MachineIRBuilder &B) const {
1982 Register DstReg = MI.getOperand(0).getReg();
1983 Register Src0Reg = MI.getOperand(1).getReg();
1984 Register Src1Reg = MI.getOperand(2).getReg();
1985 auto Flags = MI.getFlags();
1986 LLT Ty = MRI.getType(DstReg);
1987
1988 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1989 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1990 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1991 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1992 MI.eraseFromParent();
1993 return true;
1994}
1995
1996static MachineInstrBuilder extractF64Exponent(Register Hi,
1997 MachineIRBuilder &B) {
1998 const unsigned FractBits = 52;
1999 const unsigned ExpBits = 11;
2000 LLT S32 = LLT::scalar(32);
2001
2002 auto Const0 = B.buildConstant(S32, FractBits - 32);
2003 auto Const1 = B.buildConstant(S32, ExpBits);
2004
2005 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2006 .addUse(Hi)
2007 .addUse(Const0.getReg(0))
2008 .addUse(Const1.getReg(0));
2009
2010 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2011}
2012
2013bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2014 MachineInstr &MI, MachineRegisterInfo &MRI,
2015 MachineIRBuilder &B) const {
2016 const LLT S1 = LLT::scalar(1);
2017 const LLT S32 = LLT::scalar(32);
2018 const LLT S64 = LLT::scalar(64);
2019
2020 Register Src = MI.getOperand(1).getReg();
2021 assert(MRI.getType(Src) == S64)((void)0);
2022
2023 // TODO: Should this use extract since the low half is unused?
2024 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2025 Register Hi = Unmerge.getReg(1);
2026
2027 // Extract the upper half, since this is where we will find the sign and
2028 // exponent.
2029 auto Exp = extractF64Exponent(Hi, B);
2030
2031 const unsigned FractBits = 52;
2032
2033 // Extract the sign bit.
2034 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1)1U << 31);
2035 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2036
2037 const auto FractMask = B.buildConstant(S64, (UINT64_C(1)1ULL << FractBits) - 1);
2038
2039 const auto Zero32 = B.buildConstant(S32, 0);
2040
2041 // Extend back to 64-bits.
2042 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2043
2044 auto Shr = B.buildAShr(S64, FractMask, Exp);
2045 auto Not = B.buildNot(S64, Shr);
2046 auto Tmp0 = B.buildAnd(S64, Src, Not);
2047 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2048
2049 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2050 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2051
2052 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2053 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2054 MI.eraseFromParent();
2055 return true;
2056}
2057
2058bool AMDGPULegalizerInfo::legalizeITOFP(
2059 MachineInstr &MI, MachineRegisterInfo &MRI,
2060 MachineIRBuilder &B, bool Signed) const {
2061
2062 Register Dst = MI.getOperand(0).getReg();
2063 Register Src = MI.getOperand(1).getReg();
2064
2065 const LLT S64 = LLT::scalar(64);
2066 const LLT S32 = LLT::scalar(32);
2067
2068 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64)((void)0);
2069
2070 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2071
2072 auto CvtHi = Signed ?
2073 B.buildSITOFP(S64, Unmerge.getReg(1)) :
2074 B.buildUITOFP(S64, Unmerge.getReg(1));
2075
2076 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2077
2078 auto ThirtyTwo = B.buildConstant(S32, 32);
2079 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080 .addUse(CvtHi.getReg(0))
2081 .addUse(ThirtyTwo.getReg(0));
2082
2083 // TODO: Should this propagate fast-math-flags?
2084 B.buildFAdd(Dst, LdExp, CvtLo);
2085 MI.eraseFromParent();
2086 return true;
2087}
2088
2089// TODO: Copied from DAG implementation. Verify logic and document how this
2090// actually works.
2091bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2092 MachineRegisterInfo &MRI,
2093 MachineIRBuilder &B,
2094 bool Signed) const {
2095
2096 Register Dst = MI.getOperand(0).getReg();
2097 Register Src = MI.getOperand(1).getReg();
2098
2099 const LLT S64 = LLT::scalar(64);
2100 const LLT S32 = LLT::scalar(32);
2101
2102 const LLT SrcLT = MRI.getType(Src);
2103 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64)((void)0);
2104
2105 unsigned Flags = MI.getFlags();
2106
2107 // The basic idea of converting a floating point number into a pair of 32-bit
2108 // integers is illustrated as follows:
2109 //
2110 // tf := trunc(val);
2111 // hif := floor(tf * 2^-32);
2112 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2113 // hi := fptoi(hif);
2114 // lo := fptoi(lof);
2115 //
2116 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2117 MachineInstrBuilder Sign;
2118 if (Signed && SrcLT == S32) {
2119 // However, a 32-bit floating point number has only 23 bits mantissa and
2120 // it's not enough to hold all the significant bits of `lof` if val is
2121 // negative. To avoid the loss of precision, We need to take the absolute
2122 // value after truncating and flip the result back based on the original
2123 // signedness.
2124 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2125 Trunc = B.buildFAbs(S32, Trunc, Flags);
2126 }
2127 MachineInstrBuilder K0, K1;
2128 if (SrcLT == S64) {
2129 K0 = B.buildFConstant(S64,
2130 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000ULL));
2131 K1 = B.buildFConstant(S64,
2132 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000ULL));
2133 } else {
2134 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U));
2135 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U));
2136 }
2137
2138 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2139 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2140 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2141
2142 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2143 : B.buildFPTOUI(S32, FloorMul);
2144 auto Lo = B.buildFPTOUI(S32, Fma);
2145
2146 if (Signed && SrcLT == S32) {
2147 // Flip the result based on the signedness, which is either all 0s or 1s.
2148 Sign = B.buildMerge(S64, {Sign, Sign});
2149 // r := xor({lo, hi}, sign) - sign;
2150 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2151 } else
2152 B.buildMerge(Dst, {Lo, Hi});
2153 MI.eraseFromParent();
2154
2155 return true;
2156}
2157
2158bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2159 MachineInstr &MI) const {
2160 MachineFunction &MF = Helper.MIRBuilder.getMF();
2161 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2162
2163 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2164 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2165
2166 // With ieee_mode disabled, the instructions have the correct behavior
2167 // already for G_FMINNUM/G_FMAXNUM
2168 if (!MFI->getMode().IEEE)
2169 return !IsIEEEOp;
2170
2171 if (IsIEEEOp)
2172 return true;
2173
2174 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2175}
2176
2177bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2178 MachineInstr &MI, MachineRegisterInfo &MRI,
2179 MachineIRBuilder &B) const {
2180 // TODO: Should move some of this into LegalizerHelper.
2181
2182 // TODO: Promote dynamic indexing of s16 to s32
2183
2184 // FIXME: Artifact combiner probably should have replaced the truncated
2185 // constant before this, so we shouldn't need
2186 // getConstantVRegValWithLookThrough.
2187 Optional<ValueAndVReg> MaybeIdxVal =
2188 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2189 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2190 return true;
2191 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2192
2193 Register Dst = MI.getOperand(0).getReg();
2194 Register Vec = MI.getOperand(1).getReg();
2195
2196 LLT VecTy = MRI.getType(Vec);
2197 LLT EltTy = VecTy.getElementType();
2198 assert(EltTy == MRI.getType(Dst))((void)0);
2199
2200 if (IdxVal < VecTy.getNumElements())
2201 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2202 else
2203 B.buildUndef(Dst);
2204
2205 MI.eraseFromParent();
2206 return true;
2207}
2208
2209bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2210 MachineInstr &MI, MachineRegisterInfo &MRI,
2211 MachineIRBuilder &B) const {
2212 // TODO: Should move some of this into LegalizerHelper.
2213
2214 // TODO: Promote dynamic indexing of s16 to s32
2215
2216 // FIXME: Artifact combiner probably should have replaced the truncated
2217 // constant before this, so we shouldn't need
2218 // getConstantVRegValWithLookThrough.
2219 Optional<ValueAndVReg> MaybeIdxVal =
2220 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2221 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2222 return true;
2223
2224 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2225 Register Dst = MI.getOperand(0).getReg();
2226 Register Vec = MI.getOperand(1).getReg();
2227 Register Ins = MI.getOperand(2).getReg();
2228
2229 LLT VecTy = MRI.getType(Vec);
2230 LLT EltTy = VecTy.getElementType();
2231 assert(EltTy == MRI.getType(Ins))((void)0);
2232
2233 if (IdxVal < VecTy.getNumElements())
2234 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2235 else
2236 B.buildUndef(Dst);
2237
2238 MI.eraseFromParent();
2239 return true;
2240}
2241
2242bool AMDGPULegalizerInfo::legalizeShuffleVector(
2243 MachineInstr &MI, MachineRegisterInfo &MRI,
2244 MachineIRBuilder &B) const {
2245 const LLT V2S16 = LLT::fixed_vector(2, 16);
2246
2247 Register Dst = MI.getOperand(0).getReg();
2248 Register Src0 = MI.getOperand(1).getReg();
2249 LLT DstTy = MRI.getType(Dst);
2250 LLT SrcTy = MRI.getType(Src0);
2251
2252 if (SrcTy == V2S16 && DstTy == V2S16 &&
2253 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2254 return true;
2255
2256 MachineIRBuilder HelperBuilder(MI);
2257 GISelObserverWrapper DummyObserver;
2258 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2259 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2260}
2261
2262bool AMDGPULegalizerInfo::legalizeSinCos(
2263 MachineInstr &MI, MachineRegisterInfo &MRI,
2264 MachineIRBuilder &B) const {
2265
2266 Register DstReg = MI.getOperand(0).getReg();
2267 Register SrcReg = MI.getOperand(1).getReg();
2268 LLT Ty = MRI.getType(DstReg);
2269 unsigned Flags = MI.getFlags();
2270
2271 Register TrigVal;
2272 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2273 if (ST.hasTrigReducedRange()) {
2274 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2275 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2276 .addUse(MulVal.getReg(0))
2277 .setMIFlags(Flags).getReg(0);
2278 } else
2279 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2280
2281 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2282 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2283 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2284 .addUse(TrigVal)
2285 .setMIFlags(Flags);
2286 MI.eraseFromParent();
2287 return true;
2288}
2289
2290bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2291 MachineIRBuilder &B,
2292 const GlobalValue *GV,
2293 int64_t Offset,
2294 unsigned GAFlags) const {
2295 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!")((void)0);
2296 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2297 // to the following code sequence:
2298 //
2299 // For constant address space:
2300 // s_getpc_b64 s[0:1]
2301 // s_add_u32 s0, s0, $symbol
2302 // s_addc_u32 s1, s1, 0
2303 //
2304 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2305 // a fixup or relocation is emitted to replace $symbol with a literal
2306 // constant, which is a pc-relative offset from the encoding of the $symbol
2307 // operand to the global variable.
2308 //
2309 // For global address space:
2310 // s_getpc_b64 s[0:1]
2311 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2312 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2313 //
2314 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2315 // fixups or relocations are emitted to replace $symbol@*@lo and
2316 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2317 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2318 // operand to the global variable.
2319 //
2320 // What we want here is an offset from the value returned by s_getpc
2321 // (which is the address of the s_add_u32 instruction) to the global
2322 // variable, but since the encoding of $symbol starts 4 bytes after the start
2323 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2324 // small. This requires us to add 4 to the global variable offset in order to
2325 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2326 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2327 // instruction.
2328
2329 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2330
2331 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2332 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2333
2334 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2335 .addDef(PCReg);
2336
2337 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2338 if (GAFlags == SIInstrInfo::MO_NONE)
2339 MIB.addImm(0);
2340 else
2341 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2342
2343 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2344
2345 if (PtrTy.getSizeInBits() == 32)
2346 B.buildExtract(DstReg, PCReg, 0);
2347 return true;
2348 }
2349
2350bool AMDGPULegalizerInfo::legalizeGlobalValue(
2351 MachineInstr &MI, MachineRegisterInfo &MRI,
2352 MachineIRBuilder &B) const {
2353 Register DstReg = MI.getOperand(0).getReg();
2354 LLT Ty = MRI.getType(DstReg);
2355 unsigned AS = Ty.getAddressSpace();
2356
2357 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2358 MachineFunction &MF = B.getMF();
2359 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2360
2361 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2362 if (!MFI->isModuleEntryFunction() &&
2363 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2364 const Function &Fn = MF.getFunction();
2365 DiagnosticInfoUnsupported BadLDSDecl(
2366 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2367 DS_Warning);
2368 Fn.getContext().diagnose(BadLDSDecl);
2369
2370 // We currently don't have a way to correctly allocate LDS objects that
2371 // aren't directly associated with a kernel. We do force inlining of
2372 // functions that use local objects. However, if these dead functions are
2373 // not eliminated, we don't want a compile time error. Just emit a warning
2374 // and a trap, since there should be no callable path here.
2375 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2376 B.buildUndef(DstReg);
2377 MI.eraseFromParent();
2378 return true;
2379 }
2380
2381 // TODO: We could emit code to handle the initialization somewhere.
2382 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2383 const SITargetLowering *TLI = ST.getTargetLowering();
2384 if (!TLI->shouldUseLDSConstAddress(GV)) {
2385 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2386 return true; // Leave in place;
2387 }
2388
2389 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2390 Type *Ty = GV->getValueType();
2391 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2392 // zero-sized type in other languages to declare the dynamic shared
2393 // memory which size is not known at the compile time. They will be
2394 // allocated by the runtime and placed directly after the static
2395 // allocated ones. They all share the same offset.
2396 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2397 // Adjust alignment for that dynamic shared memory array.
2398 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2399 LLT S32 = LLT::scalar(32);
2400 auto Sz =
2401 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2402 B.buildIntToPtr(DstReg, Sz);
2403 MI.eraseFromParent();
2404 return true;
2405 }
2406 }
2407
2408 B.buildConstant(
2409 DstReg,
2410 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2411 MI.eraseFromParent();
2412 return true;
2413 }
2414
2415 const Function &Fn = MF.getFunction();
2416 DiagnosticInfoUnsupported BadInit(
2417 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2418 Fn.getContext().diagnose(BadInit);
2419 return true;
2420 }
2421
2422 const SITargetLowering *TLI = ST.getTargetLowering();
2423
2424 if (TLI->shouldEmitFixup(GV)) {
2425 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2426 MI.eraseFromParent();
2427 return true;
2428 }
2429
2430 if (TLI->shouldEmitPCReloc(GV)) {
2431 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2432 MI.eraseFromParent();
2433 return true;
2434 }
2435
2436 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2437 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2438
2439 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2440 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2441 MachinePointerInfo::getGOT(MF),
2442 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2443 MachineMemOperand::MOInvariant,
2444 LoadTy, Align(8));
2445
2446 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2447
2448 if (Ty.getSizeInBits() == 32) {
2449 // Truncate if this is a 32-bit constant adrdess.
2450 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2451 B.buildExtract(DstReg, Load, 0);
2452 } else
2453 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2454
2455 MI.eraseFromParent();
2456 return true;
2457}
2458
2459static LLT widenToNextPowerOf2(LLT Ty) {
2460 if (Ty.isVector())
2461 return Ty.changeElementCount(
2462 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2463 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2464}
2465
2466bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2467 MachineInstr &MI) const {
2468 MachineIRBuilder &B = Helper.MIRBuilder;
2469 MachineRegisterInfo &MRI = *B.getMRI();
2470 GISelChangeObserver &Observer = Helper.Observer;
2471
2472 Register PtrReg = MI.getOperand(1).getReg();
2473 LLT PtrTy = MRI.getType(PtrReg);
2474 unsigned AddrSpace = PtrTy.getAddressSpace();
2475
2476 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2477 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2478 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2479 Observer.changingInstr(MI);
2480 MI.getOperand(1).setReg(Cast.getReg(0));
2481 Observer.changedInstr(MI);
2482 return true;
2483 }
2484
2485 if (MI.getOpcode() != AMDGPU::G_LOAD)
2486 return false;
2487
2488 Register ValReg = MI.getOperand(0).getReg();
2489 LLT ValTy = MRI.getType(ValReg);
2490
2491 MachineMemOperand *MMO = *MI.memoperands_begin();
2492 const unsigned ValSize = ValTy.getSizeInBits();
2493 const LLT MemTy = MMO->getMemoryType();
2494 const Align MemAlign = MMO->getAlign();
2495 const unsigned MemSize = MemTy.getSizeInBits();
2496 const unsigned AlignInBits = 8 * MemAlign.value();
2497
2498 // Widen non-power-of-2 loads to the alignment if needed
2499 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2500 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2501
2502 // This was already the correct extending load result type, so just adjust
2503 // the memory type.
2504 if (WideMemSize == ValSize) {
2505 MachineFunction &MF = B.getMF();
2506
2507 MachineMemOperand *WideMMO =
2508 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2509 Observer.changingInstr(MI);
2510 MI.setMemRefs(MF, {WideMMO});
2511 Observer.changedInstr(MI);
2512 return true;
2513 }
2514
2515 // Don't bother handling edge case that should probably never be produced.
2516 if (ValSize > WideMemSize)
2517 return false;
2518
2519 LLT WideTy = widenToNextPowerOf2(ValTy);
2520
2521 Register WideLoad;
2522 if (!WideTy.isVector()) {
2523 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2524 B.buildTrunc(ValReg, WideLoad).getReg(0);
2525 } else {
2526 // Extract the subvector.
2527
2528 if (isRegisterType(ValTy)) {
2529 // If this a case where G_EXTRACT is legal, use it.
2530 // (e.g. <3 x s32> -> <4 x s32>)
2531 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2532 B.buildExtract(ValReg, WideLoad, 0);
2533 } else {
2534 // For cases where the widened type isn't a nice register value, unmerge
2535 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2536 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2537 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2538 B.setInsertPt(B.getMBB(), MI.getIterator());
2539 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2540 }
2541 }
2542
2543 MI.eraseFromParent();
2544 return true;
2545 }
2546
2547 return false;
2548}
2549
2550bool AMDGPULegalizerInfo::legalizeFMad(
2551 MachineInstr &MI, MachineRegisterInfo &MRI,
2552 MachineIRBuilder &B) const {
2553 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2554 assert(Ty.isScalar())((void)0);
2555
2556 MachineFunction &MF = B.getMF();
2557 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2558
2559 // TODO: Always legal with future ftz flag.
2560 // FIXME: Do we need just output?
2561 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2562 return true;
2563 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2564 return true;
2565
2566 MachineIRBuilder HelperBuilder(MI);
2567 GISelObserverWrapper DummyObserver;
2568 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2569 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2570}
2571
2572bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2573 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2574 Register DstReg = MI.getOperand(0).getReg();
2575 Register PtrReg = MI.getOperand(1).getReg();
2576 Register CmpVal = MI.getOperand(2).getReg();
2577 Register NewVal = MI.getOperand(3).getReg();
2578
2579 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&((void)0)
2580 "this should not have been custom lowered")((void)0);
2581
2582 LLT ValTy = MRI.getType(CmpVal);
2583 LLT VecTy = LLT::fixed_vector(2, ValTy);
2584
2585 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2586
2587 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2588 .addDef(DstReg)
2589 .addUse(PtrReg)
2590 .addUse(PackedVal)
2591 .setMemRefs(MI.memoperands());
2592
2593 MI.eraseFromParent();
2594 return true;
2595}
2596
2597bool AMDGPULegalizerInfo::legalizeFlog(
2598 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2599 Register Dst = MI.getOperand(0).getReg();
2600 Register Src = MI.getOperand(1).getReg();
2601 LLT Ty = B.getMRI()->getType(Dst);
2602 unsigned Flags = MI.getFlags();
2603
2604 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2605 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2606
2607 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2608 MI.eraseFromParent();
2609 return true;
2610}
2611
2612bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2613 MachineIRBuilder &B) const {
2614 Register Dst = MI.getOperand(0).getReg();
2615 Register Src = MI.getOperand(1).getReg();
2616 unsigned Flags = MI.getFlags();
2617 LLT Ty = B.getMRI()->getType(Dst);
2618
2619 auto K = B.buildFConstant(Ty, numbers::log2e);
2620 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2621 B.buildFExp2(Dst, Mul, Flags);
2622 MI.eraseFromParent();
2623 return true;
2624}
2625
2626bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2627 MachineIRBuilder &B) const {
2628 Register Dst = MI.getOperand(0).getReg();
2629 Register Src0 = MI.getOperand(1).getReg();
2630 Register Src1 = MI.getOperand(2).getReg();
2631 unsigned Flags = MI.getFlags();
2632 LLT Ty = B.getMRI()->getType(Dst);
2633 const LLT S16 = LLT::scalar(16);
2634 const LLT S32 = LLT::scalar(32);
2635
2636 if (Ty == S32) {
2637 auto Log = B.buildFLog2(S32, Src0, Flags);
2638 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2639 .addUse(Log.getReg(0))
2640 .addUse(Src1)
2641 .setMIFlags(Flags);
2642 B.buildFExp2(Dst, Mul, Flags);
2643 } else if (Ty == S16) {
2644 // There's no f16 fmul_legacy, so we need to convert for it.
2645 auto Log = B.buildFLog2(S16, Src0, Flags);
2646 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2647 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2648 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2649 .addUse(Ext0.getReg(0))
2650 .addUse(Ext1.getReg(0))
2651 .setMIFlags(Flags);
2652
2653 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2654 } else
2655 return false;
2656
2657 MI.eraseFromParent();
2658 return true;
2659}
2660
2661// Find a source register, ignoring any possible source modifiers.
2662static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2663 Register ModSrc = OrigSrc;
2664 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2665 ModSrc = SrcFNeg->getOperand(1).getReg();
2666 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2667 ModSrc = SrcFAbs->getOperand(1).getReg();
2668 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2669 ModSrc = SrcFAbs->getOperand(1).getReg();
2670 return ModSrc;
2671}
2672
2673bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2674 MachineRegisterInfo &MRI,
2675 MachineIRBuilder &B) const {
2676
2677 const LLT S1 = LLT::scalar(1);
2678 const LLT S64 = LLT::scalar(64);
2679 Register Dst = MI.getOperand(0).getReg();
2680 Register OrigSrc = MI.getOperand(1).getReg();
2681 unsigned Flags = MI.getFlags();
2682 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&((void)0)
2683 "this should not have been custom lowered")((void)0);
2684
2685 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2686 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2687 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2688 // V_FRACT bug is:
2689 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2690 //
2691 // Convert floor(x) to (x - fract(x))
2692
2693 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2694 .addUse(OrigSrc)
2695 .setMIFlags(Flags);
2696
2697 // Give source modifier matching some assistance before obscuring a foldable
2698 // pattern.
2699
2700 // TODO: We can avoid the neg on the fract? The input sign to fract
2701 // shouldn't matter?
2702 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2703
2704 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2705
2706 Register Min = MRI.createGenericVirtualRegister(S64);
2707
2708 // We don't need to concern ourselves with the snan handling difference, so
2709 // use the one which will directly select.
2710 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2711 if (MFI->getMode().IEEE)
2712 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2713 else
2714 B.buildFMinNum(Min, Fract, Const, Flags);
2715
2716 Register CorrectedFract = Min;
2717 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2718 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2719 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2720 }
2721
2722 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2723 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2724
2725 MI.eraseFromParent();
2726 return true;
2727}
2728
2729// Turn an illegal packed v2s16 build vector into bit operations.
2730// TODO: This should probably be a bitcast action in LegalizerHelper.
2731bool AMDGPULegalizerInfo::legalizeBuildVector(
2732 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2733 Register Dst = MI.getOperand(0).getReg();
2734 const LLT S32 = LLT::scalar(32);
2735 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16))((void)0);
2736
2737 Register Src0 = MI.getOperand(1).getReg();
2738 Register Src1 = MI.getOperand(2).getReg();
2739 assert(MRI.getType(Src0) == LLT::scalar(16))((void)0);
2740
2741 auto Merge = B.buildMerge(S32, {Src0, Src1});
2742 B.buildBitcast(Dst, Merge);
2743
2744 MI.eraseFromParent();
2745 return true;
2746}
2747
2748// Check that this is a G_XOR x, -1
2749static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2750 if (MI.getOpcode() != TargetOpcode::G_XOR)
2751 return false;
2752 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2753 return ConstVal && *ConstVal == -1;
2754}
2755
2756// Return the use branch instruction, otherwise null if the usage is invalid.
2757static MachineInstr *
2758verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2759 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2760 Register CondDef = MI.getOperand(0).getReg();
2761 if (!MRI.hasOneNonDBGUse(CondDef))
2762 return nullptr;
2763
2764 MachineBasicBlock *Parent = MI.getParent();
2765 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2766
2767 if (isNot(MRI, *UseMI)) {
2768 Register NegatedCond = UseMI->getOperand(0).getReg();
2769 if (!MRI.hasOneNonDBGUse(NegatedCond))
2770 return nullptr;
2771
2772 // We're deleting the def of this value, so we need to remove it.
2773 UseMI->eraseFromParent();
2774
2775 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2776 Negated = true;
2777 }
2778
2779 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2780 return nullptr;
2781
2782 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2783 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2784 if (Next == Parent->end()) {
2785 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2786 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2787 return nullptr;
2788 UncondBrTarget = &*NextMBB;
2789 } else {
2790 if (Next->getOpcode() != AMDGPU::G_BR)
2791 return nullptr;
2792 Br = &*Next;
2793 UncondBrTarget = Br->getOperand(0).getMBB();
2794 }
2795
2796 return UseMI;
2797}
2798
2799bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2800 const ArgDescriptor *Arg,
2801 const TargetRegisterClass *ArgRC,
2802 LLT ArgTy) const {
2803 MCRegister SrcReg = Arg->getRegister();
2804 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected")((void)0);
2805 assert(DstReg.isVirtual() && "Virtual register expected")((void)0);
2806
2807 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2808 ArgTy);
2809 if (Arg->isMasked()) {
6
Calling 'ArgDescriptor::isMasked'
9
Returning from 'ArgDescriptor::isMasked'
10
Taking true branch
2810 // TODO: Should we try to emit this once in the entry block?
2811 const LLT S32 = LLT::scalar(32);
2812 const unsigned Mask = Arg->getMask();
2813 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
11
Calling 'countTrailingZeros<unsigned int>'
18
Returning from 'countTrailingZeros<unsigned int>'
19
'Shift' initialized to 32
2814
2815 Register AndMaskSrc = LiveIn;
2816
2817 if (Shift
19.1
'Shift' is not equal to 0
19.1
'Shift' is not equal to 0
19.1
'Shift' is not equal to 0
!= 0) {
20
Taking true branch
2818 auto ShiftAmt = B.buildConstant(S32, Shift);
2819 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2820 }
2821
2822 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
21
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
2823 } else {
2824 B.buildCopy(DstReg, LiveIn);
2825 }
2826
2827 return true;
2828}
2829
2830bool AMDGPULegalizerInfo::loadInputValue(
2831 Register DstReg, MachineIRBuilder &B,
2832 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2833 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2834 const ArgDescriptor *Arg;
2835 const TargetRegisterClass *ArgRC;
2836 LLT ArgTy;
2837 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2838
2839 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4
Taking false branch
2840 return false; // TODO: Handle these
2841 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5
Calling 'AMDGPULegalizerInfo::loadInputValue'
2842}
2843
2844bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2845 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2846 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2847 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3
Calling 'AMDGPULegalizerInfo::loadInputValue'
2848 return false;
2849
2850 MI.eraseFromParent();
2851 return true;
2852}
2853
2854bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2855 MachineRegisterInfo &MRI,
2856 MachineIRBuilder &B) const {
2857 Register Dst = MI.getOperand(0).getReg();
2858 LLT DstTy = MRI.getType(Dst);
2859 LLT S16 = LLT::scalar(16);
2860 LLT S32 = LLT::scalar(32);
2861 LLT S64 = LLT::scalar(64);
2862
2863 if (DstTy == S16)
2864 return legalizeFDIV16(MI, MRI, B);
2865 if (DstTy == S32)
2866 return legalizeFDIV32(MI, MRI, B);
2867 if (DstTy == S64)
2868 return legalizeFDIV64(MI, MRI, B);
2869
2870 return false;
2871}
2872
2873void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2874 Register DstDivReg,
2875 Register DstRemReg,
2876 Register X,
2877 Register Y) const {
2878 const LLT S1 = LLT::scalar(1);
2879 const LLT S32 = LLT::scalar(32);
2880
2881 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2882 // algorithm used here.
2883
2884 // Initial estimate of inv(y).
2885 auto FloatY = B.buildUITOFP(S32, Y);
2886 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2887 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2888 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2889 auto Z = B.buildFPTOUI(S32, ScaledY);
2890
2891 // One round of UNR.
2892 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2893 auto NegYZ = B.buildMul(S32, NegY, Z);
2894 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2895
2896 // Quotient/remainder estimate.
2897 auto Q = B.buildUMulH(S32, X, Z);
2898 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2899
2900 // First quotient/remainder refinement.
2901 auto One = B.buildConstant(S32, 1);
2902 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2903 if (DstDivReg)
2904 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2905 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2906
2907 // Second quotient/remainder refinement.
2908 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2909 if (DstDivReg)
2910 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2911
2912 if (DstRemReg)
2913 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2914}
2915
2916// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2917//
2918// Return lo, hi of result
2919//
2920// %cvt.lo = G_UITOFP Val.lo
2921// %cvt.hi = G_UITOFP Val.hi
2922// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2923// %rcp = G_AMDGPU_RCP_IFLAG %mad
2924// %mul1 = G_FMUL %rcp, 0x5f7ffffc
2925// %mul2 = G_FMUL %mul1, 2**(-32)
2926// %trunc = G_INTRINSIC_TRUNC %mul2
2927// %mad2 = G_FMAD %trunc, -(2**32), %mul1
2928// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2929static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2930 Register Val) {
2931 const LLT S32 = LLT::scalar(32);
2932 auto Unmerge = B.buildUnmerge(S32, Val);
2933
2934 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2935 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2936
2937 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2938 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2939
2940 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2941 auto Mul1 =
2942 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2943
2944 // 2**(-32)
2945 auto Mul2 =
2946 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2947 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2948
2949 // -(2**32)
2950 auto Mad2 = B.buildFMAD(S32, Trunc,
2951 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2952
2953 auto ResultLo = B.buildFPTOUI(S32, Mad2);
2954 auto ResultHi = B.buildFPTOUI(S32, Trunc);
2955
2956 return {ResultLo.getReg(0), ResultHi.getReg(0)};
2957}
2958
2959void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2960 Register DstDivReg,
2961 Register DstRemReg,
2962 Register Numer,
2963 Register Denom) const {
2964 const LLT S32 = LLT::scalar(32);
2965 const LLT S64 = LLT::scalar(64);
2966 const LLT S1 = LLT::scalar(1);
2967 Register RcpLo, RcpHi;
2968
2969 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2970
2971 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2972
2973 auto Zero64 = B.buildConstant(S64, 0);
2974 auto NegDenom = B.buildSub(S64, Zero64, Denom);
2975
2976 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2977 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2978
2979 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2980 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2981 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2982
2983 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2984 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2985 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2986 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2987
2988 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2989 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2990 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2991 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2992 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2993
2994 auto Zero32 = B.buildConstant(S32, 0);
2995 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2996 auto Add2_HiC =
2997 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2998 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2999 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3000
3001 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3002 Register NumerLo = UnmergeNumer.getReg(0);
3003 Register NumerHi = UnmergeNumer.getReg(1);
3004
3005 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3006 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3007 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3008 Register Mul3_Lo = UnmergeMul3.getReg(0);
3009 Register Mul3_Hi = UnmergeMul3.getReg(1);
3010 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3011 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3012 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3013 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3014
3015 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3016 Register DenomLo = UnmergeDenom.getReg(0);
3017 Register DenomHi = UnmergeDenom.getReg(1);
3018
3019 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3020 auto C1 = B.buildSExt(S32, CmpHi);
3021
3022 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3023 auto C2 = B.buildSExt(S32, CmpLo);
3024
3025 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3026 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3027
3028 // TODO: Here and below portions of the code can be enclosed into if/endif.
3029 // Currently control flow is unconditional and we have 4 selects after
3030 // potential endif to substitute PHIs.
3031
3032 // if C3 != 0 ...
3033 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3034 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3035 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3036 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3037
3038 auto One64 = B.buildConstant(S64, 1);
3039 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3040
3041 auto C4 =
3042 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3043 auto C5 =
3044 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3045 auto C6 = B.buildSelect(
3046 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3047
3048 // if (C6 != 0)
3049 auto Add4 = B.buildAdd(S64, Add3, One64);
3050 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3051
3052 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3053 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3054 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3055
3056 // endif C6
3057 // endif C3
3058
3059 if (DstDivReg) {
3060 auto Sel1 = B.buildSelect(
3061 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3062 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3063 Sel1, MulHi3);
3064 }
3065
3066 if (DstRemReg) {
3067 auto Sel2 = B.buildSelect(
3068 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3069 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3070 Sel2, Sub1);
3071 }
3072}
3073
3074bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3075 MachineRegisterInfo &MRI,
3076 MachineIRBuilder &B) const {
3077 Register DstDivReg, DstRemReg;
3078 switch (MI.getOpcode()) {
3079 default:
3080 llvm_unreachable("Unexpected opcode!")__builtin_unreachable();
3081 case AMDGPU::G_UDIV: {
3082 DstDivReg = MI.getOperand(0).getReg();
3083 break;
3084 }
3085 case AMDGPU::G_UREM: {
3086 DstRemReg = MI.getOperand(0).getReg();
3087 break;
3088 }
3089 case AMDGPU::G_UDIVREM: {
3090 DstDivReg = MI.getOperand(0).getReg();
3091 DstRemReg = MI.getOperand(1).getReg();
3092 break;
3093 }
3094 }
3095
3096 const LLT S64 = LLT::scalar(64);
3097 const LLT S32 = LLT::scalar(32);
3098 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3099 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3100 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3101 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3102
3103 if (Ty == S32)
3104 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3105 else if (Ty == S64)
3106 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3107 else
3108 return false;
3109
3110 MI.eraseFromParent();
3111 return true;
3112}
3113
3114bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3115 MachineRegisterInfo &MRI,
3116 MachineIRBuilder &B) const {
3117 const LLT S64 = LLT::scalar(64);
3118 const LLT S32 = LLT::scalar(32);
3119
3120 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3121 if (Ty != S32 && Ty != S64)
3122 return false;
3123
3124 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3125 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3126 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3127
3128 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3129 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3130 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3131
3132 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3133 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3134
3135 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3136 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3137
3138 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3139 switch (MI.getOpcode()) {
3140 default:
3141 llvm_unreachable("Unexpected opcode!")__builtin_unreachable();
3142 case AMDGPU::G_SDIV: {
3143 DstDivReg = MI.getOperand(0).getReg();
3144 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3145 break;
3146 }
3147 case AMDGPU::G_SREM: {
3148 DstRemReg = MI.getOperand(0).getReg();
3149 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3150 break;
3151 }
3152 case AMDGPU::G_SDIVREM: {
3153 DstDivReg = MI.getOperand(0).getReg();
3154 DstRemReg = MI.getOperand(1).getReg();
3155 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3156 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3157 break;
3158 }
3159 }
3160
3161 if (Ty == S32)
3162 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3163 else
3164 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3165
3166 if (DstDivReg) {
3167 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3168 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3169 B.buildSub(DstDivReg, SignXor, Sign);
3170 }
3171
3172 if (DstRemReg) {
3173 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3174 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3175 B.buildSub(DstRemReg, SignXor, Sign);
3176 }
3177
3178 MI.eraseFromParent();
3179 return true;
3180}
3181
3182bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3183 MachineRegisterInfo &MRI,
3184 MachineIRBuilder &B) const {
3185 Register Res = MI.getOperand(0).getReg();
3186 Register LHS = MI.getOperand(1).getReg();
3187 Register RHS = MI.getOperand(2).getReg();
3188 uint16_t Flags = MI.getFlags();
3189 LLT ResTy = MRI.getType(Res);
3190
3191 const MachineFunction &MF = B.getMF();
3192 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3193 MI.getFlag(MachineInstr::FmAfn);
3194
3195 if (!AllowInaccurateRcp)
3196 return false;
3197
3198 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3199 // 1 / x -> RCP(x)
3200 if (CLHS->isExactlyValue(1.0)) {
3201 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3202 .addUse(RHS)
3203 .setMIFlags(Flags);
3204
3205 MI.eraseFromParent();
3206 return true;
3207 }
3208
3209 // -1 / x -> RCP( FNEG(x) )
3210 if (CLHS->isExactlyValue(-1.0)) {
3211 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3212 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3213 .addUse(FNeg.getReg(0))
3214 .setMIFlags(Flags);
3215
3216 MI.eraseFromParent();
3217 return true;
3218 }
3219 }
3220
3221 // x / y -> x * (1.0 / y)
3222 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3223 .addUse(RHS)
3224 .setMIFlags(Flags);
3225 B.buildFMul(Res, LHS, RCP, Flags);
3226
3227 MI.eraseFromParent();
3228 return true;
3229}
3230
3231bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3232 MachineRegisterInfo &MRI,
3233 MachineIRBuilder &B) const {
3234 Register Res = MI.getOperand(0).getReg();
3235 Register X = MI.getOperand(1).getReg();
3236 Register Y = MI.getOperand(2).getReg();
3237 uint16_t Flags = MI.getFlags();
3238 LLT ResTy = MRI.getType(Res);
3239
3240 const MachineFunction &MF = B.getMF();
3241 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3242 MI.getFlag(MachineInstr::FmAfn);
3243
3244 if (!AllowInaccurateRcp)
3245 return false;
3246
3247 auto NegY = B.buildFNeg(ResTy, Y);
3248 auto One = B.buildFConstant(ResTy, 1.0);
3249
3250 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3251 .addUse(Y)
3252 .setMIFlags(Flags);
3253
3254 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3255 R = B.buildFMA(ResTy, Tmp0, R, R);
3256
3257 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3258 R = B.buildFMA(ResTy, Tmp1, R, R);
3259
3260 auto Ret = B.buildFMul(ResTy, X, R);
3261 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3262
3263 B.buildFMA(Res, Tmp2, R, Ret);
3264 MI.eraseFromParent();
3265 return true;
3266}
3267
3268bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3269 MachineRegisterInfo &MRI,
3270 MachineIRBuilder &B) const {
3271 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3272 return true;
3273
3274 Register Res = MI.getOperand(0).getReg();
3275 Register LHS = MI.getOperand(1).getReg();
3276 Register RHS = MI.getOperand(2).getReg();
3277
3278 uint16_t Flags = MI.getFlags();
3279
3280 LLT S16 = LLT::scalar(16);
3281 LLT S32 = LLT::scalar(32);
3282
3283 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3284 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3285
3286 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3287 .addUse(RHSExt.getReg(0))
3288 .setMIFlags(Flags);
3289
3290 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3291 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3292
3293 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3294 .addUse(RDst.getReg(0))
3295 .addUse(RHS)
3296 .addUse(LHS)
3297 .setMIFlags(Flags);
3298
3299 MI.eraseFromParent();
3300 return true;
3301}
3302
3303// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3304// to enable denorm mode. When 'Enable' is false, disable denorm mode.
3305static void toggleSPDenormMode(bool Enable,
3306 MachineIRBuilder &B,
3307 const GCNSubtarget &ST,
3308 AMDGPU::SIModeRegisterDefaults Mode) {
3309 // Set SP denorm mode to this value.
3310 unsigned SPDenormMode =
3311 Enable ? FP_DENORM_FLUSH_NONE3 : Mode.fpDenormModeSPValue();
3312
3313 if (ST.hasDenormModeInst()) {
3314 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3315 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3316
3317 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3318 B.buildInstr(AMDGPU::S_DENORM_MODE)
3319 .addImm(NewDenormModeValue);
3320
3321 } else {
3322 // Select FP32 bit field in mode register.
3323 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3324 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3325 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3326
3327 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3328 .addImm(SPDenormMode)
3329 .addImm(SPDenormModeBitField);
3330 }
3331}
3332
3333bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3334 MachineRegisterInfo &MRI,
3335 MachineIRBuilder &B) const {
3336 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3337 return true;
3338
3339 Register Res = MI.getOperand(0).getReg();
3340 Register LHS = MI.getOperand(1).getReg();
3341 Register RHS = MI.getOperand(2).getReg();
3342 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3343 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3344
3345 uint16_t Flags = MI.getFlags();
3346
3347 LLT S32 = LLT::scalar(32);
3348 LLT S1 = LLT::scalar(1);
3349
3350 auto One = B.buildFConstant(S32, 1.0f);
3351
3352 auto DenominatorScaled =
3353 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3354 .addUse(LHS)
3355 .addUse(RHS)
3356 .addImm(0)
3357 .setMIFlags(Flags);
3358 auto NumeratorScaled =
3359 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3360 .addUse(LHS)
3361 .addUse(RHS)
3362 .addImm(1)
3363 .setMIFlags(Flags);
3364
3365 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3366 .addUse(DenominatorScaled.getReg(0))
3367 .setMIFlags(Flags);
3368 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3369
3370 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3371 // aren't modeled as reading it.
3372 if (!Mode.allFP32Denormals())
3373 toggleSPDenormMode(true, B, ST, Mode);
3374
3375 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3376 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3377 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3378 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3379 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3380 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3381
3382 if (!Mode.allFP32Denormals())
3383 toggleSPDenormMode(false, B, ST, Mode);
3384
3385 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3386 .addUse(Fma4.getReg(0))
3387 .addUse(Fma1.getReg(0))
3388 .addUse(Fma3.getReg(0))
3389 .addUse(NumeratorScaled.getReg(1))
3390 .setMIFlags(Flags);
3391
3392 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3393 .addUse(Fmas.getReg(0))
3394 .addUse(RHS)
3395 .addUse(LHS)
3396 .setMIFlags(Flags);
3397
3398 MI.eraseFromParent();
3399 return true;
3400}
3401
3402bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3403 MachineRegisterInfo &MRI,
3404 MachineIRBuilder &B) const {
3405 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3406 return true;
3407
3408 Register Res = MI.getOperand(0).getReg();
3409 Register LHS = MI.getOperand(1).getReg();
3410 Register RHS = MI.getOperand(2).getReg();
3411
3412 uint16_t Flags = MI.getFlags();
3413
3414 LLT S64 = LLT::scalar(64);
3415 LLT S1 = LLT::scalar(1);
3416
3417 auto One = B.buildFConstant(S64, 1.0);
3418
3419 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3420 .addUse(LHS)
3421 .addUse(RHS)
3422 .addImm(0)
3423 .setMIFlags(Flags);
3424
3425 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3426
3427 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3428 .addUse(DivScale0.getReg(0))
3429 .setMIFlags(Flags);
3430
3431 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3432 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3433 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3434
3435 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3436 .addUse(LHS)
3437 .addUse(RHS)
3438 .addImm(1)
3439 .setMIFlags(Flags);
3440
3441 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3442 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3443 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3444
3445 Register Scale;
3446 if (!ST.hasUsableDivScaleConditionOutput()) {
3447 // Workaround a hardware bug on SI where the condition output from div_scale
3448 // is not usable.
3449
3450 LLT S32 = LLT::scalar(32);
3451
3452 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3453 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3454 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3455 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3456
3457 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3458 Scale1Unmerge.getReg(1));
3459 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3460 Scale0Unmerge.getReg(1));
3461 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3462 } else {
3463 Scale = DivScale1.getReg(1);
3464 }
3465
3466 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3467 .addUse(Fma4.getReg(0))
3468 .addUse(Fma3.getReg(0))
3469 .addUse(Mul.getReg(0))
3470 .addUse(Scale)
3471 .setMIFlags(Flags);
3472
3473 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3474 .addUse(Fmas.getReg(0))
3475 .addUse(RHS)
3476 .addUse(LHS)
3477 .setMIFlags(Flags);
3478
3479 MI.eraseFromParent();
3480 return true;
3481}
3482
3483bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3484 MachineRegisterInfo &MRI,
3485 MachineIRBuilder &B) const {
3486 Register Res = MI.getOperand(0).getReg();
3487 Register LHS = MI.getOperand(2).getReg();
3488 Register RHS = MI.getOperand(3).getReg();
3489 uint16_t Flags = MI.getFlags();
3490
3491 LLT S32 = LLT::scalar(32);
3492 LLT S1 = LLT::scalar(1);
3493
3494 auto Abs = B.buildFAbs(S32, RHS, Flags);
3495 const APFloat C0Val(1.0f);
3496
3497 auto C0 = B.buildConstant(S32, 0x6f800000);
3498 auto C1 = B.buildConstant(S32, 0x2f800000);
3499 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3500
3501 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3502 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3503
3504 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3505
3506 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3507 .addUse(Mul0.getReg(0))
3508 .setMIFlags(Flags);
3509
3510 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3511
3512 B.buildFMul(Res, Sel, Mul1, Flags);
3513
3514 MI.eraseFromParent();
3515 return true;
3516}
3517
3518// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3519// FIXME: Why do we handle this one but not other removed instructions?
3520//
3521// Reciprocal square root. The clamp prevents infinite results, clamping
3522// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3523// +-max_float.
3524bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3525 MachineRegisterInfo &MRI,
3526 MachineIRBuilder &B) const {
3527 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3528 return true;
3529
3530 Register Dst = MI.getOperand(0).getReg();
3531 Register Src = MI.getOperand(2).getReg();
3532 auto Flags = MI.getFlags();
3533
3534 LLT Ty = MRI.getType(Dst);
3535
3536 const fltSemantics *FltSemantics;
3537 if (Ty == LLT::scalar(32))
3538 FltSemantics = &APFloat::IEEEsingle();
3539 else if (Ty == LLT::scalar(64))
3540 FltSemantics = &APFloat::IEEEdouble();
3541 else
3542 return false;
3543
3544 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3545 .addUse(Src)
3546 .setMIFlags(Flags);
3547
3548 // We don't need to concern ourselves with the snan handling difference, since
3549 // the rsq quieted (or not) so use the one which will directly select.
3550 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3551 const bool UseIEEE = MFI->getMode().IEEE;
3552
3553 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3554 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3555 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3556
3557 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3558
3559 if (UseIEEE)
3560 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3561 else
3562 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3563 MI.eraseFromParent();
3564 return true;
3565}
3566
3567static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3568 switch (IID) {
3569 case Intrinsic::amdgcn_ds_fadd:
3570 return AMDGPU::G_ATOMICRMW_FADD;
3571 case Intrinsic::amdgcn_ds_fmin:
3572 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3573 case Intrinsic::amdgcn_ds_fmax:
3574 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3575 default:
3576 llvm_unreachable("not a DS FP intrinsic")__builtin_unreachable();
3577 }
3578}
3579
3580bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3581 MachineInstr &MI,
3582 Intrinsic::ID IID) const {
3583 GISelChangeObserver &Observer = Helper.Observer;
3584 Observer.changingInstr(MI);
3585
3586 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3587
3588 // The remaining operands were used to set fields in the MemOperand on
3589 // construction.
3590 for (int I = 6; I > 3; --I)
3591 MI.RemoveOperand(I);
3592
3593 MI.RemoveOperand(1); // Remove the intrinsic ID.
3594 Observer.changedInstr(MI);
3595 return true;
3596}
3597
3598bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3599 MachineRegisterInfo &MRI,
3600 MachineIRBuilder &B) const {
3601 uint64_t Offset =
3602 ST.getTargetLowering()->getImplicitParameterOffset(
3603 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3604 LLT DstTy = MRI.getType(DstReg);
3605 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3606
3607 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3608 if (!loadInputValue(KernargPtrReg, B,
3609 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3610 return false;
3611
3612 // FIXME: This should be nuw
3613 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3614 return true;
3615}
3616
3617bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3618 MachineRegisterInfo &MRI,
3619 MachineIRBuilder &B) const {
3620 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3621 if (!MFI->isEntryFunction()) {
3622 return legalizePreloadedArgIntrin(MI, MRI, B,
3623 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3624 }
3625
3626 Register DstReg = MI.getOperand(0).getReg();
3627 if (!getImplicitArgPtr(DstReg, MRI, B))
3628 return false;
3629
3630 MI.eraseFromParent();
3631 return true;
3632}
3633
3634bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3635 MachineRegisterInfo &MRI,
3636 MachineIRBuilder &B,
3637 unsigned AddrSpace) const {
3638 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3639 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3640 Register Hi32 = Unmerge.getReg(1);
3641
3642 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3643 MI.eraseFromParent();
3644 return true;
3645}
3646
3647// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3648// offset (the offset that is included in bounds checking and swizzling, to be
3649// split between the instruction's voffset and immoffset fields) and soffset
3650// (the offset that is excluded from bounds checking and swizzling, to go in
3651// the instruction's soffset field). This function takes the first kind of
3652// offset and figures out how to split it between voffset and immoffset.
3653std::pair<Register, unsigned>
3654AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3655 Register OrigOffset) const {
3656 const unsigned MaxImm = 4095;
3657 Register BaseReg;
3658 unsigned ImmOffset;
3659 const LLT S32 = LLT::scalar(32);
3660 MachineRegisterInfo &MRI = *B.getMRI();
3661
3662 std::tie(BaseReg, ImmOffset) =
3663 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3664
3665 // If BaseReg is a pointer, convert it to int.
3666 if (MRI.getType(BaseReg).isPointer())
3667 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3668
3669 // If the immediate value is too big for the immoffset field, put the value
3670 // and -4096 into the immoffset field so that the value that is copied/added
3671 // for the voffset field is a multiple of 4096, and it stands more chance
3672 // of being CSEd with the copy/add for another similar load/store.
3673 // However, do not do that rounding down to a multiple of 4096 if that is a
3674 // negative number, as it appears to be illegal to have a negative offset
3675 // in the vgpr, even if adding the immediate offset makes it positive.
3676 unsigned Overflow = ImmOffset & ~MaxImm;
3677 ImmOffset -= Overflow;
3678 if ((int32_t)Overflow < 0) {
3679 Overflow += ImmOffset;
3680 ImmOffset = 0;
3681 }
3682
3683 if (Overflow != 0) {
3684 if (!BaseReg) {
3685 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3686 } else {
3687 auto OverflowVal = B.buildConstant(S32, Overflow);
3688 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3689 }
3690 }
3691
3692 if (!BaseReg)
3693 BaseReg = B.buildConstant(S32, 0).getReg(0);
3694
3695 return std::make_pair(BaseReg, ImmOffset);
3696}
3697
3698/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3699void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3700 Register VOffset, Register SOffset,
3701 unsigned ImmOffset, Register VIndex,
3702 MachineRegisterInfo &MRI) const {
3703 Optional<ValueAndVReg> MaybeVOffsetVal =
3704 getConstantVRegValWithLookThrough(VOffset, MRI);
3705 Optional<ValueAndVReg> MaybeSOffsetVal =
3706 getConstantVRegValWithLookThrough(SOffset, MRI);
3707 Optional<ValueAndVReg> MaybeVIndexVal =
3708 getConstantVRegValWithLookThrough(VIndex, MRI);
3709 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3710 // update the MMO with that offset. The stride is unknown so we can only do
3711 // this if VIndex is constant 0.
3712 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3713 MaybeVIndexVal->Value == 0) {
3714 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3715 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3716 MMO->setOffset(TotalOffset);
3717 } else {
3718 // We don't have a constant combined offset to use in the MMO. Give up.
3719 MMO->setValue((Value *)nullptr);
3720 }
3721}
3722
3723/// Handle register layout difference for f16 images for some subtargets.
3724Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3725 MachineRegisterInfo &MRI,
3726 Register Reg,
3727 bool ImageStore) const {
3728 const LLT S16 = LLT::scalar(16);
3729 const LLT S32 = LLT::scalar(32);
3730 LLT StoreVT = MRI.getType(Reg);
3731 assert(StoreVT.isVector() && StoreVT.getElementType() == S16)((void)0);
3732
3733 if (ST.hasUnpackedD16VMem()) {
3734 auto Unmerge = B.buildUnmerge(S16, Reg);
3735
3736 SmallVector<Register, 4> WideRegs;
3737 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3738 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3739
3740 int NumElts = StoreVT.getNumElements();
3741
3742 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3743 .getReg(0);
3744 }
3745
3746 if (ImageStore && ST.hasImageStoreD16Bug()) {
3747 if (StoreVT.getNumElements() == 2) {
3748 SmallVector<Register, 4> PackedRegs;
3749 Reg = B.buildBitcast(S32, Reg).getReg(0);
3750 PackedRegs.push_back(Reg);
3751 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3752 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3753 .getReg(0);
3754 }
3755
3756 if (StoreVT.getNumElements() == 3) {
3757 SmallVector<Register, 4> PackedRegs;
3758 auto Unmerge = B.buildUnmerge(S16, Reg);
3759 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3760 PackedRegs.push_back(Unmerge.getReg(I));
3761 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3762 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3763 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3764 }
3765
3766 if (StoreVT.getNumElements() == 4) {
3767 SmallVector<Register, 4> PackedRegs;
3768 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3769 auto Unmerge = B.buildUnmerge(S32, Reg);
3770 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3771 PackedRegs.push_back(Unmerge.getReg(I));
3772 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3773 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3774 .getReg(0);
3775 }
3776
3777 llvm_unreachable("invalid data type")__builtin_unreachable();
3778 }
3779
3780 return Reg;
3781}
3782
3783Register AMDGPULegalizerInfo::fixStoreSourceType(
3784 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3785 MachineRegisterInfo *MRI = B.getMRI();
3786 LLT Ty = MRI->getType(VData);
3787
3788 const LLT S16 = LLT::scalar(16);
3789
3790 // Fixup illegal register types for i8 stores.
3791 if (Ty == LLT::scalar(8) || Ty == S16) {
3792 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3793 return AnyExt;
3794 }
3795
3796 if (Ty.isVector()) {
3797 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3798 if (IsFormat)
3799 return handleD16VData(B, *MRI, VData);
3800 }
3801 }
3802
3803 return VData;
3804}
3805
3806bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3807 MachineRegisterInfo &MRI,
3808 MachineIRBuilder &B,
3809 bool IsTyped,
3810 bool IsFormat) const {
3811 Register VData = MI.getOperand(1).getReg();
3812 LLT Ty = MRI.getType(VData);
3813 LLT EltTy = Ty.getScalarType();
3814 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3815 const LLT S32 = LLT::scalar(32);
3816
3817 VData = fixStoreSourceType(B, VData, IsFormat);
3818 Register RSrc = MI.getOperand(2).getReg();
3819
3820 MachineMemOperand *MMO = *MI.memoperands_begin();
3821 const int MemSize = MMO->getSize();
3822
3823 unsigned ImmOffset;
3824
3825 // The typed intrinsics add an immediate after the registers.
3826 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3827
3828 // The struct intrinsic variants add one additional operand over raw.
3829 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3830 Register VIndex;
3831 int OpOffset = 0;
3832 if (HasVIndex) {
3833 VIndex = MI.getOperand(3).getReg();
3834 OpOffset = 1;
3835 } else {
3836 VIndex = B.buildConstant(S32, 0).getReg(0);
3837 }
3838
3839 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3840 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3841
3842 unsigned Format = 0;
3843 if (IsTyped) {
3844 Format = MI.getOperand(5 + OpOffset).getImm();
3845 ++OpOffset;
3846 }
3847
3848 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3849
3850 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3851 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3852
3853 unsigned Opc;
3854 if (IsTyped) {
3855 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3856 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3857 } else if (IsFormat) {
3858 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3859 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3860 } else {
3861 switch (MemSize) {
3862 case 1:
3863 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3864 break;
3865 case 2:
3866 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3867 break;
3868 default:
3869 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3870 break;
3871 }
3872 }
3873
3874 auto MIB = B.buildInstr(Opc)
3875 .addUse(VData) // vdata
3876 .addUse(RSrc) // rsrc
3877 .addUse(VIndex) // vindex
3878 .addUse(VOffset) // voffset
3879 .addUse(SOffset) // soffset
3880 .addImm(ImmOffset); // offset(imm)
3881
3882 if (IsTyped)
3883 MIB.addImm(Format);
3884
3885 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3886 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3887 .addMemOperand(MMO);
3888
3889 MI.eraseFromParent();
3890 return true;
3891}
3892
3893bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3894 MachineRegisterInfo &MRI,
3895 MachineIRBuilder &B,
3896 bool IsFormat,
3897 bool IsTyped) const {
3898 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3899 MachineMemOperand *MMO = *MI.memoperands_begin();
3900 const LLT MemTy = MMO->getMemoryType();
3901 const LLT S32 = LLT::scalar(32);
3902
3903 Register Dst = MI.getOperand(0).getReg();
3904 Register RSrc = MI.getOperand(2).getReg();
3905
3906 // The typed intrinsics add an immediate after the registers.
3907 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3908
3909 // The struct intrinsic variants add one additional operand over raw.
3910 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3911 Register VIndex;
3912 int OpOffset = 0;
3913 if (HasVIndex) {
3914 VIndex = MI.getOperand(3).getReg();
3915 OpOffset = 1;
3916 } else {
3917 VIndex = B.buildConstant(S32, 0).getReg(0);
3918 }
3919
3920 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3921 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3922
3923 unsigned Format = 0;
3924 if (IsTyped) {
3925 Format = MI.getOperand(5 + OpOffset).getImm();
3926 ++OpOffset;
3927 }
3928
3929 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3930 unsigned ImmOffset;
3931
3932 LLT Ty = MRI.getType(Dst);
3933 LLT EltTy = Ty.getScalarType();
3934 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3935 const bool Unpacked = ST.hasUnpackedD16VMem();
3936
3937 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3938 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3939
3940 unsigned Opc;
3941
3942 if (IsTyped) {
3943 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3944 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3945 } else if (IsFormat) {
3946 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3947 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3948 } else {
3949 switch (MemTy.getSizeInBits()) {
3950 case 8:
3951 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3952 break;
3953 case 16:
3954 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3955 break;
3956 default:
3957 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3958 break;
3959 }
3960 }
3961
3962 Register LoadDstReg;
3963
3964 bool IsExtLoad =
3965 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
3966 LLT UnpackedTy = Ty.changeElementSize(32);
3967
3968 if (IsExtLoad)
3969 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3970 else if (Unpacked && IsD16 && Ty.isVector())
3971 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3972 else
3973 LoadDstReg = Dst;
3974
3975 auto MIB = B.buildInstr(Opc)
3976 .addDef(LoadDstReg) // vdata
3977 .addUse(RSrc) // rsrc
3978 .addUse(VIndex) // vindex
3979 .addUse(VOffset) // voffset
3980 .addUse(SOffset) // soffset
3981 .addImm(ImmOffset); // offset(imm)
3982
3983 if (IsTyped)
3984 MIB.addImm(Format);
3985
3986 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3987 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3988 .addMemOperand(MMO);
3989
3990 if (LoadDstReg != Dst) {
3991 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3992
3993 // Widen result for extending loads was widened.
3994 if (IsExtLoad)
3995 B.buildTrunc(Dst, LoadDstReg);
3996 else {
3997 // Repack to original 16-bit vector result
3998 // FIXME: G_TRUNC should work, but legalization currently fails
3999 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4000 SmallVector<Register, 4> Repack;
4001 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4002 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4003 B.buildMerge(Dst, Repack);
4004 }
4005 }
4006
4007 MI.eraseFromParent();
4008 return true;
4009}
4010
4011bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4012 MachineIRBuilder &B,
4013 bool IsInc) const {
4014 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4015 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4016 B.buildInstr(Opc)
4017 .addDef(MI.getOperand(0).getReg())
4018 .addUse(MI.getOperand(2).getReg())
4019 .addUse(MI.getOperand(3).getReg())
4020 .cloneMemRefs(MI);
4021 MI.eraseFromParent();
4022 return true;
4023}
4024
4025static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4026 switch (IntrID) {
4027 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4028 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4029 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4030 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4031 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4032 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4033 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4034 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4035 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4036 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4037 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4038 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4039 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4040 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4041 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4042 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4043 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4044 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4045 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4046 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4047 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4048 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4049 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4050 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4051 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4052 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4053 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4054 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4055 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4056 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4057 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4058 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4059 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4060 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4061 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4062 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4063 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4064 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4065 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4066 case Intrinsic::amdgcn_buffer_atomic_fadd:
4067 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4068 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4070 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4071 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4072 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4073 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4074 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4075 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4076 default:
4077 llvm_unreachable("unhandled atomic opcode")__builtin_unreachable();
4078 }
4079}
4080
4081bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4082 MachineIRBuilder &B,
4083 Intrinsic::ID IID) const {
4084 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4085 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4086 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4087
4088 Register Dst;
4089
4090 int OpOffset = 0;
4091 if (HasReturn) {
4092 // A few FP atomics do not support return values.
4093 Dst = MI.getOperand(0).getReg();
4094 } else {
4095 OpOffset = -1;
4096 }
4097
4098 Register VData = MI.getOperand(2 + OpOffset).getReg();
4099 Register CmpVal;
4100
4101 if (IsCmpSwap) {
4102 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4103 ++OpOffset;
4104 }
4105
4106 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4107 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4108
4109 // The struct intrinsic variants add one additional operand over raw.
4110 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4111 Register VIndex;
4112 if (HasVIndex) {
4113 VIndex = MI.getOperand(4 + OpOffset).getReg();
4114 ++OpOffset;
4115 } else {
4116 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4117 }
4118
4119 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4120 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4121 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4122
4123 MachineMemOperand *MMO = *MI.memoperands_begin();
4124
4125 unsigned ImmOffset;
4126 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4127 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4128
4129 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4130
4131 if (HasReturn)
4132 MIB.addDef(Dst);
4133
4134 MIB.addUse(VData); // vdata
4135
4136 if (IsCmpSwap)
4137 MIB.addReg(CmpVal);
4138
4139 MIB.addUse(RSrc) // rsrc
4140 .addUse(VIndex) // vindex
4141 .addUse(VOffset) // voffset
4142 .addUse(SOffset) // soffset
4143 .addImm(ImmOffset) // offset(imm)
4144 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4145 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4146 .addMemOperand(MMO);
4147
4148 MI.eraseFromParent();
4149 return true;
4150}
4151
4152/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4153/// vector with s16 typed elements.
4154static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4155 SmallVectorImpl<Register> &PackedAddrs,
4156 unsigned ArgOffset,
4157 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4158 bool IsA16, bool IsG16) {
4159 const LLT S16 = LLT::scalar(16);
4160 const LLT V2S16 = LLT::fixed_vector(2, 16);
4161 auto EndIdx = Intr->VAddrEnd;
4162
4163 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4164 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4165 if (!SrcOp.isReg())
4166 continue; // _L to _LZ may have eliminated this.
4167
4168 Register AddrReg = SrcOp.getReg();
4169
4170 if ((I < Intr->GradientStart) ||
4171 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4172 (I >= Intr->CoordStart && !IsA16)) {
4173 // Handle any gradient or coordinate operands that should not be packed
4174 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4175 PackedAddrs.push_back(AddrReg);
4176 } else {
4177 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4178 // derivatives dx/dh and dx/dv are packed with undef.
4179 if (((I + 1) >= EndIdx) ||
4180 ((Intr->NumGradients / 2) % 2 == 1 &&
4181 (I == static_cast<unsigned>(Intr->GradientStart +
4182 (Intr->NumGradients / 2) - 1) ||
4183 I == static_cast<unsigned>(Intr->GradientStart +
4184 Intr->NumGradients - 1))) ||
4185 // Check for _L to _LZ optimization
4186 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4187 PackedAddrs.push_back(
4188 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4189 .getReg(0));
4190 } else {
4191 PackedAddrs.push_back(
4192 B.buildBuildVector(
4193 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4194 .getReg(0));
4195 ++I;
4196 }
4197 }
4198 }
4199}
4200
4201/// Convert from separate vaddr components to a single vector address register,
4202/// and replace the remaining operands with $noreg.
4203static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4204 int DimIdx, int NumVAddrs) {
4205 const LLT S32 = LLT::scalar(32);
4206
4207 SmallVector<Register, 8> AddrRegs;
4208 for (int I = 0; I != NumVAddrs; ++I) {
4209 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4210 if (SrcOp.isReg()) {
4211 AddrRegs.push_back(SrcOp.getReg());
4212 assert(B.getMRI()->getType(SrcOp.getReg()) == S32)((void)0);
4213 }
4214 }
4215
4216 int NumAddrRegs = AddrRegs.size();
4217 if (NumAddrRegs != 1) {
4218 // Above 8 elements round up to next power of 2 (i.e. 16).
4219 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4220 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4221 auto Undef = B.buildUndef(S32);
4222 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4223 NumAddrRegs = RoundedNumRegs;
4224 }
4225
4226 auto VAddr =
4227 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4228 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4229 }
4230
4231 for (int I = 1; I != NumVAddrs; ++I) {
4232 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4233 if (SrcOp.isReg())
4234 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4235 }
4236}
4237
4238/// Rewrite image intrinsics to use register layouts expected by the subtarget.
4239///
4240/// Depending on the subtarget, load/store with 16-bit element data need to be
4241/// rewritten to use the low half of 32-bit registers, or directly use a packed
4242/// layout. 16-bit addresses should also sometimes be packed into 32-bit
4243/// registers.
4244///
4245/// We don't want to directly select image instructions just yet, but also want
4246/// to exposes all register repacking to the legalizer/combiners. We also don't
4247/// want a selected instrution entering RegBankSelect. In order to avoid
4248/// defining a multitude of intermediate image instructions, directly hack on
4249/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4250/// now unnecessary arguments with $noreg.
4251bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4252 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4253 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4254
4255 const unsigned NumDefs = MI.getNumExplicitDefs();
4256 const unsigned ArgOffset = NumDefs + 1;
4257 bool IsTFE = NumDefs == 2;
4258 // We are only processing the operands of d16 image operations on subtargets
4259 // that use the unpacked register layout, or need to repack the TFE result.
4260
4261 // TODO: Do we need to guard against already legalized intrinsics?
4262 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4263 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4264
4265 MachineRegisterInfo *MRI = B.getMRI();
4266 const LLT S32 = LLT::scalar(32);
4267 const LLT S16 = LLT::scalar(16);
4268 const LLT V2S16 = LLT::fixed_vector(2, 16);
4269
4270 unsigned DMask = 0;
4271
4272 // Check for 16 bit addresses and pack if true.
4273 LLT GradTy =
4274 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4275 LLT AddrTy =
4276 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4277 const bool IsG16 = GradTy == S16;
4278 const bool IsA16 = AddrTy == S16;
4279
4280 int DMaskLanes = 0;
4281 if (!BaseOpcode->Atomic) {
4282 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4283 if (BaseOpcode->Gather4) {
4284 DMaskLanes = 4;
4285 } else if (DMask != 0) {
4286 DMaskLanes = countPopulation(DMask);
4287 } else if (!IsTFE && !BaseOpcode->Store) {
4288 // If dmask is 0, this is a no-op load. This can be eliminated.
4289 B.buildUndef(MI.getOperand(0));
4290 MI.eraseFromParent();
4291 return true;
4292 }
4293 }
4294
4295 Observer.changingInstr(MI);
4296 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4297
4298 unsigned NewOpcode = NumDefs == 0 ?
4299 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4300
4301 // Track that we legalized this
4302 MI.setDesc(B.getTII().get(NewOpcode));
4303
4304 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4305 // dmask to be at least 1 otherwise the instruction will fail
4306 if (IsTFE && DMask == 0) {
4307 DMask = 0x1;
4308 DMaskLanes = 1;
4309 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4310 }
4311
4312 if (BaseOpcode->Atomic) {
4313 Register VData0 = MI.getOperand(2).getReg();
4314 LLT Ty = MRI->getType(VData0);
4315
4316 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4317 if (Ty.isVector())
4318 return false;
4319
4320 if (BaseOpcode->AtomicX2) {
4321 Register VData1 = MI.getOperand(3).getReg();
4322 // The two values are packed in one register.
4323 LLT PackedTy = LLT::fixed_vector(2, Ty);
4324 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4325 MI.getOperand(2).setReg(Concat.getReg(0));
4326 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4327 }
4328 }
4329
4330 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4331
4332 // Optimize _L to _LZ when _L is zero
4333 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4334 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4335 const ConstantFP *ConstantLod;
4336
4337 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4338 m_GFCst(ConstantLod))) {
4339 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4340 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4341 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4342 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4343 Intr->Dim);
4344
4345 // The starting indexes should remain in the same place.
4346 --CorrectedNumVAddrs;
4347
4348 MI.getOperand(MI.getNumExplicitDefs())
4349 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4350 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4351 Intr = NewImageDimIntr;
4352 }
4353 }
4354 }
4355
4356 // Optimize _mip away, when 'lod' is zero
4357 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4358 int64_t ConstantLod;
4359 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4360 m_ICst(ConstantLod))) {
4361 if (ConstantLod == 0) {
4362 // TODO: Change intrinsic opcode and remove operand instead or replacing
4363 // it with 0, as the _L to _LZ handling is done above.
4364 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4365 --CorrectedNumVAddrs;
4366 }
4367 }
4368 }
4369
4370 // Rewrite the addressing register layout before doing anything else.
4371 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4372 // 16 bit gradients are supported, but are tied to the A16 control
4373 // so both gradients and addresses must be 16 bit
4374 return false;
4375 }
4376
4377 if (IsA16 && !ST.hasA16()) {
4378 // A16 not supported
4379 return false;
4380 }
4381
4382 if (IsA16 || IsG16) {
4383 if (Intr->NumVAddrs > 1) {
4384 SmallVector<Register, 4> PackedRegs;
4385
4386 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4387 IsG16);
4388
4389 // See also below in the non-a16 branch
4390 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4391 PackedRegs.size() <= ST.getNSAMaxSize();
4392
4393 if (!UseNSA && PackedRegs.size() > 1) {
4394 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4395 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4396 PackedRegs[0] = Concat.getReg(0);
4397 PackedRegs.resize(1);
4398 }
4399
4400 const unsigned NumPacked = PackedRegs.size();
4401 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4402 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4403 if (!SrcOp.isReg()) {
4404 assert(SrcOp.isImm() && SrcOp.getImm() == 0)((void)0);
4405 continue;
4406 }
4407
4408 assert(SrcOp.getReg() != AMDGPU::NoRegister)((void)0);
4409
4410 if (I - Intr->VAddrStart < NumPacked)
4411 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4412 else
4413 SrcOp.setReg(AMDGPU::NoRegister);
4414 }
4415 }
4416 } else {
4417 // If the register allocator cannot place the address registers contiguously
4418 // without introducing moves, then using the non-sequential address encoding
4419 // is always preferable, since it saves VALU instructions and is usually a
4420 // wash in terms of code size or even better.
4421 //
4422 // However, we currently have no way of hinting to the register allocator
4423 // that MIMG addresses should be placed contiguously when it is possible to
4424 // do so, so force non-NSA for the common 2-address case as a heuristic.
4425 //
4426 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4427 // allocation when possible.
4428 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4429 CorrectedNumVAddrs <= ST.getNSAMaxSize();
4430
4431 if (!UseNSA && Intr->NumVAddrs > 1)
4432 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4433 Intr->NumVAddrs);
4434 }
4435
4436 int Flags = 0;
4437 if (IsA16)
4438 Flags |= 1;
4439 if (IsG16)
4440 Flags |= 2;
4441 MI.addOperand(MachineOperand::CreateImm(Flags));
4442
4443 if (BaseOpcode->Store) { // No TFE for stores?
4444 // TODO: Handle dmask trim
4445 Register VData = MI.getOperand(1).getReg();
4446 LLT Ty = MRI->getType(VData);
4447 if (!Ty.isVector() || Ty.getElementType() != S16)
4448 return true;
4449
4450 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4451 if (RepackedReg != VData) {
4452 MI.getOperand(1).setReg(RepackedReg);
4453 }
4454
4455 return true;
4456 }
4457
4458 Register DstReg = MI.getOperand(0).getReg();
4459 LLT Ty = MRI->getType(DstReg);
4460 const LLT EltTy = Ty.getScalarType();
4461 const bool IsD16 = Ty.getScalarType() == S16;
4462 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4463
4464 // Confirm that the return type is large enough for the dmask specified
4465 if (NumElts < DMaskLanes)
4466 return false;
4467
4468 if (NumElts > 4 || DMaskLanes > 4)
4469 return false;
4470
4471 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4472 const LLT AdjustedTy =
4473 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4474
4475 // The raw dword aligned data component of the load. The only legal cases
4476 // where this matters should be when using the packed D16 format, for
4477 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4478 LLT RoundedTy;
4479
4480 // S32 vector to to cover all data, plus TFE result element.
4481 LLT TFETy;
4482
4483 // Register type to use for each loaded component. Will be S32 or V2S16.
4484 LLT RegTy;
4485
4486 if (IsD16 && ST.hasUnpackedD16VMem()) {
4487 RoundedTy =
4488 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4489 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4490 RegTy = S32;
4491 } else {
4492 unsigned EltSize = EltTy.getSizeInBits();
4493 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4494 unsigned RoundedSize = 32 * RoundedElts;
4495 RoundedTy = LLT::scalarOrVector(
4496 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4497 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4498 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4499 }
4500
4501 // The return type does not need adjustment.
4502 // TODO: Should we change s16 case to s32 or <2 x s16>?
4503 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4504 return true;
4505
4506 Register Dst1Reg;
4507
4508 // Insert after the instruction.
4509 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4510
4511 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4512 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4513 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4514 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4515
4516 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4517
4518 MI.getOperand(0).setReg(NewResultReg);
4519
4520 // In the IR, TFE is supposed to be used with a 2 element struct return
4521 // type. The intruction really returns these two values in one contiguous
4522 // register, with one additional dword beyond the loaded data. Rewrite the
4523 // return type to use a single register result.
4524
4525 if (IsTFE) {
4526 Dst1Reg = MI.getOperand(1).getReg();
4527 if (MRI->getType(Dst1Reg) != S32)
4528 return false;
4529
4530 // TODO: Make sure the TFE operand bit is set.
4531 MI.RemoveOperand(1);
4532
4533 // Handle the easy case that requires no repack instructions.
4534 if (Ty == S32) {
4535 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4536 return true;
4537 }
4538 }
4539
4540 // Now figure out how to copy the new result register back into the old
4541 // result.
4542 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4543
4544 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4545
4546 if (ResultNumRegs == 1) {
4547 assert(!IsTFE)((void)0);
4548 ResultRegs[0] = NewResultReg;
4549 } else {
4550 // We have to repack into a new vector of some kind.
4551 for (int I = 0; I != NumDataRegs; ++I)
4552 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4553 B.buildUnmerge(ResultRegs, NewResultReg);
4554
4555 // Drop the final TFE element to get the data part. The TFE result is
4556 // directly written to the right place already.
4557 if (IsTFE)
4558 ResultRegs.resize(NumDataRegs);
4559 }
4560
4561 // For an s16 scalar result, we form an s32 result with a truncate regardless
4562 // of packed vs. unpacked.
4563 if (IsD16 && !Ty.isVector()) {
4564 B.buildTrunc(DstReg, ResultRegs[0]);
4565 return true;
4566 }
4567
4568 // Avoid a build/concat_vector of 1 entry.
4569 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4570 B.buildBitcast(DstReg, ResultRegs[0]);
4571 return true;
4572 }
4573
4574 assert(Ty.isVector())((void)0);
4575
4576 if (IsD16) {
4577 // For packed D16 results with TFE enabled, all the data components are
4578 // S32. Cast back to the expected type.
4579 //
4580 // TODO: We don't really need to use load s32 elements. We would only need one
4581 // cast for the TFE result if a multiple of v2s16 was used.
4582 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4583 for (Register &Reg : ResultRegs)
4584 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4585 } else if (ST.hasUnpackedD16VMem()) {
4586 for (Register &Reg : ResultRegs)
4587 Reg = B.buildTrunc(S16, Reg).getReg(0);
4588 }
4589 }
4590
4591 auto padWithUndef = [&](LLT Ty, int NumElts) {
4592 if (NumElts == 0)
4593 return;
4594 Register Undef = B.buildUndef(Ty).getReg(0);
4595 for (int I = 0; I != NumElts; ++I)
4596 ResultRegs.push_back(Undef);
4597 };
4598
4599 // Pad out any elements eliminated due to the dmask.
4600 LLT ResTy = MRI->getType(ResultRegs[0]);
4601 if (!ResTy.isVector()) {
4602 padWithUndef(ResTy, NumElts - ResultRegs.size());
4603 B.buildBuildVector(DstReg, ResultRegs);
4604 return true;
4605 }
4606
4607 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16)((void)0);
4608 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4609
4610 // Deal with the one annoying legal case.
4611 const LLT V3S16 = LLT::fixed_vector(3, 16);
4612 if (Ty == V3S16) {
4613 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4614 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4615 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4616 return true;
4617 }
4618
4619 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4620 B.buildConcatVectors(DstReg, ResultRegs);
4621 return true;
4622}
4623
4624bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4625 LegalizerHelper &Helper, MachineInstr &MI) const {
4626 MachineIRBuilder &B = Helper.MIRBuilder;
4627 GISelChangeObserver &Observer = Helper.Observer;
4628
4629 Register Dst = MI.getOperand(0).getReg();
4630 LLT Ty = B.getMRI()->getType(Dst);
4631 unsigned Size = Ty.getSizeInBits();
4632 MachineFunction &MF = B.getMF();
4633
4634 Observer.changingInstr(MI);
4635
4636 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4637 Ty = getBitcastRegisterType(Ty);
4638 Helper.bitcastDst(MI, Ty, 0);
4639 Dst = MI.getOperand(0).getReg();
4640 B.setInsertPt(B.getMBB(), MI);
4641 }
4642
4643 // FIXME: We don't really need this intermediate instruction. The intrinsic
4644 // should be fixed to have a memory operand. Since it's readnone, we're not
4645 // allowed to add one.
4646 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4647 MI.RemoveOperand(1); // Remove intrinsic ID
4648
4649 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4650 // TODO: Should this use datalayout alignment?
4651 const unsigned MemSize = (Size + 7) / 8;
4652 const Align MemAlign(4);
4653 MachineMemOperand *MMO = MF.getMachineMemOperand(
4654 MachinePointerInfo(),
4655 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4656 MachineMemOperand::MOInvariant,
4657 MemSize, MemAlign);
4658 MI.addMemOperand(MF, MMO);
4659
4660 // There are no 96-bit result scalar loads, but widening to 128-bit should
4661 // always be legal. We may need to restore this to a 96-bit result if it turns
4662 // out this needs to be converted to a vector load during RegBankSelect.
4663 if (!isPowerOf2_32(Size)) {
4664 if (Ty.isVector())
4665 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4666 else
4667 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4668 }
4669
4670 Observer.changedInstr(MI);
4671 return true;
4672}
4673
4674// TODO: Move to selection
4675bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4676 MachineRegisterInfo &MRI,
4677 MachineIRBuilder &B) const {
4678 if (!ST.isTrapHandlerEnabled() ||
4679 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4680 return legalizeTrapEndpgm(MI, MRI, B);
4681
4682 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4683 switch (*HsaAbiVer) {
4684 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4685 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4686 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4687 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4688 return ST.supportsGetDoorbellID() ?
4689 legalizeTrapHsa(MI, MRI, B) :
4690 legalizeTrapHsaQueuePtr(MI, MRI, B);
4691 }
4692 }
4693
4694 llvm_unreachable("Unknown trap handler")__builtin_unreachable();
4695}
4696
4697bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4698 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4699 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4700 MI.eraseFromParent();
4701 return true;
4702}
4703
4704bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4705 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4706 // Pass queue pointer to trap handler as input, and insert trap instruction
4707 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4708 Register LiveIn =
4709 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4710 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4711 return false;
4712
4713 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4714 B.buildCopy(SGPR01, LiveIn);
4715 B.buildInstr(AMDGPU::S_TRAP)
4716 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4717 .addReg(SGPR01, RegState::Implicit);
4718
4719 MI.eraseFromParent();
4720 return true;
4721}
4722
4723bool AMDGPULegalizerInfo::legalizeTrapHsa(
4724 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4725 B.buildInstr(AMDGPU::S_TRAP)
4726 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4727 MI.eraseFromParent();
4728 return true;
4729}
4730
4731bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4732 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4733 // Is non-HSA path or trap-handler disabled? then, report a warning
4734 // accordingly
4735 if (!ST.isTrapHandlerEnabled() ||
4736 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4737 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4738 "debugtrap handler not supported",
4739 MI.getDebugLoc(), DS_Warning);
4740 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4741 Ctx.diagnose(NoTrap);
4742 } else {
4743 // Insert debug-trap instruction
4744 B.buildInstr(AMDGPU::S_TRAP)
4745 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4746 }
4747
4748 MI.eraseFromParent();
4749 return true;
4750}
4751
4752bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4753 MachineIRBuilder &B) const {
4754 MachineRegisterInfo &MRI = *B.getMRI();
4755 const LLT S16 = LLT::scalar(16);
4756 const LLT S32 = LLT::scalar(32);
4757
4758 Register DstReg = MI.getOperand(0).getReg();
4759 Register NodePtr = MI.getOperand(2).getReg();
4760 Register RayExtent = MI.getOperand(3).getReg();
4761 Register RayOrigin = MI.getOperand(4).getReg();
4762 Register RayDir = MI.getOperand(5).getReg();
4763 Register RayInvDir = MI.getOperand(6).getReg();
4764 Register TDescr = MI.getOperand(7).getReg();
4765
4766 if (!ST.hasGFX10_AEncoding()) {
4767 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4768 "intrinsic not supported on subtarget",
4769 MI.getDebugLoc());
4770 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4771 return false;
4772 }
4773
4774 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4775 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4776 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4777 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4778 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4779 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4780
4781 SmallVector<Register, 12> Ops;
4782 if (Is64) {
4783 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4784 Ops.push_back(Unmerge.getReg(0));
4785 Ops.push_back(Unmerge.getReg(1));
4786 } else {
4787 Ops.push_back(NodePtr);
4788 }
4789 Ops.push_back(RayExtent);
4790
4791 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4792 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4793 Ops.push_back(Unmerge.getReg(0));
4794 Ops.push_back(Unmerge.getReg(1));
4795 Ops.push_back(Unmerge.getReg(2));
4796 };
4797
4798 packLanes(RayOrigin);
4799 if (IsA16) {
4800 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4801 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4802 Register R1 = MRI.createGenericVirtualRegister(S32);
4803 Register R2 = MRI.createGenericVirtualRegister(S32);
4804 Register R3 = MRI.createGenericVirtualRegister(S32);
4805 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4806 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4807 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4808 Ops.push_back(R1);
4809 Ops.push_back(R2);
4810 Ops.push_back(R3);
4811 } else {
4812 packLanes(RayDir);
4813 packLanes(RayInvDir);
4814 }
4815
4816 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4817 .addDef(DstReg)
4818 .addImm(Opcode);
4819
4820 for (Register R : Ops) {
4821 MIB.addUse(R);
4822 }
4823
4824 MIB.addUse(TDescr)
4825 .addImm(IsA16 ? 1 : 0)
4826 .cloneMemRefs(MI);
4827
4828 MI.eraseFromParent();
4829 return true;
4830}
4831
4832bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4833 MachineInstr &MI) const {
4834 MachineIRBuilder &B = Helper.MIRBuilder;
4835 MachineRegisterInfo &MRI = *B.getMRI();
4836
4837 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4838 auto IntrID = MI.getIntrinsicID();
4839 switch (IntrID) {
1
Control jumps to 'case amdgcn_implicit_buffer_ptr:' at line 4958
4840 case Intrinsic::amdgcn_if:
4841 case Intrinsic::amdgcn_else: {
4842 MachineInstr *Br = nullptr;
4843 MachineBasicBlock *UncondBrTarget = nullptr;
4844 bool Negated = false;
4845 if (MachineInstr *BrCond =
4846 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4847 const SIRegisterInfo *TRI
4848 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4849
4850 Register Def = MI.getOperand(1).getReg();
4851 Register Use = MI.getOperand(3).getReg();
4852
4853 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4854
4855 if (Negated)
4856 std::swap(CondBrTarget, UncondBrTarget);
4857
4858 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4859 if (IntrID == Intrinsic::amdgcn_if) {
4860 B.buildInstr(AMDGPU::SI_IF)
4861 .addDef(Def)
4862 .addUse(Use)
4863 .addMBB(UncondBrTarget);
4864 } else {
4865 B.buildInstr(AMDGPU::SI_ELSE)
4866 .addDef(Def)
4867 .addUse(Use)
4868 .addMBB(UncondBrTarget);
4869 }
4870
4871 if (Br) {
4872 Br->getOperand(0).setMBB(CondBrTarget);
4873 } else {
4874 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4875 // since we're swapping branch targets it needs to be reinserted.
4876 // FIXME: IRTranslator should probably not do this
4877 B.buildBr(*CondBrTarget);
4878 }
4879
4880 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4881 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4882 MI.eraseFromParent();
4883 BrCond->eraseFromParent();
4884 return true;
4885 }
4886
4887 return false;
4888 }
4889 case Intrinsic::amdgcn_loop: {
4890 MachineInstr *Br = nullptr;
4891 MachineBasicBlock *UncondBrTarget = nullptr;
4892 bool Negated = false;
4893 if (MachineInstr *BrCond =
4894 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4895 const SIRegisterInfo *TRI
4896 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4897
4898 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4899 Register Reg = MI.getOperand(2).getReg();
4900
4901 if (Negated)
4902 std::swap(CondBrTarget, UncondBrTarget);
4903
4904 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4905 B.buildInstr(AMDGPU::SI_LOOP)
4906 .addUse(Reg)
4907 .addMBB(UncondBrTarget);
4908
4909 if (Br)
4910 Br->getOperand(0).setMBB(CondBrTarget);
4911 else
4912 B.buildBr(*CondBrTarget);
4913
4914 MI.eraseFromParent();
4915 BrCond->eraseFromParent();
4916 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4917 return true;
4918 }
4919
4920 return false;
4921 }
4922 case Intrinsic::amdgcn_kernarg_segment_ptr:
4923 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4924 // This only makes sense to call in a kernel, so just lower to null.
4925 B.buildConstant(MI.getOperand(0).getReg(), 0);
4926 MI.eraseFromParent();
4927 return true;
4928 }
4929
4930 return legalizePreloadedArgIntrin(
4931 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4932 case Intrinsic::amdgcn_implicitarg_ptr:
4933 return legalizeImplicitArgPtr(MI, MRI, B);
4934 case Intrinsic::amdgcn_workitem_id_x:
4935 return legalizePreloadedArgIntrin(MI, MRI, B,
4936 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4937 case Intrinsic::amdgcn_workitem_id_y:
4938 return legalizePreloadedArgIntrin(MI, MRI, B,
4939 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4940 case Intrinsic::amdgcn_workitem_id_z:
4941 return legalizePreloadedArgIntrin(MI, MRI, B,
4942 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4943 case Intrinsic::amdgcn_workgroup_id_x:
4944 return legalizePreloadedArgIntrin(MI, MRI, B,
4945 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4946 case Intrinsic::amdgcn_workgroup_id_y:
4947 return legalizePreloadedArgIntrin(MI, MRI, B,
4948 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4949 case Intrinsic::amdgcn_workgroup_id_z:
4950 return legalizePreloadedArgIntrin(MI, MRI, B,
4951 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4952 case Intrinsic::amdgcn_dispatch_ptr:
4953 return legalizePreloadedArgIntrin(MI, MRI, B,
4954 AMDGPUFunctionArgInfo::DISPATCH_PTR);
4955 case Intrinsic::amdgcn_queue_ptr:
4956 return legalizePreloadedArgIntrin(MI, MRI, B,
4957 AMDGPUFunctionArgInfo::QUEUE_PTR);
4958 case Intrinsic::amdgcn_implicit_buffer_ptr:
4959 return legalizePreloadedArgIntrin(
2
Calling 'AMDGPULegalizerInfo::legalizePreloadedArgIntrin'
4960 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4961 case Intrinsic::amdgcn_dispatch_id:
4962 return legalizePreloadedArgIntrin(MI, MRI, B,
4963 AMDGPUFunctionArgInfo::DISPATCH_ID);
4964 case Intrinsic::amdgcn_fdiv_fast:
4965 return legalizeFDIVFastIntrin(MI, MRI, B);
4966 case Intrinsic::amdgcn_is_shared:
4967 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4968 case Intrinsic::amdgcn_is_private:
4969 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4970 case Intrinsic::amdgcn_wavefrontsize: {
4971 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4972 MI.eraseFromParent();
4973 return true;
4974 }
4975 case Intrinsic::amdgcn_s_buffer_load:
4976 return legalizeSBufferLoad(Helper, MI);
4977 case Intrinsic::amdgcn_raw_buffer_store:
4978 case Intrinsic::amdgcn_struct_buffer_store:
4979 return legalizeBufferStore(MI, MRI, B, false, false);
4980 case Intrinsic::amdgcn_raw_buffer_store_format:
4981 case Intrinsic::amdgcn_struct_buffer_store_format:
4982 return legalizeBufferStore(MI, MRI, B, false, true);
4983 case Intrinsic::amdgcn_raw_tbuffer_store:
4984 case Intrinsic::amdgcn_struct_tbuffer_store:
4985 return legalizeBufferStore(MI, MRI, B, true, true);
4986 case Intrinsic::amdgcn_raw_buffer_load:
4987 case Intrinsic::amdgcn_struct_buffer_load:
4988 return legalizeBufferLoad(MI, MRI, B, false, false);
4989 case Intrinsic::amdgcn_raw_buffer_load_format:
4990 case Intrinsic::amdgcn_struct_buffer_load_format:
4991 return legalizeBufferLoad(MI, MRI, B, true, false);
4992 case Intrinsic::amdgcn_raw_tbuffer_load:
4993 case Intrinsic::amdgcn_struct_tbuffer_load:
4994 return legalizeBufferLoad(MI, MRI, B, true, true);
4995 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4996 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4997 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4998 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4999 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5000 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5001 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5002 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5003 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5004 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5005 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5006 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5007 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5008 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5009 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5010 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5011 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5012 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5013 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5014 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5015 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5016 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5017 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5018 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5019 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5020 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5021 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5022 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5023 case Intrinsic::amdgcn_buffer_atomic_fadd:
5024 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5025 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5026 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5027 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5028 return legalizeBufferAtomic(MI, B, IntrID);
5029 case Intrinsic::amdgcn_atomic_inc:
5030 return legalizeAtomicIncDec(MI, B, true);
5031 case Intrinsic::amdgcn_atomic_dec:
5032 return legalizeAtomicIncDec(MI, B, false);
5033 case Intrinsic::trap:
5034 return legalizeTrapIntrinsic(MI, MRI, B);
5035 case Intrinsic::debugtrap:
5036 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5037 case Intrinsic::amdgcn_rsq_clamp:
5038 return legalizeRsqClampIntrinsic(MI, MRI, B);
5039 case Intrinsic::amdgcn_ds_fadd:
5040 case Intrinsic::amdgcn_ds_fmin:
5041 case Intrinsic::amdgcn_ds_fmax:
5042 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5043 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5044 return legalizeBVHIntrinsic(MI, B);
5045 default: {
5046 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5047 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5048 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5049 return true;
5050 }
5051 }
5052
5053 return true;
5054}

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/CodeGen/Register.h"
13#include "llvm/Pass.h"
14
15namespace llvm {
16
17class Function;
18class LLT;
19class raw_ostream;
20class TargetRegisterClass;
21class TargetRegisterInfo;
22
23struct ArgDescriptor {
24private:
25 friend struct AMDGPUFunctionArgInfo;
26 friend class AMDGPUArgumentUsageInfo;
27
28 union {
29 MCRegister Reg;
30 unsigned StackOffset;
31 };
32
33 // Bitmask to locate argument within the register.
34 unsigned Mask;
35
36 bool IsStack : 1;
37 bool IsSet : 1;
38
39public:
40 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41 bool IsStack = false, bool IsSet = false)
42 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
43
44 static constexpr ArgDescriptor createRegister(Register Reg,
45 unsigned Mask = ~0u) {
46 return ArgDescriptor(Reg, Mask, false, true);
47 }
48
49 static constexpr ArgDescriptor createStack(unsigned Offset,
50 unsigned Mask = ~0u) {
51 return ArgDescriptor(Offset, Mask, true, true);
52 }
53
54 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55 unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 MCRegister getRegister() const {
72 assert(!IsStack)((void)0);
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((void)0);
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
7
Assuming the condition is true
8
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr;
145
146 // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
147 // into v0, 10 bits per dimension if packed-tid is set.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
12.1
'ZB' is not equal to ZB_Undefined
12.1
'ZB' is not equal to ZB_Undefined
12.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
13
Assuming 'Val' is equal to 0
14
Taking true branch
117 return 32;
15
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
12
Calling 'TrailingZerosCounter::count'
16
Returning from 'TrailingZerosCounter::count'
17
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((void)0);
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1LL<<(N-1)) <= x && x < (INT64_C(1)1LL<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1ULL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1ULL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1ULL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX0xffffffffffffffffULL >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
441
442 return UINT64_C(1)1ULL + ~(UINT64_C(1)1ULL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1ULL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")((void)0);
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")((void)0);
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")((void)0);
771 assert(B <= 32 && "Bit width out of range.")((void)0);
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")((void)0);
787 assert(B <= 64 && "Bit width out of range.")((void)0);
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif