Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support/MathExtras.h
Warning:line 252, column 30
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SILoadStoreOptimizer.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -D PIC -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -D_RET_PROTECTOR -ret-protector -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
62#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63#include "llvm/Analysis/AliasAnalysis.h"
64#include "llvm/CodeGen/MachineFunctionPass.h"
65#include "llvm/InitializePasses.h"
66
67using namespace llvm;
68
69#define DEBUG_TYPE"si-load-store-opt" "si-load-store-opt"
70
71namespace {
72enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 BUFFER_LOAD,
78 BUFFER_STORE,
79 MIMG,
80 TBUFFER_LOAD,
81 TBUFFER_STORE,
82};
83
84struct AddressRegs {
85 unsigned char NumVAddrs = 0;
86 bool SBase = false;
87 bool SRsrc = false;
88 bool SOffset = false;
89 bool VAddr = false;
90 bool Addr = false;
91 bool SSamp = false;
92};
93
94// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95const unsigned MaxAddressRegs = 12 + 1 + 1;
96
97class SILoadStoreOptimizer : public MachineFunctionPass {
98 struct CombineInfo {
99 MachineBasicBlock::iterator I;
100 unsigned EltSize;
101 unsigned Offset;
102 unsigned Width;
103 unsigned Format;
104 unsigned BaseOff;
105 unsigned DMask;
106 InstClassEnum InstClass;
107 unsigned CPol = 0;
108 bool UseST64;
109 int AddrIdx[MaxAddressRegs];
110 const MachineOperand *AddrReg[MaxAddressRegs];
111 unsigned NumAddresses;
112 unsigned Order;
113
114 bool hasSameBaseAddress(const MachineInstr &MI) {
115 for (unsigned i = 0; i < NumAddresses; i++) {
116 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
117
118 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
119 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
120 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
121 return false;
122 }
123 continue;
124 }
125
126 // Check same base pointer. Be careful of subregisters, which can occur
127 // with vectors of pointers.
128 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
129 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
130 return false;
131 }
132 }
133 return true;
134 }
135
136 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
137 for (unsigned i = 0; i < NumAddresses; ++i) {
138 const MachineOperand *AddrOp = AddrReg[i];
139 // Immediates are always OK.
140 if (AddrOp->isImm())
141 continue;
142
143 // Don't try to merge addresses that aren't either immediates or registers.
144 // TODO: Should be possible to merge FrameIndexes and maybe some other
145 // non-register
146 if (!AddrOp->isReg())
147 return false;
148
149 // TODO: We should be able to merge physical reg addreses.
150 if (AddrOp->getReg().isPhysical())
151 return false;
152
153 // If an address has only one use then there will be on other
154 // instructions with the same address, so we can't merge this one.
155 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
156 return false;
157 }
158 return true;
159 }
160
161 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
162 const GCNSubtarget &STM);
163 };
164
165 struct BaseRegisters {
166 Register LoReg;
167 Register HiReg;
168
169 unsigned LoSubReg = 0;
170 unsigned HiSubReg = 0;
171 };
172
173 struct MemAddress {
174 BaseRegisters Base;
175 int64_t Offset = 0;
176 };
177
178 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
179
180private:
181 const GCNSubtarget *STM = nullptr;
182 const SIInstrInfo *TII = nullptr;
183 const SIRegisterInfo *TRI = nullptr;
184 MachineRegisterInfo *MRI = nullptr;
185 AliasAnalysis *AA = nullptr;
186 bool OptimizeAgain;
187
188 static bool dmasksCanBeCombined(const CombineInfo &CI,
189 const SIInstrInfo &TII,
190 const CombineInfo &Paired);
191 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
192 CombineInfo &Paired, bool Modify = false);
193 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
194 const CombineInfo &Paired);
195 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
196 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
197 const CombineInfo &Paired);
198 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
199 const CombineInfo &Paired);
200 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
201
202 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
203 SmallVectorImpl<MachineInstr *> &InstsToMove);
204
205 unsigned read2Opcode(unsigned EltSize) const;
206 unsigned read2ST64Opcode(unsigned EltSize) const;
207 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
208 CombineInfo &Paired,
209 const SmallVectorImpl<MachineInstr *> &InstsToMove);
210
211 unsigned write2Opcode(unsigned EltSize) const;
212 unsigned write2ST64Opcode(unsigned EltSize) const;
213 MachineBasicBlock::iterator
214 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
215 const SmallVectorImpl<MachineInstr *> &InstsToMove);
216 MachineBasicBlock::iterator
217 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
218 const SmallVectorImpl<MachineInstr *> &InstsToMove);
219 MachineBasicBlock::iterator
220 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
221 const SmallVectorImpl<MachineInstr *> &InstsToMove);
222 MachineBasicBlock::iterator
223 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
224 const SmallVectorImpl<MachineInstr *> &InstsToMove);
225 MachineBasicBlock::iterator
226 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
227 const SmallVectorImpl<MachineInstr *> &InstsToMove);
228 MachineBasicBlock::iterator
229 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
230 const SmallVectorImpl<MachineInstr *> &InstsToMove);
231 MachineBasicBlock::iterator
232 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
233 const SmallVectorImpl<MachineInstr *> &InstsToMove);
234
235 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
236 int32_t NewOffset) const;
237 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
238 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
239 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
240 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
241 /// Promotes constant offset to the immediate by adjusting the base. It
242 /// tries to use a base from the nearby instructions that allows it to have
243 /// a 13bit constant offset which gets promoted to the immediate.
244 bool promoteConstantOffsetToImm(MachineInstr &CI,
245 MemInfoMap &Visited,
246 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
247 void addInstToMergeableList(const CombineInfo &CI,
248 std::list<std::list<CombineInfo> > &MergeableInsts) const;
249
250 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
251 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
252 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
253 std::list<std::list<CombineInfo>> &MergeableInsts) const;
254
255public:
256 static char ID;
257
258 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
259 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
260 }
261
262 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
263 bool &OptimizeListAgain);
264 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
265
266 bool runOnMachineFunction(MachineFunction &MF) override;
267
268 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
269
270 void getAnalysisUsage(AnalysisUsage &AU) const override {
271 AU.setPreservesCFG();
272 AU.addRequired<AAResultsWrapperPass>();
273
274 MachineFunctionPass::getAnalysisUsage(AU);
275 }
276
277 MachineFunctionProperties getRequiredProperties() const override {
278 return MachineFunctionProperties()
279 .set(MachineFunctionProperties::Property::IsSSA);
280 }
281};
282
283static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
284 const unsigned Opc = MI.getOpcode();
285
286 if (TII.isMUBUF(Opc)) {
287 // FIXME: Handle d16 correctly
288 return AMDGPU::getMUBUFElements(Opc);
289 }
290 if (TII.isMIMG(MI)) {
291 uint64_t DMaskImm =
292 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
293 return countPopulation(DMaskImm);
294 }
295 if (TII.isMTBUF(Opc)) {
296 return AMDGPU::getMTBUFElements(Opc);
297 }
298
299 switch (Opc) {
300 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
301 return 1;
302 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
303 return 2;
304 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
305 return 4;
306 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH[[gnu::fallthrough]];
307 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH[[gnu::fallthrough]];
308 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH[[gnu::fallthrough]];
309 case AMDGPU::DS_WRITE_B32_gfx9:
310 return 1;
311 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH[[gnu::fallthrough]];
312 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH[[gnu::fallthrough]];
313 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH[[gnu::fallthrough]];
314 case AMDGPU::DS_WRITE_B64_gfx9:
315 return 2;
316 default:
317 return 0;
318 }
319}
320
321/// Maps instruction opcode to enum InstClassEnum.
322static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
323 switch (Opc) {
324 default:
325 if (TII.isMUBUF(Opc)) {
326 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
327 default:
328 return UNKNOWN;
329 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
330 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
331 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
332 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
333 return BUFFER_LOAD;
334 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
335 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
336 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
337 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
338 return BUFFER_STORE;
339 }
340 }
341 if (TII.isMIMG(Opc)) {
342 // Ignore instructions encoded without vaddr.
343 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
344 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
345 return UNKNOWN;
346 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
347 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
348 TII.isGather4(Opc))
349 return UNKNOWN;
350 return MIMG;
351 }
352 if (TII.isMTBUF(Opc)) {
353 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
354 default:
355 return UNKNOWN;
356 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
357 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
358 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
359 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
360 return TBUFFER_LOAD;
361 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
362 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
363 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
364 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
365 return TBUFFER_STORE;
366 }
367 }
368 return UNKNOWN;
369 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
370 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
372 return S_BUFFER_LOAD_IMM;
373 case AMDGPU::DS_READ_B32:
374 case AMDGPU::DS_READ_B32_gfx9:
375 case AMDGPU::DS_READ_B64:
376 case AMDGPU::DS_READ_B64_gfx9:
377 return DS_READ;
378 case AMDGPU::DS_WRITE_B32:
379 case AMDGPU::DS_WRITE_B32_gfx9:
380 case AMDGPU::DS_WRITE_B64:
381 case AMDGPU::DS_WRITE_B64_gfx9:
382 return DS_WRITE;
383 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
384 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
385 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
386 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
387 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
388 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
389 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
390 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
391 return UNKNOWN;
392 }
393}
394
395/// Determines instruction subclass from opcode. Only instructions
396/// of the same subclass can be merged together.
397static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
398 switch (Opc) {
399 default:
400 if (TII.isMUBUF(Opc))
401 return AMDGPU::getMUBUFBaseOpcode(Opc);
402 if (TII.isMIMG(Opc)) {
403 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
404 assert(Info)((void)0);
405 return Info->BaseOpcode;
406 }
407 if (TII.isMTBUF(Opc))
408 return AMDGPU::getMTBUFBaseOpcode(Opc);
409 return -1;
410 case AMDGPU::DS_READ_B32:
411 case AMDGPU::DS_READ_B32_gfx9:
412 case AMDGPU::DS_READ_B64:
413 case AMDGPU::DS_READ_B64_gfx9:
414 case AMDGPU::DS_WRITE_B32:
415 case AMDGPU::DS_WRITE_B32_gfx9:
416 case AMDGPU::DS_WRITE_B64:
417 case AMDGPU::DS_WRITE_B64_gfx9:
418 return Opc;
419 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
420 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
421 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
422 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
423 }
424}
425
426static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
427 AddressRegs Result;
428
429 if (TII.isMUBUF(Opc)) {
430 if (AMDGPU::getMUBUFHasVAddr(Opc))
431 Result.VAddr = true;
432 if (AMDGPU::getMUBUFHasSrsrc(Opc))
433 Result.SRsrc = true;
434 if (AMDGPU::getMUBUFHasSoffset(Opc))
435 Result.SOffset = true;
436
437 return Result;
438 }
439
440 if (TII.isMIMG(Opc)) {
441 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
442 if (VAddr0Idx >= 0) {
443 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
444 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
445 } else {
446 Result.VAddr = true;
447 }
448 Result.SRsrc = true;
449 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
450 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
451 Result.SSamp = true;
452
453 return Result;
454 }
455 if (TII.isMTBUF(Opc)) {
456 if (AMDGPU::getMTBUFHasVAddr(Opc))
457 Result.VAddr = true;
458 if (AMDGPU::getMTBUFHasSrsrc(Opc))
459 Result.SRsrc = true;
460 if (AMDGPU::getMTBUFHasSoffset(Opc))
461 Result.SOffset = true;
462
463 return Result;
464 }
465
466 switch (Opc) {
467 default:
468 return Result;
469 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
470 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
471 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
472 Result.SBase = true;
473 return Result;
474 case AMDGPU::DS_READ_B32:
475 case AMDGPU::DS_READ_B64:
476 case AMDGPU::DS_READ_B32_gfx9:
477 case AMDGPU::DS_READ_B64_gfx9:
478 case AMDGPU::DS_WRITE_B32:
479 case AMDGPU::DS_WRITE_B64:
480 case AMDGPU::DS_WRITE_B32_gfx9:
481 case AMDGPU::DS_WRITE_B64_gfx9:
482 Result.Addr = true;
483 return Result;
484 }
485}
486
487void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
488 const SIInstrInfo &TII,
489 const GCNSubtarget &STM) {
490 I = MI;
491 unsigned Opc = MI->getOpcode();
492 InstClass = getInstClass(Opc, TII);
493
494 if (InstClass == UNKNOWN)
495 return;
496
497 switch (InstClass) {
498 case DS_READ:
499 EltSize =
500 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
501 : 4;
502 break;
503 case DS_WRITE:
504 EltSize =
505 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
506 : 4;
507 break;
508 case S_BUFFER_LOAD_IMM:
509 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
510 break;
511 default:
512 EltSize = 4;
513 break;
514 }
515
516 if (InstClass == MIMG) {
517 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
518 // Offset is not considered for MIMG instructions.
519 Offset = 0;
520 } else {
521 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
522 Offset = I->getOperand(OffsetIdx).getImm();
523 }
524
525 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
526 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
527
528 Width = getOpcodeWidth(*I, TII);
529
530 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
531 Offset &= 0xffff;
532 } else if (InstClass != MIMG) {
533 CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
534 }
535
536 AddressRegs Regs = getRegs(Opc, TII);
537
538 NumAddresses = 0;
539 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
540 AddrIdx[NumAddresses++] =
541 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
542 if (Regs.Addr)
543 AddrIdx[NumAddresses++] =
544 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
545 if (Regs.SBase)
546 AddrIdx[NumAddresses++] =
547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
548 if (Regs.SRsrc)
549 AddrIdx[NumAddresses++] =
550 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
551 if (Regs.SOffset)
552 AddrIdx[NumAddresses++] =
553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
554 if (Regs.VAddr)
555 AddrIdx[NumAddresses++] =
556 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
557 if (Regs.SSamp)
558 AddrIdx[NumAddresses++] =
559 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
560 assert(NumAddresses <= MaxAddressRegs)((void)0);
561
562 for (unsigned J = 0; J < NumAddresses; J++)
563 AddrReg[J] = &I->getOperand(AddrIdx[J]);
564}
565
566} // end anonymous namespace.
567
568INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,static void *initializeSILoadStoreOptimizerPassOnce(PassRegistry
&Registry) {
569 "SI Load Store Optimizer", false, false)static void *initializeSILoadStoreOptimizerPassOnce(PassRegistry
&Registry) {
570INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
571INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",PassInfo *PI = new PassInfo( "SI Load Store Optimizer", "si-load-store-opt"
, &SILoadStoreOptimizer::ID, PassInfo::NormalCtor_t(callDefaultCtor
<SILoadStoreOptimizer>), false, false); Registry.registerPass
(*PI, true); return PI; } static llvm::once_flag InitializeSILoadStoreOptimizerPassFlag
; void llvm::initializeSILoadStoreOptimizerPass(PassRegistry &
Registry) { llvm::call_once(InitializeSILoadStoreOptimizerPassFlag
, initializeSILoadStoreOptimizerPassOnce, std::ref(Registry))
; }
572 false, false)PassInfo *PI = new PassInfo( "SI Load Store Optimizer", "si-load-store-opt"
, &SILoadStoreOptimizer::ID, PassInfo::NormalCtor_t(callDefaultCtor
<SILoadStoreOptimizer>), false, false); Registry.registerPass
(*PI, true); return PI; } static llvm::once_flag InitializeSILoadStoreOptimizerPassFlag
; void llvm::initializeSILoadStoreOptimizerPass(PassRegistry &
Registry) { llvm::call_once(InitializeSILoadStoreOptimizerPassFlag
, initializeSILoadStoreOptimizerPassOnce, std::ref(Registry))
; }
573
574char SILoadStoreOptimizer::ID = 0;
575
576char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
577
578FunctionPass *llvm::createSILoadStoreOptimizerPass() {
579 return new SILoadStoreOptimizer();
580}
581
582static void moveInstsAfter(MachineBasicBlock::iterator I,
583 ArrayRef<MachineInstr *> InstsToMove) {
584 MachineBasicBlock *MBB = I->getParent();
585 ++I;
586 for (MachineInstr *MI : InstsToMove) {
587 MI->removeFromParent();
588 MBB->insert(I, MI);
589 }
590}
591
592static void addDefsUsesToList(const MachineInstr &MI,
593 DenseSet<Register> &RegDefs,
594 DenseSet<Register> &PhysRegUses) {
595 for (const MachineOperand &Op : MI.operands()) {
596 if (Op.isReg()) {
597 if (Op.isDef())
598 RegDefs.insert(Op.getReg());
599 else if (Op.readsReg() && Op.getReg().isPhysical())
600 PhysRegUses.insert(Op.getReg());
601 }
602 }
603}
604
605static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
606 MachineBasicBlock::iterator B,
607 AliasAnalysis *AA) {
608 // RAW or WAR - cannot reorder
609 // WAW - cannot reorder
610 // RAR - safe to reorder
611 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
612}
613
614// Add MI and its defs to the lists if MI reads one of the defs that are
615// already in the list. Returns true in that case.
616static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
617 DenseSet<Register> &PhysRegUses,
618 SmallVectorImpl<MachineInstr *> &Insts) {
619 for (MachineOperand &Use : MI.operands()) {
620 // If one of the defs is read, then there is a use of Def between I and the
621 // instruction that I will potentially be merged with. We will need to move
622 // this instruction after the merged instructions.
623 //
624 // Similarly, if there is a def which is read by an instruction that is to
625 // be moved for merging, then we need to move the def-instruction as well.
626 // This can only happen for physical registers such as M0; virtual
627 // registers are in SSA form.
628 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
629 (Use.isDef() && RegDefs.count(Use.getReg())) ||
630 (Use.isDef() && Use.getReg().isPhysical() &&
631 PhysRegUses.count(Use.getReg())))) {
632 Insts.push_back(&MI);
633 addDefsUsesToList(MI, RegDefs, PhysRegUses);
634 return true;
635 }
636 }
637
638 return false;
639}
640
641static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
642 ArrayRef<MachineInstr *> InstsToMove,
643 AliasAnalysis *AA) {
644 assert(MemOp.mayLoadOrStore())((void)0);
645
646 for (MachineInstr *InstToMove : InstsToMove) {
647 if (!InstToMove->mayLoadOrStore())
648 continue;
649 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
650 return false;
651 }
652 return true;
653}
654
655// This function assumes that \p A and \p B have are identical except for
656// size and offset, and they referecne adjacent memory.
657static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
658 const MachineMemOperand *A,
659 const MachineMemOperand *B) {
660 unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
661 unsigned Size = A->getSize() + B->getSize();
662 // This function adds the offset parameter to the existing offset for A,
663 // so we pass 0 here as the offset and then manually set it to the correct
664 // value after the call.
665 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
666 MMO->setOffset(MinOffset);
667 return MMO;
668}
669
670bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
671 const SIInstrInfo &TII,
672 const CombineInfo &Paired) {
673 assert(CI.InstClass == MIMG)((void)0);
674
675 // Ignore instructions with tfe/lwe set.
676 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
677 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
678
679 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
680 return false;
681
682 // Check other optional immediate operands for equality.
683 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
684 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
685 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
686
687 for (auto op : OperandsToMatch) {
688 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
689 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
690 return false;
691 if (Idx != -1 &&
692 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
693 return false;
694 }
695
696 // Check DMask for overlaps.
697 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
698 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
699
700 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
701 if ((1u << AllowedBitsForMin) <= MinMask)
702 return false;
703
704 return true;
705}
706
707static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
708 unsigned ComponentCount,
709 const GCNSubtarget &STI) {
710 if (ComponentCount > 4)
711 return 0;
712
713 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
714 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
715 if (!OldFormatInfo)
716 return 0;
717
718 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
719 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
720 ComponentCount,
721 OldFormatInfo->NumFormat, STI);
722
723 if (!NewFormatInfo)
724 return 0;
725
726 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&((void)0)
727 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp)((void)0);
728
729 return NewFormatInfo->Format;
730}
731
732// Return the value in the inclusive range [Lo,Hi] that is aligned to the
733// highest power of two. Note that the result is well defined for all inputs
734// including corner cases like:
735// - if Lo == Hi, return that value
736// - if Lo == 0, return 0 (even though the "- 1" below underflows
737// - if Lo > Hi, return 0 (as if the range wrapped around)
738static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
739 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
1
Calling 'maskLeadingOnes<unsigned int>'
740}
741
742bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
743 const GCNSubtarget &STI,
744 CombineInfo &Paired,
745 bool Modify) {
746 assert(CI.InstClass != MIMG)((void)0);
747
748 // XXX - Would the same offset be OK? Is there any reason this would happen or
749 // be useful?
750 if (CI.Offset == Paired.Offset)
751 return false;
752
753 // This won't be valid if the offset isn't aligned.
754 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
755 return false;
756
757 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
758
759 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
760 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
761 if (!Info0)
762 return false;
763 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
764 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
765 if (!Info1)
766 return false;
767
768 if (Info0->BitsPerComp != Info1->BitsPerComp ||
769 Info0->NumFormat != Info1->NumFormat)
770 return false;
771
772 // TODO: Should be possible to support more formats, but if format loads
773 // are not dword-aligned, the merged load might not be valid.
774 if (Info0->BitsPerComp != 32)
775 return false;
776
777 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
778 return false;
779 }
780
781 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
782 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
783 CI.UseST64 = false;
784 CI.BaseOff = 0;
785
786 // Handle all non-DS instructions.
787 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
788 return (EltOffset0 + CI.Width == EltOffset1 ||
789 EltOffset1 + Paired.Width == EltOffset0) &&
790 CI.CPol == Paired.CPol &&
791 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
792 }
793
794 // If the offset in elements doesn't fit in 8-bits, we might be able to use
795 // the stride 64 versions.
796 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
797 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
798 if (Modify) {
799 CI.Offset = EltOffset0 / 64;
800 Paired.Offset = EltOffset1 / 64;
801 CI.UseST64 = true;
802 }
803 return true;
804 }
805
806 // Check if the new offsets fit in the reduced 8-bit range.
807 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
808 if (Modify) {
809 CI.Offset = EltOffset0;
810 Paired.Offset = EltOffset1;
811 }
812 return true;
813 }
814
815 // Try to shift base address to decrease offsets.
816 uint32_t Min = std::min(EltOffset0, EltOffset1);
817 uint32_t Max = std::max(EltOffset0, EltOffset1);
818
819 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
820 if (((Max - Min) & ~Mask) == 0) {
821 if (Modify) {
822 // From the range of values we could use for BaseOff, choose the one that
823 // is aligned to the highest power of two, to maximise the chance that
824 // the same offset can be reused for other load/store pairs.
825 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
826 // Copy the low bits of the offsets, so that when we adjust them by
827 // subtracting BaseOff they will be multiples of 64.
828 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
829 CI.BaseOff = BaseOff * CI.EltSize;
830 CI.Offset = (EltOffset0 - BaseOff) / 64;
831 Paired.Offset = (EltOffset1 - BaseOff) / 64;
832 CI.UseST64 = true;
833 }
834 return true;
835 }
836
837 if (isUInt<8>(Max - Min)) {
838 if (Modify) {
839 // From the range of values we could use for BaseOff, choose the one that
840 // is aligned to the highest power of two, to maximise the chance that
841 // the same offset can be reused for other load/store pairs.
842 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
843 CI.BaseOff = BaseOff * CI.EltSize;
844 CI.Offset = EltOffset0 - BaseOff;
845 Paired.Offset = EltOffset1 - BaseOff;
846 }
847 return true;
848 }
849
850 return false;
851}
852
853bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
854 const CombineInfo &CI,
855 const CombineInfo &Paired) {
856 const unsigned Width = (CI.Width + Paired.Width);
857 switch (CI.InstClass) {
858 default:
859 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
860 case S_BUFFER_LOAD_IMM:
861 switch (Width) {
862 default:
863 return false;
864 case 2:
865 case 4:
866 return true;
867 }
868 }
869}
870
871const TargetRegisterClass *
872SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
873 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
874 return TRI->getRegClassForReg(*MRI, Dst->getReg());
875 }
876 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
877 return TRI->getRegClassForReg(*MRI, Src->getReg());
878 }
879 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
880 return TRI->getRegClassForReg(*MRI, Src->getReg());
881 }
882 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
883 return TRI->getRegClassForReg(*MRI, Dst->getReg());
884 }
885 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
886 return TRI->getRegClassForReg(*MRI, Src->getReg());
887 }
888 return nullptr;
889}
890
891/// This function assumes that CI comes before Paired in a basic block.
892bool SILoadStoreOptimizer::checkAndPrepareMerge(
893 CombineInfo &CI, CombineInfo &Paired,
894 SmallVectorImpl<MachineInstr *> &InstsToMove) {
895
896 // Check both offsets (or masks for MIMG) can be combined and fit in the
897 // reduced range.
898 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
899 return false;
900
901 if (CI.InstClass != MIMG &&
902 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
903 return false;
904
905 const unsigned Opc = CI.I->getOpcode();
906 const InstClassEnum InstClass = getInstClass(Opc, *TII);
907
908 if (InstClass == UNKNOWN) {
909 return false;
910 }
911 const unsigned InstSubclass = getInstSubclass(Opc, *TII);
912
913 // Do not merge VMEM buffer instructions with "swizzled" bit set.
914 int Swizzled =
915 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
916 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
917 return false;
918
919 DenseSet<Register> RegDefsToMove;
920 DenseSet<Register> PhysRegUsesToMove;
921 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
922
923 const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
924 bool IsAGPR = TRI->hasAGPRs(DataRC);
925
926 MachineBasicBlock::iterator E = std::next(Paired.I);
927 MachineBasicBlock::iterator MBBI = std::next(CI.I);
928 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
929 for (; MBBI != E; ++MBBI) {
930
931 if (MBBI == MBBE) {
932 // CombineInfo::Order is a hint on the instruction ordering within the
933 // basic block. This hint suggests that CI precedes Paired, which is
934 // true most of the time. However, moveInstsAfter() processing a
935 // previous list may have changed this order in a situation when it
936 // moves an instruction which exists in some other merge list.
937 // In this case it must be dependent.
938 return false;
939 }
940
941 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
942 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
943 // This is not a matching instruction, but we can keep looking as
944 // long as one of these conditions are met:
945 // 1. It is safe to move I down past MBBI.
946 // 2. It is safe to move MBBI down past the instruction that I will
947 // be merged into.
948
949 if (MBBI->hasUnmodeledSideEffects()) {
950 // We can't re-order this instruction with respect to other memory
951 // operations, so we fail both conditions mentioned above.
952 return false;
953 }
954
955 if (MBBI->mayLoadOrStore() &&
956 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
957 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
958 // We fail condition #1, but we may still be able to satisfy condition
959 // #2. Add this instruction to the move list and then we will check
960 // if condition #2 holds once we have selected the matching instruction.
961 InstsToMove.push_back(&*MBBI);
962 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
963 continue;
964 }
965
966 // When we match I with another DS instruction we will be moving I down
967 // to the location of the matched instruction any uses of I will need to
968 // be moved down as well.
969 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
970 InstsToMove);
971 continue;
972 }
973
974 // Don't merge volatiles.
975 if (MBBI->hasOrderedMemoryRef())
976 return false;
977
978 int Swizzled =
979 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
980 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
981 return false;
982
983 // Handle a case like
984 // DS_WRITE_B32 addr, v, idx0
985 // w = DS_READ_B32 addr, idx0
986 // DS_WRITE_B32 addr, f(w), idx1
987 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
988 // merging of the two writes.
989 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
990 InstsToMove))
991 continue;
992
993 if (&*MBBI == &*Paired.I) {
994 if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
995 return false;
996 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
997 // operands. However we are reporting that ds_write2 shall have
998 // only VGPR data so that machine copy propagation does not
999 // create an illegal instruction with a VGPR and AGPR sources.
1000 // Consequenctially if we create such instruction the verifier
1001 // will complain.
1002 if (IsAGPR && CI.InstClass == DS_WRITE)
1003 return false;
1004
1005 // We need to go through the list of instructions that we plan to
1006 // move and make sure they are all safe to move down past the merged
1007 // instruction.
1008 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
1009
1010 // Call offsetsCanBeCombined with modify = true so that the offsets are
1011 // correct for the new instruction. This should return true, because
1012 // this function should only be called on CombineInfo objects that
1013 // have already been confirmed to be mergeable.
1014 if (CI.InstClass != MIMG)
1015 offsetsCanBeCombined(CI, *STM, Paired, true);
1016 return true;
1017 }
1018 return false;
1019 }
1020
1021 // We've found a load/store that we couldn't merge for some reason.
1022 // We could potentially keep looking, but we'd need to make sure that
1023 // it was safe to move I and also all the instruction in InstsToMove
1024 // down past this instruction.
1025 // check if we can move I across MBBI and if we can move all I's users
1026 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
1027 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
1028 break;
1029 }
1030 return false;
1031}
1032
1033unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1034 if (STM->ldsRequiresM0Init())
1035 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1036 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1037}
1038
1039unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1040 if (STM->ldsRequiresM0Init())
1041 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1042
1043 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1044 : AMDGPU::DS_READ2ST64_B64_gfx9;
1045}
1046
1047MachineBasicBlock::iterator
1048SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1049 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1050 MachineBasicBlock *MBB = CI.I->getParent();
1051
1052 // Be careful, since the addresses could be subregisters themselves in weird
1053 // cases, like vectors of pointers.
1054 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1055
1056 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1057 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1058
1059 unsigned NewOffset0 = CI.Offset;
1060 unsigned NewOffset1 = Paired.Offset;
1061 unsigned Opc =
1062 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1063
1064 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1065 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1066
1067 if (NewOffset0 > NewOffset1) {
1068 // Canonicalize the merged instruction so the smaller offset comes first.
1069 std::swap(NewOffset0, NewOffset1);
1070 std::swap(SubRegIdx0, SubRegIdx1);
1071 }
1072
1073 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&((void)0)
1074 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit")((void)0);
1075
1076 const MCInstrDesc &Read2Desc = TII->get(Opc);
1077
1078 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1079 Register DestReg = MRI->createVirtualRegister(SuperRC);
1080
1081 DebugLoc DL = CI.I->getDebugLoc();
1082
1083 Register BaseReg = AddrReg->getReg();
1084 unsigned BaseSubReg = AddrReg->getSubReg();
1085 unsigned BaseRegFlags = 0;
1086 if (CI.BaseOff) {
1087 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1088 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1089 .addImm(CI.BaseOff);
1090
1091 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1092 BaseRegFlags = RegState::Kill;
1093
1094 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1095 .addReg(ImmReg)
1096 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1097 .addImm(0); // clamp bit
1098 BaseSubReg = 0;
1099 }
1100
1101 MachineInstrBuilder Read2 =
1102 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
1103 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1104 .addImm(NewOffset0) // offset0
1105 .addImm(NewOffset1) // offset1
1106 .addImm(0) // gds
1107 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1108
1109 (void)Read2;
1110
1111 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1112
1113 // Copy to the old destination registers.
1114 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1115 .add(*Dest0) // Copy to same destination including flags and sub reg.
1116 .addReg(DestReg, 0, SubRegIdx0);
1117 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1118 .add(*Dest1)
1119 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1120
1121 moveInstsAfter(Copy1, InstsToMove);
1122
1123 CI.I->eraseFromParent();
1124 Paired.I->eraseFromParent();
1125
1126 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n')do { } while (false);
1127 return Read2;
1128}
1129
1130unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1131 if (STM->ldsRequiresM0Init())
1132 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1133 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1134 : AMDGPU::DS_WRITE2_B64_gfx9;
1135}
1136
1137unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1138 if (STM->ldsRequiresM0Init())
1139 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1140 : AMDGPU::DS_WRITE2ST64_B64;
1141
1142 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1143 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1144}
1145
1146MachineBasicBlock::iterator
1147SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
1148 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1149 MachineBasicBlock *MBB = CI.I->getParent();
1150
1151 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1152 // sure we preserve the subregister index and any register flags set on them.
1153 const MachineOperand *AddrReg =
1154 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1155 const MachineOperand *Data0 =
1156 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1157 const MachineOperand *Data1 =
1158 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1159
1160 unsigned NewOffset0 = CI.Offset;
1161 unsigned NewOffset1 = Paired.Offset;
1162 unsigned Opc =
1163 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1164
1165 if (NewOffset0 > NewOffset1) {
1166 // Canonicalize the merged instruction so the smaller offset comes first.
1167 std::swap(NewOffset0, NewOffset1);
1168 std::swap(Data0, Data1);
1169 }
1170
1171 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&((void)0)
1172 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit")((void)0);
1173
1174 const MCInstrDesc &Write2Desc = TII->get(Opc);
1175 DebugLoc DL = CI.I->getDebugLoc();
1176
1177 Register BaseReg = AddrReg->getReg();
1178 unsigned BaseSubReg = AddrReg->getSubReg();
1179 unsigned BaseRegFlags = 0;
1180 if (CI.BaseOff) {
1181 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1182 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1183 .addImm(CI.BaseOff);
1184
1185 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1186 BaseRegFlags = RegState::Kill;
1187
1188 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1189 .addReg(ImmReg)
1190 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1191 .addImm(0); // clamp bit
1192 BaseSubReg = 0;
1193 }
1194
1195 MachineInstrBuilder Write2 =
1196 BuildMI(*MBB, Paired.I, DL, Write2Desc)
1197 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1198 .add(*Data0) // data0
1199 .add(*Data1) // data1
1200 .addImm(NewOffset0) // offset0
1201 .addImm(NewOffset1) // offset1
1202 .addImm(0) // gds
1203 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1204
1205 moveInstsAfter(Write2, InstsToMove);
1206
1207 CI.I->eraseFromParent();
1208 Paired.I->eraseFromParent();
1209
1210 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n')do { } while (false);
1211 return Write2;
1212}
1213
1214MachineBasicBlock::iterator
1215SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1216 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1217 MachineBasicBlock *MBB = CI.I->getParent();
1218 DebugLoc DL = CI.I->getDebugLoc();
1219 const unsigned Opcode = getNewOpcode(CI, Paired);
1220
1221 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1222
1223 Register DestReg = MRI->createVirtualRegister(SuperRC);
1224 unsigned MergedDMask = CI.DMask | Paired.DMask;
1225 unsigned DMaskIdx =
1226 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1227
1228 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1229 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1230 if (I == DMaskIdx)
1231 MIB.addImm(MergedDMask);
1232 else
1233 MIB.add((*CI.I).getOperand(I));
1234 }
1235
1236 // It shouldn't be possible to get this far if the two instructions
1237 // don't have a single memoperand, because MachineInstr::mayAlias()
1238 // will return true if this is the case.
1239 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())((void)0);
1240
1241 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1242 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1243
1244 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1245
1246 unsigned SubRegIdx0, SubRegIdx1;
1247 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1248
1249 // Copy to the old destination registers.
1250 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1251 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1252 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1253
1254 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1255 .add(*Dest0) // Copy to same destination including flags and sub reg.
1256 .addReg(DestReg, 0, SubRegIdx0);
1257 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1258 .add(*Dest1)
1259 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1260
1261 moveInstsAfter(Copy1, InstsToMove);
1262
1263 CI.I->eraseFromParent();
1264 Paired.I->eraseFromParent();
1265 return New;
1266}
1267
1268MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1269 CombineInfo &CI, CombineInfo &Paired,
1270 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1271 MachineBasicBlock *MBB = CI.I->getParent();
1272 DebugLoc DL = CI.I->getDebugLoc();
1273 const unsigned Opcode = getNewOpcode(CI, Paired);
1274
1275 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1276
1277 Register DestReg = MRI->createVirtualRegister(SuperRC);
1278 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1279
1280 // It shouldn't be possible to get this far if the two instructions
1281 // don't have a single memoperand, because MachineInstr::mayAlias()
1282 // will return true if this is the case.
1283 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())((void)0);
1284
1285 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1286 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1287
1288 MachineInstr *New =
1289 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
1290 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1291 .addImm(MergedOffset) // offset
1292 .addImm(CI.CPol) // cpol
1293 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1294
1295 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1296 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1297 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1298
1299 // Copy to the old destination registers.
1300 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1301 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1302 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1303
1304 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1305 .add(*Dest0) // Copy to same destination including flags and sub reg.
1306 .addReg(DestReg, 0, SubRegIdx0);
1307 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1308 .add(*Dest1)
1309 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1310
1311 moveInstsAfter(Copy1, InstsToMove);
1312
1313 CI.I->eraseFromParent();
1314 Paired.I->eraseFromParent();
1315 return New;
1316}
1317
1318MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1319 CombineInfo &CI, CombineInfo &Paired,
1320 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1321 MachineBasicBlock *MBB = CI.I->getParent();
1322 DebugLoc DL = CI.I->getDebugLoc();
1323
1324 const unsigned Opcode = getNewOpcode(CI, Paired);
1325
1326 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1327
1328 // Copy to the new source register.
1329 Register DestReg = MRI->createVirtualRegister(SuperRC);
1330 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1331
1332 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1333
1334 AddressRegs Regs = getRegs(Opcode, *TII);
1335
1336 if (Regs.VAddr)
1337 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1338
1339 // It shouldn't be possible to get this far if the two instructions
1340 // don't have a single memoperand, because MachineInstr::mayAlias()
1341 // will return true if this is the case.
1342 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())((void)0);
1343
1344 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1345 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1346
1347 MachineInstr *New =
1348 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1349 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1350 .addImm(MergedOffset) // offset
1351 .addImm(CI.CPol) // cpol
1352 .addImm(0) // tfe
1353 .addImm(0) // swz
1354 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1355
1356 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1357 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1358 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1359
1360 // Copy to the old destination registers.
1361 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1362 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1363 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1364
1365 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1366 .add(*Dest0) // Copy to same destination including flags and sub reg.
1367 .addReg(DestReg, 0, SubRegIdx0);
1368 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1369 .add(*Dest1)
1370 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1371
1372 moveInstsAfter(Copy1, InstsToMove);
1373
1374 CI.I->eraseFromParent();
1375 Paired.I->eraseFromParent();
1376 return New;
1377}
1378
1379MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1380 CombineInfo &CI, CombineInfo &Paired,
1381 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1382 MachineBasicBlock *MBB = CI.I->getParent();
1383 DebugLoc DL = CI.I->getDebugLoc();
1384
1385 const unsigned Opcode = getNewOpcode(CI, Paired);
1386
1387 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1388
1389 // Copy to the new source register.
1390 Register DestReg = MRI->createVirtualRegister(SuperRC);
1391 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1392
1393 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1394
1395 AddressRegs Regs = getRegs(Opcode, *TII);
1396
1397 if (Regs.VAddr)
1398 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1399
1400 unsigned JoinedFormat =
1401 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1402
1403 // It shouldn't be possible to get this far if the two instructions
1404 // don't have a single memoperand, because MachineInstr::mayAlias()
1405 // will return true if this is the case.
1406 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())((void)0);
1407
1408 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1409 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1410
1411 MachineInstr *New =
1412 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1413 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1414 .addImm(MergedOffset) // offset
1415 .addImm(JoinedFormat) // format
1416 .addImm(CI.CPol) // cpol
1417 .addImm(0) // tfe
1418 .addImm(0) // swz
1419 .addMemOperand(
1420 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1421
1422 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1423 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1424 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1425
1426 // Copy to the old destination registers.
1427 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1428 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1429 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1430
1431 BuildMI(*MBB, Paired.I, DL, CopyDesc)
1432 .add(*Dest0) // Copy to same destination including flags and sub reg.
1433 .addReg(DestReg, 0, SubRegIdx0);
1434 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1435 .add(*Dest1)
1436 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1437
1438 moveInstsAfter(Copy1, InstsToMove);
1439
1440 CI.I->eraseFromParent();
1441 Paired.I->eraseFromParent();
1442 return New;
1443}
1444
1445MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1446 CombineInfo &CI, CombineInfo &Paired,
1447 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1448 MachineBasicBlock *MBB = CI.I->getParent();
1449 DebugLoc DL = CI.I->getDebugLoc();
1450
1451 const unsigned Opcode = getNewOpcode(CI, Paired);
1452
1453 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1454 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1455 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1456
1457 // Copy to the new source register.
1458 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1459 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1460
1461 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1462 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1463
1464 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1465 .add(*Src0)
1466 .addImm(SubRegIdx0)
1467 .add(*Src1)
1468 .addImm(SubRegIdx1);
1469
1470 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1471 .addReg(SrcReg, RegState::Kill);
1472
1473 AddressRegs Regs = getRegs(Opcode, *TII);
1474
1475 if (Regs.VAddr)
1476 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1477
1478 unsigned JoinedFormat =
1479 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1480
1481 // It shouldn't be possible to get this far if the two instructions
1482 // don't have a single memoperand, because MachineInstr::mayAlias()
1483 // will return true if this is the case.
1484 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())((void)0);
1485
1486 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1487 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1488
1489 MachineInstr *New =
1490 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1491 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1492 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1493 .addImm(JoinedFormat) // format
1494 .addImm(CI.CPol) // cpol
1495 .addImm(0) // tfe
1496 .addImm(0) // swz
1497 .addMemOperand(
1498 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1499
1500 moveInstsAfter(MIB, InstsToMove);
1501
1502 CI.I->eraseFromParent();
1503 Paired.I->eraseFromParent();
1504 return New;
1505}
1506
1507unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1508 const CombineInfo &Paired) {
1509 const unsigned Width = CI.Width + Paired.Width;
1510
1511 switch (CI.InstClass) {
1512 default:
1513 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE)((void)0);
1514 // FIXME: Handle d16 correctly
1515 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1516 Width);
1517 case TBUFFER_LOAD:
1518 case TBUFFER_STORE:
1519 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1520 Width);
1521
1522 case UNKNOWN:
1523 llvm_unreachable("Unknown instruction class")__builtin_unreachable();
1524 case S_BUFFER_LOAD_IMM:
1525 switch (Width) {
1526 default:
1527 return 0;
1528 case 2:
1529 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1530 case 4:
1531 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1532 }
1533 case MIMG:
1534 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width))((void)0);
1535 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1536 }
1537}
1538
1539std::pair<unsigned, unsigned>
1540SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
1541
1542 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
1543 return std::make_pair(0, 0);
1544
1545 bool ReverseOrder;
1546 if (CI.InstClass == MIMG) {
1547 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&((void)0)
1548 "No overlaps")((void)0);
1549 ReverseOrder = CI.DMask > Paired.DMask;
1550 } else
1551 ReverseOrder = CI.Offset > Paired.Offset;
1552
1553 static const unsigned Idxs[4][4] = {
1554 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1555 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1556 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1557 {AMDGPU::sub3, 0, 0, 0},
1558 };
1559 unsigned Idx0;
1560 unsigned Idx1;
1561
1562 assert(CI.Width >= 1 && CI.Width <= 3)((void)0);
1563 assert(Paired.Width >= 1 && Paired.Width <= 3)((void)0);
1564
1565 if (ReverseOrder) {
1566 Idx1 = Idxs[0][Paired.Width - 1];
1567 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1568 } else {
1569 Idx0 = Idxs[0][CI.Width - 1];
1570 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1571 }
1572
1573 return std::make_pair(Idx0, Idx1);
1574}
1575
1576const TargetRegisterClass *
1577SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1578 const CombineInfo &Paired) {
1579 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1580 switch (CI.Width + Paired.Width) {
1581 default:
1582 return nullptr;
1583 case 2:
1584 return &AMDGPU::SReg_64_XEXECRegClass;
1585 case 4:
1586 return &AMDGPU::SGPR_128RegClass;
1587 case 8:
1588 return &AMDGPU::SGPR_256RegClass;
1589 case 16:
1590 return &AMDGPU::SGPR_512RegClass;
1591 }
1592 }
1593
1594 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1595 return TRI->hasAGPRs(getDataRegClass(*CI.I))
1596 ? TRI->getAGPRClassForBitWidth(BitWidth)
1597 : TRI->getVGPRClassForBitWidth(BitWidth);
1598}
1599
1600MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1601 CombineInfo &CI, CombineInfo &Paired,
1602 const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1603 MachineBasicBlock *MBB = CI.I->getParent();
1604 DebugLoc DL = CI.I->getDebugLoc();
1605
1606 const unsigned Opcode = getNewOpcode(CI, Paired);
1607
1608 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1609 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1610 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1611
1612 // Copy to the new source register.
1613 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1614 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1615
1616 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1617 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1618
1619 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1620 .add(*Src0)
1621 .addImm(SubRegIdx0)
1622 .add(*Src1)
1623 .addImm(SubRegIdx1);
1624
1625 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1626 .addReg(SrcReg, RegState::Kill);
1627
1628 AddressRegs Regs = getRegs(Opcode, *TII);
1629
1630 if (Regs.VAddr)
1631 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1632
1633
1634 // It shouldn't be possible to get this far if the two instructions
1635 // don't have a single memoperand, because MachineInstr::mayAlias()
1636 // will return true if this is the case.
1637 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand())((void)0);
1638
1639 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1640 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1641
1642 MachineInstr *New =
1643 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1644 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1645 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1646 .addImm(CI.CPol) // cpol
1647 .addImm(0) // tfe
1648 .addImm(0) // swz
1649 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1650
1651 moveInstsAfter(MIB, InstsToMove);
1652
1653 CI.I->eraseFromParent();
1654 Paired.I->eraseFromParent();
1655 return New;
1656}
1657
1658MachineOperand
1659SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1660 APInt V(32, Val, true);
1661 if (TII->isInlineConstant(V))
1662 return MachineOperand::CreateImm(Val);
1663
1664 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1665 MachineInstr *Mov =
1666 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1667 TII->get(AMDGPU::S_MOV_B32), Reg)
1668 .addImm(Val);
1669 (void)Mov;
1670 LLVM_DEBUG(dbgs() << " "; Mov->dump())do { } while (false);
1671 return MachineOperand::CreateReg(Reg, false);
1672}
1673
1674// Compute base address using Addr and return the final register.
1675Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1676 const MemAddress &Addr) const {
1677 MachineBasicBlock *MBB = MI.getParent();
1678 MachineBasicBlock::iterator MBBI = MI.getIterator();
1679 DebugLoc DL = MI.getDebugLoc();
1680
1681 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||((void)0)
1682 Addr.Base.LoSubReg) &&((void)0)
1683 "Expected 32-bit Base-Register-Low!!")((void)0);
1684
1685 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||((void)0)
1686 Addr.Base.HiSubReg) &&((void)0)
1687 "Expected 32-bit Base-Register-Hi!!")((void)0);
1688
1689 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n")do { } while (false);
1690 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1691 MachineOperand OffsetHi =
1692 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1693
1694 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1695 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1696 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1697
1698 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1699 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1700 MachineInstr *LoHalf =
1701 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1702 .addReg(CarryReg, RegState::Define)
1703 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1704 .add(OffsetLo)
1705 .addImm(0); // clamp bit
1706 (void)LoHalf;
1707 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();)do { } while (false);
1708
1709 MachineInstr *HiHalf =
1710 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1711 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1712 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1713 .add(OffsetHi)
1714 .addReg(CarryReg, RegState::Kill)
1715 .addImm(0); // clamp bit
1716 (void)HiHalf;
1717 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();)do { } while (false);
1718
1719 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1720 MachineInstr *FullBase =
1721 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1722 .addReg(DestSub0)
1723 .addImm(AMDGPU::sub0)
1724 .addReg(DestSub1)
1725 .addImm(AMDGPU::sub1);
1726 (void)FullBase;
1727 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";)do { } while (false);
1728
1729 return FullDestReg;
1730}
1731
1732// Update base and offset with the NewBase and NewOffset in MI.
1733void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1734 Register NewBase,
1735 int32_t NewOffset) const {
1736 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1737 Base->setReg(NewBase);
1738 Base->setIsKill(false);
1739 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1740}
1741
1742Optional<int32_t>
1743SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1744 if (Op.isImm())
1745 return Op.getImm();
1746
1747 if (!Op.isReg())
1748 return None;
1749
1750 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1751 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1752 !Def->getOperand(1).isImm())
1753 return None;
1754
1755 return Def->getOperand(1).getImm();
1756}
1757
1758// Analyze Base and extracts:
1759// - 32bit base registers, subregisters
1760// - 64bit constant offset
1761// Expecting base computation as:
1762// %OFFSET0:sgpr_32 = S_MOV_B32 8000
1763// %LO:vgpr_32, %c:sreg_64_xexec =
1764// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1765// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1766// %Base:vreg_64 =
1767// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1768void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1769 MemAddress &Addr) const {
1770 if (!Base.isReg())
1771 return;
1772
1773 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1774 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1775 || Def->getNumOperands() != 5)
1776 return;
1777
1778 MachineOperand BaseLo = Def->getOperand(1);
1779 MachineOperand BaseHi = Def->getOperand(3);
1780 if (!BaseLo.isReg() || !BaseHi.isReg())
1781 return;
1782
1783 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1784 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1785
1786 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1787 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1788 return;
1789
1790 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1791 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1792
1793 auto Offset0P = extractConstOffset(*Src0);
1794 if (Offset0P)
1795 BaseLo = *Src1;
1796 else {
1797 if (!(Offset0P = extractConstOffset(*Src1)))
1798 return;
1799 BaseLo = *Src0;
1800 }
1801
1802 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1803 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1804
1805 if (Src0->isImm())
1806 std::swap(Src0, Src1);
1807
1808 if (!Src1->isImm())
1809 return;
1810
1811 uint64_t Offset1 = Src1->getImm();
1812 BaseHi = *Src0;
1813
1814 Addr.Base.LoReg = BaseLo.getReg();
1815 Addr.Base.HiReg = BaseHi.getReg();
1816 Addr.Base.LoSubReg = BaseLo.getSubReg();
1817 Addr.Base.HiSubReg = BaseHi.getSubReg();
1818 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1819}
1820
1821bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1822 MachineInstr &MI,
1823 MemInfoMap &Visited,
1824 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1825
1826 if (!(MI.mayLoad() ^ MI.mayStore()))
1827 return false;
1828
1829 // TODO: Support flat and scratch.
1830 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1831 return false;
1832
1833 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL__null)
1834 return false;
1835
1836 if (AnchorList.count(&MI))
1837 return false;
1838
1839 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump())do { } while (false);
1840
1841 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1842 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";)do { } while (false);
1843 return false;
1844 }
1845
1846 // Step1: Find the base-registers and a 64bit constant offset.
1847 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1848 MemAddress MAddr;
1849 if (Visited.find(&MI) == Visited.end()) {
1850 processBaseWithConstOffset(Base, MAddr);
1851 Visited[&MI] = MAddr;
1852 } else
1853 MAddr = Visited[&MI];
1854
1855 if (MAddr.Offset == 0) {
1856 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"do { } while (false)
1857 " constant offsets that can be promoted.\n";)do { } while (false);
1858 return false;
1859 }
1860
1861 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "do { } while (false)
1862 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";)do { } while (false);
1863
1864 // Step2: Traverse through MI's basic block and find an anchor(that has the
1865 // same base-registers) with the highest 13bit distance from MI's offset.
1866 // E.g. (64bit loads)
1867 // bb:
1868 // addr1 = &a + 4096; load1 = load(addr1, 0)
1869 // addr2 = &a + 6144; load2 = load(addr2, 0)
1870 // addr3 = &a + 8192; load3 = load(addr3, 0)
1871 // addr4 = &a + 10240; load4 = load(addr4, 0)
1872 // addr5 = &a + 12288; load5 = load(addr5, 0)
1873 //
1874 // Starting from the first load, the optimization will try to find a new base
1875 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1876 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1877 // as the new-base(anchor) because of the maximum distance which can
1878 // accomodate more intermediate bases presumeably.
1879 //
1880 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1881 // (&a + 8192) for load1, load2, load4.
1882 // addr = &a + 8192
1883 // load1 = load(addr, -4096)
1884 // load2 = load(addr, -2048)
1885 // load3 = load(addr, 0)
1886 // load4 = load(addr, 2048)
1887 // addr5 = &a + 12288; load5 = load(addr5, 0)
1888 //
1889 MachineInstr *AnchorInst = nullptr;
1890 MemAddress AnchorAddr;
1891 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1892 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1893
1894 MachineBasicBlock *MBB = MI.getParent();
1895 MachineBasicBlock::iterator E = MBB->end();
1896 MachineBasicBlock::iterator MBBI = MI.getIterator();
1897 ++MBBI;
1898 const SITargetLowering *TLI =
1899 static_cast<const SITargetLowering *>(STM->getTargetLowering());
1900
1901 for ( ; MBBI != E; ++MBBI) {
1902 MachineInstr &MINext = *MBBI;
1903 // TODO: Support finding an anchor(with same base) from store addresses or
1904 // any other load addresses where the opcodes are different.
1905 if (MINext.getOpcode() != MI.getOpcode() ||
1906 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1907 continue;
1908
1909 const MachineOperand &BaseNext =
1910 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1911 MemAddress MAddrNext;
1912 if (Visited.find(&MINext) == Visited.end()) {
1913 processBaseWithConstOffset(BaseNext, MAddrNext);
1914 Visited[&MINext] = MAddrNext;
1915 } else
1916 MAddrNext = Visited[&MINext];
1917
1918 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1919 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1920 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1921 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1922 continue;
1923
1924 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1925
1926 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1927 TargetLoweringBase::AddrMode AM;
1928 AM.HasBaseReg = true;
1929 AM.BaseOffs = Dist;
1930 if (TLI->isLegalGlobalAddressingMode(AM) &&
1931 (uint32_t)std::abs(Dist) > MaxDist) {
1932 MaxDist = std::abs(Dist);
1933
1934 AnchorAddr = MAddrNext;
1935 AnchorInst = &MINext;
1936 }
1937 }
1938
1939 if (AnchorInst) {
1940 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";do { } while (false)
1941 AnchorInst->dump())do { } while (false);
1942 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "do { } while (false)
1943 << AnchorAddr.Offset << "\n\n")do { } while (false);
1944
1945 // Instead of moving up, just re-compute anchor-instruction's base address.
1946 Register Base = computeBase(MI, AnchorAddr);
1947
1948 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1949 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();)do { } while (false);
1950
1951 for (auto P : InstsWCommonBase) {
1952 TargetLoweringBase::AddrMode AM;
1953 AM.HasBaseReg = true;
1954 AM.BaseOffs = P.second - AnchorAddr.Offset;
1955
1956 if (TLI->isLegalGlobalAddressingMode(AM)) {
1957 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;do { } while (false)
1958 dbgs() << ")"; P.first->dump())do { } while (false);
1959 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1960 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump())do { } while (false);
1961 }
1962 }
1963 AnchorList.insert(AnchorInst);
1964 return true;
1965 }
1966
1967 return false;
1968}
1969
1970void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1971 std::list<std::list<CombineInfo> > &MergeableInsts) const {
1972 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1973 if (AddrList.front().InstClass == CI.InstClass &&
1974 AddrList.front().hasSameBaseAddress(*CI.I)) {
1975 AddrList.emplace_back(CI);
1976 return;
1977 }
1978 }
1979
1980 // Base address not found, so add a new list.
1981 MergeableInsts.emplace_back(1, CI);
1982}
1983
1984std::pair<MachineBasicBlock::iterator, bool>
1985SILoadStoreOptimizer::collectMergeableInsts(
1986 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
1987 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1988 std::list<std::list<CombineInfo>> &MergeableInsts) const {
1989 bool Modified = false;
1990
1991 // Sort potential mergeable instructions into lists. One list per base address.
1992 unsigned Order = 0;
1993 MachineBasicBlock::iterator BlockI = Begin;
1994 for (; BlockI != End; ++BlockI) {
1995 MachineInstr &MI = *BlockI;
1996
1997 // We run this before checking if an address is mergeable, because it can produce
1998 // better code even if the instructions aren't mergeable.
1999 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2000 Modified = true;
2001
2002 // Don't combine if volatile. We also won't be able to merge across this, so
2003 // break the search. We can look after this barrier for separate merges.
2004 if (MI.hasOrderedMemoryRef()) {
2005 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI)do { } while (false);
2006
2007 // Search will resume after this instruction in a separate merge list.
2008 ++BlockI;
2009 break;
2010 }
2011
2012 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2013 if (InstClass == UNKNOWN)
2014 continue;
2015
2016 CombineInfo CI;
2017 CI.setMI(MI, *TII, *STM);
2018 CI.Order = Order++;
2019
2020 if (!CI.hasMergeableAddress(*MRI))
2021 continue;
2022
2023 LLVM_DEBUG(dbgs() << "Mergeable: " << MI)do { } while (false);
2024
2025 addInstToMergeableList(CI, MergeableInsts);
2026 }
2027
2028 // At this point we have lists of Mergeable instructions.
2029 //
2030 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2031 // list try to find an instruction that can be merged with I. If an instruction
2032 // is found, it is stored in the Paired field. If no instructions are found, then
2033 // the CombineInfo object is deleted from the list.
2034
2035 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2036 E = MergeableInsts.end(); I != E;) {
2037
2038 std::list<CombineInfo> &MergeList = *I;
2039 if (MergeList.size() <= 1) {
2040 // This means we have found only one instruction with a given address
2041 // that can be merged, and we need at least 2 instructions to do a merge,
2042 // so this list can be discarded.
2043 I = MergeableInsts.erase(I);
2044 continue;
2045 }
2046
2047 // Sort the lists by offsets, this way mergeable instructions will be
2048 // adjacent to each other in the list, which will make it easier to find
2049 // matches.
2050 MergeList.sort(
2051 [] (const CombineInfo &A, CombineInfo &B) {
2052 return A.Offset < B.Offset;
2053 });
2054 ++I;
2055 }
2056
2057 return std::make_pair(BlockI, Modified);
2058}
2059
2060// Scan through looking for adjacent LDS operations with constant offsets from
2061// the same base register. We rely on the scheduler to do the hard work of
2062// clustering nearby loads, and assume these are all adjacent.
2063bool SILoadStoreOptimizer::optimizeBlock(
2064 std::list<std::list<CombineInfo> > &MergeableInsts) {
2065 bool Modified = false;
2066
2067 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2068 E = MergeableInsts.end(); I != E;) {
2069 std::list<CombineInfo> &MergeList = *I;
2070
2071 bool OptimizeListAgain = false;
2072 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2073 // We weren't able to make any changes, so delete the list so we don't
2074 // process the same instructions the next time we try to optimize this
2075 // block.
2076 I = MergeableInsts.erase(I);
2077 continue;
2078 }
2079
2080 Modified = true;
2081
2082 // We made changes, but also determined that there were no more optimization
2083 // opportunities, so we don't need to reprocess the list
2084 if (!OptimizeListAgain) {
2085 I = MergeableInsts.erase(I);
2086 continue;
2087 }
2088 OptimizeAgain = true;
2089 }
2090 return Modified;
2091}
2092
2093bool
2094SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2095 std::list<CombineInfo> &MergeList,
2096 bool &OptimizeListAgain) {
2097 if (MergeList.empty())
2098 return false;
2099
2100 bool Modified = false;
2101
2102 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2103 Next = std::next(I)) {
2104
2105 auto First = I;
2106 auto Second = Next;
2107
2108 if ((*First).Order > (*Second).Order)
2109 std::swap(First, Second);
2110 CombineInfo &CI = *First;
2111 CombineInfo &Paired = *Second;
2112
2113 SmallVector<MachineInstr *, 8> InstsToMove;
2114 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
2115 ++I;
2116 continue;
2117 }
2118
2119 Modified = true;
2120
2121 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I)do { } while (false);
2122
2123 switch (CI.InstClass) {
2124 default:
2125 llvm_unreachable("unknown InstClass")__builtin_unreachable();
2126 break;
2127 case DS_READ: {
2128 MachineBasicBlock::iterator NewMI =
2129 mergeRead2Pair(CI, Paired, InstsToMove);
2130 CI.setMI(NewMI, *TII, *STM);
2131 break;
2132 }
2133 case DS_WRITE: {
2134 MachineBasicBlock::iterator NewMI =
2135 mergeWrite2Pair(CI, Paired, InstsToMove);
2136 CI.setMI(NewMI, *TII, *STM);
2137 break;
2138 }
2139 case S_BUFFER_LOAD_IMM: {
2140 MachineBasicBlock::iterator NewMI =
2141 mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
2142 CI.setMI(NewMI, *TII, *STM);
2143 OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
2144 break;
2145 }
2146 case BUFFER_LOAD: {
2147 MachineBasicBlock::iterator NewMI =
2148 mergeBufferLoadPair(CI, Paired, InstsToMove);
2149 CI.setMI(NewMI, *TII, *STM);
2150 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2151 break;
2152 }
2153 case BUFFER_STORE: {
2154 MachineBasicBlock::iterator NewMI =
2155 mergeBufferStorePair(CI, Paired, InstsToMove);
2156 CI.setMI(NewMI, *TII, *STM);
2157 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2158 break;
2159 }
2160 case MIMG: {
2161 MachineBasicBlock::iterator NewMI =
2162 mergeImagePair(CI, Paired, InstsToMove);
2163 CI.setMI(NewMI, *TII, *STM);
2164 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2165 break;
2166 }
2167 case TBUFFER_LOAD: {
2168 MachineBasicBlock::iterator NewMI =
2169 mergeTBufferLoadPair(CI, Paired, InstsToMove);
2170 CI.setMI(NewMI, *TII, *STM);
2171 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2172 break;
2173 }
2174 case TBUFFER_STORE: {
2175 MachineBasicBlock::iterator NewMI =
2176 mergeTBufferStorePair(CI, Paired, InstsToMove);
2177 CI.setMI(NewMI, *TII, *STM);
2178 OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2179 break;
2180 }
2181 }
2182 CI.Order = Paired.Order;
2183 if (I == Second)
2184 I = Next;
2185
2186 MergeList.erase(Second);
2187 }
2188
2189 return Modified;
2190}
2191
2192bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2193 if (skipFunction(MF.getFunction()))
2194 return false;
2195
2196 STM = &MF.getSubtarget<GCNSubtarget>();
2197 if (!STM->loadStoreOptEnabled())
2198 return false;
2199
2200 TII = STM->getInstrInfo();
2201 TRI = &TII->getRegisterInfo();
2202
2203 MRI = &MF.getRegInfo();
2204 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2205
2206 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n")do { } while (false);
2207
2208 bool Modified = false;
2209
2210 // Contains the list of instructions for which constant offsets are being
2211 // promoted to the IMM. This is tracked for an entire block at time.
2212 SmallPtrSet<MachineInstr *, 4> AnchorList;
2213 MemInfoMap Visited;
2214
2215 for (MachineBasicBlock &MBB : MF) {
2216 MachineBasicBlock::iterator SectionEnd;
2217 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2218 I = SectionEnd) {
2219 bool CollectModified;
2220 std::list<std::list<CombineInfo>> MergeableInsts;
2221
2222 // First pass: Collect list of all instructions we know how to merge in a
2223 // subset of the block.
2224 std::tie(SectionEnd, CollectModified) =
2225 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2226
2227 Modified |= CollectModified;
2228
2229 do {
2230 OptimizeAgain = false;
2231 Modified |= optimizeBlock(MergeableInsts);
2232 } while (OptimizeAgain);
2233 }
2234
2235 Visited.clear();
2236 AnchorList.clear();
2237 }
2238
2239 return Modified;
2240}

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB != ZB_Undefined && Val == 0)
117 return 32;
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((void)0);
252 return N
2.1
'N' is not equal to 0
2.1
'N' is not equal to 0
== 0 ? 0 : (T(-1) >> (Bits - N));
3
'?' condition is false
4
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
2
Calling 'maskTrailingOnes<unsigned int>'
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1LL<<(N-1)) <= x && x < (INT64_C(1)1LL<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1ULL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1ULL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1ULL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX0xffffffffffffffffULL >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
441
442 return UINT64_C(1)1ULL + ~(UINT64_C(1)1ULL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1ULL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")((void)0);
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")((void)0);
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")((void)0);
771 assert(B <= 32 && "Bit width out of range.")((void)0);
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")((void)0);
787 assert(B <= 64 && "Bit width out of range.")((void)0);
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif