Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:line 2568, column 5
Value stored to 'Pred' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -D PIC -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -D_RET_PROTECTOR -ret-protector -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
39#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
49#include "llvm/Analysis/AliasAnalysis.h"
50#include "llvm/Analysis/CmpInstAnalysis.h"
51#include "llvm/Analysis/LoopAccessAnalysis.h"
52#include "llvm/Analysis/LoopInfo.h"
53#include "llvm/Analysis/LoopPass.h"
54#include "llvm/Analysis/MemoryLocation.h"
55#include "llvm/Analysis/MemorySSA.h"
56#include "llvm/Analysis/MemorySSAUpdater.h"
57#include "llvm/Analysis/MustExecute.h"
58#include "llvm/Analysis/OptimizationRemarkEmitter.h"
59#include "llvm/Analysis/ScalarEvolution.h"
60#include "llvm/Analysis/ScalarEvolutionExpressions.h"
61#include "llvm/Analysis/TargetLibraryInfo.h"
62#include "llvm/Analysis/TargetTransformInfo.h"
63#include "llvm/Analysis/ValueTracking.h"
64#include "llvm/IR/Attributes.h"
65#include "llvm/IR/BasicBlock.h"
66#include "llvm/IR/Constant.h"
67#include "llvm/IR/Constants.h"
68#include "llvm/IR/DataLayout.h"
69#include "llvm/IR/DebugLoc.h"
70#include "llvm/IR/DerivedTypes.h"
71#include "llvm/IR/Dominators.h"
72#include "llvm/IR/GlobalValue.h"
73#include "llvm/IR/GlobalVariable.h"
74#include "llvm/IR/IRBuilder.h"
75#include "llvm/IR/InstrTypes.h"
76#include "llvm/IR/Instruction.h"
77#include "llvm/IR/Instructions.h"
78#include "llvm/IR/IntrinsicInst.h"
79#include "llvm/IR/Intrinsics.h"
80#include "llvm/IR/LLVMContext.h"
81#include "llvm/IR/Module.h"
82#include "llvm/IR/PassManager.h"
83#include "llvm/IR/PatternMatch.h"
84#include "llvm/IR/Type.h"
85#include "llvm/IR/User.h"
86#include "llvm/IR/Value.h"
87#include "llvm/IR/ValueHandle.h"
88#include "llvm/InitializePasses.h"
89#include "llvm/Pass.h"
90#include "llvm/Support/Casting.h"
91#include "llvm/Support/CommandLine.h"
92#include "llvm/Support/Debug.h"
93#include "llvm/Support/InstructionCost.h"
94#include "llvm/Support/raw_ostream.h"
95#include "llvm/Transforms/Scalar.h"
96#include "llvm/Transforms/Utils/BuildLibCalls.h"
97#include "llvm/Transforms/Utils/Local.h"
98#include "llvm/Transforms/Utils/LoopUtils.h"
99#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
100#include <algorithm>
101#include <cassert>
102#include <cstdint>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107
108#define DEBUG_TYPE"loop-idiom" "loop-idiom"
109
110STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet"
, "Number of memset's formed from loop stores"}
;
111STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy"
, "Number of memcpy's formed from loop load+stores"}
;
112STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores")static llvm::Statistic NumMemMove = {"loop-idiom", "NumMemMove"
, "Number of memmove's formed from loop load+stores"}
;
113STATISTIC(static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
114 NumShiftUntilBitTest,static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
115 "Number of uncountable loops recognized as 'shift until bitttest' idiom")static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
;
116STATISTIC(NumShiftUntilZero,static llvm::Statistic NumShiftUntilZero = {"loop-idiom", "NumShiftUntilZero"
, "Number of uncountable loops recognized as 'shift until zero' idiom"
}
117 "Number of uncountable loops recognized as 'shift until zero' idiom")static llvm::Statistic NumShiftUntilZero = {"loop-idiom", "NumShiftUntilZero"
, "Number of uncountable loops recognized as 'shift until zero' idiom"
}
;
118
119bool DisableLIRP::All;
120static cl::opt<bool, true>
121 DisableLIRPAll("disable-" DEBUG_TYPE"loop-idiom" "-all",
122 cl::desc("Options to disable Loop Idiom Recognize Pass."),
123 cl::location(DisableLIRP::All), cl::init(false),
124 cl::ReallyHidden);
125
126bool DisableLIRP::Memset;
127static cl::opt<bool, true>
128 DisableLIRPMemset("disable-" DEBUG_TYPE"loop-idiom" "-memset",
129 cl::desc("Proceed with loop idiom recognize pass, but do "
130 "not convert loop(s) to memset."),
131 cl::location(DisableLIRP::Memset), cl::init(false),
132 cl::ReallyHidden);
133
134bool DisableLIRP::Memcpy;
135static cl::opt<bool, true>
136 DisableLIRPMemcpy("disable-" DEBUG_TYPE"loop-idiom" "-memcpy",
137 cl::desc("Proceed with loop idiom recognize pass, but do "
138 "not convert loop(s) to memcpy."),
139 cl::location(DisableLIRP::Memcpy), cl::init(false),
140 cl::ReallyHidden);
141
142static cl::opt<bool> UseLIRCodeSizeHeurs(
143 "use-lir-code-size-heurs",
144 cl::desc("Use loop idiom recognition code size heuristics when compiling"
145 "with -Os/-Oz"),
146 cl::init(true), cl::Hidden);
147
148namespace {
149
150class LoopIdiomRecognize {
151 Loop *CurLoop = nullptr;
152 AliasAnalysis *AA;
153 DominatorTree *DT;
154 LoopInfo *LI;
155 ScalarEvolution *SE;
156 TargetLibraryInfo *TLI;
157 const TargetTransformInfo *TTI;
158 const DataLayout *DL;
159 OptimizationRemarkEmitter &ORE;
160 bool ApplyCodeSizeHeuristics;
161 std::unique_ptr<MemorySSAUpdater> MSSAU;
162
163public:
164 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
165 LoopInfo *LI, ScalarEvolution *SE,
166 TargetLibraryInfo *TLI,
167 const TargetTransformInfo *TTI, MemorySSA *MSSA,
168 const DataLayout *DL,
169 OptimizationRemarkEmitter &ORE)
170 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
171 if (MSSA)
172 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
173 }
174
175 bool runOnLoop(Loop *L);
176
177private:
178 using StoreList = SmallVector<StoreInst *, 8>;
179 using StoreListMap = MapVector<Value *, StoreList>;
180
181 StoreListMap StoreRefsForMemset;
182 StoreListMap StoreRefsForMemsetPattern;
183 StoreList StoreRefsForMemcpy;
184 bool HasMemset;
185 bool HasMemsetPattern;
186 bool HasMemcpy;
187
188 /// Return code for isLegalStore()
189 enum LegalStoreKind {
190 None = 0,
191 Memset,
192 MemsetPattern,
193 Memcpy,
194 UnorderedAtomicMemcpy,
195 DontUse // Dummy retval never to be used. Allows catching errors in retval
196 // handling.
197 };
198
199 /// \name Countable Loop Idiom Handling
200 /// @{
201
202 bool runOnCountableLoop();
203 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
204 SmallVectorImpl<BasicBlock *> &ExitBlocks);
205
206 void collectStores(BasicBlock *BB);
207 LegalStoreKind isLegalStore(StoreInst *SI);
208 enum class ForMemset { No, Yes };
209 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
210 ForMemset For);
211
212 template <typename MemInst>
213 bool processLoopMemIntrinsic(
214 BasicBlock *BB,
215 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
216 const SCEV *BECount);
217 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
218 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
219
220 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
221 MaybeAlign StoreAlignment, Value *StoredVal,
222 Instruction *TheStore,
223 SmallPtrSetImpl<Instruction *> &Stores,
224 const SCEVAddRecExpr *Ev, const SCEV *BECount,
225 bool NegStride, bool IsLoopMemset = false);
226 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
227 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
228 unsigned StoreSize, MaybeAlign StoreAlign,
229 MaybeAlign LoadAlign, Instruction *TheStore,
230 Instruction *TheLoad,
231 const SCEVAddRecExpr *StoreEv,
232 const SCEVAddRecExpr *LoadEv,
233 const SCEV *BECount);
234 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
235 bool IsLoopMemset = false);
236
237 /// @}
238 /// \name Noncountable Loop Idiom Handling
239 /// @{
240
241 bool runOnNoncountableLoop();
242
243 bool recognizePopcount();
244 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
245 PHINode *CntPhi, Value *Var);
246 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
247 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
248 Instruction *CntInst, PHINode *CntPhi,
249 Value *Var, Instruction *DefX,
250 const DebugLoc &DL, bool ZeroCheck,
251 bool IsCntPhiUsedOutsideLoop);
252
253 bool recognizeShiftUntilBitTest();
254 bool recognizeShiftUntilZero();
255
256 /// @}
257};
258
259class LoopIdiomRecognizeLegacyPass : public LoopPass {
260public:
261 static char ID;
262
263 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
264 initializeLoopIdiomRecognizeLegacyPassPass(
265 *PassRegistry::getPassRegistry());
266 }
267
268 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
269 if (DisableLIRP::All)
270 return false;
271
272 if (skipLoop(L))
273 return false;
274
275 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
276 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
277 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
278 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
279 TargetLibraryInfo *TLI =
280 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
281 *L->getHeader()->getParent());
282 const TargetTransformInfo *TTI =
283 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
284 *L->getHeader()->getParent());
285 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
286 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
287 MemorySSA *MSSA = nullptr;
288 if (MSSAAnalysis)
289 MSSA = &MSSAAnalysis->getMSSA();
290
291 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
292 // pass. Function analyses need to be preserved across loop transformations
293 // but ORE cannot be preserved (see comment before the pass definition).
294 OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
295
296 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
297 return LIR.runOnLoop(L);
298 }
299
300 /// This transformation requires natural loop information & requires that
301 /// loop preheaders be inserted into the CFG.
302 void getAnalysisUsage(AnalysisUsage &AU) const override {
303 AU.addRequired<TargetLibraryInfoWrapperPass>();
304 AU.addRequired<TargetTransformInfoWrapperPass>();
305 AU.addPreserved<MemorySSAWrapperPass>();
306 getLoopAnalysisUsage(AU);
307 }
308};
309
310} // end anonymous namespace
311
312char LoopIdiomRecognizeLegacyPass::ID = 0;
313
314PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
315 LoopStandardAnalysisResults &AR,
316 LPMUpdater &) {
317 if (DisableLIRP::All)
318 return PreservedAnalyses::all();
319
320 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
321
322 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
323 // pass. Function analyses need to be preserved across loop transformations
324 // but ORE cannot be preserved (see comment before the pass definition).
325 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
326
327 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
328 AR.MSSA, DL, ORE);
329 if (!LIR.runOnLoop(&L))
330 return PreservedAnalyses::all();
331
332 auto PA = getLoopPassPreservedAnalyses();
333 if (AR.MSSA)
334 PA.preserve<MemorySSAAnalysis>();
335 return PA;
336}
337
338INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
339 "Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
340INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
341INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
342INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
343INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
344 "Recognize loop idioms", false, false)PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
345
346Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
347
348static void deleteDeadInstruction(Instruction *I) {
349 I->replaceAllUsesWith(UndefValue::get(I->getType()));
350 I->eraseFromParent();
351}
352
353//===----------------------------------------------------------------------===//
354//
355// Implementation of LoopIdiomRecognize
356//
357//===----------------------------------------------------------------------===//
358
359bool LoopIdiomRecognize::runOnLoop(Loop *L) {
360 CurLoop = L;
361 // If the loop could not be converted to canonical form, it must have an
362 // indirectbr in it, just give up.
363 if (!L->getLoopPreheader())
364 return false;
365
366 // Disable loop idiom recognition if the function's name is a common idiom.
367 StringRef Name = L->getHeader()->getParent()->getName();
368 if (Name == "memset" || Name == "memcpy")
369 return false;
370 if (Name == "_libc_memset" || Name == "_libc_memcpy")
371 return false;
372
373 // Determine if code size heuristics need to be applied.
374 ApplyCodeSizeHeuristics =
375 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
376
377 HasMemset = TLI->has(LibFunc_memset);
378 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
379 HasMemcpy = TLI->has(LibFunc_memcpy);
380
381 if (HasMemset || HasMemsetPattern || HasMemcpy)
382 if (SE->hasLoopInvariantBackedgeTakenCount(L))
383 return runOnCountableLoop();
384
385 return runOnNoncountableLoop();
386}
387
388bool LoopIdiomRecognize::runOnCountableLoop() {
389 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
390 assert(!isa<SCEVCouldNotCompute>(BECount) &&((void)0)
391 "runOnCountableLoop() called on a loop without a predictable"((void)0)
392 "backedge-taken count")((void)0);
393
394 // If this loop executes exactly one time, then it should be peeled, not
395 // optimized by this pass.
396 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
397 if (BECst->getAPInt() == 0)
398 return false;
399
400 SmallVector<BasicBlock *, 8> ExitBlocks;
401 CurLoop->getUniqueExitBlocks(ExitBlocks);
402
403 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { } while (false)
404 << CurLoop->getHeader()->getParent()->getName()do { } while (false)
405 << "] Countable Loop %" << CurLoop->getHeader()->getName()do { } while (false)
406 << "\n")do { } while (false);
407
408 // The following transforms hoist stores/memsets into the loop pre-header.
409 // Give up if the loop has instructions that may throw.
410 SimpleLoopSafetyInfo SafetyInfo;
411 SafetyInfo.computeLoopSafetyInfo(CurLoop);
412 if (SafetyInfo.anyBlockMayThrow())
413 return false;
414
415 bool MadeChange = false;
416
417 // Scan all the blocks in the loop that are not in subloops.
418 for (auto *BB : CurLoop->getBlocks()) {
419 // Ignore blocks in subloops.
420 if (LI->getLoopFor(BB) != CurLoop)
421 continue;
422
423 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
424 }
425 return MadeChange;
426}
427
428static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
429 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
430 return ConstStride->getAPInt();
431}
432
433/// getMemSetPatternValue - If a strided store of the specified value is safe to
434/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
435/// be passed in. Otherwise, return null.
436///
437/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
438/// just replicate their input array and then pass on to memset_pattern16.
439static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
440 // FIXME: This could check for UndefValue because it can be merged into any
441 // other valid pattern.
442
443 // If the value isn't a constant, we can't promote it to being in a constant
444 // array. We could theoretically do a store to an alloca or something, but
445 // that doesn't seem worthwhile.
446 Constant *C = dyn_cast<Constant>(V);
447 if (!C)
448 return nullptr;
449
450 // Only handle simple values that are a power of two bytes in size.
451 uint64_t Size = DL->getTypeSizeInBits(V->getType());
452 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
453 return nullptr;
454
455 // Don't care enough about darwin/ppc to implement this.
456 if (DL->isBigEndian())
457 return nullptr;
458
459 // Convert to size in bytes.
460 Size /= 8;
461
462 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
463 // if the top and bottom are the same (e.g. for vectors and large integers).
464 if (Size > 16)
465 return nullptr;
466
467 // If the constant is exactly 16 bytes, just use it.
468 if (Size == 16)
469 return C;
470
471 // Otherwise, we'll use an array of the constants.
472 unsigned ArraySize = 16 / Size;
473 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
474 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
475}
476
477LoopIdiomRecognize::LegalStoreKind
478LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
479 // Don't touch volatile stores.
480 if (SI->isVolatile())
481 return LegalStoreKind::None;
482 // We only want simple or unordered-atomic stores.
483 if (!SI->isUnordered())
484 return LegalStoreKind::None;
485
486 // Avoid merging nontemporal stores.
487 if (SI->getMetadata(LLVMContext::MD_nontemporal))
488 return LegalStoreKind::None;
489
490 Value *StoredVal = SI->getValueOperand();
491 Value *StorePtr = SI->getPointerOperand();
492
493 // Don't convert stores of non-integral pointer types to memsets (which stores
494 // integers).
495 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
496 return LegalStoreKind::None;
497
498 // Reject stores that are so large that they overflow an unsigned.
499 // When storing out scalable vectors we bail out for now, since the code
500 // below currently only works for constant strides.
501 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
502 if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
503 (SizeInBits.getFixedSize() >> 32) != 0)
504 return LegalStoreKind::None;
505
506 // See if the pointer expression is an AddRec like {base,+,1} on the current
507 // loop, which indicates a strided store. If we have something else, it's a
508 // random store we can't handle.
509 const SCEVAddRecExpr *StoreEv =
510 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
511 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
512 return LegalStoreKind::None;
513
514 // Check to see if we have a constant stride.
515 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
516 return LegalStoreKind::None;
517
518 // See if the store can be turned into a memset.
519
520 // If the stored value is a byte-wise value (like i32 -1), then it may be
521 // turned into a memset of i8 -1, assuming that all the consecutive bytes
522 // are stored. A store of i32 0x01020304 can never be turned into a memset,
523 // but it can be turned into memset_pattern if the target supports it.
524 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
525
526 // Note: memset and memset_pattern on unordered-atomic is yet not supported
527 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
528
529 // If we're allowed to form a memset, and the stored value would be
530 // acceptable for memset, use it.
531 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
532 // Verify that the stored value is loop invariant. If not, we can't
533 // promote the memset.
534 CurLoop->isLoopInvariant(SplatValue)) {
535 // It looks like we can use SplatValue.
536 return LegalStoreKind::Memset;
537 }
538 if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
539 // Don't create memset_pattern16s with address spaces.
540 StorePtr->getType()->getPointerAddressSpace() == 0 &&
541 getMemSetPatternValue(StoredVal, DL)) {
542 // It looks like we can use PatternValue!
543 return LegalStoreKind::MemsetPattern;
544 }
545
546 // Otherwise, see if the store can be turned into a memcpy.
547 if (HasMemcpy && !DisableLIRP::Memcpy) {
548 // Check to see if the stride matches the size of the store. If so, then we
549 // know that every byte is touched in the loop.
550 APInt Stride = getStoreStride(StoreEv);
551 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
552 if (StoreSize != Stride && StoreSize != -Stride)
553 return LegalStoreKind::None;
554
555 // The store must be feeding a non-volatile load.
556 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
557
558 // Only allow non-volatile loads
559 if (!LI || LI->isVolatile())
560 return LegalStoreKind::None;
561 // Only allow simple or unordered-atomic loads
562 if (!LI->isUnordered())
563 return LegalStoreKind::None;
564
565 // See if the pointer expression is an AddRec like {base,+,1} on the current
566 // loop, which indicates a strided load. If we have something else, it's a
567 // random load we can't handle.
568 const SCEVAddRecExpr *LoadEv =
569 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
570 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
571 return LegalStoreKind::None;
572
573 // The store and load must share the same stride.
574 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
575 return LegalStoreKind::None;
576
577 // Success. This store can be converted into a memcpy.
578 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
579 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
580 : LegalStoreKind::Memcpy;
581 }
582 // This store can't be transformed into a memset/memcpy.
583 return LegalStoreKind::None;
584}
585
586void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
587 StoreRefsForMemset.clear();
588 StoreRefsForMemsetPattern.clear();
589 StoreRefsForMemcpy.clear();
590 for (Instruction &I : *BB) {
591 StoreInst *SI = dyn_cast<StoreInst>(&I);
592 if (!SI)
593 continue;
594
595 // Make sure this is a strided store with a constant stride.
596 switch (isLegalStore(SI)) {
597 case LegalStoreKind::None:
598 // Nothing to do
599 break;
600 case LegalStoreKind::Memset: {
601 // Find the base pointer.
602 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
603 StoreRefsForMemset[Ptr].push_back(SI);
604 } break;
605 case LegalStoreKind::MemsetPattern: {
606 // Find the base pointer.
607 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
608 StoreRefsForMemsetPattern[Ptr].push_back(SI);
609 } break;
610 case LegalStoreKind::Memcpy:
611 case LegalStoreKind::UnorderedAtomicMemcpy:
612 StoreRefsForMemcpy.push_back(SI);
613 break;
614 default:
615 assert(false && "unhandled return value")((void)0);
616 break;
617 }
618 }
619}
620
621/// runOnLoopBlock - Process the specified block, which lives in a counted loop
622/// with the specified backedge count. This block is known to be in the current
623/// loop and not in any subloops.
624bool LoopIdiomRecognize::runOnLoopBlock(
625 BasicBlock *BB, const SCEV *BECount,
626 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
627 // We can only promote stores in this block if they are unconditionally
628 // executed in the loop. For a block to be unconditionally executed, it has
629 // to dominate all the exit blocks of the loop. Verify this now.
630 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
631 if (!DT->dominates(BB, ExitBlocks[i]))
632 return false;
633
634 bool MadeChange = false;
635 // Look for store instructions, which may be optimized to memset/memcpy.
636 collectStores(BB);
637
638 // Look for a single store or sets of stores with a common base, which can be
639 // optimized into a memset (memset_pattern). The latter most commonly happens
640 // with structs and handunrolled loops.
641 for (auto &SL : StoreRefsForMemset)
642 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
643
644 for (auto &SL : StoreRefsForMemsetPattern)
645 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
646
647 // Optimize the store into a memcpy, if it feeds an similarly strided load.
648 for (auto &SI : StoreRefsForMemcpy)
649 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
650
651 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
652 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
653 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
654 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
655
656 return MadeChange;
657}
658
659/// See if this store(s) can be promoted to a memset.
660bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
661 const SCEV *BECount, ForMemset For) {
662 // Try to find consecutive stores that can be transformed into memsets.
663 SetVector<StoreInst *> Heads, Tails;
664 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
665
666 // Do a quadratic search on all of the given stores and find
667 // all of the pairs of stores that follow each other.
668 SmallVector<unsigned, 16> IndexQueue;
669 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
670 assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((void)0);
671
672 Value *FirstStoredVal = SL[i]->getValueOperand();
673 Value *FirstStorePtr = SL[i]->getPointerOperand();
674 const SCEVAddRecExpr *FirstStoreEv =
675 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
676 APInt FirstStride = getStoreStride(FirstStoreEv);
677 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
678
679 // See if we can optimize just this store in isolation.
680 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
681 Heads.insert(SL[i]);
682 continue;
683 }
684
685 Value *FirstSplatValue = nullptr;
686 Constant *FirstPatternValue = nullptr;
687
688 if (For == ForMemset::Yes)
689 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
690 else
691 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
692
693 assert((FirstSplatValue || FirstPatternValue) &&((void)0)
694 "Expected either splat value or pattern value.")((void)0);
695
696 IndexQueue.clear();
697 // If a store has multiple consecutive store candidates, search Stores
698 // array according to the sequence: from i+1 to e, then from i-1 to 0.
699 // This is because usually pairing with immediate succeeding or preceding
700 // candidate create the best chance to find memset opportunity.
701 unsigned j = 0;
702 for (j = i + 1; j < e; ++j)
703 IndexQueue.push_back(j);
704 for (j = i; j > 0; --j)
705 IndexQueue.push_back(j - 1);
706
707 for (auto &k : IndexQueue) {
708 assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((void)0);
709 Value *SecondStorePtr = SL[k]->getPointerOperand();
710 const SCEVAddRecExpr *SecondStoreEv =
711 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
712 APInt SecondStride = getStoreStride(SecondStoreEv);
713
714 if (FirstStride != SecondStride)
715 continue;
716
717 Value *SecondStoredVal = SL[k]->getValueOperand();
718 Value *SecondSplatValue = nullptr;
719 Constant *SecondPatternValue = nullptr;
720
721 if (For == ForMemset::Yes)
722 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
723 else
724 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
725
726 assert((SecondSplatValue || SecondPatternValue) &&((void)0)
727 "Expected either splat value or pattern value.")((void)0);
728
729 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
730 if (For == ForMemset::Yes) {
731 if (isa<UndefValue>(FirstSplatValue))
732 FirstSplatValue = SecondSplatValue;
733 if (FirstSplatValue != SecondSplatValue)
734 continue;
735 } else {
736 if (isa<UndefValue>(FirstPatternValue))
737 FirstPatternValue = SecondPatternValue;
738 if (FirstPatternValue != SecondPatternValue)
739 continue;
740 }
741 Tails.insert(SL[k]);
742 Heads.insert(SL[i]);
743 ConsecutiveChain[SL[i]] = SL[k];
744 break;
745 }
746 }
747 }
748
749 // We may run into multiple chains that merge into a single chain. We mark the
750 // stores that we transformed so that we don't visit the same store twice.
751 SmallPtrSet<Value *, 16> TransformedStores;
752 bool Changed = false;
753
754 // For stores that start but don't end a link in the chain:
755 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
756 it != e; ++it) {
757 if (Tails.count(*it))
758 continue;
759
760 // We found a store instr that starts a chain. Now follow the chain and try
761 // to transform it.
762 SmallPtrSet<Instruction *, 8> AdjacentStores;
763 StoreInst *I = *it;
764
765 StoreInst *HeadStore = I;
766 unsigned StoreSize = 0;
767
768 // Collect the chain into a list.
769 while (Tails.count(I) || Heads.count(I)) {
770 if (TransformedStores.count(I))
771 break;
772 AdjacentStores.insert(I);
773
774 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
775 // Move to the next value in the chain.
776 I = ConsecutiveChain[I];
777 }
778
779 Value *StoredVal = HeadStore->getValueOperand();
780 Value *StorePtr = HeadStore->getPointerOperand();
781 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
782 APInt Stride = getStoreStride(StoreEv);
783
784 // Check to see if the stride matches the size of the stores. If so, then
785 // we know that every byte is touched in the loop.
786 if (StoreSize != Stride && StoreSize != -Stride)
787 continue;
788
789 bool NegStride = StoreSize == -Stride;
790
791 if (processLoopStridedStore(StorePtr, StoreSize,
792 MaybeAlign(HeadStore->getAlignment()),
793 StoredVal, HeadStore, AdjacentStores, StoreEv,
794 BECount, NegStride)) {
795 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
796 Changed = true;
797 }
798 }
799
800 return Changed;
801}
802
803/// processLoopMemIntrinsic - Template function for calling different processor
804/// functions based on mem instrinsic type.
805template <typename MemInst>
806bool LoopIdiomRecognize::processLoopMemIntrinsic(
807 BasicBlock *BB,
808 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
809 const SCEV *BECount) {
810 bool MadeChange = false;
811 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
812 Instruction *Inst = &*I++;
813 // Look for memory instructions, which may be optimized to a larger one.
814 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
815 WeakTrackingVH InstPtr(&*I);
816 if (!(this->*Processor)(MI, BECount))
817 continue;
818 MadeChange = true;
819
820 // If processing the instruction invalidated our iterator, start over from
821 // the top of the block.
822 if (!InstPtr)
823 I = BB->begin();
824 }
825 }
826 return MadeChange;
827}
828
829/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
830bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
831 const SCEV *BECount) {
832 // We can only handle non-volatile memcpys with a constant size.
833 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
834 return false;
835
836 // If we're not allowed to hack on memcpy, we fail.
837 if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
838 return false;
839
840 Value *Dest = MCI->getDest();
841 Value *Source = MCI->getSource();
842 if (!Dest || !Source)
843 return false;
844
845 // See if the load and store pointer expressions are AddRec like {base,+,1} on
846 // the current loop, which indicates a strided load and store. If we have
847 // something else, it's a random load or store we can't handle.
848 const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
849 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
850 return false;
851 const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
852 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
853 return false;
854
855 // Reject memcpys that are so large that they overflow an unsigned.
856 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
857 if ((SizeInBytes >> 32) != 0)
858 return false;
859
860 // Check if the stride matches the size of the memcpy. If so, then we know
861 // that every byte is touched in the loop.
862 const SCEVConstant *StoreStride =
863 dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
864 const SCEVConstant *LoadStride =
865 dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
866 if (!StoreStride || !LoadStride)
867 return false;
868
869 APInt StoreStrideValue = StoreStride->getAPInt();
870 APInt LoadStrideValue = LoadStride->getAPInt();
871 // Huge stride value - give up
872 if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
873 return false;
874
875 if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
876 ORE.emit([&]() {
877 return OptimizationRemarkMissed(DEBUG_TYPE"loop-idiom", "SizeStrideUnequal", MCI)
878 << ore::NV("Inst", "memcpy") << " in "
879 << ore::NV("Function", MCI->getFunction())
880 << " function will not be hoised: "
881 << ore::NV("Reason", "memcpy size is not equal to stride");
882 });
883 return false;
884 }
885
886 int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
887 int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
888 // Check if the load stride matches the store stride.
889 if (StoreStrideInt != LoadStrideInt)
890 return false;
891
892 return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
893 MCI->getDestAlign(), MCI->getSourceAlign(),
894 MCI, MCI, StoreEv, LoadEv, BECount);
895}
896
897/// processLoopMemSet - See if this memset can be promoted to a large memset.
898bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
899 const SCEV *BECount) {
900 // We can only handle non-volatile memsets with a constant size.
901 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
902 return false;
903
904 // If we're not allowed to hack on memset, we fail.
905 if (!HasMemset || DisableLIRP::Memset)
906 return false;
907
908 Value *Pointer = MSI->getDest();
909
910 // See if the pointer expression is an AddRec like {base,+,1} on the current
911 // loop, which indicates a strided store. If we have something else, it's a
912 // random store we can't handle.
913 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
914 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
915 return false;
916
917 // Reject memsets that are so large that they overflow an unsigned.
918 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
919 if ((SizeInBytes >> 32) != 0)
920 return false;
921
922 // Check to see if the stride matches the size of the memset. If so, then we
923 // know that every byte is touched in the loop.
924 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
925 if (!ConstStride)
926 return false;
927
928 APInt Stride = ConstStride->getAPInt();
929 if (SizeInBytes != Stride && SizeInBytes != -Stride)
930 return false;
931
932 // Verify that the memset value is loop invariant. If not, we can't promote
933 // the memset.
934 Value *SplatValue = MSI->getValue();
935 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
936 return false;
937
938 SmallPtrSet<Instruction *, 1> MSIs;
939 MSIs.insert(MSI);
940 bool NegStride = SizeInBytes == -Stride;
941 return processLoopStridedStore(
942 Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
943 SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
944}
945
946/// mayLoopAccessLocation - Return true if the specified loop might access the
947/// specified pointer location, which is a loop-strided access. The 'Access'
948/// argument specifies what the verboten forms of access are (read or write).
949static bool
950mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
951 const SCEV *BECount, unsigned StoreSize,
952 AliasAnalysis &AA,
953 SmallPtrSetImpl<Instruction *> &IgnoredStores) {
954 // Get the location that may be stored across the loop. Since the access is
955 // strided positively through memory, we say that the modified location starts
956 // at the pointer and has infinite size.
957 LocationSize AccessSize = LocationSize::afterPointer();
958
959 // If the loop iterates a fixed number of times, we can refine the access size
960 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
961 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
962 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
963 StoreSize);
964
965 // TODO: For this to be really effective, we have to dive into the pointer
966 // operand in the store. Store to &A[i] of 100 will always return may alias
967 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
968 // which will then no-alias a store to &A[100].
969 MemoryLocation StoreLoc(Ptr, AccessSize);
970
971 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
972 ++BI)
973 for (Instruction &I : **BI)
974 if (IgnoredStores.count(&I) == 0 &&
975 isModOrRefSet(
976 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
977 return true;
978
979 return false;
980}
981
982// If we have a negative stride, Start refers to the end of the memory location
983// we're trying to memset. Therefore, we need to recompute the base pointer,
984// which is just Start - BECount*Size.
985static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
986 Type *IntPtr, unsigned StoreSize,
987 ScalarEvolution *SE) {
988 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
989 if (StoreSize != 1)
990 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
991 SCEV::FlagNUW);
992 return SE->getMinusSCEV(Start, Index);
993}
994
995/// Compute the number of bytes as a SCEV from the backedge taken count.
996///
997/// This also maps the SCEV into the provided type and tries to handle the
998/// computation in a way that will fold cleanly.
999static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1000 unsigned StoreSize, Loop *CurLoop,
1001 const DataLayout *DL, ScalarEvolution *SE) {
1002 const SCEV *NumBytesS;
1003 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
1004 // pointer size if it isn't already.
1005 //
1006 // If we're going to need to zero extend the BE count, check if we can add
1007 // one to it prior to zero extending without overflow. Provided this is safe,
1008 // it allows better simplification of the +1.
1009 if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
1010 DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
1011 SE->isLoopEntryGuardedByCond(
1012 CurLoop, ICmpInst::ICMP_NE, BECount,
1013 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
1014 NumBytesS = SE->getZeroExtendExpr(
1015 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
1016 IntPtr);
1017 } else {
1018 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
1019 SE->getOne(IntPtr), SCEV::FlagNUW);
1020 }
1021
1022 // And scale it based on the store size.
1023 if (StoreSize != 1) {
1024 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
1025 SCEV::FlagNUW);
1026 }
1027 return NumBytesS;
1028}
1029
1030/// processLoopStridedStore - We see a strided store of some value. If we can
1031/// transform this into a memset or memset_pattern in the loop preheader, do so.
1032bool LoopIdiomRecognize::processLoopStridedStore(
1033 Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
1034 Value *StoredVal, Instruction *TheStore,
1035 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
1036 const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
1037 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1038 Constant *PatternValue = nullptr;
1039
1040 if (!SplatValue)
1041 PatternValue = getMemSetPatternValue(StoredVal, DL);
1042
1043 assert((SplatValue || PatternValue) &&((void)0)
1044 "Expected either splat value or pattern value.")((void)0);
1045
1046 // The trip count of the loop and the base pointer of the addrec SCEV is
1047 // guaranteed to be loop invariant, which means that it should dominate the
1048 // header. This allows us to insert code for it in the preheader.
1049 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1050 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1051 IRBuilder<> Builder(Preheader->getTerminator());
1052 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1053 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1054
1055 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1056 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1057
1058 bool Changed = false;
1059 const SCEV *Start = Ev->getStart();
1060 // Handle negative strided loops.
1061 if (NegStride)
1062 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
1063
1064 // TODO: ideally we should still be able to generate memset if SCEV expander
1065 // is taught to generate the dependencies at the latest point.
1066 if (!isSafeToExpand(Start, *SE))
1067 return Changed;
1068
1069 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1070 // this into a memset in the loop preheader now if we want. However, this
1071 // would be unsafe to do if there is anything else in the loop that may read
1072 // or write to the aliased location. Check for any overlap by generating the
1073 // base pointer and checking the region.
1074 Value *BasePtr =
1075 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1076
1077 // From here on out, conservatively report to the pass manager that we've
1078 // changed the IR, even if we later clean up these added instructions. There
1079 // may be structural differences e.g. in the order of use lists not accounted
1080 // for in just a textual dump of the IR. This is written as a variable, even
1081 // though statically all the places this dominates could be replaced with
1082 // 'true', with the hope that anyone trying to be clever / "more precise" with
1083 // the return value will read this comment, and leave them alone.
1084 Changed = true;
1085
1086 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1087 StoreSize, *AA, Stores))
1088 return Changed;
1089
1090 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1091 return Changed;
1092
1093 // Okay, everything looks good, insert the memset.
1094
1095 const SCEV *NumBytesS =
1096 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1097
1098 // TODO: ideally we should still be able to generate memset if SCEV expander
1099 // is taught to generate the dependencies at the latest point.
1100 if (!isSafeToExpand(NumBytesS, *SE))
1101 return Changed;
1102
1103 Value *NumBytes =
1104 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1105
1106 CallInst *NewCall;
1107 if (SplatValue) {
1108 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
1109 MaybeAlign(StoreAlignment));
1110 } else {
1111 // Everything is emitted in default address space
1112 Type *Int8PtrTy = DestInt8PtrTy;
1113
1114 Module *M = TheStore->getModule();
1115 StringRef FuncName = "memset_pattern16";
1116 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1117 Int8PtrTy, Int8PtrTy, IntIdxTy);
1118 inferLibFuncAttributes(M, FuncName, *TLI);
1119
1120 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1121 // an constant array of 16-bytes. Plop the value into a mergable global.
1122 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1123 GlobalValue::PrivateLinkage,
1124 PatternValue, ".memset_pattern");
1125 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1126 GV->setAlignment(Align(16));
1127 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1128 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1129 }
1130 NewCall->setDebugLoc(TheStore->getDebugLoc());
1131
1132 if (MSSAU) {
1133 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1134 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1135 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1136 }
1137
1138 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"do { } while (false)
1139 << " from store to: " << *Ev << " at: " << *TheStoredo { } while (false)
1140 << "\n")do { } while (false);
1141
1142 ORE.emit([&]() {
1143 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
1144 NewCall->getDebugLoc(), Preheader)
1145 << "Transformed loop-strided store in "
1146 << ore::NV("Function", TheStore->getFunction())
1147 << " function into a call to "
1148 << ore::NV("NewFunction", NewCall->getCalledFunction())
1149 << "() intrinsic";
1150 });
1151
1152 // Okay, the memset has been formed. Zap the original store and anything that
1153 // feeds into it.
1154 for (auto *I : Stores) {
1155 if (MSSAU)
1156 MSSAU->removeMemoryAccess(I, true);
1157 deleteDeadInstruction(I);
1158 }
1159 if (MSSAU && VerifyMemorySSA)
1160 MSSAU->getMemorySSA()->verifyMemorySSA();
1161 ++NumMemSet;
1162 ExpCleaner.markResultUsed();
1163 return true;
1164}
1165
1166/// If the stored value is a strided load in the same loop with the same stride
1167/// this may be transformable into a memcpy. This kicks in for stuff like
1168/// for (i) A[i] = B[i];
1169bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1170 const SCEV *BECount) {
1171 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((void)0);
1172
1173 Value *StorePtr = SI->getPointerOperand();
1174 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1175 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1176
1177 // The store must be feeding a non-volatile load.
1178 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1179 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((void)0);
1180
1181 // See if the pointer expression is an AddRec like {base,+,1} on the current
1182 // loop, which indicates a strided load. If we have something else, it's a
1183 // random load we can't handle.
1184 Value *LoadPtr = LI->getPointerOperand();
1185 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1186 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize,
1187 SI->getAlign(), LI->getAlign(), SI, LI,
1188 StoreEv, LoadEv, BECount);
1189}
1190
1191bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1192 Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign,
1193 MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
1194 const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
1195 const SCEV *BECount) {
1196
1197 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1198 // conservatively bail here, since otherwise we may have to transform
1199 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1200 if (isa<MemCpyInlineInst>(TheStore))
1201 return false;
1202
1203 // The trip count of the loop and the base pointer of the addrec SCEV is
1204 // guaranteed to be loop invariant, which means that it should dominate the
1205 // header. This allows us to insert code for it in the preheader.
1206 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1207 IRBuilder<> Builder(Preheader->getTerminator());
1208 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1209
1210 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1211
1212 bool Changed = false;
1213 const SCEV *StrStart = StoreEv->getStart();
1214 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1215 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1216
1217 APInt Stride = getStoreStride(StoreEv);
1218 bool NegStride = StoreSize == -Stride;
1219
1220 // Handle negative strided loops.
1221 if (NegStride)
1222 StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1223
1224 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1225 // this into a memcpy in the loop preheader now if we want. However, this
1226 // would be unsafe to do if there is anything else in the loop that may read
1227 // or write the memory region we're storing to. This includes the load that
1228 // feeds the stores. Check for an alias by generating the base address and
1229 // checking everything.
1230 Value *StoreBasePtr = Expander.expandCodeFor(
1231 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1232
1233 // From here on out, conservatively report to the pass manager that we've
1234 // changed the IR, even if we later clean up these added instructions. There
1235 // may be structural differences e.g. in the order of use lists not accounted
1236 // for in just a textual dump of the IR. This is written as a variable, even
1237 // though statically all the places this dominates could be replaced with
1238 // 'true', with the hope that anyone trying to be clever / "more precise" with
1239 // the return value will read this comment, and leave them alone.
1240 Changed = true;
1241
1242 SmallPtrSet<Instruction *, 2> Stores;
1243 Stores.insert(TheStore);
1244
1245 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1246 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1247
1248 bool UseMemMove =
1249 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1250 StoreSize, *AA, Stores);
1251 if (UseMemMove) {
1252 // For memmove case it's not enough to guarantee that loop doesn't access
1253 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1254 // the only user of TheLoad.
1255 if (!TheLoad->hasOneUse())
1256 return Changed;
1257 Stores.insert(TheLoad);
1258 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1259 BECount, StoreSize, *AA, Stores)) {
1260 ORE.emit([&]() {
1261 return OptimizationRemarkMissed(DEBUG_TYPE"loop-idiom", "LoopMayAccessStore",
1262 TheStore)
1263 << ore::NV("Inst", InstRemark) << " in "
1264 << ore::NV("Function", TheStore->getFunction())
1265 << " function will not be hoisted: "
1266 << ore::NV("Reason", "The loop may access store location");
1267 });
1268 return Changed;
1269 }
1270 Stores.erase(TheLoad);
1271 }
1272
1273 const SCEV *LdStart = LoadEv->getStart();
1274 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1275
1276 // Handle negative strided loops.
1277 if (NegStride)
1278 LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1279
1280 // For a memcpy, we have to make sure that the input array is not being
1281 // mutated by the loop.
1282 Value *LoadBasePtr = Expander.expandCodeFor(
1283 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1284
1285 // If the store is a memcpy instruction, we must check if it will write to
1286 // the load memory locations. So remove it from the ignored stores.
1287 if (IsMemCpy)
1288 Stores.erase(TheStore);
1289 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1290 StoreSize, *AA, Stores)) {
1291 ORE.emit([&]() {
1292 return OptimizationRemarkMissed(DEBUG_TYPE"loop-idiom", "LoopMayAccessLoad", TheLoad)
1293 << ore::NV("Inst", InstRemark) << " in "
1294 << ore::NV("Function", TheStore->getFunction())
1295 << " function will not be hoisted: "
1296 << ore::NV("Reason", "The loop may access load location");
1297 });
1298 return Changed;
1299 }
1300 if (UseMemMove) {
1301 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr for
1302 // negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1303 int64_t LoadOff = 0, StoreOff = 0;
1304 const Value *BP1 = llvm::GetPointerBaseWithConstantOffset(
1305 LoadBasePtr->stripPointerCasts(), LoadOff, *DL);
1306 const Value *BP2 = llvm::GetPointerBaseWithConstantOffset(
1307 StoreBasePtr->stripPointerCasts(), StoreOff, *DL);
1308 int64_t LoadSize =
1309 DL->getTypeSizeInBits(TheLoad->getType()).getFixedSize() / 8;
1310 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1311 return Changed;
1312 if ((!NegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1313 (NegStride && LoadOff + LoadSize > StoreOff))
1314 return Changed;
1315 }
1316
1317 if (avoidLIRForMultiBlockLoop())
1318 return Changed;
1319
1320 // Okay, everything is safe, we can transform this!
1321
1322 const SCEV *NumBytesS =
1323 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1324
1325 Value *NumBytes =
1326 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1327
1328 CallInst *NewCall = nullptr;
1329 // Check whether to generate an unordered atomic memcpy:
1330 // If the load or store are atomic, then they must necessarily be unordered
1331 // by previous checks.
1332 if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1333 if (UseMemMove)
1334 NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1335 LoadAlign, NumBytes);
1336 else
1337 NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
1338 LoadAlign, NumBytes);
1339 } else {
1340 // For now don't support unordered atomic memmove.
1341 if (UseMemMove)
1342 return Changed;
1343 // We cannot allow unaligned ops for unordered load/store, so reject
1344 // anything where the alignment isn't at least the element size.
1345 assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&((void)0)
1346 "Expect unordered load/store to have align.")((void)0);
1347 if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
1348 return Changed;
1349
1350 // If the element.atomic memcpy is not lowered into explicit
1351 // loads/stores later, then it will be lowered into an element-size
1352 // specific lib call. If the lib call doesn't exist for our store size, then
1353 // we shouldn't generate the memcpy.
1354 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1355 return Changed;
1356
1357 // Create the call.
1358 // Note that unordered atomic loads/stores are *required* by the spec to
1359 // have an alignment but non-atomic loads/stores may not.
1360 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1361 StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
1362 NumBytes, StoreSize);
1363 }
1364 NewCall->setDebugLoc(TheStore->getDebugLoc());
1365
1366 if (MSSAU) {
1367 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1368 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1369 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1370 }
1371
1372 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"do { } while (false)
1373 << " from load ptr=" << *LoadEv << " at: " << *TheLoaddo { } while (false)
1374 << "\n"do { } while (false)
1375 << " from store ptr=" << *StoreEv << " at: " << *TheStoredo { } while (false)
1376 << "\n")do { } while (false);
1377
1378 ORE.emit([&]() {
1379 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1380 NewCall->getDebugLoc(), Preheader)
1381 << "Formed a call to "
1382 << ore::NV("NewFunction", NewCall->getCalledFunction())
1383 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1384 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1385 << " function";
1386 });
1387
1388 // Okay, the memcpy has been formed. Zap the original store and anything that
1389 // feeds into it.
1390 if (MSSAU)
1391 MSSAU->removeMemoryAccess(TheStore, true);
1392 deleteDeadInstruction(TheStore);
1393 if (MSSAU && VerifyMemorySSA)
1394 MSSAU->getMemorySSA()->verifyMemorySSA();
1395 if (UseMemMove)
1396 ++NumMemMove;
1397 else
1398 ++NumMemCpy;
1399 ExpCleaner.markResultUsed();
1400 return true;
1401}
1402
1403// When compiling for codesize we avoid idiom recognition for a multi-block loop
1404// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1405//
1406bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1407 bool IsLoopMemset) {
1408 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1409 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1410 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { } while (false)
1411 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { } while (false)
1412 << " avoided: multi-block top-level loop\n")do { } while (false);
1413 return true;
1414 }
1415 }
1416
1417 return false;
1418}
1419
1420bool LoopIdiomRecognize::runOnNoncountableLoop() {
1421 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { } while (false)
1422 << CurLoop->getHeader()->getParent()->getName()do { } while (false)
1423 << "] Noncountable Loop %"do { } while (false)
1424 << CurLoop->getHeader()->getName() << "\n")do { } while (false);
1425
1426 return recognizePopcount() || recognizeAndInsertFFS() ||
1427 recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1428}
1429
1430/// Check if the given conditional branch is based on the comparison between
1431/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1432/// true), the control yields to the loop entry. If the branch matches the
1433/// behavior, the variable involved in the comparison is returned. This function
1434/// will be called to see if the precondition and postcondition of the loop are
1435/// in desirable form.
1436static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1437 bool JmpOnZero = false) {
1438 if (!BI || !BI->isConditional())
1439 return nullptr;
1440
1441 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1442 if (!Cond)
1443 return nullptr;
1444
1445 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1446 if (!CmpZero || !CmpZero->isZero())
1447 return nullptr;
1448
1449 BasicBlock *TrueSucc = BI->getSuccessor(0);
1450 BasicBlock *FalseSucc = BI->getSuccessor(1);
1451 if (JmpOnZero)
1452 std::swap(TrueSucc, FalseSucc);
1453
1454 ICmpInst::Predicate Pred = Cond->getPredicate();
1455 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1456 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1457 return Cond->getOperand(0);
1458
1459 return nullptr;
1460}
1461
1462// Check if the recurrence variable `VarX` is in the right form to create
1463// the idiom. Returns the value coerced to a PHINode if so.
1464static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1465 BasicBlock *LoopEntry) {
1466 auto *PhiX = dyn_cast<PHINode>(VarX);
1467 if (PhiX && PhiX->getParent() == LoopEntry &&
1468 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1469 return PhiX;
1470 return nullptr;
1471}
1472
1473/// Return true iff the idiom is detected in the loop.
1474///
1475/// Additionally:
1476/// 1) \p CntInst is set to the instruction counting the population bit.
1477/// 2) \p CntPhi is set to the corresponding phi node.
1478/// 3) \p Var is set to the value whose population bits are being counted.
1479///
1480/// The core idiom we are trying to detect is:
1481/// \code
1482/// if (x0 != 0)
1483/// goto loop-exit // the precondition of the loop
1484/// cnt0 = init-val;
1485/// do {
1486/// x1 = phi (x0, x2);
1487/// cnt1 = phi(cnt0, cnt2);
1488///
1489/// cnt2 = cnt1 + 1;
1490/// ...
1491/// x2 = x1 & (x1 - 1);
1492/// ...
1493/// } while(x != 0);
1494///
1495/// loop-exit:
1496/// \endcode
1497static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1498 Instruction *&CntInst, PHINode *&CntPhi,
1499 Value *&Var) {
1500 // step 1: Check to see if the look-back branch match this pattern:
1501 // "if (a!=0) goto loop-entry".
1502 BasicBlock *LoopEntry;
1503 Instruction *DefX2, *CountInst;
1504 Value *VarX1, *VarX0;
1505 PHINode *PhiX, *CountPhi;
1506
1507 DefX2 = CountInst = nullptr;
1508 VarX1 = VarX0 = nullptr;
1509 PhiX = CountPhi = nullptr;
1510 LoopEntry = *(CurLoop->block_begin());
1511
1512 // step 1: Check if the loop-back branch is in desirable form.
1513 {
1514 if (Value *T = matchCondition(
1515 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1516 DefX2 = dyn_cast<Instruction>(T);
1517 else
1518 return false;
1519 }
1520
1521 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1522 {
1523 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1524 return false;
1525
1526 BinaryOperator *SubOneOp;
1527
1528 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1529 VarX1 = DefX2->getOperand(1);
1530 else {
1531 VarX1 = DefX2->getOperand(0);
1532 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1533 }
1534 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1535 return false;
1536
1537 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1538 if (!Dec ||
1539 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1540 (SubOneOp->getOpcode() == Instruction::Add &&
1541 Dec->isMinusOne()))) {
1542 return false;
1543 }
1544 }
1545
1546 // step 3: Check the recurrence of variable X
1547 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1548 if (!PhiX)
1549 return false;
1550
1551 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1552 {
1553 CountInst = nullptr;
1554 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1555 IterE = LoopEntry->end();
1556 Iter != IterE; Iter++) {
1557 Instruction *Inst = &*Iter;
1558 if (Inst->getOpcode() != Instruction::Add)
1559 continue;
1560
1561 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1562 if (!Inc || !Inc->isOne())
1563 continue;
1564
1565 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1566 if (!Phi)
1567 continue;
1568
1569 // Check if the result of the instruction is live of the loop.
1570 bool LiveOutLoop = false;
1571 for (User *U : Inst->users()) {
1572 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1573 LiveOutLoop = true;
1574 break;
1575 }
1576 }
1577
1578 if (LiveOutLoop) {
1579 CountInst = Inst;
1580 CountPhi = Phi;
1581 break;
1582 }
1583 }
1584
1585 if (!CountInst)
1586 return false;
1587 }
1588
1589 // step 5: check if the precondition is in this form:
1590 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1591 {
1592 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1593 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1594 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1595 return false;
1596
1597 CntInst = CountInst;
1598 CntPhi = CountPhi;
1599 Var = T;
1600 }
1601
1602 return true;
1603}
1604
1605/// Return true if the idiom is detected in the loop.
1606///
1607/// Additionally:
1608/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1609/// or nullptr if there is no such.
1610/// 2) \p CntPhi is set to the corresponding phi node
1611/// or nullptr if there is no such.
1612/// 3) \p Var is set to the value whose CTLZ could be used.
1613/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1614///
1615/// The core idiom we are trying to detect is:
1616/// \code
1617/// if (x0 == 0)
1618/// goto loop-exit // the precondition of the loop
1619/// cnt0 = init-val;
1620/// do {
1621/// x = phi (x0, x.next); //PhiX
1622/// cnt = phi(cnt0, cnt.next);
1623///
1624/// cnt.next = cnt + 1;
1625/// ...
1626/// x.next = x >> 1; // DefX
1627/// ...
1628/// } while(x.next != 0);
1629///
1630/// loop-exit:
1631/// \endcode
1632static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1633 Intrinsic::ID &IntrinID, Value *&InitX,
1634 Instruction *&CntInst, PHINode *&CntPhi,
1635 Instruction *&DefX) {
1636 BasicBlock *LoopEntry;
1637 Value *VarX = nullptr;
1638
1639 DefX = nullptr;
1640 CntInst = nullptr;
1641 CntPhi = nullptr;
1642 LoopEntry = *(CurLoop->block_begin());
1643
1644 // step 1: Check if the loop-back branch is in desirable form.
1645 if (Value *T = matchCondition(
1646 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1647 DefX = dyn_cast<Instruction>(T);
1648 else
1649 return false;
1650
1651 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1652 if (!DefX || !DefX->isShift())
1653 return false;
1654 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1655 Intrinsic::ctlz;
1656 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1657 if (!Shft || !Shft->isOne())
1658 return false;
1659 VarX = DefX->getOperand(0);
1660
1661 // step 3: Check the recurrence of variable X
1662 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1663 if (!PhiX)
1664 return false;
1665
1666 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1667
1668 // Make sure the initial value can't be negative otherwise the ashr in the
1669 // loop might never reach zero which would make the loop infinite.
1670 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1671 return false;
1672
1673 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1674 // or cnt.next = cnt + -1.
1675 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1676 // then all uses of "cnt.next" could be optimized to the trip count
1677 // plus "cnt0". Currently it is not optimized.
1678 // This step could be used to detect POPCNT instruction:
1679 // cnt.next = cnt + (x.next & 1)
1680 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1681 IterE = LoopEntry->end();
1682 Iter != IterE; Iter++) {
1683 Instruction *Inst = &*Iter;
1684 if (Inst->getOpcode() != Instruction::Add)
1685 continue;
1686
1687 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1688 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1689 continue;
1690
1691 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1692 if (!Phi)
1693 continue;
1694
1695 CntInst = Inst;
1696 CntPhi = Phi;
1697 break;
1698 }
1699 if (!CntInst)
1700 return false;
1701
1702 return true;
1703}
1704
1705/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1706/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1707/// trip count returns true; otherwise, returns false.
1708bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1709 // Give up if the loop has multiple blocks or multiple backedges.
1710 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1711 return false;
1712
1713 Intrinsic::ID IntrinID;
1714 Value *InitX;
1715 Instruction *DefX = nullptr;
1716 PHINode *CntPhi = nullptr;
1717 Instruction *CntInst = nullptr;
1718 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1719 // this is always 6.
1720 size_t IdiomCanonicalSize = 6;
1721
1722 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1723 CntInst, CntPhi, DefX))
1724 return false;
1725
1726 bool IsCntPhiUsedOutsideLoop = false;
1727 for (User *U : CntPhi->users())
1728 if (!CurLoop->contains(cast<Instruction>(U))) {
1729 IsCntPhiUsedOutsideLoop = true;
1730 break;
1731 }
1732 bool IsCntInstUsedOutsideLoop = false;
1733 for (User *U : CntInst->users())
1734 if (!CurLoop->contains(cast<Instruction>(U))) {
1735 IsCntInstUsedOutsideLoop = true;
1736 break;
1737 }
1738 // If both CntInst and CntPhi are used outside the loop the profitability
1739 // is questionable.
1740 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1741 return false;
1742
1743 // For some CPUs result of CTLZ(X) intrinsic is undefined
1744 // when X is 0. If we can not guarantee X != 0, we need to check this
1745 // when expand.
1746 bool ZeroCheck = false;
1747 // It is safe to assume Preheader exist as it was checked in
1748 // parent function RunOnLoop.
1749 BasicBlock *PH = CurLoop->getLoopPreheader();
1750
1751 // If we are using the count instruction outside the loop, make sure we
1752 // have a zero check as a precondition. Without the check the loop would run
1753 // one iteration for before any check of the input value. This means 0 and 1
1754 // would have identical behavior in the original loop and thus
1755 if (!IsCntPhiUsedOutsideLoop) {
1756 auto *PreCondBB = PH->getSinglePredecessor();
1757 if (!PreCondBB)
1758 return false;
1759 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1760 if (!PreCondBI)
1761 return false;
1762 if (matchCondition(PreCondBI, PH) != InitX)
1763 return false;
1764 ZeroCheck = true;
1765 }
1766
1767 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1768 // profitable if we delete the loop.
1769
1770 // the loop has only 6 instructions:
1771 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1772 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1773 // %shr = ashr %n.addr.0, 1
1774 // %tobool = icmp eq %shr, 0
1775 // %inc = add nsw %i.0, 1
1776 // br i1 %tobool
1777
1778 const Value *Args[] = {InitX,
1779 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1780
1781 // @llvm.dbg doesn't count as they have no semantic effect.
1782 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1783 uint32_t HeaderSize =
1784 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1785
1786 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1787 InstructionCost Cost =
1788 TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1789 if (HeaderSize != IdiomCanonicalSize &&
1790 Cost > TargetTransformInfo::TCC_Basic)
1791 return false;
1792
1793 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1794 DefX->getDebugLoc(), ZeroCheck,
1795 IsCntPhiUsedOutsideLoop);
1796 return true;
1797}
1798
1799/// Recognizes a population count idiom in a non-countable loop.
1800///
1801/// If detected, transforms the relevant code to issue the popcount intrinsic
1802/// function call, and returns true; otherwise, returns false.
1803bool LoopIdiomRecognize::recognizePopcount() {
1804 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1805 return false;
1806
1807 // Counting population are usually conducted by few arithmetic instructions.
1808 // Such instructions can be easily "absorbed" by vacant slots in a
1809 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1810 // in a compact loop.
1811
1812 // Give up if the loop has multiple blocks or multiple backedges.
1813 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1814 return false;
1815
1816 BasicBlock *LoopBody = *(CurLoop->block_begin());
1817 if (LoopBody->size() >= 20) {
1818 // The loop is too big, bail out.
1819 return false;
1820 }
1821
1822 // It should have a preheader containing nothing but an unconditional branch.
1823 BasicBlock *PH = CurLoop->getLoopPreheader();
1824 if (!PH || &PH->front() != PH->getTerminator())
1825 return false;
1826 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1827 if (!EntryBI || EntryBI->isConditional())
1828 return false;
1829
1830 // It should have a precondition block where the generated popcount intrinsic
1831 // function can be inserted.
1832 auto *PreCondBB = PH->getSinglePredecessor();
1833 if (!PreCondBB)
1834 return false;
1835 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1836 if (!PreCondBI || PreCondBI->isUnconditional())
1837 return false;
1838
1839 Instruction *CntInst;
1840 PHINode *CntPhi;
1841 Value *Val;
1842 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1843 return false;
1844
1845 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1846 return true;
1847}
1848
1849static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1850 const DebugLoc &DL) {
1851 Value *Ops[] = {Val};
1852 Type *Tys[] = {Val->getType()};
1853
1854 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1855 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1856 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1857 CI->setDebugLoc(DL);
1858
1859 return CI;
1860}
1861
1862static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1863 const DebugLoc &DL, bool ZeroCheck,
1864 Intrinsic::ID IID) {
1865 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1866 Type *Tys[] = {Val->getType()};
1867
1868 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1869 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1870 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1871 CI->setDebugLoc(DL);
1872
1873 return CI;
1874}
1875
1876/// Transform the following loop (Using CTLZ, CTTZ is similar):
1877/// loop:
1878/// CntPhi = PHI [Cnt0, CntInst]
1879/// PhiX = PHI [InitX, DefX]
1880/// CntInst = CntPhi + 1
1881/// DefX = PhiX >> 1
1882/// LOOP_BODY
1883/// Br: loop if (DefX != 0)
1884/// Use(CntPhi) or Use(CntInst)
1885///
1886/// Into:
1887/// If CntPhi used outside the loop:
1888/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1889/// Count = CountPrev + 1
1890/// else
1891/// Count = BitWidth(InitX) - CTLZ(InitX)
1892/// loop:
1893/// CntPhi = PHI [Cnt0, CntInst]
1894/// PhiX = PHI [InitX, DefX]
1895/// PhiCount = PHI [Count, Dec]
1896/// CntInst = CntPhi + 1
1897/// DefX = PhiX >> 1
1898/// Dec = PhiCount - 1
1899/// LOOP_BODY
1900/// Br: loop if (Dec != 0)
1901/// Use(CountPrev + Cnt0) // Use(CntPhi)
1902/// or
1903/// Use(Count + Cnt0) // Use(CntInst)
1904///
1905/// If LOOP_BODY is empty the loop will be deleted.
1906/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1907void LoopIdiomRecognize::transformLoopToCountable(
1908 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1909 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1910 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1911 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1912
1913 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1914 IRBuilder<> Builder(PreheaderBr);
1915 Builder.SetCurrentDebugLocation(DL);
1916
1917 // If there are no uses of CntPhi crate:
1918 // Count = BitWidth - CTLZ(InitX);
1919 // NewCount = Count;
1920 // If there are uses of CntPhi create:
1921 // NewCount = BitWidth - CTLZ(InitX >> 1);
1922 // Count = NewCount + 1;
1923 Value *InitXNext;
1924 if (IsCntPhiUsedOutsideLoop) {
1925 if (DefX->getOpcode() == Instruction::AShr)
1926 InitXNext = Builder.CreateAShr(InitX, 1);
1927 else if (DefX->getOpcode() == Instruction::LShr)
1928 InitXNext = Builder.CreateLShr(InitX, 1);
1929 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1930 InitXNext = Builder.CreateShl(InitX, 1);
1931 else
1932 llvm_unreachable("Unexpected opcode!")__builtin_unreachable();
1933 } else
1934 InitXNext = InitX;
1935 Value *Count =
1936 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1937 Type *CountTy = Count->getType();
1938 Count = Builder.CreateSub(
1939 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
1940 Value *NewCount = Count;
1941 if (IsCntPhiUsedOutsideLoop)
1942 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
1943
1944 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
1945
1946 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1947 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
1948 // If the counter was being incremented in the loop, add NewCount to the
1949 // counter's initial value, but only if the initial value is not zero.
1950 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1951 if (!InitConst || !InitConst->isZero())
1952 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1953 } else {
1954 // If the count was being decremented in the loop, subtract NewCount from
1955 // the counter's initial value.
1956 NewCount = Builder.CreateSub(CntInitVal, NewCount);
1957 }
1958
1959 // Step 2: Insert new IV and loop condition:
1960 // loop:
1961 // ...
1962 // PhiCount = PHI [Count, Dec]
1963 // ...
1964 // Dec = PhiCount - 1
1965 // ...
1966 // Br: loop if (Dec != 0)
1967 BasicBlock *Body = *(CurLoop->block_begin());
1968 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1969 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1970
1971 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
1972
1973 Builder.SetInsertPoint(LbCond);
1974 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
1975 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
1976
1977 TcPhi->addIncoming(Count, Preheader);
1978 TcPhi->addIncoming(TcDec, Body);
1979
1980 CmpInst::Predicate Pred =
1981 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1982 LbCond->setPredicate(Pred);
1983 LbCond->setOperand(0, TcDec);
1984 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
1985
1986 // Step 3: All the references to the original counter outside
1987 // the loop are replaced with the NewCount
1988 if (IsCntPhiUsedOutsideLoop)
1989 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1990 else
1991 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1992
1993 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1994 // loop. The loop would otherwise not be deleted even if it becomes empty.
1995 SE->forgetLoop(CurLoop);
1996}
1997
1998void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1999 Instruction *CntInst,
2000 PHINode *CntPhi, Value *Var) {
2001 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2002 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2003 const DebugLoc &DL = CntInst->getDebugLoc();
2004
2005 // Assuming before transformation, the loop is following:
2006 // if (x) // the precondition
2007 // do { cnt++; x &= x - 1; } while(x);
2008
2009 // Step 1: Insert the ctpop instruction at the end of the precondition block
2010 IRBuilder<> Builder(PreCondBr);
2011 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2012 {
2013 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2014 NewCount = PopCntZext =
2015 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2016
2017 if (NewCount != PopCnt)
2018 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2019
2020 // TripCnt is exactly the number of iterations the loop has
2021 TripCnt = NewCount;
2022
2023 // If the population counter's initial value is not zero, insert Add Inst.
2024 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2025 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2026 if (!InitConst || !InitConst->isZero()) {
2027 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2028 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2029 }
2030 }
2031
2032 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2033 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2034 // function would be partial dead code, and downstream passes will drag
2035 // it back from the precondition block to the preheader.
2036 {
2037 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2038
2039 Value *Opnd0 = PopCntZext;
2040 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2041 if (PreCond->getOperand(0) != Var)
2042 std::swap(Opnd0, Opnd1);
2043
2044 ICmpInst *NewPreCond = cast<ICmpInst>(
2045 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2046 PreCondBr->setCondition(NewPreCond);
2047
2048 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
2049 }
2050
2051 // Step 3: Note that the population count is exactly the trip count of the
2052 // loop in question, which enable us to convert the loop from noncountable
2053 // loop into a countable one. The benefit is twofold:
2054 //
2055 // - If the loop only counts population, the entire loop becomes dead after
2056 // the transformation. It is a lot easier to prove a countable loop dead
2057 // than to prove a noncountable one. (In some C dialects, an infinite loop
2058 // isn't dead even if it computes nothing useful. In general, DCE needs
2059 // to prove a noncountable loop finite before safely delete it.)
2060 //
2061 // - If the loop also performs something else, it remains alive.
2062 // Since it is transformed to countable form, it can be aggressively
2063 // optimized by some optimizations which are in general not applicable
2064 // to a noncountable loop.
2065 //
2066 // After this step, this loop (conceptually) would look like following:
2067 // newcnt = __builtin_ctpop(x);
2068 // t = newcnt;
2069 // if (x)
2070 // do { cnt++; x &= x-1; t--) } while (t > 0);
2071 BasicBlock *Body = *(CurLoop->block_begin());
2072 {
2073 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2074 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2075 Type *Ty = TripCnt->getType();
2076
2077 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2078
2079 Builder.SetInsertPoint(LbCond);
2080 Instruction *TcDec = cast<Instruction>(
2081 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2082 "tcdec", false, true));
2083
2084 TcPhi->addIncoming(TripCnt, PreHead);
2085 TcPhi->addIncoming(TcDec, Body);
2086
2087 CmpInst::Predicate Pred =
2088 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2089 LbCond->setPredicate(Pred);
2090 LbCond->setOperand(0, TcDec);
2091 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2092 }
2093
2094 // Step 4: All the references to the original population counter outside
2095 // the loop are replaced with the NewCount -- the value returned from
2096 // __builtin_ctpop().
2097 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2098
2099 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2100 // loop. The loop would otherwise not be deleted even if it becomes empty.
2101 SE->forgetLoop(CurLoop);
2102}
2103
2104/// Match loop-invariant value.
2105template <typename SubPattern_t> struct match_LoopInvariant {
2106 SubPattern_t SubPattern;
2107 const Loop *L;
2108
2109 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2110 : SubPattern(SP), L(L) {}
2111
2112 template <typename ITy> bool match(ITy *V) {
2113 return L->isLoopInvariant(V) && SubPattern.match(V);
2114 }
2115};
2116
2117/// Matches if the value is loop-invariant.
2118template <typename Ty>
2119inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2120 return match_LoopInvariant<Ty>(M, L);
2121}
2122
2123/// Return true if the idiom is detected in the loop.
2124///
2125/// The core idiom we are trying to detect is:
2126/// \code
2127/// entry:
2128/// <...>
2129/// %bitmask = shl i32 1, %bitpos
2130/// br label %loop
2131///
2132/// loop:
2133/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2134/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2135/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2136/// %x.next = shl i32 %x.curr, 1
2137/// <...>
2138/// br i1 %x.curr.isbitunset, label %loop, label %end
2139///
2140/// end:
2141/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2142/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2143/// <...>
2144/// \endcode
2145static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2146 Value *&BitMask, Value *&BitPos,
2147 Value *&CurrX, Instruction *&NextX) {
2148 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { } while (false)
2149 " Performing shift-until-bittest idiom detection.\n")do { } while (false);
2150
2151 // Give up if the loop has multiple blocks or multiple backedges.
2152 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2153 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n")do { } while (false);
2154 return false;
2155 }
2156
2157 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2158 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2159 assert(LoopPreheaderBB && "There is always a loop preheader.")((void)0);
2160
2161 using namespace PatternMatch;
2162
2163 // Step 1: Check if the loop backedge is in desirable form.
2164
2165 ICmpInst::Predicate Pred;
2166 Value *CmpLHS, *CmpRHS;
2167 BasicBlock *TrueBB, *FalseBB;
2168 if (!match(LoopHeaderBB->getTerminator(),
2169 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2170 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2171 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n")do { } while (false);
2172 return false;
2173 }
2174
2175 // Step 2: Check if the backedge's condition is in desirable form.
2176
2177 auto MatchVariableBitMask = [&]() {
2178 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2179 match(CmpLHS,
2180 m_c_And(m_Value(CurrX),
2181 m_CombineAnd(
2182 m_Value(BitMask),
2183 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2184 CurLoop))));
2185 };
2186 auto MatchConstantBitMask = [&]() {
2187 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2188 match(CmpLHS, m_And(m_Value(CurrX),
2189 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2190 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2191 };
2192 auto MatchDecomposableConstantBitMask = [&]() {
2193 APInt Mask;
2194 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2195 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2196 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2197 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2198 };
2199
2200 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2201 !MatchDecomposableConstantBitMask()) {
2202 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n")do { } while (false);
2203 return false;
2204 }
2205
2206 // Step 3: Check if the recurrence is in desirable form.
2207 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2208 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2209 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n")do { } while (false);
2210 return false;
2211 }
2212
2213 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2214 NextX =
2215 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2216
2217 assert(CurLoop->isLoopInvariant(BaseX) &&((void)0)
2218 "Expected BaseX to be avaliable in the preheader!")((void)0);
2219
2220 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2221 // FIXME: support right-shift?
2222 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n")do { } while (false);
2223 return false;
2224 }
2225
2226 // Step 4: Check if the backedge's destinations are in desirable form.
2227
2228 assert(ICmpInst::isEquality(Pred) &&((void)0)
2229 "Should only get equality predicates here.")((void)0);
2230
2231 // cmp-br is commutative, so canonicalize to a single variant.
2232 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2233 Pred = ICmpInst::getInversePredicate(Pred);
2234 std::swap(TrueBB, FalseBB);
2235 }
2236
2237 // We expect to exit loop when comparison yields false,
2238 // so when it yields true we should branch back to loop header.
2239 if (TrueBB != LoopHeaderBB) {
2240 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n")do { } while (false);
2241 return false;
2242 }
2243
2244 // Okay, idiom checks out.
2245 return true;
2246}
2247
2248/// Look for the following loop:
2249/// \code
2250/// entry:
2251/// <...>
2252/// %bitmask = shl i32 1, %bitpos
2253/// br label %loop
2254///
2255/// loop:
2256/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2257/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2258/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2259/// %x.next = shl i32 %x.curr, 1
2260/// <...>
2261/// br i1 %x.curr.isbitunset, label %loop, label %end
2262///
2263/// end:
2264/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2265/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2266/// <...>
2267/// \endcode
2268///
2269/// And transform it into:
2270/// \code
2271/// entry:
2272/// %bitmask = shl i32 1, %bitpos
2273/// %lowbitmask = add i32 %bitmask, -1
2274/// %mask = or i32 %lowbitmask, %bitmask
2275/// %x.masked = and i32 %x, %mask
2276/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2277/// i1 true)
2278/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2279/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2280/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2281/// %tripcount = add i32 %backedgetakencount, 1
2282/// %x.curr = shl i32 %x, %backedgetakencount
2283/// %x.next = shl i32 %x, %tripcount
2284/// br label %loop
2285///
2286/// loop:
2287/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2288/// %loop.iv.next = add nuw i32 %loop.iv, 1
2289/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2290/// <...>
2291/// br i1 %loop.ivcheck, label %end, label %loop
2292///
2293/// end:
2294/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2295/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2296/// <...>
2297/// \endcode
2298bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2299 bool MadeChange = false;
2300
2301 Value *X, *BitMask, *BitPos, *XCurr;
2302 Instruction *XNext;
2303 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2304 XNext)) {
2305 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { } while (false)
2306 " shift-until-bittest idiom detection failed.\n")do { } while (false);
2307 return MadeChange;
2308 }
2309 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n")do { } while (false);
2310
2311 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2312 // but is it profitable to transform?
2313
2314 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2315 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2316 assert(LoopPreheaderBB && "There is always a loop preheader.")((void)0);
2317
2318 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2319 assert(SuccessorBB && "There is only a single successor.")((void)0);
2320
2321 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2322 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2323
2324 Intrinsic::ID IntrID = Intrinsic::ctlz;
2325 Type *Ty = X->getType();
2326 unsigned Bitwidth = Ty->getScalarSizeInBits();
2327
2328 TargetTransformInfo::TargetCostKind CostKind =
2329 TargetTransformInfo::TCK_SizeAndLatency;
2330
2331 // The rewrite is considered to be unprofitable iff and only iff the
2332 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2333 // making the loop countable, even if nothing else changes.
2334 IntrinsicCostAttributes Attrs(
2335 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2336 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
2337 if (Cost > TargetTransformInfo::TCC_Basic) {
2338 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { } while (false)
2339 " Intrinsic is too costly, not beneficial\n")do { } while (false);
2340 return MadeChange;
2341 }
2342 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2343 TargetTransformInfo::TCC_Basic) {
2344 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n")do { } while (false);
2345 return MadeChange;
2346 }
2347
2348 // Ok, transform appears worthwhile.
2349 MadeChange = true;
2350
2351 // Step 1: Compute the loop trip count.
2352
2353 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2354 BitPos->getName() + ".lowbitmask");
2355 Value *Mask =
2356 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2357 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2358 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2359 IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2360 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2361 Value *XMaskedNumActiveBits = Builder.CreateSub(
2362 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2363 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2364 /*HasNSW=*/Bitwidth != 2);
2365 Value *XMaskedLeadingOnePos =
2366 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2367 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2368 /*HasNSW=*/Bitwidth > 2);
2369
2370 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2371 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2372 /*HasNUW=*/true, /*HasNSW=*/true);
2373 // We know loop's backedge-taken count, but what's loop's trip count?
2374 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2375 Value *LoopTripCount =
2376 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2377 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2378 /*HasNSW=*/Bitwidth != 2);
2379
2380 // Step 2: Compute the recurrence's final value without a loop.
2381
2382 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2383 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2384 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2385 NewX->takeName(XCurr);
2386 if (auto *I = dyn_cast<Instruction>(NewX))
2387 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2388
2389 Value *NewXNext;
2390 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2391 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2392 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2393 // that isn't the case, we'll need to emit an alternative, safe IR.
2394 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2395 PatternMatch::match(
2396 BitPos, PatternMatch::m_SpecificInt_ICMP(
2397 ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
2398 Ty->getScalarSizeInBits() - 1))))
2399 NewXNext = Builder.CreateShl(X, LoopTripCount);
2400 else {
2401 // Otherwise, just additionally shift by one. It's the smallest solution,
2402 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2403 // and select 0 instead.
2404 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2405 }
2406
2407 NewXNext->takeName(XNext);
2408 if (auto *I = dyn_cast<Instruction>(NewXNext))
2409 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2410
2411 // Step 3: Adjust the successor basic block to recieve the computed
2412 // recurrence's final value instead of the recurrence itself.
2413
2414 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2415 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2416
2417 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2418
2419 // The new canonical induction variable.
2420 Builder.SetInsertPoint(&LoopHeaderBB->front());
2421 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2422
2423 // The induction itself.
2424 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2425 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2426 auto *IVNext =
2427 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2428 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2429
2430 // The loop trip count check.
2431 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2432 CurLoop->getName() + ".ivcheck");
2433 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2434 LoopHeaderBB->getTerminator()->eraseFromParent();
2435
2436 // Populate the IV PHI.
2437 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2438 IV->addIncoming(IVNext, LoopHeaderBB);
2439
2440 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2441 // loop. The loop would otherwise not be deleted even if it becomes empty.
2442
2443 SE->forgetLoop(CurLoop);
2444
2445 // Other passes will take care of actually deleting the loop if possible.
2446
2447 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n")do { } while (false);
2448
2449 ++NumShiftUntilBitTest;
2450 return MadeChange;
2451}
2452
2453/// Return true if the idiom is detected in the loop.
2454///
2455/// The core idiom we are trying to detect is:
2456/// \code
2457/// entry:
2458/// <...>
2459/// %start = <...>
2460/// %extraoffset = <...>
2461/// <...>
2462/// br label %for.cond
2463///
2464/// loop:
2465/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2466/// %nbits = add nsw i8 %iv, %extraoffset
2467/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2468/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2469/// %iv.next = add i8 %iv, 1
2470/// <...>
2471/// br i1 %val.shifted.iszero, label %end, label %loop
2472///
2473/// end:
2474/// %iv.res = phi i8 [ %iv, %loop ] <...>
2475/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2476/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2477/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2478/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2479/// <...>
2480/// \endcode
2481static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,
2482 Instruction *&ValShiftedIsZero,
2483 Intrinsic::ID &IntrinID, Instruction *&IV,
2484 Value *&Start, Value *&Val,
2485 const SCEV *&ExtraOffsetExpr,
2486 bool &InvertedCond) {
2487 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { } while (false)
2488 " Performing shift-until-zero idiom detection.\n")do { } while (false);
2489
2490 // Give up if the loop has multiple blocks or multiple backedges.
2491 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2492 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n")do { } while (false);
2493 return false;
2494 }
2495
2496 Instruction *ValShifted, *NBits, *IVNext;
2497 Value *ExtraOffset;
2498
2499 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2500 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2501 assert(LoopPreheaderBB && "There is always a loop preheader.")((void)0);
2502
2503 using namespace PatternMatch;
2504
2505 // Step 1: Check if the loop backedge, condition is in desirable form.
2506
2507 ICmpInst::Predicate Pred;
2508 BasicBlock *TrueBB, *FalseBB;
2509 if (!match(LoopHeaderBB->getTerminator(),
2510 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2511 m_BasicBlock(FalseBB))) ||
2512 !match(ValShiftedIsZero,
2513 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2514 !ICmpInst::isEquality(Pred)) {
2515 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n")do { } while (false);
2516 return false;
2517 }
2518
2519 // Step 2: Check if the comparison's operand is in desirable form.
2520 // FIXME: Val could be a one-input PHI node, which we should look past.
2521 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2522 m_Instruction(NBits)))) {
2523 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n")do { } while (false);
2524 return false;
2525 }
2526 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2527 : Intrinsic::ctlz;
2528
2529 // Step 3: Check if the shift amount is in desirable form.
2530
2531 if (match(NBits, m_c_Add(m_Instruction(IV),
2532 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2533 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2534 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2535 else if (match(NBits,
2536 m_Sub(m_Instruction(IV),
2537 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2538 NBits->hasNoSignedWrap())
2539 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2540 else {
2541 IV = NBits;
2542 ExtraOffsetExpr = SE->getZero(NBits->getType());
2543 }
2544
2545 // Step 4: Check if the recurrence is in desirable form.
2546 auto *IVPN = dyn_cast<PHINode>(IV);
2547 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2548 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n")do { } while (false);
2549 return false;
2550 }
2551
2552 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2553 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2554
2555 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2556 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n")do { } while (false);
2557 return false;
2558 }
2559
2560 // Step 4: Check if the backedge's destinations are in desirable form.
2561
2562 assert(ICmpInst::isEquality(Pred) &&((void)0)
2563 "Should only get equality predicates here.")((void)0);
2564
2565 // cmp-br is commutative, so canonicalize to a single variant.
2566 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2567 if (InvertedCond) {
2568 Pred = ICmpInst::getInversePredicate(Pred);
Value stored to 'Pred' is never read
2569 std::swap(TrueBB, FalseBB);
2570 }
2571
2572 // We expect to exit loop when comparison yields true,
2573 // so when it yields false we should branch back to loop header.
2574 if (FalseBB != LoopHeaderBB) {
2575 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n")do { } while (false);
2576 return false;
2577 }
2578
2579 // The new, countable, loop will certainly only run a known number of
2580 // iterations, It won't be infinite. But the old loop might be infinite
2581 // under certain conditions. For logical shifts, the value will become zero
2582 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2583 // right-shift, iff the sign bit was set, the value will never become zero,
2584 // and the loop may never finish.
2585 if (ValShifted->getOpcode() == Instruction::AShr &&
2586 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2587 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n")do { } while (false);
2588 return false;
2589 }
2590
2591 // Okay, idiom checks out.
2592 return true;
2593}
2594
2595/// Look for the following loop:
2596/// \code
2597/// entry:
2598/// <...>
2599/// %start = <...>
2600/// %extraoffset = <...>
2601/// <...>
2602/// br label %for.cond
2603///
2604/// loop:
2605/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2606/// %nbits = add nsw i8 %iv, %extraoffset
2607/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2608/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2609/// %iv.next = add i8 %iv, 1
2610/// <...>
2611/// br i1 %val.shifted.iszero, label %end, label %loop
2612///
2613/// end:
2614/// %iv.res = phi i8 [ %iv, %loop ] <...>
2615/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2616/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2617/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2618/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2619/// <...>
2620/// \endcode
2621///
2622/// And transform it into:
2623/// \code
2624/// entry:
2625/// <...>
2626/// %start = <...>
2627/// %extraoffset = <...>
2628/// <...>
2629/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2630/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2631/// %extraoffset.neg = sub i8 0, %extraoffset
2632/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2633/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2634/// %loop.tripcount = sub i8 %iv.final, %start
2635/// br label %loop
2636///
2637/// loop:
2638/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2639/// %loop.iv.next = add i8 %loop.iv, 1
2640/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2641/// %iv = add i8 %loop.iv, %start
2642/// <...>
2643/// br i1 %loop.ivcheck, label %end, label %loop
2644///
2645/// end:
2646/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2647/// <...>
2648/// \endcode
2649bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2650 bool MadeChange = false;
2651
2652 Instruction *ValShiftedIsZero;
2653 Intrinsic::ID IntrID;
2654 Instruction *IV;
2655 Value *Start, *Val;
2656 const SCEV *ExtraOffsetExpr;
2657 bool InvertedCond;
2658 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2659 Start, Val, ExtraOffsetExpr, InvertedCond)) {
2660 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { } while (false)
2661 " shift-until-zero idiom detection failed.\n")do { } while (false);
2662 return MadeChange;
2663 }
2664 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n")do { } while (false);
2665
2666 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2667 // but is it profitable to transform?
2668
2669 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2670 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2671 assert(LoopPreheaderBB && "There is always a loop preheader.")((void)0);
2672
2673 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2674 assert(SuccessorBB && "There is only a single successor.")((void)0);
2675
2676 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2677 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2678
2679 Type *Ty = Val->getType();
2680 unsigned Bitwidth = Ty->getScalarSizeInBits();
2681
2682 TargetTransformInfo::TargetCostKind CostKind =
2683 TargetTransformInfo::TCK_SizeAndLatency;
2684
2685 // The rewrite is considered to be unprofitable iff and only iff the
2686 // intrinsic we'll use are not cheap. Note that we are okay with *just*
2687 // making the loop countable, even if nothing else changes.
2688 IntrinsicCostAttributes Attrs(
2689 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
2690 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
2691 if (Cost > TargetTransformInfo::TCC_Basic) {
2692 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { } while (false)
2693 " Intrinsic is too costly, not beneficial\n")do { } while (false);
2694 return MadeChange;
2695 }
2696
2697 // Ok, transform appears worthwhile.
2698 MadeChange = true;
2699
2700 bool OffsetIsZero = false;
2701 if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2702 OffsetIsZero = ExtraOffsetExprC->isZero();
2703
2704 // Step 1: Compute the loop's final IV value / trip count.
2705
2706 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2707 IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
2708 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2709 Value *ValNumActiveBits = Builder.CreateSub(
2710 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2711 Val->getName() + ".numactivebits", /*HasNUW=*/true,
2712 /*HasNSW=*/Bitwidth != 2);
2713
2714 SCEVExpander Expander(*SE, *DL, "loop-idiom");
2715 Expander.setInsertPoint(&*Builder.GetInsertPoint());
2716 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2717
2718 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2719 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2720 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2721 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2722 {ValNumActiveBitsOffset, Start},
2723 /*FMFSource=*/nullptr, "iv.final");
2724
2725 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2726 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2727 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2728 // FIXME: or when the offset was `add nuw`
2729
2730 // We know loop's backedge-taken count, but what's loop's trip count?
2731 Value *LoopTripCount =
2732 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2733 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2734 /*HasNSW=*/Bitwidth != 2);
2735
2736 // Step 2: Adjust the successor basic block to recieve the original
2737 // induction variable's final value instead of the orig. IV itself.
2738
2739 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2740
2741 // Step 3: Rewrite the loop into a countable form, with canonical IV.
2742
2743 // The new canonical induction variable.
2744 Builder.SetInsertPoint(&LoopHeaderBB->front());
2745 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2746
2747 // The induction itself.
2748 Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2749 auto *CIVNext =
2750 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2751 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2752
2753 // The loop trip count check.
2754 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2755 CurLoop->getName() + ".ivcheck");
2756 auto *NewIVCheck = CIVCheck;
2757 if (InvertedCond) {
2758 NewIVCheck = Builder.CreateNot(CIVCheck);
2759 NewIVCheck->takeName(ValShiftedIsZero);
2760 }
2761
2762 // The original IV, but rebased to be an offset to the CIV.
2763 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2764 /*HasNSW=*/true); // FIXME: what about NUW?
2765 IVDePHId->takeName(IV);
2766
2767 // The loop terminator.
2768 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2769 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2770 LoopHeaderBB->getTerminator()->eraseFromParent();
2771
2772 // Populate the IV PHI.
2773 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2774 CIV->addIncoming(CIVNext, LoopHeaderBB);
2775
2776 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2777 // loop. The loop would otherwise not be deleted even if it becomes empty.
2778
2779 SE->forgetLoop(CurLoop);
2780
2781 // Step 5: Try to cleanup the loop's body somewhat.
2782 IV->replaceAllUsesWith(IVDePHId);
2783 IV->eraseFromParent();
2784
2785 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2786 ValShiftedIsZero->eraseFromParent();
2787
2788 // Other passes will take care of actually deleting the loop if possible.
2789
2790 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n")do { } while (false);
2791
2792 ++NumShiftUntilZero;
2793 return MadeChange;
2794}