Bug Summary

File:src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Warning:line 4306, column 43
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple amd64-unknown-openbsd7.0 -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -resource-dir /usr/local/lib/clang/13.0.0 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Analysis -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ASMParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/BinaryFormat -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitcode -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Bitstream -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /include/llvm/CodeGen -I /include/llvm/CodeGen/PBQP -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IR -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Coroutines -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData/Coverage -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/CodeView -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/DWARF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/MSF -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/PDB -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Demangle -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/JITLink -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ExecutionEngine/Orc -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenACC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Frontend/OpenMP -I /include/llvm/CodeGen/GlobalISel -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/IRReader -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/Transforms/InstCombine -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/LTO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Linker -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/MC/MCParser -I /include/llvm/CodeGen/MIRParser -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Object -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Option -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Passes -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ProfileData -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Scalar -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/ADT -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/DebugInfo/Symbolize -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Target -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Utils -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/Vectorize -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include/llvm/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/X86 -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Transforms/IPO -I /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include -I /usr/src/gnu/usr.bin/clang/libLLVM/../include -I /usr/src/gnu/usr.bin/clang/libLLVM/obj -I /usr/src/gnu/usr.bin/clang/libLLVM/obj/../include -D NDEBUG -D __STDC_LIMIT_MACROS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D LLVM_PREFIX="/usr" -D PIC -internal-isystem /usr/include/c++/v1 -internal-isystem /usr/local/lib/clang/13.0.0/include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/usr/src/gnu/usr.bin/clang/libLLVM/obj -ferror-limit 19 -fvisibility-inlines-hidden -fwrapv -D_RET_PROTECTOR -ret-protector -fno-rtti -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-valloc -fno-builtin-free -fno-builtin-strdup -fno-builtin-strndup -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /home/ben/Projects/vmm/scan-build/2022-01-12-194120-40624-1 -x c++ /usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "GCNSubtarget.h"
20#include "SIMachineFunctionInfo.h"
21#include "llvm/CodeGen/Analysis.h"
22#include "llvm/IR/DiagnosticInfo.h"
23#include "llvm/IR/IntrinsicsAMDGPU.h"
24#include "llvm/Support/CommandLine.h"
25#include "llvm/Support/KnownBits.h"
26#include "llvm/Target/TargetMachine.h"
27
28using namespace llvm;
29
30#include "AMDGPUGenCallingConv.inc"
31
32static cl::opt<bool> AMDGPUBypassSlowDiv(
33 "amdgpu-bypass-slow-div",
34 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
35 cl::init(true));
36
37// Find a larger type to do a load / store of a vector with.
38EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
39 unsigned StoreSize = VT.getStoreSizeInBits();
40 if (StoreSize <= 32)
41 return EVT::getIntegerVT(Ctx, StoreSize);
42
43 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32")((void)0);
44 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
45}
46
47unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
48 EVT VT = Op.getValueType();
49 KnownBits Known = DAG.computeKnownBits(Op);
50 return VT.getSizeInBits() - Known.countMinLeadingZeros();
51}
52
53unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
54 EVT VT = Op.getValueType();
55
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
59}
60
61AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Lower floating point store/load to integer store/load to reduce the number
65 // of patterns in tablegen.
66 setOperationAction(ISD::LOAD, MVT::f32, Promote);
67 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
68
69 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
70 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
71
72 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
73 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
74
75 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
76 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
77
78 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
79 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
80
81 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
82 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
83
84 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
85 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
86
87 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
88 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
89
90 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
91 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
92
93 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
94 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
95
96 setOperationAction(ISD::LOAD, MVT::i64, Promote);
97 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
98
99 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
100 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
101
102 setOperationAction(ISD::LOAD, MVT::f64, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
104
105 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
107
108 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
110
111 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
113
114 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
116
117 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
118 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
119
120 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
121 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
122
123 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
124 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
125
126 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
127 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
128
129 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
130 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
131
132 // There are no 64-bit extloads. These should be done as a 32-bit extload and
133 // an extension to 64-bit.
134 for (MVT VT : MVT::integer_valuetypes()) {
135 setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
136 setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
137 setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
138 }
139
140 for (MVT VT : MVT::integer_valuetypes()) {
141 if (VT == MVT::i64)
142 continue;
143
144 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
145 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
146 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
147 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
148
149 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
150 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
151 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
152 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
153
154 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
155 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
156 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
158 }
159
160 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
161 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
162 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
163 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
164 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
165 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
166 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
167 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
168 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
169 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
172 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
173 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
174 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
175 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
176 }
177
178 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
183 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
184 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
185
186 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
189 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
190 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
192
193 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
196 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
197 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
199
200 setOperationAction(ISD::STORE, MVT::f32, Promote);
201 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
202
203 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
204 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
205
206 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
207 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
208
209 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
210 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
211
212 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
213 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
214
215 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
216 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
217
218 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
219 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
220
221 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
222 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
223
224 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
225 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
226
227 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
228 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
229
230 setOperationAction(ISD::STORE, MVT::i64, Promote);
231 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
232
233 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
234 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
235
236 setOperationAction(ISD::STORE, MVT::f64, Promote);
237 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
238
239 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
240 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
241
242 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
243 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
244
245 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
246 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
247
248 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
249 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
250
251 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
252 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
253
254 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
255 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
256
257 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
258 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
259
260 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
261 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
262
263 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
264 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
265
266 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
267 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
268 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
269 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
270
271 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
272 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
273 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
274 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
275
276 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
277 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
278 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
279 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
280 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
281 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
282 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
283
284 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
285 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
286
287 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
288 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
289
290 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
291 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
292 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
293 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
294
295 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
296 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
297 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
298 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
299
300 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
301 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
302
303 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
304 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
305 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
306 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
307 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
308 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
309 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
310
311 setOperationAction(ISD::Constant, MVT::i32, Legal);
312 setOperationAction(ISD::Constant, MVT::i64, Legal);
313 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
314 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
315
316 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
317 setOperationAction(ISD::BRIND, MVT::Other, Expand);
318
319 // This is totally unsupported, just custom lower to produce an error.
320 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
321
322 // Library functions. These default to Expand, but we have instructions
323 // for them.
324 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
325 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
326 setOperationAction(ISD::FPOW, MVT::f32, Legal);
327 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
328 setOperationAction(ISD::FABS, MVT::f32, Legal);
329 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
330 setOperationAction(ISD::FRINT, MVT::f32, Legal);
331 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
332 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
333 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
334
335 setOperationAction(ISD::FROUND, MVT::f32, Custom);
336 setOperationAction(ISD::FROUND, MVT::f64, Custom);
337
338 setOperationAction(ISD::FLOG, MVT::f32, Custom);
339 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
340 setOperationAction(ISD::FEXP, MVT::f32, Custom);
341
342
343 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
344 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
345
346 setOperationAction(ISD::FREM, MVT::f16, Custom);
347 setOperationAction(ISD::FREM, MVT::f32, Custom);
348 setOperationAction(ISD::FREM, MVT::f64, Custom);
349
350 // Expand to fneg + fadd.
351 setOperationAction(ISD::FSUB, MVT::f64, Expand);
352
353 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
354 setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
355 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
356 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
357 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
358 setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
359 setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
360 setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
361 setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
362 setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
363 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
364 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
365 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
366 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
367 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
368 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
369 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
370 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
371 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
372 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
373 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
374 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
375 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
376 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
377 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
378 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
379 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
380 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
381 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
382 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
383 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
384 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
385 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
386 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
387 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
388 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
389 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
390 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
391 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
392 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
393 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
394 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
395
396 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
397 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
398 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
399
400 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
401 for (MVT VT : ScalarIntVTs) {
402 // These should use [SU]DIVREM, so set them to expand
403 setOperationAction(ISD::SDIV, VT, Expand);
404 setOperationAction(ISD::UDIV, VT, Expand);
405 setOperationAction(ISD::SREM, VT, Expand);
406 setOperationAction(ISD::UREM, VT, Expand);
407
408 // GPU does not have divrem function for signed or unsigned.
409 setOperationAction(ISD::SDIVREM, VT, Custom);
410 setOperationAction(ISD::UDIVREM, VT, Custom);
411
412 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
413 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
414 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
415
416 setOperationAction(ISD::BSWAP, VT, Expand);
417 setOperationAction(ISD::CTTZ, VT, Expand);
418 setOperationAction(ISD::CTLZ, VT, Expand);
419
420 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
421 setOperationAction(ISD::ADDC, VT, Legal);
422 setOperationAction(ISD::SUBC, VT, Legal);
423 setOperationAction(ISD::ADDE, VT, Legal);
424 setOperationAction(ISD::SUBE, VT, Legal);
425 }
426
427 // The hardware supports 32-bit FSHR, but not FSHL.
428 setOperationAction(ISD::FSHR, MVT::i32, Legal);
429
430 // The hardware supports 32-bit ROTR, but not ROTL.
431 setOperationAction(ISD::ROTL, MVT::i32, Expand);
432 setOperationAction(ISD::ROTL, MVT::i64, Expand);
433 setOperationAction(ISD::ROTR, MVT::i64, Expand);
434
435 setOperationAction(ISD::MULHU, MVT::i16, Expand);
436 setOperationAction(ISD::MULHS, MVT::i16, Expand);
437
438 setOperationAction(ISD::MUL, MVT::i64, Expand);
439 setOperationAction(ISD::MULHU, MVT::i64, Expand);
440 setOperationAction(ISD::MULHS, MVT::i64, Expand);
441 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
442 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
443 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
444 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
445 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
446
447 setOperationAction(ISD::SMIN, MVT::i32, Legal);
448 setOperationAction(ISD::UMIN, MVT::i32, Legal);
449 setOperationAction(ISD::SMAX, MVT::i32, Legal);
450 setOperationAction(ISD::UMAX, MVT::i32, Legal);
451
452 setOperationAction(ISD::CTTZ, MVT::i64, Custom);
453 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
454 setOperationAction(ISD::CTLZ, MVT::i64, Custom);
455 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
456
457 static const MVT::SimpleValueType VectorIntTypes[] = {
458 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
459
460 for (MVT VT : VectorIntTypes) {
461 // Expand the following operations for the current type by default.
462 setOperationAction(ISD::ADD, VT, Expand);
463 setOperationAction(ISD::AND, VT, Expand);
464 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
465 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
466 setOperationAction(ISD::MUL, VT, Expand);
467 setOperationAction(ISD::MULHU, VT, Expand);
468 setOperationAction(ISD::MULHS, VT, Expand);
469 setOperationAction(ISD::OR, VT, Expand);
470 setOperationAction(ISD::SHL, VT, Expand);
471 setOperationAction(ISD::SRA, VT, Expand);
472 setOperationAction(ISD::SRL, VT, Expand);
473 setOperationAction(ISD::ROTL, VT, Expand);
474 setOperationAction(ISD::ROTR, VT, Expand);
475 setOperationAction(ISD::SUB, VT, Expand);
476 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
477 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
478 setOperationAction(ISD::SDIV, VT, Expand);
479 setOperationAction(ISD::UDIV, VT, Expand);
480 setOperationAction(ISD::SREM, VT, Expand);
481 setOperationAction(ISD::UREM, VT, Expand);
482 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
483 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
484 setOperationAction(ISD::SDIVREM, VT, Expand);
485 setOperationAction(ISD::UDIVREM, VT, Expand);
486 setOperationAction(ISD::SELECT, VT, Expand);
487 setOperationAction(ISD::VSELECT, VT, Expand);
488 setOperationAction(ISD::SELECT_CC, VT, Expand);
489 setOperationAction(ISD::XOR, VT, Expand);
490 setOperationAction(ISD::BSWAP, VT, Expand);
491 setOperationAction(ISD::CTPOP, VT, Expand);
492 setOperationAction(ISD::CTTZ, VT, Expand);
493 setOperationAction(ISD::CTLZ, VT, Expand);
494 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
495 setOperationAction(ISD::SETCC, VT, Expand);
496 }
497
498 static const MVT::SimpleValueType FloatVectorTypes[] = {
499 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
500
501 for (MVT VT : FloatVectorTypes) {
502 setOperationAction(ISD::FABS, VT, Expand);
503 setOperationAction(ISD::FMINNUM, VT, Expand);
504 setOperationAction(ISD::FMAXNUM, VT, Expand);
505 setOperationAction(ISD::FADD, VT, Expand);
506 setOperationAction(ISD::FCEIL, VT, Expand);
507 setOperationAction(ISD::FCOS, VT, Expand);
508 setOperationAction(ISD::FDIV, VT, Expand);
509 setOperationAction(ISD::FEXP2, VT, Expand);
510 setOperationAction(ISD::FEXP, VT, Expand);
511 setOperationAction(ISD::FLOG2, VT, Expand);
512 setOperationAction(ISD::FREM, VT, Expand);
513 setOperationAction(ISD::FLOG, VT, Expand);
514 setOperationAction(ISD::FLOG10, VT, Expand);
515 setOperationAction(ISD::FPOW, VT, Expand);
516 setOperationAction(ISD::FFLOOR, VT, Expand);
517 setOperationAction(ISD::FTRUNC, VT, Expand);
518 setOperationAction(ISD::FMUL, VT, Expand);
519 setOperationAction(ISD::FMA, VT, Expand);
520 setOperationAction(ISD::FRINT, VT, Expand);
521 setOperationAction(ISD::FNEARBYINT, VT, Expand);
522 setOperationAction(ISD::FSQRT, VT, Expand);
523 setOperationAction(ISD::FSIN, VT, Expand);
524 setOperationAction(ISD::FSUB, VT, Expand);
525 setOperationAction(ISD::FNEG, VT, Expand);
526 setOperationAction(ISD::VSELECT, VT, Expand);
527 setOperationAction(ISD::SELECT_CC, VT, Expand);
528 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
529 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
530 setOperationAction(ISD::SETCC, VT, Expand);
531 setOperationAction(ISD::FCANONICALIZE, VT, Expand);
532 }
533
534 // This causes using an unrolled select operation rather than expansion with
535 // bit operations. This is in general better, but the alternative using BFI
536 // instructions may be better if the select sources are SGPRs.
537 setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
538 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
539
540 setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
541 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
542
543 setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
544 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
545
546 setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
547 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
548
549 setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
550 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
551
552 setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
553 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
554
555 // There are no libcalls of any kind.
556 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
557 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
558
559 setSchedulingPreference(Sched::RegPressure);
560 setJumpIsExpensive(true);
561
562 // FIXME: This is only partially true. If we have to do vector compares, any
563 // SGPR pair can be a condition register. If we have a uniform condition, we
564 // are better off doing SALU operations, where there is only one SCC. For now,
565 // we don't have a way of knowing during instruction selection if a condition
566 // will be uniform and we always use vector compares. Assume we are using
567 // vector compares until that is fixed.
568 setHasMultipleConditionRegisters(true);
569
570 setMinCmpXchgSizeInBits(32);
571 setSupportsUnalignedAtomics(false);
572
573 PredictableSelectIsExpensive = false;
574
575 // We want to find all load dependencies for long chains of stores to enable
576 // merging into very wide vectors. The problem is with vectors with > 4
577 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
578 // vectors are a legal type, even though we have to split the loads
579 // usually. When we can more precisely specify load legality per address
580 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
581 // smarter so that they can figure out what to do in 2 iterations without all
582 // N > 4 stores on the same chain.
583 GatherAllAliasesMaxDepth = 16;
584
585 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
586 // about these during lowering.
587 MaxStoresPerMemcpy = 0xffffffff;
588 MaxStoresPerMemmove = 0xffffffff;
589 MaxStoresPerMemset = 0xffffffff;
590
591 // The expansion for 64-bit division is enormous.
592 if (AMDGPUBypassSlowDiv)
593 addBypassSlowDiv(64, 32);
594
595 setTargetDAGCombine(ISD::BITCAST);
596 setTargetDAGCombine(ISD::SHL);
597 setTargetDAGCombine(ISD::SRA);
598 setTargetDAGCombine(ISD::SRL);
599 setTargetDAGCombine(ISD::TRUNCATE);
600 setTargetDAGCombine(ISD::MUL);
601 setTargetDAGCombine(ISD::MULHU);
602 setTargetDAGCombine(ISD::MULHS);
603 setTargetDAGCombine(ISD::SELECT);
604 setTargetDAGCombine(ISD::SELECT_CC);
605 setTargetDAGCombine(ISD::STORE);
606 setTargetDAGCombine(ISD::FADD);
607 setTargetDAGCombine(ISD::FSUB);
608 setTargetDAGCombine(ISD::FNEG);
609 setTargetDAGCombine(ISD::FABS);
610 setTargetDAGCombine(ISD::AssertZext);
611 setTargetDAGCombine(ISD::AssertSext);
612 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
613}
614
615bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
616 if (getTargetMachine().Options.NoSignedZerosFPMath)
617 return true;
618
619 const auto Flags = Op.getNode()->getFlags();
620 if (Flags.hasNoSignedZeros())
621 return true;
622
623 return false;
624}
625
626//===----------------------------------------------------------------------===//
627// Target Information
628//===----------------------------------------------------------------------===//
629
630LLVM_READNONE__attribute__((__const__))
631static bool fnegFoldsIntoOp(unsigned Opc) {
632 switch (Opc) {
633 case ISD::FADD:
634 case ISD::FSUB:
635 case ISD::FMUL:
636 case ISD::FMA:
637 case ISD::FMAD:
638 case ISD::FMINNUM:
639 case ISD::FMAXNUM:
640 case ISD::FMINNUM_IEEE:
641 case ISD::FMAXNUM_IEEE:
642 case ISD::FSIN:
643 case ISD::FTRUNC:
644 case ISD::FRINT:
645 case ISD::FNEARBYINT:
646 case ISD::FCANONICALIZE:
647 case AMDGPUISD::RCP:
648 case AMDGPUISD::RCP_LEGACY:
649 case AMDGPUISD::RCP_IFLAG:
650 case AMDGPUISD::SIN_HW:
651 case AMDGPUISD::FMUL_LEGACY:
652 case AMDGPUISD::FMIN_LEGACY:
653 case AMDGPUISD::FMAX_LEGACY:
654 case AMDGPUISD::FMED3:
655 // TODO: handle llvm.amdgcn.fma.legacy
656 return true;
657 default:
658 return false;
659 }
660}
661
662/// \p returns true if the operation will definitely need to use a 64-bit
663/// encoding, and thus will use a VOP3 encoding regardless of the source
664/// modifiers.
665LLVM_READONLY__attribute__((__pure__))
666static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
667 return N->getNumOperands() > 2 || VT == MVT::f64;
668}
669
670// Most FP instructions support source modifiers, but this could be refined
671// slightly.
672LLVM_READONLY__attribute__((__pure__))
673static bool hasSourceMods(const SDNode *N) {
674 if (isa<MemSDNode>(N))
675 return false;
676
677 switch (N->getOpcode()) {
678 case ISD::CopyToReg:
679 case ISD::SELECT:
680 case ISD::FDIV:
681 case ISD::FREM:
682 case ISD::INLINEASM:
683 case ISD::INLINEASM_BR:
684 case AMDGPUISD::DIV_SCALE:
685 case ISD::INTRINSIC_W_CHAIN:
686
687 // TODO: Should really be looking at the users of the bitcast. These are
688 // problematic because bitcasts are used to legalize all stores to integer
689 // types.
690 case ISD::BITCAST:
691 return false;
692 case ISD::INTRINSIC_WO_CHAIN: {
693 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
694 case Intrinsic::amdgcn_interp_p1:
695 case Intrinsic::amdgcn_interp_p2:
696 case Intrinsic::amdgcn_interp_mov:
697 case Intrinsic::amdgcn_interp_p1_f16:
698 case Intrinsic::amdgcn_interp_p2_f16:
699 return false;
700 default:
701 return true;
702 }
703 }
704 default:
705 return true;
706 }
707}
708
709bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
710 unsigned CostThreshold) {
711 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
712 // it is truly free to use a source modifier in all cases. If there are
713 // multiple users but for each one will necessitate using VOP3, there will be
714 // a code size increase. Try to avoid increasing code size unless we know it
715 // will save on the instruction count.
716 unsigned NumMayIncreaseSize = 0;
717 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
718
719 // XXX - Should this limit number of uses to check?
720 for (const SDNode *U : N->uses()) {
721 if (!hasSourceMods(U))
722 return false;
723
724 if (!opMustUseVOP3Encoding(U, VT)) {
725 if (++NumMayIncreaseSize > CostThreshold)
726 return false;
727 }
728 }
729
730 return true;
731}
732
733EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
734 ISD::NodeType ExtendKind) const {
735 assert(!VT.isVector() && "only scalar expected")((void)0);
736
737 // Round to the next multiple of 32-bits.
738 unsigned Size = VT.getSizeInBits();
739 if (Size <= 32)
740 return MVT::i32;
741 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
742}
743
744MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
745 return MVT::i32;
746}
747
748bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
749 return true;
750}
751
752// The backend supports 32 and 64 bit floating point immediates.
753// FIXME: Why are we reporting vectors of FP immediates as legal?
754bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
755 bool ForCodeSize) const {
756 EVT ScalarVT = VT.getScalarType();
757 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
758 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
759}
760
761// We don't want to shrink f64 / f32 constants.
762bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
763 EVT ScalarVT = VT.getScalarType();
764 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
765}
766
767bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
768 ISD::LoadExtType ExtTy,
769 EVT NewVT) const {
770 // TODO: This may be worth removing. Check regression tests for diffs.
771 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
772 return false;
773
774 unsigned NewSize = NewVT.getStoreSizeInBits();
775
776 // If we are reducing to a 32-bit load or a smaller multi-dword load,
777 // this is always better.
778 if (NewSize >= 32)
779 return true;
780
781 EVT OldVT = N->getValueType(0);
782 unsigned OldSize = OldVT.getStoreSizeInBits();
783
784 MemSDNode *MN = cast<MemSDNode>(N);
785 unsigned AS = MN->getAddressSpace();
786 // Do not shrink an aligned scalar load to sub-dword.
787 // Scalar engine cannot do sub-dword loads.
788 if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
789 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
790 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
791 (isa<LoadSDNode>(N) &&
792 AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
793 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
794 return false;
795
796 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
797 // extloads, so doing one requires using a buffer_load. In cases where we
798 // still couldn't use a scalar load, using the wider load shouldn't really
799 // hurt anything.
800
801 // If the old size already had to be an extload, there's no harm in continuing
802 // to reduce the width.
803 return (OldSize < 32);
804}
805
806bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
807 const SelectionDAG &DAG,
808 const MachineMemOperand &MMO) const {
809
810 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits())((void)0);
811
812 if (LoadTy.getScalarType() == MVT::i32)
813 return false;
814
815 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
816 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
817
818 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
819 return false;
820
821 bool Fast = false;
822 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
823 CastTy, MMO, &Fast) &&
824 Fast;
825}
826
827// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
828// profitable with the expansion for 64-bit since it's generally good to
829// speculate things.
830// FIXME: These should really have the size as a parameter.
831bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
832 return true;
833}
834
835bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
836 return true;
837}
838
839bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
840 switch (N->getOpcode()) {
841 case ISD::EntryToken:
842 case ISD::TokenFactor:
843 return true;
844 case ISD::INTRINSIC_WO_CHAIN: {
845 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
846 switch (IntrID) {
847 case Intrinsic::amdgcn_readfirstlane:
848 case Intrinsic::amdgcn_readlane:
849 return true;
850 }
851 return false;
852 }
853 case ISD::LOAD:
854 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
855 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
856 return true;
857 return false;
858 }
859 return false;
860}
861
862SDValue AMDGPUTargetLowering::getNegatedExpression(
863 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
864 NegatibleCost &Cost, unsigned Depth) const {
865
866 switch (Op.getOpcode()) {
867 case ISD::FMA:
868 case ISD::FMAD: {
869 // Negating a fma is not free if it has users without source mods.
870 if (!allUsesHaveSourceMods(Op.getNode()))
871 return SDValue();
872 break;
873 }
874 default:
875 break;
876 }
877
878 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
879 ForCodeSize, Cost, Depth);
880}
881
882//===---------------------------------------------------------------------===//
883// Target Properties
884//===---------------------------------------------------------------------===//
885
886bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
887 assert(VT.isFloatingPoint())((void)0);
888
889 // Packed operations do not have a fabs modifier.
890 return VT == MVT::f32 || VT == MVT::f64 ||
891 (Subtarget->has16BitInsts() && VT == MVT::f16);
892}
893
894bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
895 assert(VT.isFloatingPoint())((void)0);
896 // Report this based on the end legalized type.
897 VT = VT.getScalarType();
898 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
899}
900
901bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
902 unsigned NumElem,
903 unsigned AS) const {
904 return true;
905}
906
907bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
908 // There are few operations which truly have vector input operands. Any vector
909 // operation is going to involve operations on each component, and a
910 // build_vector will be a copy per element, so it always makes sense to use a
911 // build_vector input in place of the extracted element to avoid a copy into a
912 // super register.
913 //
914 // We should probably only do this if all users are extracts only, but this
915 // should be the common case.
916 return true;
917}
918
919bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
920 // Truncate is just accessing a subregister.
921
922 unsigned SrcSize = Source.getSizeInBits();
923 unsigned DestSize = Dest.getSizeInBits();
924
925 return DestSize < SrcSize && DestSize % 32 == 0 ;
926}
927
928bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
929 // Truncate is just accessing a subregister.
930
931 unsigned SrcSize = Source->getScalarSizeInBits();
932 unsigned DestSize = Dest->getScalarSizeInBits();
933
934 if (DestSize== 16 && Subtarget->has16BitInsts())
935 return SrcSize >= 32;
936
937 return DestSize < SrcSize && DestSize % 32 == 0;
938}
939
940bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
941 unsigned SrcSize = Src->getScalarSizeInBits();
942 unsigned DestSize = Dest->getScalarSizeInBits();
943
944 if (SrcSize == 16 && Subtarget->has16BitInsts())
945 return DestSize >= 32;
946
947 return SrcSize == 32 && DestSize == 64;
948}
949
950bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
951 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
952 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
953 // this will enable reducing 64-bit operations the 32-bit, which is always
954 // good.
955
956 if (Src == MVT::i16)
957 return Dest == MVT::i32 ||Dest == MVT::i64 ;
958
959 return Src == MVT::i32 && Dest == MVT::i64;
960}
961
962bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
963 return isZExtFree(Val.getValueType(), VT2);
964}
965
966bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
967 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
968 // limited number of native 64-bit operations. Shrinking an operation to fit
969 // in a single 32-bit register should always be helpful. As currently used,
970 // this is much less general than the name suggests, and is only used in
971 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
972 // not profitable, and may actually be harmful.
973 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
974}
975
976//===---------------------------------------------------------------------===//
977// TargetLowering Callbacks
978//===---------------------------------------------------------------------===//
979
980CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
981 bool IsVarArg) {
982 switch (CC) {
983 case CallingConv::AMDGPU_VS:
984 case CallingConv::AMDGPU_GS:
985 case CallingConv::AMDGPU_PS:
986 case CallingConv::AMDGPU_CS:
987 case CallingConv::AMDGPU_HS:
988 case CallingConv::AMDGPU_ES:
989 case CallingConv::AMDGPU_LS:
990 return CC_AMDGPU;
991 case CallingConv::C:
992 case CallingConv::Fast:
993 case CallingConv::Cold:
994 return CC_AMDGPU_Func;
995 case CallingConv::AMDGPU_Gfx:
996 return CC_SI_Gfx;
997 case CallingConv::AMDGPU_KERNEL:
998 case CallingConv::SPIR_KERNEL:
999 default:
1000 report_fatal_error("Unsupported calling convention for call");
1001 }
1002}
1003
1004CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1005 bool IsVarArg) {
1006 switch (CC) {
1007 case CallingConv::AMDGPU_KERNEL:
1008 case CallingConv::SPIR_KERNEL:
1009 llvm_unreachable("kernels should not be handled here")__builtin_unreachable();
1010 case CallingConv::AMDGPU_VS:
1011 case CallingConv::AMDGPU_GS:
1012 case CallingConv::AMDGPU_PS:
1013 case CallingConv::AMDGPU_CS:
1014 case CallingConv::AMDGPU_HS:
1015 case CallingConv::AMDGPU_ES:
1016 case CallingConv::AMDGPU_LS:
1017 return RetCC_SI_Shader;
1018 case CallingConv::AMDGPU_Gfx:
1019 return RetCC_SI_Gfx;
1020 case CallingConv::C:
1021 case CallingConv::Fast:
1022 case CallingConv::Cold:
1023 return RetCC_AMDGPU_Func;
1024 default:
1025 report_fatal_error("Unsupported calling convention.");
1026 }
1027}
1028
1029/// The SelectionDAGBuilder will automatically promote function arguments
1030/// with illegal types. However, this does not work for the AMDGPU targets
1031/// since the function arguments are stored in memory as these illegal types.
1032/// In order to handle this properly we need to get the original types sizes
1033/// from the LLVM IR Function and fixup the ISD:InputArg values before
1034/// passing them to AnalyzeFormalArguments()
1035
1036/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1037/// input values across multiple registers. Each item in the Ins array
1038/// represents a single value that will be stored in registers. Ins[x].VT is
1039/// the value type of the value that will be stored in the register, so
1040/// whatever SDNode we lower the argument to needs to be this type.
1041///
1042/// In order to correctly lower the arguments we need to know the size of each
1043/// argument. Since Ins[x].VT gives us the size of the register that will
1044/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1045/// for the orignal function argument so that we can deduce the correct memory
1046/// type to use for Ins[x]. In most cases the correct memory type will be
1047/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1048/// we have a kernel argument of type v8i8, this argument will be split into
1049/// 8 parts and each part will be represented by its own item in the Ins array.
1050/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1051/// the argument before it was split. From this, we deduce that the memory type
1052/// for each individual part is i8. We pass the memory type as LocVT to the
1053/// calling convention analysis function and the register type (Ins[x].VT) as
1054/// the ValVT.
1055void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1056 CCState &State,
1057 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1058 const MachineFunction &MF = State.getMachineFunction();
1059 const Function &Fn = MF.getFunction();
1060 LLVMContext &Ctx = Fn.getParent()->getContext();
1061 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1062 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1063 CallingConv::ID CC = Fn.getCallingConv();
1064
1065 Align MaxAlign = Align(1);
1066 uint64_t ExplicitArgOffset = 0;
1067 const DataLayout &DL = Fn.getParent()->getDataLayout();
1068
1069 unsigned InIndex = 0;
1070
1071 for (const Argument &Arg : Fn.args()) {
1072 const bool IsByRef = Arg.hasByRefAttr();
1073 Type *BaseArgTy = Arg.getType();
1074 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1075 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1076 if (!Alignment)
1077 Alignment = DL.getABITypeAlign(MemArgTy);
1078 MaxAlign = max(Alignment, MaxAlign);
1079 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1080
1081 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1082 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1083
1084 // We're basically throwing away everything passed into us and starting over
1085 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1086 // to us as computed in Ins.
1087 //
1088 // We also need to figure out what type legalization is trying to do to get
1089 // the correct memory offsets.
1090
1091 SmallVector<EVT, 16> ValueVTs;
1092 SmallVector<uint64_t, 16> Offsets;
1093 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1094
1095 for (unsigned Value = 0, NumValues = ValueVTs.size();
1096 Value != NumValues; ++Value) {
1097 uint64_t BasePartOffset = Offsets[Value];
1098
1099 EVT ArgVT = ValueVTs[Value];
1100 EVT MemVT = ArgVT;
1101 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1102 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1103
1104 if (NumRegs == 1) {
1105 // This argument is not split, so the IR type is the memory type.
1106 if (ArgVT.isExtended()) {
1107 // We have an extended type, like i24, so we should just use the
1108 // register type.
1109 MemVT = RegisterVT;
1110 } else {
1111 MemVT = ArgVT;
1112 }
1113 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1114 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1115 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements())((void)0);
1116 // We have a vector value which has been split into a vector with
1117 // the same scalar type, but fewer elements. This should handle
1118 // all the floating-point vector types.
1119 MemVT = RegisterVT;
1120 } else if (ArgVT.isVector() &&
1121 ArgVT.getVectorNumElements() == NumRegs) {
1122 // This arg has been split so that each element is stored in a separate
1123 // register.
1124 MemVT = ArgVT.getScalarType();
1125 } else if (ArgVT.isExtended()) {
1126 // We have an extended type, like i65.
1127 MemVT = RegisterVT;
1128 } else {
1129 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1130 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0)((void)0);
1131 if (RegisterVT.isInteger()) {
1132 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1133 } else if (RegisterVT.isVector()) {
1134 assert(!RegisterVT.getScalarType().isFloatingPoint())((void)0);
1135 unsigned NumElements = RegisterVT.getVectorNumElements();
1136 assert(MemoryBits % NumElements == 0)((void)0);
1137 // This vector type has been split into another vector type with
1138 // a different elements size.
1139 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1140 MemoryBits / NumElements);
1141 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1142 } else {
1143 llvm_unreachable("cannot deduce memory type.")__builtin_unreachable();
1144 }
1145 }
1146
1147 // Convert one element vectors to scalar.
1148 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1149 MemVT = MemVT.getScalarType();
1150
1151 // Round up vec3/vec5 argument.
1152 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1153 assert(MemVT.getVectorNumElements() == 3 ||((void)0)
1154 MemVT.getVectorNumElements() == 5)((void)0);
1155 MemVT = MemVT.getPow2VectorType(State.getContext());
1156 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1157 MemVT = MemVT.getRoundIntegerType(State.getContext());
1158 }
1159
1160 unsigned PartOffset = 0;
1161 for (unsigned i = 0; i != NumRegs; ++i) {
1162 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1163 BasePartOffset + PartOffset,
1164 MemVT.getSimpleVT(),
1165 CCValAssign::Full));
1166 PartOffset += MemVT.getStoreSize();
1167 }
1168 }
1169 }
1170}
1171
1172SDValue AMDGPUTargetLowering::LowerReturn(
1173 SDValue Chain, CallingConv::ID CallConv,
1174 bool isVarArg,
1175 const SmallVectorImpl<ISD::OutputArg> &Outs,
1176 const SmallVectorImpl<SDValue> &OutVals,
1177 const SDLoc &DL, SelectionDAG &DAG) const {
1178 // FIXME: Fails for r600 tests
1179 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1180 // "wave terminate should not have return values");
1181 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1182}
1183
1184//===---------------------------------------------------------------------===//
1185// Target specific lowering
1186//===---------------------------------------------------------------------===//
1187
1188/// Selects the correct CCAssignFn for a given CallingConvention value.
1189CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1190 bool IsVarArg) {
1191 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1192}
1193
1194CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1195 bool IsVarArg) {
1196 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1197}
1198
1199SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1200 SelectionDAG &DAG,
1201 MachineFrameInfo &MFI,
1202 int ClobberedFI) const {
1203 SmallVector<SDValue, 8> ArgChains;
1204 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1205 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1206
1207 // Include the original chain at the beginning of the list. When this is
1208 // used by target LowerCall hooks, this helps legalize find the
1209 // CALLSEQ_BEGIN node.
1210 ArgChains.push_back(Chain);
1211
1212 // Add a chain value for each stack argument corresponding
1213 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1214 UE = DAG.getEntryNode().getNode()->use_end();
1215 U != UE; ++U) {
1216 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1217 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1218 if (FI->getIndex() < 0) {
1219 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1220 int64_t InLastByte = InFirstByte;
1221 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1222
1223 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1224 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1225 ArgChains.push_back(SDValue(L, 1));
1226 }
1227 }
1228 }
1229 }
1230
1231 // Build a tokenfactor for all the chains.
1232 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1233}
1234
1235SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1236 SmallVectorImpl<SDValue> &InVals,
1237 StringRef Reason) const {
1238 SDValue Callee = CLI.Callee;
1239 SelectionDAG &DAG = CLI.DAG;
1240
1241 const Function &Fn = DAG.getMachineFunction().getFunction();
1242
1243 StringRef FuncName("<unknown>");
1244
1245 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1246 FuncName = G->getSymbol();
1247 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1248 FuncName = G->getGlobal()->getName();
1249
1250 DiagnosticInfoUnsupported NoCalls(
1251 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1252 DAG.getContext()->diagnose(NoCalls);
1253
1254 if (!CLI.IsTailCall) {
1255 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1256 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1257 }
1258
1259 return DAG.getEntryNode();
1260}
1261
1262SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1263 SmallVectorImpl<SDValue> &InVals) const {
1264 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1265}
1266
1267SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1268 SelectionDAG &DAG) const {
1269 const Function &Fn = DAG.getMachineFunction().getFunction();
1270
1271 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1272 SDLoc(Op).getDebugLoc());
1273 DAG.getContext()->diagnose(NoDynamicAlloca);
1274 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1275 return DAG.getMergeValues(Ops, SDLoc());
1276}
1277
1278SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1279 SelectionDAG &DAG) const {
1280 switch (Op.getOpcode()) {
1281 default:
1282 Op->print(errs(), &DAG);
1283 llvm_unreachable("Custom lowering code for this "__builtin_unreachable()
1284 "instruction is not implemented yet!")__builtin_unreachable();
1285 break;
1286 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1287 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1288 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1289 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1290 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1291 case ISD::FREM: return LowerFREM(Op, DAG);
1292 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1293 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1294 case ISD::FRINT: return LowerFRINT(Op, DAG);
1295 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1296 case ISD::FROUND: return LowerFROUND(Op, DAG);
1297 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1298 case ISD::FLOG:
1299 return LowerFLOG(Op, DAG, numbers::ln2f);
1300 case ISD::FLOG10:
1301 return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1302 case ISD::FEXP:
1303 return lowerFEXP(Op, DAG);
1304 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1305 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1306 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1307 case ISD::FP_TO_SINT:
1308 case ISD::FP_TO_UINT:
1309 return LowerFP_TO_INT(Op, DAG);
1310 case ISD::CTTZ:
1311 case ISD::CTTZ_ZERO_UNDEF:
1312 case ISD::CTLZ:
1313 case ISD::CTLZ_ZERO_UNDEF:
1314 return LowerCTLZ_CTTZ(Op, DAG);
1315 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1316 }
1317 return Op;
1318}
1319
1320void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1321 SmallVectorImpl<SDValue> &Results,
1322 SelectionDAG &DAG) const {
1323 switch (N->getOpcode()) {
1324 case ISD::SIGN_EXTEND_INREG:
1325 // Different parts of legalization seem to interpret which type of
1326 // sign_extend_inreg is the one to check for custom lowering. The extended
1327 // from type is what really matters, but some places check for custom
1328 // lowering of the result type. This results in trying to use
1329 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1330 // nothing here and let the illegal result integer be handled normally.
1331 return;
1332 default:
1333 return;
1334 }
1335}
1336
1337bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
1338 const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1339 if (!GVar || !GVar->hasInitializer())
1340 return false;
1341
1342 return !isa<UndefValue>(GVar->getInitializer());
1343}
1344
1345SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1346 SDValue Op,
1347 SelectionDAG &DAG) const {
1348
1349 const DataLayout &DL = DAG.getDataLayout();
1350 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1351 const GlobalValue *GV = G->getGlobal();
1352
1353 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1354 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1355 if (!MFI->isModuleEntryFunction() &&
1356 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1357 SDLoc DL(Op);
1358 const Function &Fn = DAG.getMachineFunction().getFunction();
1359 DiagnosticInfoUnsupported BadLDSDecl(
1360 Fn, "local memory global used by non-kernel function",
1361 DL.getDebugLoc(), DS_Warning);
1362 DAG.getContext()->diagnose(BadLDSDecl);
1363
1364 // We currently don't have a way to correctly allocate LDS objects that
1365 // aren't directly associated with a kernel. We do force inlining of
1366 // functions that use local objects. However, if these dead functions are
1367 // not eliminated, we don't want a compile time error. Just emit a warning
1368 // and a trap, since there should be no callable path here.
1369 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1370 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1371 Trap, DAG.getRoot());
1372 DAG.setRoot(OutputChain);
1373 return DAG.getUNDEF(Op.getValueType());
1374 }
1375
1376 // XXX: What does the value of G->getOffset() mean?
1377 assert(G->getOffset() == 0 &&((void)0)
1378 "Do not know what to do with an non-zero offset")((void)0);
1379
1380 // TODO: We could emit code to handle the initialization somewhere.
1381 if (!hasDefinedInitializer(GV)) {
1382 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1383 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1384 }
1385 }
1386
1387 const Function &Fn = DAG.getMachineFunction().getFunction();
1388 DiagnosticInfoUnsupported BadInit(
1389 Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1390 DAG.getContext()->diagnose(BadInit);
1391 return SDValue();
1392}
1393
1394SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1395 SelectionDAG &DAG) const {
1396 SmallVector<SDValue, 8> Args;
1397
1398 EVT VT = Op.getValueType();
1399 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1400 SDLoc SL(Op);
1401 SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1402 SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1403
1404 SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1405 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1406 }
1407
1408 for (const SDUse &U : Op->ops())
1409 DAG.ExtractVectorElements(U.get(), Args);
1410
1411 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1412}
1413
1414SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1415 SelectionDAG &DAG) const {
1416
1417 SmallVector<SDValue, 8> Args;
1418 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1419 EVT VT = Op.getValueType();
1420 EVT SrcVT = Op.getOperand(0).getValueType();
1421
1422 // For these types, we have some TableGen patterns except if the index is 1
1423 if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1424 (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1425 Start != 1)
1426 return Op;
1427
1428 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1429 VT.getVectorNumElements());
1430
1431 return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1432}
1433
1434/// Generate Min/Max node
1435SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1436 SDValue LHS, SDValue RHS,
1437 SDValue True, SDValue False,
1438 SDValue CC,
1439 DAGCombinerInfo &DCI) const {
1440 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1441 return SDValue();
1442
1443 SelectionDAG &DAG = DCI.DAG;
1444 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1445 switch (CCOpcode) {
1446 case ISD::SETOEQ:
1447 case ISD::SETONE:
1448 case ISD::SETUNE:
1449 case ISD::SETNE:
1450 case ISD::SETUEQ:
1451 case ISD::SETEQ:
1452 case ISD::SETFALSE:
1453 case ISD::SETFALSE2:
1454 case ISD::SETTRUE:
1455 case ISD::SETTRUE2:
1456 case ISD::SETUO:
1457 case ISD::SETO:
1458 break;
1459 case ISD::SETULE:
1460 case ISD::SETULT: {
1461 if (LHS == True)
1462 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1463 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1464 }
1465 case ISD::SETOLE:
1466 case ISD::SETOLT:
1467 case ISD::SETLE:
1468 case ISD::SETLT: {
1469 // Ordered. Assume ordered for undefined.
1470
1471 // Only do this after legalization to avoid interfering with other combines
1472 // which might occur.
1473 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1474 !DCI.isCalledByLegalizer())
1475 return SDValue();
1476
1477 // We need to permute the operands to get the correct NaN behavior. The
1478 // selected operand is the second one based on the failing compare with NaN,
1479 // so permute it based on the compare type the hardware uses.
1480 if (LHS == True)
1481 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1482 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1483 }
1484 case ISD::SETUGE:
1485 case ISD::SETUGT: {
1486 if (LHS == True)
1487 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1488 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1489 }
1490 case ISD::SETGT:
1491 case ISD::SETGE:
1492 case ISD::SETOGE:
1493 case ISD::SETOGT: {
1494 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1495 !DCI.isCalledByLegalizer())
1496 return SDValue();
1497
1498 if (LHS == True)
1499 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1500 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1501 }
1502 case ISD::SETCC_INVALID:
1503 llvm_unreachable("Invalid setcc condcode!")__builtin_unreachable();
1504 }
1505 return SDValue();
1506}
1507
1508std::pair<SDValue, SDValue>
1509AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1510 SDLoc SL(Op);
1511
1512 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1513
1514 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1515 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1516
1517 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1518 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1519
1520 return std::make_pair(Lo, Hi);
1521}
1522
1523SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1524 SDLoc SL(Op);
1525
1526 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1527 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1528 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1529}
1530
1531SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1532 SDLoc SL(Op);
1533
1534 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1535 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1536 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1537}
1538
1539// Split a vector type into two parts. The first part is a power of two vector.
1540// The second part is whatever is left over, and is a scalar if it would
1541// otherwise be a 1-vector.
1542std::pair<EVT, EVT>
1543AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1544 EVT LoVT, HiVT;
1545 EVT EltVT = VT.getVectorElementType();
1546 unsigned NumElts = VT.getVectorNumElements();
1547 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1548 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1549 HiVT = NumElts - LoNumElts == 1
1550 ? EltVT
1551 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1552 return std::make_pair(LoVT, HiVT);
1553}
1554
1555// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1556// scalar.
1557std::pair<SDValue, SDValue>
1558AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1559 const EVT &LoVT, const EVT &HiVT,
1560 SelectionDAG &DAG) const {
1561 assert(LoVT.getVectorNumElements() +((void)0)
1562 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=((void)0)
1563 N.getValueType().getVectorNumElements() &&((void)0)
1564 "More vector elements requested than available!")((void)0);
1565 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1566 DAG.getVectorIdxConstant(0, DL));
1567 SDValue Hi = DAG.getNode(
1568 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1569 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1570 return std::make_pair(Lo, Hi);
1571}
1572
1573SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1574 SelectionDAG &DAG) const {
1575 LoadSDNode *Load = cast<LoadSDNode>(Op);
1576 EVT VT = Op.getValueType();
1577 SDLoc SL(Op);
1578
1579
1580 // If this is a 2 element vector, we really want to scalarize and not create
1581 // weird 1 element vectors.
1582 if (VT.getVectorNumElements() == 2) {
1583 SDValue Ops[2];
1584 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1585 return DAG.getMergeValues(Ops, SL);
1586 }
1587
1588 SDValue BasePtr = Load->getBasePtr();
1589 EVT MemVT = Load->getMemoryVT();
1590
1591 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1592
1593 EVT LoVT, HiVT;
1594 EVT LoMemVT, HiMemVT;
1595 SDValue Lo, Hi;
1596
1597 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1598 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1599 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1600
1601 unsigned Size = LoMemVT.getStoreSize();
1602 unsigned BaseAlign = Load->getAlignment();
1603 unsigned HiAlign = MinAlign(BaseAlign, Size);
1604
1605 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1606 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1607 BaseAlign, Load->getMemOperand()->getFlags());
1608 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1609 SDValue HiLoad =
1610 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1611 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1612 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1613
1614 SDValue Join;
1615 if (LoVT == HiVT) {
1616 // This is the case that the vector is power of two so was evenly split.
1617 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1618 } else {
1619 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1620 DAG.getVectorIdxConstant(0, SL));
1621 Join = DAG.getNode(
1622 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1623 VT, Join, HiLoad,
1624 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1625 }
1626
1627 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1628 LoLoad.getValue(1), HiLoad.getValue(1))};
1629
1630 return DAG.getMergeValues(Ops, SL);
1631}
1632
1633SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1634 SelectionDAG &DAG) const {
1635 LoadSDNode *Load = cast<LoadSDNode>(Op);
1636 EVT VT = Op.getValueType();
1637 SDValue BasePtr = Load->getBasePtr();
1638 EVT MemVT = Load->getMemoryVT();
1639 SDLoc SL(Op);
1640 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1641 unsigned BaseAlign = Load->getAlignment();
1642 unsigned NumElements = MemVT.getVectorNumElements();
1643
1644 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1645 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1646 if (NumElements != 3 ||
1647 (BaseAlign < 8 &&
1648 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1649 return SplitVectorLoad(Op, DAG);
1650
1651 assert(NumElements == 3)((void)0);
1652
1653 EVT WideVT =
1654 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1655 EVT WideMemVT =
1656 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1657 SDValue WideLoad = DAG.getExtLoad(
1658 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1659 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1660 return DAG.getMergeValues(
1661 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1662 DAG.getVectorIdxConstant(0, SL)),
1663 WideLoad.getValue(1)},
1664 SL);
1665}
1666
1667SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1668 SelectionDAG &DAG) const {
1669 StoreSDNode *Store = cast<StoreSDNode>(Op);
1670 SDValue Val = Store->getValue();
1671 EVT VT = Val.getValueType();
1672
1673 // If this is a 2 element vector, we really want to scalarize and not create
1674 // weird 1 element vectors.
1675 if (VT.getVectorNumElements() == 2)
1676 return scalarizeVectorStore(Store, DAG);
1677
1678 EVT MemVT = Store->getMemoryVT();
1679 SDValue Chain = Store->getChain();
1680 SDValue BasePtr = Store->getBasePtr();
1681 SDLoc SL(Op);
1682
1683 EVT LoVT, HiVT;
1684 EVT LoMemVT, HiMemVT;
1685 SDValue Lo, Hi;
1686
1687 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1688 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1689 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1690
1691 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1692
1693 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1694 unsigned BaseAlign = Store->getAlignment();
1695 unsigned Size = LoMemVT.getStoreSize();
1696 unsigned HiAlign = MinAlign(BaseAlign, Size);
1697
1698 SDValue LoStore =
1699 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1700 Store->getMemOperand()->getFlags());
1701 SDValue HiStore =
1702 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1703 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1704
1705 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1706}
1707
1708// This is a shortcut for integer division because we have fast i32<->f32
1709// conversions, and fast f32 reciprocal instructions. The fractional part of a
1710// float is enough to accurately represent up to a 24-bit signed integer.
1711SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1712 bool Sign) const {
1713 SDLoc DL(Op);
1714 EVT VT = Op.getValueType();
1715 SDValue LHS = Op.getOperand(0);
1716 SDValue RHS = Op.getOperand(1);
1717 MVT IntVT = MVT::i32;
1718 MVT FltVT = MVT::f32;
1719
1720 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1721 if (LHSSignBits < 9)
1722 return SDValue();
1723
1724 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1725 if (RHSSignBits < 9)
1726 return SDValue();
1727
1728 unsigned BitSize = VT.getSizeInBits();
1729 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1730 unsigned DivBits = BitSize - SignBits;
1731 if (Sign)
1732 ++DivBits;
1733
1734 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1735 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1736
1737 SDValue jq = DAG.getConstant(1, DL, IntVT);
1738
1739 if (Sign) {
1740 // char|short jq = ia ^ ib;
1741 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1742
1743 // jq = jq >> (bitsize - 2)
1744 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1745 DAG.getConstant(BitSize - 2, DL, VT));
1746
1747 // jq = jq | 0x1
1748 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1749 }
1750
1751 // int ia = (int)LHS;
1752 SDValue ia = LHS;
1753
1754 // int ib, (int)RHS;
1755 SDValue ib = RHS;
1756
1757 // float fa = (float)ia;
1758 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1759
1760 // float fb = (float)ib;
1761 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1762
1763 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1764 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1765
1766 // fq = trunc(fq);
1767 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1768
1769 // float fqneg = -fq;
1770 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1771
1772 MachineFunction &MF = DAG.getMachineFunction();
1773 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1774
1775 // float fr = mad(fqneg, fb, fa);
1776 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1777 (unsigned)ISD::FMA :
1778 !MFI->getMode().allFP32Denormals() ?
1779 (unsigned)ISD::FMAD :
1780 (unsigned)AMDGPUISD::FMAD_FTZ;
1781 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1782
1783 // int iq = (int)fq;
1784 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1785
1786 // fr = fabs(fr);
1787 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1788
1789 // fb = fabs(fb);
1790 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1791
1792 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1793
1794 // int cv = fr >= fb;
1795 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1796
1797 // jq = (cv ? jq : 0);
1798 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1799
1800 // dst = iq + jq;
1801 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1802
1803 // Rem needs compensation, it's easier to recompute it
1804 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1805 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1806
1807 // Truncate to number of bits this divide really is.
1808 if (Sign) {
1809 SDValue InRegSize
1810 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1811 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1812 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1813 } else {
1814 SDValue TruncMask = DAG.getConstant((UINT64_C(1)1ULL << DivBits) - 1, DL, VT);
1815 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1816 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1817 }
1818
1819 return DAG.getMergeValues({ Div, Rem }, DL);
1820}
1821
1822void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1823 SelectionDAG &DAG,
1824 SmallVectorImpl<SDValue> &Results) const {
1825 SDLoc DL(Op);
1826 EVT VT = Op.getValueType();
1827
1828 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64")((void)0);
1829
1830 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1831
1832 SDValue One = DAG.getConstant(1, DL, HalfVT);
1833 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1834
1835 //HiLo split
1836 SDValue LHS = Op.getOperand(0);
1837 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1838 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1839
1840 SDValue RHS = Op.getOperand(1);
1841 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1842 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1843
1844 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1845 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1846
1847 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1848 LHS_Lo, RHS_Lo);
1849
1850 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1851 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1852
1853 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1854 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1855 return;
1856 }
1857
1858 if (isTypeLegal(MVT::i64)) {
1859 MachineFunction &MF = DAG.getMachineFunction();
1860 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1861
1862 // Compute denominator reciprocal.
1863 unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1864 (unsigned)ISD::FMA :
1865 !MFI->getMode().allFP32Denormals() ?
1866 (unsigned)ISD::FMAD :
1867 (unsigned)AMDGPUISD::FMAD_FTZ;
1868
1869 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1870 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1871 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1872 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1873 Cvt_Lo);
1874 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1875 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1876 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1877 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1878 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1879 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1880 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1881 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1882 Mul1);
1883 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1884 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1885 SDValue Rcp64 = DAG.getBitcast(VT,
1886 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1887
1888 SDValue Zero64 = DAG.getConstant(0, DL, VT);
1889 SDValue One64 = DAG.getConstant(1, DL, VT);
1890 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1891 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1892
1893 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1894 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1895 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1896 SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1897 Zero);
1898 SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1899 One);
1900
1901 SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1902 Mulhi1_Lo, Zero1);
1903 SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1904 Mulhi1_Hi, Add1_Lo.getValue(1));
1905 SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1906 SDValue Add1 = DAG.getBitcast(VT,
1907 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1908
1909 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1910 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1911 SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1912 Zero);
1913 SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1914 One);
1915
1916 SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1917 Mulhi2_Lo, Zero1);
1918 SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1919 Mulhi2_Hi, Add1_Lo.getValue(1));
1920 SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1921 Zero, Add2_Lo.getValue(1));
1922 SDValue Add2 = DAG.getBitcast(VT,
1923 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1924 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1925
1926 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1927
1928 SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1929 SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1930 SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1931 Mul3_Lo, Zero1);
1932 SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1933 Mul3_Hi, Sub1_Lo.getValue(1));
1934 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1935 SDValue Sub1 = DAG.getBitcast(VT,
1936 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1937
1938 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1939 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1940 ISD::SETUGE);
1941 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1942 ISD::SETUGE);
1943 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1944
1945 // TODO: Here and below portions of the code can be enclosed into if/endif.
1946 // Currently control flow is unconditional and we have 4 selects after
1947 // potential endif to substitute PHIs.
1948
1949 // if C3 != 0 ...
1950 SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1951 RHS_Lo, Zero1);
1952 SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1953 RHS_Hi, Sub1_Lo.getValue(1));
1954 SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1955 Zero, Sub2_Lo.getValue(1));
1956 SDValue Sub2 = DAG.getBitcast(VT,
1957 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1958
1959 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1960
1961 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1962 ISD::SETUGE);
1963 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1964 ISD::SETUGE);
1965 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1966
1967 // if (C6 != 0)
1968 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1969
1970 SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1971 RHS_Lo, Zero1);
1972 SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1973 RHS_Hi, Sub2_Lo.getValue(1));
1974 SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1975 Zero, Sub3_Lo.getValue(1));
1976 SDValue Sub3 = DAG.getBitcast(VT,
1977 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1978
1979 // endif C6
1980 // endif C3
1981
1982 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1983 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1984
1985 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1986 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1987
1988 Results.push_back(Div);
1989 Results.push_back(Rem);
1990
1991 return;
1992 }
1993
1994 // r600 expandion.
1995 // Get Speculative values
1996 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1997 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1998
1999 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2000 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2001 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2002
2003 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2004 SDValue DIV_Lo = Zero;
2005
2006 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2007
2008 for (unsigned i = 0; i < halfBitWidth; ++i) {
2009 const unsigned bitPos = halfBitWidth - i - 1;
2010 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2011 // Get value of high bit
2012 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2013 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2014 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2015
2016 // Shift
2017 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2018 // Add LHS high bit
2019 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2020
2021 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2022 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2023
2024 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2025
2026 // Update REM
2027 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2028 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2029 }
2030
2031 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2032 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2033 Results.push_back(DIV);
2034 Results.push_back(REM);
2035}
2036
2037SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2038 SelectionDAG &DAG) const {
2039 SDLoc DL(Op);
2040 EVT VT = Op.getValueType();
2041
2042 if (VT == MVT::i64) {
2043 SmallVector<SDValue, 2> Results;
2044 LowerUDIVREM64(Op, DAG, Results);
2045 return DAG.getMergeValues(Results, DL);
2046 }
2047
2048 if (VT == MVT::i32) {
2049 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2050 return Res;
2051 }
2052
2053 SDValue X = Op.getOperand(0);
2054 SDValue Y = Op.getOperand(1);
2055
2056 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2057 // algorithm used here.
2058
2059 // Initial estimate of inv(y).
2060 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2061
2062 // One round of UNR.
2063 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2064 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2065 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2066 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2067
2068 // Quotient/remainder estimate.
2069 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2070 SDValue R =
2071 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2072
2073 // First quotient/remainder refinement.
2074 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2075 SDValue One = DAG.getConstant(1, DL, VT);
2076 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2077 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2078 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2079 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2080 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2081
2082 // Second quotient/remainder refinement.
2083 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2084 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2085 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2086 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2087 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2088
2089 return DAG.getMergeValues({Q, R}, DL);
2090}
2091
2092SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2093 SelectionDAG &DAG) const {
2094 SDLoc DL(Op);
2095 EVT VT = Op.getValueType();
2096
2097 SDValue LHS = Op.getOperand(0);
2098 SDValue RHS = Op.getOperand(1);
2099
2100 SDValue Zero = DAG.getConstant(0, DL, VT);
2101 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2102
2103 if (VT == MVT::i32) {
2104 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2105 return Res;
2106 }
2107
2108 if (VT == MVT::i64 &&
2109 DAG.ComputeNumSignBits(LHS) > 32 &&
2110 DAG.ComputeNumSignBits(RHS) > 32) {
2111 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2112
2113 //HiLo split
2114 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2115 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2116 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2117 LHS_Lo, RHS_Lo);
2118 SDValue Res[2] = {
2119 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2120 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2121 };
2122 return DAG.getMergeValues(Res, DL);
2123 }
2124
2125 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2126 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2127 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2128 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2129
2130 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2131 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2132
2133 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2134 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2135
2136 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2137 SDValue Rem = Div.getValue(1);
2138
2139 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2140 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2141
2142 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2143 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2144
2145 SDValue Res[2] = {
2146 Div,
2147 Rem
2148 };
2149 return DAG.getMergeValues(Res, DL);
2150}
2151
2152// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2153SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2154 SDLoc SL(Op);
2155 EVT VT = Op.getValueType();
2156 auto Flags = Op->getFlags();
2157 SDValue X = Op.getOperand(0);
2158 SDValue Y = Op.getOperand(1);
2159
2160 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2161 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2162 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2163 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2164 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2165}
2166
2167SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2168 SDLoc SL(Op);
2169 SDValue Src = Op.getOperand(0);
2170
2171 // result = trunc(src)
2172 // if (src > 0.0 && src != result)
2173 // result += 1.0
2174
2175 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2176
2177 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2178 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2179
2180 EVT SetCCVT =
2181 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2182
2183 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2184 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2185 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2186
2187 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2188 // TODO: Should this propagate fast-math-flags?
2189 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2190}
2191
2192static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2193 SelectionDAG &DAG) {
2194 const unsigned FractBits = 52;
2195 const unsigned ExpBits = 11;
2196
2197 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2198 Hi,
2199 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2200 DAG.getConstant(ExpBits, SL, MVT::i32));
2201 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2202 DAG.getConstant(1023, SL, MVT::i32));
2203
2204 return Exp;
2205}
2206
2207SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2208 SDLoc SL(Op);
2209 SDValue Src = Op.getOperand(0);
2210
2211 assert(Op.getValueType() == MVT::f64)((void)0);
2212
2213 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2214 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2215
2216 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2217
2218 // Extract the upper half, since this is where we will find the sign and
2219 // exponent.
2220 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2221
2222 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2223
2224 const unsigned FractBits = 52;
2225
2226 // Extract the sign bit.
2227 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1)1U << 31, SL, MVT::i32);
2228 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2229
2230 // Extend back to 64-bits.
2231 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2232 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2233
2234 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2235 const SDValue FractMask
2236 = DAG.getConstant((UINT64_C(1)1ULL << FractBits) - 1, SL, MVT::i64);
2237
2238 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2239 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2240 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2241
2242 EVT SetCCVT =
2243 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2244
2245 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2246
2247 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2248 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2249
2250 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2251 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2252
2253 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2254}
2255
2256SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2257 SDLoc SL(Op);
2258 SDValue Src = Op.getOperand(0);
2259
2260 assert(Op.getValueType() == MVT::f64)((void)0);
2261
2262 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2263 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2264 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2265
2266 // TODO: Should this propagate fast-math-flags?
2267
2268 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2269 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2270
2271 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2272
2273 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2274 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2275
2276 EVT SetCCVT =
2277 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2278 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2279
2280 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2281}
2282
2283SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2284 // FNEARBYINT and FRINT are the same, except in their handling of FP
2285 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2286 // rint, so just treat them as equivalent.
2287 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2288}
2289
2290// XXX - May require not supporting f32 denormals?
2291
2292// Don't handle v2f16. The extra instructions to scalarize and repack around the
2293// compare and vselect end up producing worse code than scalarizing the whole
2294// operation.
2295SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2296 SDLoc SL(Op);
2297 SDValue X = Op.getOperand(0);
2298 EVT VT = Op.getValueType();
2299
2300 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2301
2302 // TODO: Should this propagate fast-math-flags?
2303
2304 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2305
2306 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2307
2308 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2309 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2310 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2311
2312 SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2313
2314 EVT SetCCVT =
2315 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2316
2317 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2318
2319 SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2320
2321 return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2322}
2323
2324SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2325 SDLoc SL(Op);
2326 SDValue Src = Op.getOperand(0);
2327
2328 // result = trunc(src);
2329 // if (src < 0.0 && src != result)
2330 // result += -1.0.
2331
2332 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2333
2334 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2335 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2336
2337 EVT SetCCVT =
2338 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2339
2340 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2341 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2342 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2343
2344 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2345 // TODO: Should this propagate fast-math-flags?
2346 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2347}
2348
2349SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2350 double Log2BaseInverted) const {
2351 EVT VT = Op.getValueType();
2352
2353 SDLoc SL(Op);
2354 SDValue Operand = Op.getOperand(0);
2355 SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2356 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2357
2358 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2359}
2360
2361// exp2(M_LOG2E_F * f);
2362SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2363 EVT VT = Op.getValueType();
2364 SDLoc SL(Op);
2365 SDValue Src = Op.getOperand(0);
2366
2367 const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2368 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2369 return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2370}
2371
2372static bool isCtlzOpc(unsigned Opc) {
2373 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2374}
2375
2376static bool isCttzOpc(unsigned Opc) {
2377 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2378}
2379
2380SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2381 SDLoc SL(Op);
2382 SDValue Src = Op.getOperand(0);
2383 bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2384 Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2385
2386 unsigned ISDOpc, NewOpc;
2387 if (isCtlzOpc(Op.getOpcode())) {
2388 ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2389 NewOpc = AMDGPUISD::FFBH_U32;
2390 } else if (isCttzOpc(Op.getOpcode())) {
2391 ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2392 NewOpc = AMDGPUISD::FFBL_B32;
2393 } else
2394 llvm_unreachable("Unexpected OPCode!!!")__builtin_unreachable();
2395
2396
2397 if (ZeroUndef && Src.getValueType() == MVT::i32)
2398 return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2399
2400 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2401
2402 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2403 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2404
2405 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2406 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2407
2408 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2409 *DAG.getContext(), MVT::i32);
2410
2411 SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2412 SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2413
2414 SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2415 SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2416
2417 const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2418 SDValue Add, NewOpr;
2419 if (isCtlzOpc(Op.getOpcode())) {
2420 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2421 // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2422 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2423 } else {
2424 Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2425 // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2426 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2427 }
2428
2429 if (!ZeroUndef) {
2430 // Test if the full 64-bit input is zero.
2431
2432 // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2433 // which we probably don't want.
2434 SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2435 SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2436 SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2437
2438 // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2439 // with the same cycles, otherwise it is slower.
2440 // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2441 // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2442
2443 const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2444
2445 // The instruction returns -1 for 0 input, but the defined intrinsic
2446 // behavior is to return the number of bits.
2447 NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2448 SrcIsZero, Bits32, NewOpr);
2449 }
2450
2451 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2452}
2453
2454SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2455 bool Signed) const {
2456 // Unsigned
2457 // cul2f(ulong u)
2458 //{
2459 // uint lz = clz(u);
2460 // uint e = (u != 0) ? 127U + 63U - lz : 0;
2461 // u = (u << lz) & 0x7fffffffffffffffUL;
2462 // ulong t = u & 0xffffffffffUL;
2463 // uint v = (e << 23) | (uint)(u >> 40);
2464 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2465 // return as_float(v + r);
2466 //}
2467 // Signed
2468 // cl2f(long l)
2469 //{
2470 // long s = l >> 63;
2471 // float r = cul2f((l + s) ^ s);
2472 // return s ? -r : r;
2473 //}
2474
2475 SDLoc SL(Op);
2476 SDValue Src = Op.getOperand(0);
2477 SDValue L = Src;
2478
2479 SDValue S;
2480 if (Signed) {
2481 const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2482 S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2483
2484 SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2485 L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2486 }
2487
2488 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2489 *DAG.getContext(), MVT::f32);
2490
2491
2492 SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2493 SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2494 SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2495 LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2496
2497 SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2498 SDValue E = DAG.getSelect(SL, MVT::i32,
2499 DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2500 DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2501 ZeroI32);
2502
2503 SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2504 DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2505 DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2506
2507 SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2508 DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2509
2510 SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2511 U, DAG.getConstant(40, SL, MVT::i64));
2512
2513 SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2514 DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2515 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
2516
2517 SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2518 SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2519 SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2520
2521 SDValue One = DAG.getConstant(1, SL, MVT::i32);
2522
2523 SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2524
2525 SDValue R = DAG.getSelect(SL, MVT::i32,
2526 RCmp,
2527 One,
2528 DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2529 R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2530 R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2531
2532 if (!Signed)
2533 return R;
2534
2535 SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2536 return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2537}
2538
2539SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2540 bool Signed) const {
2541 SDLoc SL(Op);
2542 SDValue Src = Op.getOperand(0);
2543
2544 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2545
2546 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2547 DAG.getConstant(0, SL, MVT::i32));
2548 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2549 DAG.getConstant(1, SL, MVT::i32));
2550
2551 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2552 SL, MVT::f64, Hi);
2553
2554 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2555
2556 SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2557 DAG.getConstant(32, SL, MVT::i32));
2558 // TODO: Should this propagate fast-math-flags?
2559 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2560}
2561
2562SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2563 SelectionDAG &DAG) const {
2564 // TODO: Factor out code common with LowerSINT_TO_FP.
2565 EVT DestVT = Op.getValueType();
2566 SDValue Src = Op.getOperand(0);
2567 EVT SrcVT = Src.getValueType();
2568
2569 if (SrcVT == MVT::i16) {
2570 if (DestVT == MVT::f16)
2571 return Op;
2572 SDLoc DL(Op);
2573
2574 // Promote src to i32
2575 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2576 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2577 }
2578
2579 assert(SrcVT == MVT::i64 && "operation should be legal")((void)0);
2580
2581 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2582 SDLoc DL(Op);
2583
2584 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2585 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2586 SDValue FPRound =
2587 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2588
2589 return FPRound;
2590 }
2591
2592 if (DestVT == MVT::f32)
2593 return LowerINT_TO_FP32(Op, DAG, false);
2594
2595 assert(DestVT == MVT::f64)((void)0);
2596 return LowerINT_TO_FP64(Op, DAG, false);
2597}
2598
2599SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2600 SelectionDAG &DAG) const {
2601 EVT DestVT = Op.getValueType();
2602
2603 SDValue Src = Op.getOperand(0);
2604 EVT SrcVT = Src.getValueType();
2605
2606 if (SrcVT == MVT::i16) {
2607 if (DestVT == MVT::f16)
2608 return Op;
2609
2610 SDLoc DL(Op);
2611 // Promote src to i32
2612 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2613 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2614 }
2615
2616 assert(SrcVT == MVT::i64 && "operation should be legal")((void)0);
2617
2618 // TODO: Factor out code common with LowerUINT_TO_FP.
2619
2620 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2621 SDLoc DL(Op);
2622 SDValue Src = Op.getOperand(0);
2623
2624 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2625 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2626 SDValue FPRound =
2627 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2628
2629 return FPRound;
2630 }
2631
2632 if (DestVT == MVT::f32)
2633 return LowerINT_TO_FP32(Op, DAG, true);
2634
2635 assert(DestVT == MVT::f64)((void)0);
2636 return LowerINT_TO_FP64(Op, DAG, true);
2637}
2638
2639SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2640 bool Signed) const {
2641 SDLoc SL(Op);
2642
2643 SDValue Src = Op.getOperand(0);
2644 EVT SrcVT = Src.getValueType();
2645
2646 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64)((void)0);
2647
2648 // The basic idea of converting a floating point number into a pair of 32-bit
2649 // integers is illustrated as follows:
2650 //
2651 // tf := trunc(val);
2652 // hif := floor(tf * 2^-32);
2653 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2654 // hi := fptoi(hif);
2655 // lo := fptoi(lof);
2656 //
2657 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2658 SDValue Sign;
2659 if (Signed && SrcVT == MVT::f32) {
2660 // However, a 32-bit floating point number has only 23 bits mantissa and
2661 // it's not enough to hold all the significant bits of `lof` if val is
2662 // negative. To avoid the loss of precision, We need to take the absolute
2663 // value after truncating and flip the result back based on the original
2664 // signedness.
2665 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2666 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2667 DAG.getConstant(31, SL, MVT::i32));
2668 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2669 }
2670
2671 SDValue K0, K1;
2672 if (SrcVT == MVT::f64) {
2673 K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)0x3df0000000000000ULL),
2674 SL, SrcVT);
2675 K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)0xc1f0000000000000ULL),
2676 SL, SrcVT);
2677 } else {
2678 K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)0x2f800000U), SL,
2679 SrcVT);
2680 K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)0xcf800000U), SL,
2681 SrcVT);
2682 }
2683 // TODO: Should this propagate fast-math-flags?
2684 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2685
2686 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2687
2688 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2689
2690 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2691 : ISD::FP_TO_UINT,
2692 SL, MVT::i32, FloorMul);
2693 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2694
2695 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2696 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2697
2698 if (Signed && SrcVT == MVT::f32) {
2699 assert(Sign)((void)0);
2700 // Flip the result based on the signedness, which is either all 0s or 1s.
2701 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2702 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2703 // r := xor(r, sign) - sign;
2704 Result =
2705 DAG.getNode(ISD::SUB, SL, MVT::i64,
2706 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2707 }
2708
2709 return Result;
2710}
2711
2712SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2713 SDLoc DL(Op);
2714 SDValue N0 = Op.getOperand(0);
2715
2716 // Convert to target node to get known bits
2717 if (N0.getValueType() == MVT::f32)
2718 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2719
2720 if (getTargetMachine().Options.UnsafeFPMath) {
2721 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2722 return SDValue();
2723 }
2724
2725 assert(N0.getSimpleValueType() == MVT::f64)((void)0);
2726
2727 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2728 const unsigned ExpMask = 0x7ff;
2729 const unsigned ExpBiasf64 = 1023;
2730 const unsigned ExpBiasf16 = 15;
2731 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2732 SDValue One = DAG.getConstant(1, DL, MVT::i32);
2733 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2734 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2735 DAG.getConstant(32, DL, MVT::i64));
2736 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2737 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2738 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2739 DAG.getConstant(20, DL, MVT::i64));
2740 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2741 DAG.getConstant(ExpMask, DL, MVT::i32));
2742 // Subtract the fp64 exponent bias (1023) to get the real exponent and
2743 // add the f16 bias (15) to get the biased exponent for the f16 format.
2744 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2745 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2746
2747 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2748 DAG.getConstant(8, DL, MVT::i32));
2749 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2750 DAG.getConstant(0xffe, DL, MVT::i32));
2751
2752 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2753 DAG.getConstant(0x1ff, DL, MVT::i32));
2754 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2755
2756 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2757 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2758
2759 // (M != 0 ? 0x0200 : 0) | 0x7c00;
2760 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2761 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2762 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2763
2764 // N = M | (E << 12);
2765 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2766 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2767 DAG.getConstant(12, DL, MVT::i32)));
2768
2769 // B = clamp(1-E, 0, 13);
2770 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2771 One, E);
2772 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2773 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2774 DAG.getConstant(13, DL, MVT::i32));
2775
2776 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2777 DAG.getConstant(0x1000, DL, MVT::i32));
2778
2779 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2780 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2781 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2782 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2783
2784 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2785 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2786 DAG.getConstant(0x7, DL, MVT::i32));
2787 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2788 DAG.getConstant(2, DL, MVT::i32));
2789 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2790 One, Zero, ISD::SETEQ);
2791 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2792 One, Zero, ISD::SETGT);
2793 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2794 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2795
2796 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2797 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2798 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2799 I, V, ISD::SETEQ);
2800
2801 // Extract the sign bit.
2802 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2803 DAG.getConstant(16, DL, MVT::i32));
2804 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2805 DAG.getConstant(0x8000, DL, MVT::i32));
2806
2807 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2808 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2809}
2810
2811SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2812 SelectionDAG &DAG) const {
2813 SDValue Src = Op.getOperand(0);
2814 unsigned OpOpcode = Op.getOpcode();
2815 EVT SrcVT = Src.getValueType();
2816 EVT DestVT = Op.getValueType();
2817
2818 // Will be selected natively
2819 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2820 return Op;
2821
2822 // Promote i16 to i32
2823 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2824 SDLoc DL(Op);
2825
2826 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2827 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2828 }
2829
2830 if (SrcVT == MVT::f16 ||
2831 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2832 SDLoc DL(Op);
2833
2834 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2835 unsigned Ext =
2836 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2837 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2838 }
2839
2840 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2841 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2842
2843 return SDValue();
2844}
2845
2846SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2847 SelectionDAG &DAG) const {
2848 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2849 MVT VT = Op.getSimpleValueType();
2850 MVT ScalarVT = VT.getScalarType();
2851
2852 assert(VT.isVector())((void)0);
2853
2854 SDValue Src = Op.getOperand(0);
2855 SDLoc DL(Op);
2856
2857 // TODO: Don't scalarize on Evergreen?
2858 unsigned NElts = VT.getVectorNumElements();
2859 SmallVector<SDValue, 8> Args;
2860 DAG.ExtractVectorElements(Src, Args, 0, NElts);
2861
2862 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2863 for (unsigned I = 0; I < NElts; ++I)
2864 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2865
2866 return DAG.getBuildVector(VT, DL, Args);
2867}
2868
2869//===----------------------------------------------------------------------===//
2870// Custom DAG optimizations
2871//===----------------------------------------------------------------------===//
2872
2873static bool isU24(SDValue Op, SelectionDAG &DAG) {
2874 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2875}
2876
2877static bool isI24(SDValue Op, SelectionDAG &DAG) {
2878 EVT VT = Op.getValueType();
2879 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2880 // as unsigned 24-bit values.
2881 AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2882}
2883
2884static SDValue simplifyMul24(SDNode *Node24,
2885 TargetLowering::DAGCombinerInfo &DCI) {
2886 SelectionDAG &DAG = DCI.DAG;
2887 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2888 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2889
2890 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2891 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2892 unsigned NewOpcode = Node24->getOpcode();
2893 if (IsIntrin) {
2894 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2895 NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2896 AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2897 }
2898
2899 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2900
2901 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2902 // the operands to have other uses, but will only perform simplifications that
2903 // involve bypassing some nodes for this user.
2904 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2905 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2906 if (DemandedLHS || DemandedRHS)
2907 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2908 DemandedLHS ? DemandedLHS : LHS,
2909 DemandedRHS ? DemandedRHS : RHS);
2910
2911 // Now try SimplifyDemandedBits which can simplify the nodes used by our
2912 // operands if this node is the only user.
2913 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2914 return SDValue(Node24, 0);
2915 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2916 return SDValue(Node24, 0);
2917
2918 return SDValue();
2919}
2920
2921template <typename IntTy>
2922static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2923 uint32_t Width, const SDLoc &DL) {
2924 if (Width + Offset < 32) {
2925 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2926 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2927 return DAG.getConstant(Result, DL, MVT::i32);
2928 }
2929
2930 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2931}
2932
2933static bool hasVolatileUser(SDNode *Val) {
2934 for (SDNode *U : Val->uses()) {
2935 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2936 if (M->isVolatile())
2937 return true;
2938 }
2939 }
2940
2941 return false;
2942}
2943
2944bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2945 // i32 vectors are the canonical memory type.
2946 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2947 return false;
2948
2949 if (!VT.isByteSized())
2950 return false;
2951
2952 unsigned Size = VT.getStoreSize();
2953
2954 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2955 return false;
2956
2957 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2958 return false;
2959
2960 return true;
2961}
2962
2963// Replace load of an illegal type with a store of a bitcast to a friendlier
2964// type.
2965SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2966 DAGCombinerInfo &DCI) const {
2967 if (!DCI.isBeforeLegalize())
2968 return SDValue();
2969
2970 LoadSDNode *LN = cast<LoadSDNode>(N);
2971 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2972 return SDValue();
2973
2974 SDLoc SL(N);
2975 SelectionDAG &DAG = DCI.DAG;
2976 EVT VT = LN->getMemoryVT();
2977
2978 unsigned Size = VT.getStoreSize();
2979 Align Alignment = LN->getAlign();
2980 if (Alignment < Size && isTypeLegal(VT)) {
2981 bool IsFast;
2982 unsigned AS = LN->getAddressSpace();
2983
2984 // Expand unaligned loads earlier than legalization. Due to visitation order
2985 // problems during legalization, the emitted instructions to pack and unpack
2986 // the bytes again are not eliminated in the case of an unaligned copy.
2987 if (!allowsMisalignedMemoryAccesses(
2988 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2989 SDValue Ops[2];
2990
2991 if (VT.isVector())
2992 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
2993 else
2994 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2995
2996 return DAG.getMergeValues(Ops, SDLoc(N));
2997 }
2998
2999 if (!IsFast)
3000 return SDValue();
3001 }
3002
3003 if (!shouldCombineMemoryType(VT))
3004 return SDValue();
3005
3006 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3007
3008 SDValue NewLoad
3009 = DAG.getLoad(NewVT, SL, LN->getChain(),
3010 LN->getBasePtr(), LN->getMemOperand());
3011
3012 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3013 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3014 return SDValue(N, 0);
3015}
3016
3017// Replace store of an illegal type with a store of a bitcast to a friendlier
3018// type.
3019SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3020 DAGCombinerInfo &DCI) const {
3021 if (!DCI.isBeforeLegalize())
3022 return SDValue();
3023
3024 StoreSDNode *SN = cast<StoreSDNode>(N);
3025 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3026 return SDValue();
3027
3028 EVT VT = SN->getMemoryVT();
3029 unsigned Size = VT.getStoreSize();
3030
3031 SDLoc SL(N);
3032 SelectionDAG &DAG = DCI.DAG;
3033 Align Alignment = SN->getAlign();
3034 if (Alignment < Size && isTypeLegal(VT)) {
3035 bool IsFast;
3036 unsigned AS = SN->getAddressSpace();
3037
3038 // Expand unaligned stores earlier than legalization. Due to visitation
3039 // order problems during legalization, the emitted instructions to pack and
3040 // unpack the bytes again are not eliminated in the case of an unaligned
3041 // copy.
3042 if (!allowsMisalignedMemoryAccesses(
3043 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3044 if (VT.isVector())
3045 return scalarizeVectorStore(SN, DAG);
3046
3047 return expandUnalignedStore(SN, DAG);
3048 }
3049
3050 if (!IsFast)
3051 return SDValue();
3052 }
3053
3054 if (!shouldCombineMemoryType(VT))
3055 return SDValue();
3056
3057 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3058 SDValue Val = SN->getValue();
3059
3060 //DCI.AddToWorklist(Val.getNode());
3061
3062 bool OtherUses = !Val.hasOneUse();
3063 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3064 if (OtherUses) {
3065 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3066 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3067 }
3068
3069 return DAG.getStore(SN->getChain(), SL, CastVal,
3070 SN->getBasePtr(), SN->getMemOperand());
3071}
3072
3073// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3074// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3075// issues.
3076SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3077 DAGCombinerInfo &DCI) const {
3078 SelectionDAG &DAG = DCI.DAG;
3079 SDValue N0 = N->getOperand(0);
3080
3081 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3082 // (vt2 (truncate (assertzext vt0:x, vt1)))
3083 if (N0.getOpcode() == ISD::TRUNCATE) {
3084 SDValue N1 = N->getOperand(1);
3085 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3086 SDLoc SL(N);
3087
3088 SDValue Src = N0.getOperand(0);
3089 EVT SrcVT = Src.getValueType();
3090 if (SrcVT.bitsGE(ExtVT)) {
3091 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3092 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3093 }
3094 }
3095
3096 return SDValue();
3097}
3098
3099SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3100 SDNode *N, DAGCombinerInfo &DCI) const {
3101 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3102 switch (IID) {
3103 case Intrinsic::amdgcn_mul_i24:
3104 case Intrinsic::amdgcn_mul_u24:
3105 return simplifyMul24(N, DCI);
3106 case Intrinsic::amdgcn_fract:
3107 case Intrinsic::amdgcn_rsq:
3108 case Intrinsic::amdgcn_rcp_legacy:
3109 case Intrinsic::amdgcn_rsq_legacy:
3110 case Intrinsic::amdgcn_rsq_clamp:
3111 case Intrinsic::amdgcn_ldexp: {
3112 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3113 SDValue Src = N->getOperand(1);
3114 return Src.isUndef() ? Src : SDValue();
3115 }
3116 default:
3117 return SDValue();
3118 }
3119}
3120
3121/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3122/// binary operation \p Opc to it with the corresponding constant operands.
3123SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3124 DAGCombinerInfo &DCI, const SDLoc &SL,
3125 unsigned Opc, SDValue LHS,
3126 uint32_t ValLo, uint32_t ValHi) const {
3127 SelectionDAG &DAG = DCI.DAG;
3128 SDValue Lo, Hi;
3129 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3130
3131 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3132 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3133
3134 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3135 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3136
3137 // Re-visit the ands. It's possible we eliminated one of them and it could
3138 // simplify the vector.
3139 DCI.AddToWorklist(Lo.getNode());
3140 DCI.AddToWorklist(Hi.getNode());
3141
3142 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3143 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3144}
3145
3146SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3147 DAGCombinerInfo &DCI) const {
3148 EVT VT = N->getValueType(0);
3149
3150 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3151 if (!RHS)
3152 return SDValue();
3153
3154 SDValue LHS = N->getOperand(0);
3155 unsigned RHSVal = RHS->getZExtValue();
3156 if (!RHSVal)
3157 return LHS;
3158
3159 SDLoc SL(N);
3160 SelectionDAG &DAG = DCI.DAG;
3161
3162 switch (LHS->getOpcode()) {
3163 default:
3164 break;
3165 case ISD::ZERO_EXTEND:
3166 case ISD::SIGN_EXTEND:
3167 case ISD::ANY_EXTEND: {
3168 SDValue X = LHS->getOperand(0);
3169
3170 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3171 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3172 // Prefer build_vector as the canonical form if packed types are legal.
3173 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3174 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3175 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3176 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3177 }
3178
3179 // shl (ext x) => zext (shl x), if shift does not overflow int
3180 if (VT != MVT::i64)
3181 break;
3182 KnownBits Known = DAG.computeKnownBits(X);
3183 unsigned LZ = Known.countMinLeadingZeros();
3184 if (LZ < RHSVal)
3185 break;
3186 EVT XVT = X.getValueType();
3187 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3188 return DAG.getZExtOrTrunc(Shl, SL, VT);
3189 }
3190 }
3191
3192 if (VT != MVT::i64)
3193 return SDValue();
3194
3195 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3196
3197 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3198 // common case, splitting this into a move and a 32-bit shift is faster and
3199 // the same code size.
3200 if (RHSVal < 32)
3201 return SDValue();
3202
3203 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3204
3205 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3206 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3207
3208 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3209
3210 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3211 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3212}
3213
3214SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3215 DAGCombinerInfo &DCI) const {
3216 if (N->getValueType(0) != MVT::i64)
3217 return SDValue();
3218
3219 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3220 if (!RHS)
3221 return SDValue();
3222
3223 SelectionDAG &DAG = DCI.DAG;
3224 SDLoc SL(N);
3225 unsigned RHSVal = RHS->getZExtValue();
3226
3227 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3228 if (RHSVal == 32) {
3229 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3230 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3231 DAG.getConstant(31, SL, MVT::i32));
3232
3233 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3234 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3235 }
3236
3237 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3238 if (RHSVal == 63) {
3239 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3240 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3241 DAG.getConstant(31, SL, MVT::i32));
3242 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3243 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3244 }
3245
3246 return SDValue();
3247}
3248
3249SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3250 DAGCombinerInfo &DCI) const {
3251 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3252 if (!RHS)
3253 return SDValue();
3254
3255 EVT VT = N->getValueType(0);
3256 SDValue LHS = N->getOperand(0);
3257 unsigned ShiftAmt = RHS->getZExtValue();
3258 SelectionDAG &DAG = DCI.DAG;
3259 SDLoc SL(N);
3260
3261 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3262 // this improves the ability to match BFE patterns in isel.
3263 if (LHS.getOpcode() == ISD::AND) {
3264 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3265 if (Mask->getAPIntValue().isShiftedMask() &&
3266 Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3267 return DAG.getNode(
3268 ISD::AND, SL, VT,
3269 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3270 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3271 }
3272 }
3273 }
3274
3275 if (VT != MVT::i64)
3276 return SDValue();
3277
3278 if (ShiftAmt < 32)
3279 return SDValue();
3280
3281 // srl i64:x, C for C >= 32
3282 // =>
3283 // build_pair (srl hi_32(x), C - 32), 0
3284 SDValue One = DAG.getConstant(1, SL, MVT::i32);
3285 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3286
3287 SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3288 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3289
3290 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3291 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3292
3293 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3294
3295 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3296}
3297
3298SDValue AMDGPUTargetLowering::performTruncateCombine(
3299 SDNode *N, DAGCombinerInfo &DCI) const {
3300 SDLoc SL(N);
3301 SelectionDAG &DAG = DCI.DAG;
3302 EVT VT = N->getValueType(0);
3303 SDValue Src = N->getOperand(0);
3304
3305 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3306 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3307 SDValue Vec = Src.getOperand(0);
3308 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3309 SDValue Elt0 = Vec.getOperand(0);
3310 EVT EltVT = Elt0.getValueType();
3311 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3312 if (EltVT.isFloatingPoint()) {
3313 Elt0 = DAG.getNode(ISD::BITCAST, SL,
3314 EltVT.changeTypeToInteger(), Elt0);
3315 }
3316
3317 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3318 }
3319 }
3320 }
3321
3322 // Equivalent of above for accessing the high element of a vector as an
3323 // integer operation.
3324 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3325 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3326 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3327 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3328 SDValue BV = stripBitcast(Src.getOperand(0));
3329 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3330 BV.getValueType().getVectorNumElements() == 2) {
3331 SDValue SrcElt = BV.getOperand(1);
3332 EVT SrcEltVT = SrcElt.getValueType();
3333 if (SrcEltVT.isFloatingPoint()) {
3334 SrcElt = DAG.getNode(ISD::BITCAST, SL,
3335 SrcEltVT.changeTypeToInteger(), SrcElt);
3336 }
3337
3338 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3339 }
3340 }
3341 }
3342 }
3343
3344 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3345 //
3346 // i16 (trunc (srl i64:x, K)), K <= 16 ->
3347 // i16 (trunc (srl (i32 (trunc x), K)))
3348 if (VT.getScalarSizeInBits() < 32) {
3349 EVT SrcVT = Src.getValueType();
3350 if (SrcVT.getScalarSizeInBits() > 32 &&
3351 (Src.getOpcode() == ISD::SRL ||
3352 Src.getOpcode() == ISD::SRA ||
3353 Src.getOpcode() == ISD::SHL)) {
3354 SDValue Amt = Src.getOperand(1);
3355 KnownBits Known = DAG.computeKnownBits(Amt);
3356 unsigned Size = VT.getScalarSizeInBits();
3357 if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3358 (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3359 EVT MidVT = VT.isVector() ?
3360 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3361 VT.getVectorNumElements()) : MVT::i32;
3362
3363 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3364 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3365 Src.getOperand(0));
3366 DCI.AddToWorklist(Trunc.getNode());
3367
3368 if (Amt.getValueType() != NewShiftVT) {
3369 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3370 DCI.AddToWorklist(Amt.getNode());
3371 }
3372
3373 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3374 Trunc, Amt);
3375 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3376 }
3377 }
3378 }
3379
3380 return SDValue();
3381}
3382
3383// We need to specifically handle i64 mul here to avoid unnecessary conversion
3384// instructions. If we only match on the legalized i64 mul expansion,
3385// SimplifyDemandedBits will be unable to remove them because there will be
3386// multiple uses due to the separate mul + mulh[su].
3387static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3388 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3389 if (Size <= 32) {
3390 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3391 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3392 }
3393
3394 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3395 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3396
3397 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3398 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3399
3400 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3401}
3402
3403SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3404 DAGCombinerInfo &DCI) const {
3405 EVT VT = N->getValueType(0);
3406
3407 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3408 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3409 // unnecessarily). isDivergent() is used as an approximation of whether the
3410 // value is in an SGPR.
3411 if (!N->isDivergent())
3412 return SDValue();
3413
3414 unsigned Size = VT.getSizeInBits();
3415 if (VT.isVector() || Size > 64)
3416 return SDValue();
3417
3418 // There are i16 integer mul/mad.
3419 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3420 return SDValue();
3421
3422 SelectionDAG &DAG = DCI.DAG;
3423 SDLoc DL(N);
3424
3425 SDValue N0 = N->getOperand(0);
3426 SDValue N1 = N->getOperand(1);
3427
3428 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3429 // in the source into any_extends if the result of the mul is truncated. Since
3430 // we can assume the high bits are whatever we want, use the underlying value
3431 // to avoid the unknown high bits from interfering.
3432 if (N0.getOpcode() == ISD::ANY_EXTEND)
3433 N0 = N0.getOperand(0);
3434
3435 if (N1.getOpcode() == ISD::ANY_EXTEND)
3436 N1 = N1.getOperand(0);
3437
3438 SDValue Mul;
3439
3440 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3441 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3442 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3443 Mul = getMul24(DAG, DL, N0, N1, Size, false);
3444 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3445 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3446 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3447 Mul = getMul24(DAG, DL, N0, N1, Size, true);
3448 } else {
3449 return SDValue();
3450 }
3451
3452 // We need to use sext even for MUL_U24, because MUL_U24 is used
3453 // for signed multiply of 8 and 16-bit types.
3454 return DAG.getSExtOrTrunc(Mul, DL, VT);
3455}
3456
3457SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3458 DAGCombinerInfo &DCI) const {
3459 EVT VT = N->getValueType(0);
3460
3461 if (!Subtarget->hasMulI24() || VT.isVector())
3462 return SDValue();
3463
3464 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3465 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3466 // unnecessarily). isDivergent() is used as an approximation of whether the
3467 // value is in an SGPR.
3468 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3469 // valu op anyway)
3470 if (Subtarget->hasSMulHi() && !N->isDivergent())
3471 return SDValue();
3472
3473 SelectionDAG &DAG = DCI.DAG;
3474 SDLoc DL(N);
3475
3476 SDValue N0 = N->getOperand(0);
3477 SDValue N1 = N->getOperand(1);
3478
3479 if (!isI24(N0, DAG) || !isI24(N1, DAG))
3480 return SDValue();
3481
3482 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3483 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3484
3485 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3486 DCI.AddToWorklist(Mulhi.getNode());
3487 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3488}
3489
3490SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3491 DAGCombinerInfo &DCI) const {
3492 EVT VT = N->getValueType(0);
3493
3494 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3495 return SDValue();
3496
3497 // Don't generate 24-bit multiplies on values that are in SGPRs, since
3498 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3499 // unnecessarily). isDivergent() is used as an approximation of whether the
3500 // value is in an SGPR.
3501 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3502 // valu op anyway)
3503 if (Subtarget->hasSMulHi() && !N->isDivergent())
3504 return SDValue();
3505
3506 SelectionDAG &DAG = DCI.DAG;
3507 SDLoc DL(N);
3508
3509 SDValue N0 = N->getOperand(0);
3510 SDValue N1 = N->getOperand(1);
3511
3512 if (!isU24(N0, DAG) || !isU24(N1, DAG))
3513 return SDValue();
3514
3515 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3516 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3517
3518 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3519 DCI.AddToWorklist(Mulhi.getNode());
3520 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3521}
3522
3523static bool isNegativeOne(SDValue Val) {
3524 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3525 return C->isAllOnesValue();
3526 return false;
3527}
3528
3529SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3530 SDValue Op,
3531 const SDLoc &DL,
3532 unsigned Opc) const {
3533 EVT VT = Op.getValueType();
3534 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3535 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3536 LegalVT != MVT::i16))
3537 return SDValue();
3538
3539 if (VT != MVT::i32)
3540 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3541
3542 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3543 if (VT != MVT::i32)
3544 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3545
3546 return FFBX;
3547}
3548
3549// The native instructions return -1 on 0 input. Optimize out a select that
3550// produces -1 on 0.
3551//
3552// TODO: If zero is not undef, we could also do this if the output is compared
3553// against the bitwidth.
3554//
3555// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3556SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3557 SDValue LHS, SDValue RHS,
3558 DAGCombinerInfo &DCI) const {
3559 ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3560 if (!CmpRhs || !CmpRhs->isNullValue())
3561 return SDValue();
3562
3563 SelectionDAG &DAG = DCI.DAG;
3564 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3565 SDValue CmpLHS = Cond.getOperand(0);
3566
3567 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3568 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3569 if (CCOpcode == ISD::SETEQ &&
3570 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3571 RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3572 unsigned Opc =
3573 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3574 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3575 }
3576
3577 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3578 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3579 if (CCOpcode == ISD::SETNE &&
3580 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3581 LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3582 unsigned Opc =
3583 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3584
3585 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3586 }
3587
3588 return SDValue();
3589}
3590
3591static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3592 unsigned Op,
3593 const SDLoc &SL,
3594 SDValue Cond,
3595 SDValue N1,
3596 SDValue N2) {
3597 SelectionDAG &DAG = DCI.DAG;
3598 EVT VT = N1.getValueType();
3599
3600 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3601 N1.getOperand(0), N2.getOperand(0));
3602 DCI.AddToWorklist(NewSelect.getNode());
3603 return DAG.getNode(Op, SL, VT, NewSelect);
3604}
3605
3606// Pull a free FP operation out of a select so it may fold into uses.
3607//
3608// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3609// select c, (fneg x), k -> fneg (select c, x, (fneg k))
3610//
3611// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3612// select c, (fabs x), +k -> fabs (select c, x, k)
3613static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3614 SDValue N) {
3615 SelectionDAG &DAG = DCI.DAG;
3616 SDValue Cond = N.getOperand(0);
3617 SDValue LHS = N.getOperand(1);
3618 SDValue RHS = N.getOperand(2);
3619
3620 EVT VT = N.getValueType();
3621 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3622 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3623 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3624 SDLoc(N), Cond, LHS, RHS);
3625 }
3626
3627 bool Inv = false;
3628 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3629 std::swap(LHS, RHS);
3630 Inv = true;
3631 }
3632
3633 // TODO: Support vector constants.
3634 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3635 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3636 SDLoc SL(N);
3637 // If one side is an fneg/fabs and the other is a constant, we can push the
3638 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3639 SDValue NewLHS = LHS.getOperand(0);
3640 SDValue NewRHS = RHS;
3641
3642 // Careful: if the neg can be folded up, don't try to pull it back down.
3643 bool ShouldFoldNeg = true;
3644
3645 if (NewLHS.hasOneUse()) {
3646 unsigned Opc = NewLHS.getOpcode();
3647 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3648 ShouldFoldNeg = false;
3649 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3650 ShouldFoldNeg = false;
3651 }
3652
3653 if (ShouldFoldNeg) {
3654 if (LHS.getOpcode() == ISD::FNEG)
3655 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3656 else if (CRHS->isNegative())
3657 return SDValue();
3658
3659 if (Inv)
3660 std::swap(NewLHS, NewRHS);
3661
3662 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3663 Cond, NewLHS, NewRHS);
3664 DCI.AddToWorklist(NewSelect.getNode());
3665 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3666 }
3667 }
3668
3669 return SDValue();
3670}
3671
3672
3673SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3674 DAGCombinerInfo &DCI) const {
3675 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3676 return Folded;
3677
3678 SDValue Cond = N->getOperand(0);
3679 if (Cond.getOpcode() != ISD::SETCC)
3680 return SDValue();
3681
3682 EVT VT = N->getValueType(0);
3683 SDValue LHS = Cond.getOperand(0);
3684 SDValue RHS = Cond.getOperand(1);
3685 SDValue CC = Cond.getOperand(2);
3686
3687 SDValue True = N->getOperand(1);
3688 SDValue False = N->getOperand(2);
3689
3690 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3691 SelectionDAG &DAG = DCI.DAG;
3692 if (DAG.isConstantValueOfAnyType(True) &&
3693 !DAG.isConstantValueOfAnyType(False)) {
3694 // Swap cmp + select pair to move constant to false input.
3695 // This will allow using VOPC cndmasks more often.
3696 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3697
3698 SDLoc SL(N);
3699 ISD::CondCode NewCC =
3700 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3701
3702 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3703 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3704 }
3705
3706 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3707 SDValue MinMax
3708 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3709 // Revisit this node so we can catch min3/max3/med3 patterns.
3710 //DCI.AddToWorklist(MinMax.getNode());
3711 return MinMax;
3712 }
3713 }
3714
3715 // There's no reason to not do this if the condition has other uses.
3716 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3717}
3718
3719static bool isInv2Pi(const APFloat &APF) {
3720 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3721 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3722 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3723
3724 return APF.bitwiseIsEqual(KF16) ||
3725 APF.bitwiseIsEqual(KF32) ||
3726 APF.bitwiseIsEqual(KF64);
3727}
3728
3729// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3730// additional cost to negate them.
3731bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3732 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3733 if (C->isZero() && !C->isNegative())
3734 return true;
3735
3736 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3737 return true;
3738 }
3739
3740 return false;
3741}
3742
3743static unsigned inverseMinMax(unsigned Opc) {
3744 switch (Opc) {
3745 case ISD::FMAXNUM:
3746 return ISD::FMINNUM;
3747 case ISD::FMINNUM:
3748 return ISD::FMAXNUM;
3749 case ISD::FMAXNUM_IEEE:
3750 return ISD::FMINNUM_IEEE;
3751 case ISD::FMINNUM_IEEE:
3752 return ISD::FMAXNUM_IEEE;
3753 case AMDGPUISD::FMAX_LEGACY:
3754 return AMDGPUISD::FMIN_LEGACY;
3755 case AMDGPUISD::FMIN_LEGACY:
3756 return AMDGPUISD::FMAX_LEGACY;
3757 default:
3758 llvm_unreachable("invalid min/max opcode")__builtin_unreachable();
3759 }
3760}
3761
3762SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3763 DAGCombinerInfo &DCI) const {
3764 SelectionDAG &DAG = DCI.DAG;
3765 SDValue N0 = N->getOperand(0);
3766 EVT VT = N->getValueType(0);
3767
3768 unsigned Opc = N0.getOpcode();
3769
3770 // If the input has multiple uses and we can either fold the negate down, or
3771 // the other uses cannot, give up. This both prevents unprofitable
3772 // transformations and infinite loops: we won't repeatedly try to fold around
3773 // a negate that has no 'good' form.
3774 if (N0.hasOneUse()) {
3775 // This may be able to fold into the source, but at a code size cost. Don't
3776 // fold if the fold into the user is free.
3777 if (allUsesHaveSourceMods(N, 0))
3778 return SDValue();
3779 } else {
3780 if (fnegFoldsIntoOp(Opc) &&
3781 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3782 return SDValue();
3783 }
3784
3785 SDLoc SL(N);
3786 switch (Opc) {
3787 case ISD::FADD: {
3788 if (!mayIgnoreSignedZero(N0))
3789 return SDValue();
3790
3791 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3792 SDValue LHS = N0.getOperand(0);
3793 SDValue RHS = N0.getOperand(1);
3794
3795 if (LHS.getOpcode() != ISD::FNEG)
3796 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3797 else
3798 LHS = LHS.getOperand(0);
3799
3800 if (RHS.getOpcode() != ISD::FNEG)
3801 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3802 else
3803 RHS = RHS.getOperand(0);
3804
3805 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3806 if (Res.getOpcode() != ISD::FADD)
3807 return SDValue(); // Op got folded away.
3808 if (!N0.hasOneUse())
3809 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3810 return Res;
3811 }
3812 case ISD::FMUL:
3813 case AMDGPUISD::FMUL_LEGACY: {
3814 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3815 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3816 SDValue LHS = N0.getOperand(0);
3817 SDValue RHS = N0.getOperand(1);
3818
3819 if (LHS.getOpcode() == ISD::FNEG)
3820 LHS = LHS.getOperand(0);
3821 else if (RHS.getOpcode() == ISD::FNEG)
3822 RHS = RHS.getOperand(0);
3823 else
3824 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3825
3826 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3827 if (Res.getOpcode() != Opc)
3828 return SDValue(); // Op got folded away.
3829 if (!N0.hasOneUse())
3830 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3831 return Res;
3832 }
3833 case ISD::FMA:
3834 case ISD::FMAD: {
3835 // TODO: handle llvm.amdgcn.fma.legacy
3836 if (!mayIgnoreSignedZero(N0))
3837 return SDValue();
3838
3839 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3840 SDValue LHS = N0.getOperand(0);
3841 SDValue MHS = N0.getOperand(1);
3842 SDValue RHS = N0.getOperand(2);
3843
3844 if (LHS.getOpcode() == ISD::FNEG)
3845 LHS = LHS.getOperand(0);
3846 else if (MHS.getOpcode() == ISD::FNEG)
3847 MHS = MHS.getOperand(0);
3848 else
3849 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3850
3851 if (RHS.getOpcode() != ISD::FNEG)
3852 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3853 else
3854 RHS = RHS.getOperand(0);
3855
3856 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3857 if (Res.getOpcode() != Opc)
3858 return SDValue(); // Op got folded away.
3859 if (!N0.hasOneUse())
3860 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3861 return Res;
3862 }
3863 case ISD::FMAXNUM:
3864 case ISD::FMINNUM:
3865 case ISD::FMAXNUM_IEEE:
3866 case ISD::FMINNUM_IEEE:
3867 case AMDGPUISD::FMAX_LEGACY:
3868 case AMDGPUISD::FMIN_LEGACY: {
3869 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3870 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3871 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3872 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3873
3874 SDValue LHS = N0.getOperand(0);
3875 SDValue RHS = N0.getOperand(1);
3876
3877 // 0 doesn't have a negated inline immediate.
3878 // TODO: This constant check should be generalized to other operations.
3879 if (isConstantCostlierToNegate(RHS))
3880 return SDValue();
3881
3882 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3883 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3884 unsigned Opposite = inverseMinMax(Opc);
3885
3886 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3887 if (Res.getOpcode() != Opposite)
3888 return SDValue(); // Op got folded away.
3889 if (!N0.hasOneUse())
3890 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3891 return Res;
3892 }
3893 case AMDGPUISD::FMED3: {
3894 SDValue Ops[3];
3895 for (unsigned I = 0; I < 3; ++I)
3896 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3897
3898 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3899 if (Res.getOpcode() != AMDGPUISD::FMED3)
3900 return SDValue(); // Op got folded away.
3901
3902 if (!N0.hasOneUse()) {
3903 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3904 DAG.ReplaceAllUsesWith(N0, Neg);
3905
3906 for (SDNode *U : Neg->uses())
3907 DCI.AddToWorklist(U);
3908 }
3909
3910 return Res;
3911 }
3912 case ISD::FP_EXTEND:
3913 case ISD::FTRUNC:
3914 case ISD::FRINT:
3915 case ISD::FNEARBYINT: // XXX - Should fround be handled?
3916 case ISD::FSIN:
3917 case ISD::FCANONICALIZE:
3918 case AMDGPUISD::RCP:
3919 case AMDGPUISD::RCP_LEGACY:
3920 case AMDGPUISD::RCP_IFLAG:
3921 case AMDGPUISD::SIN_HW: {
3922 SDValue CvtSrc = N0.getOperand(0);
3923 if (CvtSrc.getOpcode() == ISD::FNEG) {
3924 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3925 // (fneg (rcp (fneg x))) -> (rcp x)
3926 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3927 }
3928
3929 if (!N0.hasOneUse())
3930 return SDValue();
3931
3932 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3933 // (fneg (rcp x)) -> (rcp (fneg x))
3934 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3935 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3936 }
3937 case ISD::FP_ROUND: {
3938 SDValue CvtSrc = N0.getOperand(0);
3939
3940 if (CvtSrc.getOpcode() == ISD::FNEG) {
3941 // (fneg (fp_round (fneg x))) -> (fp_round x)
3942 return DAG.getNode(ISD::FP_ROUND, SL, VT,
3943 CvtSrc.getOperand(0), N0.getOperand(1));
3944 }
3945
3946 if (!N0.hasOneUse())
3947 return SDValue();
3948
3949 // (fneg (fp_round x)) -> (fp_round (fneg x))
3950 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3951 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3952 }
3953 case ISD::FP16_TO_FP: {
3954 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3955 // f16, but legalization of f16 fneg ends up pulling it out of the source.
3956 // Put the fneg back as a legal source operation that can be matched later.
3957 SDLoc SL(N);
3958
3959 SDValue Src = N0.getOperand(0);
3960 EVT SrcVT = Src.getValueType();
3961
3962 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3963 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3964 DAG.getConstant(0x8000, SL, SrcVT));
3965 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3966 }
3967 default:
3968 return SDValue();
3969 }
3970}
3971
3972SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3973 DAGCombinerInfo &DCI) const {
3974 SelectionDAG &DAG = DCI.DAG;
3975 SDValue N0 = N->getOperand(0);
3976
3977 if (!N0.hasOneUse())
3978 return SDValue();
3979
3980 switch (N0.getOpcode()) {
3981 case ISD::FP16_TO_FP: {
3982 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal")((void)0);
3983 SDLoc SL(N);
3984 SDValue Src = N0.getOperand(0);
3985 EVT SrcVT = Src.getValueType();
3986
3987 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3988 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3989 DAG.getConstant(0x7fff, SL, SrcVT));
3990 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3991 }
3992 default:
3993 return SDValue();
3994 }
3995}
3996
3997SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3998 DAGCombinerInfo &DCI) const {
3999 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4000 if (!CFP)
4001 return SDValue();
4002
4003 // XXX - Should this flush denormals?
4004 const APFloat &Val = CFP->getValueAPF();
4005 APFloat One(Val.getSemantics(), "1.0");
4006 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4007}
4008
4009SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4010 DAGCombinerInfo &DCI) const {
4011 SelectionDAG &DAG = DCI.DAG;
4012 SDLoc DL(N);
4013
4014 switch(N->getOpcode()) {
4015 default:
4016 break;
4017 case ISD::BITCAST: {
4018 EVT DestVT = N->getValueType(0);
4019
4020 // Push casts through vector builds. This helps avoid emitting a large
4021 // number of copies when materializing floating point vector constants.
4022 //
4023 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4024 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4025 if (DestVT.isVector()) {
4026 SDValue Src = N->getOperand(0);
4027 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4028 EVT SrcVT = Src.getValueType();
4029 unsigned NElts = DestVT.getVectorNumElements();
4030
4031 if (SrcVT.getVectorNumElements() == NElts) {
4032 EVT DestEltVT = DestVT.getVectorElementType();
4033
4034 SmallVector<SDValue, 8> CastedElts;
4035 SDLoc SL(N);
4036 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4037 SDValue Elt = Src.getOperand(I);
4038 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4039 }
4040
4041 return DAG.getBuildVector(DestVT, SL, CastedElts);
4042 }
4043 }
4044 }
4045
4046 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4047 break;
4048
4049 // Fold bitcasts of constants.
4050 //
4051 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4052 // TODO: Generalize and move to DAGCombiner
4053 SDValue Src = N->getOperand(0);
4054 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4055 SDLoc SL(N);
4056 uint64_t CVal = C->getZExtValue();
4057 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4058 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4059 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4060 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4061 }
4062
4063 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4064 const APInt &Val = C->getValueAPF().bitcastToAPInt();
4065 SDLoc SL(N);
4066 uint64_t CVal = Val.getZExtValue();
4067 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4068 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4069 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4070
4071 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4072 }
4073
4074 break;
4075 }
4076 case ISD::SHL: {
4077 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4078 break;
4079
4080 return performShlCombine(N, DCI);
4081 }
4082 case ISD::SRL: {
4083 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4084 break;
4085
4086 return performSrlCombine(N, DCI);
4087 }
4088 case ISD::SRA: {
4089 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4090 break;
4091
4092 return performSraCombine(N, DCI);
4093 }
4094 case ISD::TRUNCATE:
4095 return performTruncateCombine(N, DCI);
4096 case ISD::MUL:
4097 return performMulCombine(N, DCI);
4098 case ISD::MULHS:
4099 return performMulhsCombine(N, DCI);
4100 case ISD::MULHU:
4101 return performMulhuCombine(N, DCI);
4102 case AMDGPUISD::MUL_I24:
4103 case AMDGPUISD::MUL_U24:
4104 case AMDGPUISD::MULHI_I24:
4105 case AMDGPUISD::MULHI_U24:
4106 return simplifyMul24(N, DCI);
4107 case ISD::SELECT:
4108 return performSelectCombine(N, DCI);
4109 case ISD::FNEG:
4110 return performFNegCombine(N, DCI);
4111 case ISD::FABS:
4112 return performFAbsCombine(N, DCI);
4113 case AMDGPUISD::BFE_I32:
4114 case AMDGPUISD::BFE_U32: {
4115 assert(!N->getValueType(0).isVector() &&((void)0)
4116 "Vector handling of BFE not implemented")((void)0);
4117 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4118 if (!Width)
4119 break;
4120
4121 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4122 if (WidthVal == 0)
4123 return DAG.getConstant(0, DL, MVT::i32);
4124
4125 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4126 if (!Offset)
4127 break;
4128
4129 SDValue BitsFrom = N->getOperand(0);
4130 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4131
4132 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4133
4134 if (OffsetVal == 0) {
4135 // This is already sign / zero extended, so try to fold away extra BFEs.
4136 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4137
4138 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4139 if (OpSignBits >= SignBits)
4140 return BitsFrom;
4141
4142 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4143 if (Signed) {
4144 // This is a sign_extend_inreg. Replace it to take advantage of existing
4145 // DAG Combines. If not eliminated, we will match back to BFE during
4146 // selection.
4147
4148 // TODO: The sext_inreg of extended types ends, although we can could
4149 // handle them in a single BFE.
4150 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4151 DAG.getValueType(SmallVT));
4152 }
4153
4154 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4155 }
4156
4157 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4158 if (Signed) {
4159 return constantFoldBFE<int32_t>(DAG,
4160 CVal->getSExtValue(),
4161 OffsetVal,
4162 WidthVal,
4163 DL);
4164 }
4165
4166 return constantFoldBFE<uint32_t>(DAG,
4167 CVal->getZExtValue(),
4168 OffsetVal,
4169 WidthVal,
4170 DL);
4171 }
4172
4173 if ((OffsetVal + WidthVal) >= 32 &&
4174 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4175 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4176 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4177 BitsFrom, ShiftVal);
4178 }
4179
4180 if (BitsFrom.hasOneUse()) {
4181 APInt Demanded = APInt::getBitsSet(32,
4182 OffsetVal,
4183 OffsetVal + WidthVal);
4184
4185 KnownBits Known;
4186 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4187 !DCI.isBeforeLegalizeOps());
4188 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4189 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4190 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4191 DCI.CommitTargetLoweringOpt(TLO);
4192 }
4193 }
4194
4195 break;
4196 }
4197 case ISD::LOAD:
4198 return performLoadCombine(N, DCI);
4199 case ISD::STORE:
4200 return performStoreCombine(N, DCI);
4201 case AMDGPUISD::RCP:
4202 case AMDGPUISD::RCP_IFLAG:
4203 return performRcpCombine(N, DCI);
4204 case ISD::AssertZext:
4205 case ISD::AssertSext:
4206 return performAssertSZExtCombine(N, DCI);
4207 case ISD::INTRINSIC_WO_CHAIN:
4208 return performIntrinsicWOChainCombine(N, DCI);
4209 }
4210 return SDValue();
4211}
4212
4213//===----------------------------------------------------------------------===//
4214// Helper functions
4215//===----------------------------------------------------------------------===//
4216
4217SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4218 const TargetRegisterClass *RC,
4219 Register Reg, EVT VT,
4220 const SDLoc &SL,
4221 bool RawReg) const {
4222 MachineFunction &MF = DAG.getMachineFunction();
4223 MachineRegisterInfo &MRI = MF.getRegInfo();
4224 Register VReg;
4225
4226 if (!MRI.isLiveIn(Reg)) {
4227 VReg = MRI.createVirtualRegister(RC);
4228 MRI.addLiveIn(Reg, VReg);
4229 } else {
4230 VReg = MRI.getLiveInVirtReg(Reg);
4231 }
4232
4233 if (RawReg)
4234 return DAG.getRegister(VReg, VT);
4235
4236 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4237}
4238
4239// This may be called multiple times, and nothing prevents creating multiple
4240// objects at the same offset. See if we already defined this object.
4241static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4242 int64_t Offset) {
4243 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4244 if (MFI.getObjectOffset(I) == Offset) {
4245 assert(MFI.getObjectSize(I) == Size)((void)0);
4246 return I;
4247 }
4248 }
4249
4250 return MFI.CreateFixedObject(Size, Offset, true);
4251}
4252
4253SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4254 EVT VT,
4255 const SDLoc &SL,
4256 int64_t Offset) const {
4257 MachineFunction &MF = DAG.getMachineFunction();
4258 MachineFrameInfo &MFI = MF.getFrameInfo();
4259 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4260
4261 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4262 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4263
4264 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4265 MachineMemOperand::MODereferenceable |
4266 MachineMemOperand::MOInvariant);
4267}
4268
4269SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4270 const SDLoc &SL,
4271 SDValue Chain,
4272 SDValue ArgVal,
4273 int64_t Offset) const {
4274 MachineFunction &MF = DAG.getMachineFunction();
4275 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4276 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4277
4278 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4279 // Stores to the argument stack area are relative to the stack pointer.
4280 SDValue SP =
4281 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4282 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4283 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4284 MachineMemOperand::MODereferenceable);
4285 return Store;
4286}
4287
4288SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4289 const TargetRegisterClass *RC,
4290 EVT VT, const SDLoc &SL,
4291 const ArgDescriptor &Arg) const {
4292 assert(Arg && "Attempting to load missing argument")((void)0);
4293
4294 SDValue V = Arg.isRegister() ?
1
'?' condition is true
4295 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4296 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4297
4298 if (!Arg.isMasked())
2
Calling 'ArgDescriptor::isMasked'
5
Returning from 'ArgDescriptor::isMasked'
6
Taking false branch
4299 return V;
4300
4301 unsigned Mask = Arg.getMask();
4302 unsigned Shift = countTrailingZeros<unsigned>(Mask);
7
Calling 'countTrailingZeros<unsigned int>'
14
Returning from 'countTrailingZeros<unsigned int>'
15
'Shift' initialized to 32
4303 V = DAG.getNode(ISD::SRL, SL, VT, V,
4304 DAG.getShiftAmountConstant(Shift, VT, SL));
4305 return DAG.getNode(ISD::AND, SL, VT, V,
4306 DAG.getConstant(Mask >> Shift, SL, VT));
16
The result of the right shift is undefined due to shifting by '32', which is greater or equal to the width of type 'unsigned int'
4307}
4308
4309uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4310 const MachineFunction &MF, const ImplicitParameter Param) const {
4311 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4312 const AMDGPUSubtarget &ST =
4313 AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4314 unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4315 const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4316 uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4317 ExplicitArgOffset;
4318 switch (Param) {
4319 case GRID_DIM:
4320 return ArgOffset;
4321 case GRID_OFFSET:
4322 return ArgOffset + 4;
4323 }
4324 llvm_unreachable("unexpected implicit parameter type")__builtin_unreachable();
4325}
4326
4327#define NODE_NAME_CASE(node)case AMDGPUISD::node: return "node"; case AMDGPUISD::node: return #node;
4328
4329const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4330 switch ((AMDGPUISD::NodeType)Opcode) {
4331 case AMDGPUISD::FIRST_NUMBER: break;
4332 // AMDIL DAG nodes
4333 NODE_NAME_CASE(UMUL)case AMDGPUISD::UMUL: return "UMUL";;
4334 NODE_NAME_CASE(BRANCH_COND)case AMDGPUISD::BRANCH_COND: return "BRANCH_COND";;
4335
4336 // AMDGPU DAG nodes
4337 NODE_NAME_CASE(IF)case AMDGPUISD::IF: return "IF";
4338 NODE_NAME_CASE(ELSE)case AMDGPUISD::ELSE: return "ELSE";
4339 NODE_NAME_CASE(LOOP)case AMDGPUISD::LOOP: return "LOOP";
4340 NODE_NAME_CASE(CALL)case AMDGPUISD::CALL: return "CALL";
4341 NODE_NAME_CASE(TC_RETURN)case AMDGPUISD::TC_RETURN: return "TC_RETURN";
4342 NODE_NAME_CASE(TRAP)case AMDGPUISD::TRAP: return "TRAP";
4343 NODE_NAME_CASE(RET_FLAG)case AMDGPUISD::RET_FLAG: return "RET_FLAG";
4344 NODE_NAME_CASE(RETURN_TO_EPILOG)case AMDGPUISD::RETURN_TO_EPILOG: return "RETURN_TO_EPILOG";
4345 NODE_NAME_CASE(ENDPGM)case AMDGPUISD::ENDPGM: return "ENDPGM";
4346 NODE_NAME_CASE(DWORDADDR)case AMDGPUISD::DWORDADDR: return "DWORDADDR";
4347 NODE_NAME_CASE(FRACT)case AMDGPUISD::FRACT: return "FRACT";
4348 NODE_NAME_CASE(SETCC)case AMDGPUISD::SETCC: return "SETCC";
4349 NODE_NAME_CASE(SETREG)case AMDGPUISD::SETREG: return "SETREG";
4350 NODE_NAME_CASE(DENORM_MODE)case AMDGPUISD::DENORM_MODE: return "DENORM_MODE";
4351 NODE_NAME_CASE(FMA_W_CHAIN)case AMDGPUISD::FMA_W_CHAIN: return "FMA_W_CHAIN";
4352 NODE_NAME_CASE(FMUL_W_CHAIN)case AMDGPUISD::FMUL_W_CHAIN: return "FMUL_W_CHAIN";
4353 NODE_NAME_CASE(CLAMP)case AMDGPUISD::CLAMP: return "CLAMP";
4354 NODE_NAME_CASE(COS_HW)case AMDGPUISD::COS_HW: return "COS_HW";
4355 NODE_NAME_CASE(SIN_HW)case AMDGPUISD::SIN_HW: return "SIN_HW";
4356 NODE_NAME_CASE(FMAX_LEGACY)case AMDGPUISD::FMAX_LEGACY: return "FMAX_LEGACY";
4357 NODE_NAME_CASE(FMIN_LEGACY)case AMDGPUISD::FMIN_LEGACY: return "FMIN_LEGACY";
4358 NODE_NAME_CASE(FMAX3)case AMDGPUISD::FMAX3: return "FMAX3";
4359 NODE_NAME_CASE(SMAX3)case AMDGPUISD::SMAX3: return "SMAX3";
4360 NODE_NAME_CASE(UMAX3)case AMDGPUISD::UMAX3: return "UMAX3";
4361 NODE_NAME_CASE(FMIN3)case AMDGPUISD::FMIN3: return "FMIN3";
4362 NODE_NAME_CASE(SMIN3)case AMDGPUISD::SMIN3: return "SMIN3";
4363 NODE_NAME_CASE(UMIN3)case AMDGPUISD::UMIN3: return "UMIN3";
4364 NODE_NAME_CASE(FMED3)case AMDGPUISD::FMED3: return "FMED3";
4365 NODE_NAME_CASE(SMED3)case AMDGPUISD::SMED3: return "SMED3";
4366 NODE_NAME_CASE(UMED3)case AMDGPUISD::UMED3: return "UMED3";
4367 NODE_NAME_CASE(FDOT2)case AMDGPUISD::FDOT2: return "FDOT2";
4368 NODE_NAME_CASE(URECIP)case AMDGPUISD::URECIP: return "URECIP";
4369 NODE_NAME_CASE(DIV_SCALE)case AMDGPUISD::DIV_SCALE: return "DIV_SCALE";
4370 NODE_NAME_CASE(DIV_FMAS)case AMDGPUISD::DIV_FMAS: return "DIV_FMAS";
4371 NODE_NAME_CASE(DIV_FIXUP)case AMDGPUISD::DIV_FIXUP: return "DIV_FIXUP";
4372 NODE_NAME_CASE(FMAD_FTZ)case AMDGPUISD::FMAD_FTZ: return "FMAD_FTZ";
4373 NODE_NAME_CASE(RCP)case AMDGPUISD::RCP: return "RCP";
4374 NODE_NAME_CASE(RSQ)case AMDGPUISD::RSQ: return "RSQ";
4375 NODE_NAME_CASE(RCP_LEGACY)case AMDGPUISD::RCP_LEGACY: return "RCP_LEGACY";
4376 NODE_NAME_CASE(RCP_IFLAG)case AMDGPUISD::RCP_IFLAG: return "RCP_IFLAG";
4377 NODE_NAME_CASE(FMUL_LEGACY)case AMDGPUISD::FMUL_LEGACY: return "FMUL_LEGACY";
4378 NODE_NAME_CASE(RSQ_CLAMP)case AMDGPUISD::RSQ_CLAMP: return "RSQ_CLAMP";
4379 NODE_NAME_CASE(LDEXP)case AMDGPUISD::LDEXP: return "LDEXP";
4380 NODE_NAME_CASE(FP_CLASS)case AMDGPUISD::FP_CLASS: return "FP_CLASS";
4381 NODE_NAME_CASE(DOT4)case AMDGPUISD::DOT4: return "DOT4";
4382 NODE_NAME_CASE(CARRY)case AMDGPUISD::CARRY: return "CARRY";
4383 NODE_NAME_CASE(BORROW)case AMDGPUISD::BORROW: return "BORROW";
4384 NODE_NAME_CASE(BFE_U32)case AMDGPUISD::BFE_U32: return "BFE_U32";
4385 NODE_NAME_CASE(BFE_I32)case AMDGPUISD::BFE_I32: return "BFE_I32";
4386 NODE_NAME_CASE(BFI)case AMDGPUISD::BFI: return "BFI";
4387 NODE_NAME_CASE(BFM)case AMDGPUISD::BFM: return "BFM";
4388 NODE_NAME_CASE(FFBH_U32)case AMDGPUISD::FFBH_U32: return "FFBH_U32";
4389 NODE_NAME_CASE(FFBH_I32)case AMDGPUISD::FFBH_I32: return "FFBH_I32";
4390 NODE_NAME_CASE(FFBL_B32)case AMDGPUISD::FFBL_B32: return "FFBL_B32";
4391 NODE_NAME_CASE(MUL_U24)case AMDGPUISD::MUL_U24: return "MUL_U24";
4392 NODE_NAME_CASE(MUL_I24)case AMDGPUISD::MUL_I24: return "MUL_I24";
4393 NODE_NAME_CASE(MULHI_U24)case AMDGPUISD::MULHI_U24: return "MULHI_U24";
4394 NODE_NAME_CASE(MULHI_I24)case AMDGPUISD::MULHI_I24: return "MULHI_I24";
4395 NODE_NAME_CASE(MAD_U24)case AMDGPUISD::MAD_U24: return "MAD_U24";
4396 NODE_NAME_CASE(MAD_I24)case AMDGPUISD::MAD_I24: return "MAD_I24";
4397 NODE_NAME_CASE(MAD_I64_I32)case AMDGPUISD::MAD_I64_I32: return "MAD_I64_I32";
4398 NODE_NAME_CASE(MAD_U64_U32)case AMDGPUISD::MAD_U64_U32: return "MAD_U64_U32";
4399 NODE_NAME_CASE(PERM)case AMDGPUISD::PERM: return "PERM";
4400 NODE_NAME_CASE(TEXTURE_FETCH)case AMDGPUISD::TEXTURE_FETCH: return "TEXTURE_FETCH";
4401 NODE_NAME_CASE(R600_EXPORT)case AMDGPUISD::R600_EXPORT: return "R600_EXPORT";
4402 NODE_NAME_CASE(CONST_ADDRESS)case AMDGPUISD::CONST_ADDRESS: return "CONST_ADDRESS";
4403 NODE_NAME_CASE(REGISTER_LOAD)case AMDGPUISD::REGISTER_LOAD: return "REGISTER_LOAD";
4404 NODE_NAME_CASE(REGISTER_STORE)case AMDGPUISD::REGISTER_STORE: return "REGISTER_STORE";
4405 NODE_NAME_CASE(SAMPLE)case AMDGPUISD::SAMPLE: return "SAMPLE";
4406 NODE_NAME_CASE(SAMPLEB)case AMDGPUISD::SAMPLEB: return "SAMPLEB";
4407 NODE_NAME_CASE(SAMPLED)case AMDGPUISD::SAMPLED: return "SAMPLED";
4408 NODE_NAME_CASE(SAMPLEL)case AMDGPUISD::SAMPLEL: return "SAMPLEL";
4409 NODE_NAME_CASE(CVT_F32_UBYTE0)case AMDGPUISD::CVT_F32_UBYTE0: return "CVT_F32_UBYTE0";
4410 NODE_NAME_CASE(CVT_F32_UBYTE1)case AMDGPUISD::CVT_F32_UBYTE1: return "CVT_F32_UBYTE1";
4411 NODE_NAME_CASE(CVT_F32_UBYTE2)case AMDGPUISD::CVT_F32_UBYTE2: return "CVT_F32_UBYTE2";
4412 NODE_NAME_CASE(CVT_F32_UBYTE3)case AMDGPUISD::CVT_F32_UBYTE3: return "CVT_F32_UBYTE3";
4413 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)case AMDGPUISD::CVT_PKRTZ_F16_F32: return "CVT_PKRTZ_F16_F32"
;
4414 NODE_NAME_CASE(CVT_PKNORM_I16_F32)case AMDGPUISD::CVT_PKNORM_I16_F32: return "CVT_PKNORM_I16_F32"
;
4415 NODE_NAME_CASE(CVT_PKNORM_U16_F32)case AMDGPUISD::CVT_PKNORM_U16_F32: return "CVT_PKNORM_U16_F32"
;
4416 NODE_NAME_CASE(CVT_PK_I16_I32)case AMDGPUISD::CVT_PK_I16_I32: return "CVT_PK_I16_I32";
4417 NODE_NAME_CASE(CVT_PK_U16_U32)case AMDGPUISD::CVT_PK_U16_U32: return "CVT_PK_U16_U32";
4418 NODE_NAME_CASE(FP_TO_FP16)case AMDGPUISD::FP_TO_FP16: return "FP_TO_FP16";
4419 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)case AMDGPUISD::BUILD_VERTICAL_VECTOR: return "BUILD_VERTICAL_VECTOR"
;
4420 NODE_NAME_CASE(CONST_DATA_PTR)case AMDGPUISD::CONST_DATA_PTR: return "CONST_DATA_PTR";
4421 NODE_NAME_CASE(PC_ADD_REL_OFFSET)case AMDGPUISD::PC_ADD_REL_OFFSET: return "PC_ADD_REL_OFFSET"
;
4422 NODE_NAME_CASE(LDS)case AMDGPUISD::LDS: return "LDS";
4423 NODE_NAME_CASE(DUMMY_CHAIN)case AMDGPUISD::DUMMY_CHAIN: return "DUMMY_CHAIN";
4424 case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4425 NODE_NAME_CASE(LOAD_D16_HI)case AMDGPUISD::LOAD_D16_HI: return "LOAD_D16_HI";
4426 NODE_NAME_CASE(LOAD_D16_LO)case AMDGPUISD::LOAD_D16_LO: return "LOAD_D16_LO";
4427 NODE_NAME_CASE(LOAD_D16_HI_I8)case AMDGPUISD::LOAD_D16_HI_I8: return "LOAD_D16_HI_I8";
4428 NODE_NAME_CASE(LOAD_D16_HI_U8)case AMDGPUISD::LOAD_D16_HI_U8: return "LOAD_D16_HI_U8";
4429 NODE_NAME_CASE(LOAD_D16_LO_I8)case AMDGPUISD::LOAD_D16_LO_I8: return "LOAD_D16_LO_I8";
4430 NODE_NAME_CASE(LOAD_D16_LO_U8)case AMDGPUISD::LOAD_D16_LO_U8: return "LOAD_D16_LO_U8";
4431 NODE_NAME_CASE(STORE_MSKOR)case AMDGPUISD::STORE_MSKOR: return "STORE_MSKOR";
4432 NODE_NAME_CASE(LOAD_CONSTANT)case AMDGPUISD::LOAD_CONSTANT: return "LOAD_CONSTANT";
4433 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)case AMDGPUISD::TBUFFER_STORE_FORMAT: return "TBUFFER_STORE_FORMAT"
;
4434 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)case AMDGPUISD::TBUFFER_STORE_FORMAT_D16: return "TBUFFER_STORE_FORMAT_D16"
;
4435 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)case AMDGPUISD::TBUFFER_LOAD_FORMAT: return "TBUFFER_LOAD_FORMAT"
;
4436 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)case AMDGPUISD::TBUFFER_LOAD_FORMAT_D16: return "TBUFFER_LOAD_FORMAT_D16"
;
4437 NODE_NAME_CASE(DS_ORDERED_COUNT)case AMDGPUISD::DS_ORDERED_COUNT: return "DS_ORDERED_COUNT";
4438 NODE_NAME_CASE(ATOMIC_CMP_SWAP)case AMDGPUISD::ATOMIC_CMP_SWAP: return "ATOMIC_CMP_SWAP";
4439 NODE_NAME_CASE(ATOMIC_INC)case AMDGPUISD::ATOMIC_INC: return "ATOMIC_INC";
4440 NODE_NAME_CASE(ATOMIC_DEC)case AMDGPUISD::ATOMIC_DEC: return "ATOMIC_DEC";
4441 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)case AMDGPUISD::ATOMIC_LOAD_FMIN: return "ATOMIC_LOAD_FMIN";
4442 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)case AMDGPUISD::ATOMIC_LOAD_FMAX: return "ATOMIC_LOAD_FMAX";
4443 NODE_NAME_CASE(BUFFER_LOAD)case AMDGPUISD::BUFFER_LOAD: return "BUFFER_LOAD";
4444 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)case AMDGPUISD::BUFFER_LOAD_UBYTE: return "BUFFER_LOAD_UBYTE"
;
4445 NODE_NAME_CASE(BUFFER_LOAD_USHORT)case AMDGPUISD::BUFFER_LOAD_USHORT: return "BUFFER_LOAD_USHORT"
;
4446 NODE_NAME_CASE(BUFFER_LOAD_BYTE)case AMDGPUISD::BUFFER_LOAD_BYTE: return "BUFFER_LOAD_BYTE";
4447 NODE_NAME_CASE(BUFFER_LOAD_SHORT)case AMDGPUISD::BUFFER_LOAD_SHORT: return "BUFFER_LOAD_SHORT"
;
4448 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)case AMDGPUISD::BUFFER_LOAD_FORMAT: return "BUFFER_LOAD_FORMAT"
;
4449 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)case AMDGPUISD::BUFFER_LOAD_FORMAT_D16: return "BUFFER_LOAD_FORMAT_D16"
;
4450 NODE_NAME_CASE(SBUFFER_LOAD)case AMDGPUISD::SBUFFER_LOAD: return "SBUFFER_LOAD";
4451 NODE_NAME_CASE(BUFFER_STORE)case AMDGPUISD::BUFFER_STORE: return "BUFFER_STORE";
4452 NODE_NAME_CASE(BUFFER_STORE_BYTE)case AMDGPUISD::BUFFER_STORE_BYTE: return "BUFFER_STORE_BYTE"
;
4453 NODE_NAME_CASE(BUFFER_STORE_SHORT)case AMDGPUISD::BUFFER_STORE_SHORT: return "BUFFER_STORE_SHORT"
;
4454 NODE_NAME_CASE(BUFFER_STORE_FORMAT)case AMDGPUISD::BUFFER_STORE_FORMAT: return "BUFFER_STORE_FORMAT"
;
4455 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)case AMDGPUISD::BUFFER_STORE_FORMAT_D16: return "BUFFER_STORE_FORMAT_D16"
;
4456 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)case AMDGPUISD::BUFFER_ATOMIC_SWAP: return "BUFFER_ATOMIC_SWAP"
;
4457 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)case AMDGPUISD::BUFFER_ATOMIC_ADD: return "BUFFER_ATOMIC_ADD"
;
4458 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)case AMDGPUISD::BUFFER_ATOMIC_SUB: return "BUFFER_ATOMIC_SUB"
;
4459 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)case AMDGPUISD::BUFFER_ATOMIC_SMIN: return "BUFFER_ATOMIC_SMIN"
;
4460 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)case AMDGPUISD::BUFFER_ATOMIC_UMIN: return "BUFFER_ATOMIC_UMIN"
;
4461 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)case AMDGPUISD::BUFFER_ATOMIC_SMAX: return "BUFFER_ATOMIC_SMAX"
;
4462 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)case AMDGPUISD::BUFFER_ATOMIC_UMAX: return "BUFFER_ATOMIC_UMAX"
;
4463 NODE_NAME_CASE(BUFFER_ATOMIC_AND)case AMDGPUISD::BUFFER_ATOMIC_AND: return "BUFFER_ATOMIC_AND"
;
4464 NODE_NAME_CASE(BUFFER_ATOMIC_OR)case AMDGPUISD::BUFFER_ATOMIC_OR: return "BUFFER_ATOMIC_OR";
4465 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)case AMDGPUISD::BUFFER_ATOMIC_XOR: return "BUFFER_ATOMIC_XOR"
;
4466 NODE_NAME_CASE(BUFFER_ATOMIC_INC)case AMDGPUISD::BUFFER_ATOMIC_INC: return "BUFFER_ATOMIC_INC"
;
4467 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)case AMDGPUISD::BUFFER_ATOMIC_DEC: return "BUFFER_ATOMIC_DEC"
;
4468 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: return "BUFFER_ATOMIC_CMPSWAP"
;
4469 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)case AMDGPUISD::BUFFER_ATOMIC_CSUB: return "BUFFER_ATOMIC_CSUB"
;
4470 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)case AMDGPUISD::BUFFER_ATOMIC_FADD: return "BUFFER_ATOMIC_FADD"
;
4471 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)case AMDGPUISD::BUFFER_ATOMIC_FMIN: return "BUFFER_ATOMIC_FMIN"
;
4472 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)case AMDGPUISD::BUFFER_ATOMIC_FMAX: return "BUFFER_ATOMIC_FMAX"
;
4473
4474 case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4475 }
4476 return nullptr;
4477}
4478
4479SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4480 SelectionDAG &DAG, int Enabled,
4481 int &RefinementSteps,
4482 bool &UseOneConstNR,
4483 bool Reciprocal) const {
4484 EVT VT = Operand.getValueType();
4485
4486 if (VT == MVT::f32) {
4487 RefinementSteps = 0;
4488 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4489 }
4490
4491 // TODO: There is also f64 rsq instruction, but the documentation is less
4492 // clear on its precision.
4493
4494 return SDValue();
4495}
4496
4497SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4498 SelectionDAG &DAG, int Enabled,
4499 int &RefinementSteps) const {
4500 EVT VT = Operand.getValueType();
4501
4502 if (VT == MVT::f32) {
4503 // Reciprocal, < 1 ulp error.
4504 //
4505 // This reciprocal approximation converges to < 0.5 ulp error with one
4506 // newton rhapson performed with two fused multiple adds (FMAs).
4507
4508 RefinementSteps = 0;
4509 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4510 }
4511
4512 // TODO: There is also f64 rcp instruction, but the documentation is less
4513 // clear on its precision.
4514
4515 return SDValue();
4516}
4517
4518void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4519 const SDValue Op, KnownBits &Known,
4520 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4521
4522 Known.resetAll(); // Don't know anything.
4523
4524 unsigned Opc = Op.getOpcode();
4525
4526 switch (Opc) {
4527 default:
4528 break;
4529 case AMDGPUISD::CARRY:
4530 case AMDGPUISD::BORROW: {
4531 Known.Zero = APInt::getHighBitsSet(32, 31);
4532 break;
4533 }
4534
4535 case AMDGPUISD::BFE_I32:
4536 case AMDGPUISD::BFE_U32: {
4537 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4538 if (!CWidth)
4539 return;
4540
4541 uint32_t Width = CWidth->getZExtValue() & 0x1f;
4542
4543 if (Opc == AMDGPUISD::BFE_U32)
4544 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4545
4546 break;
4547 }
4548 case AMDGPUISD::FP_TO_FP16: {
4549 unsigned BitWidth = Known.getBitWidth();
4550
4551 // High bits are zero.
4552 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4553 break;
4554 }
4555 case AMDGPUISD::MUL_U24:
4556 case AMDGPUISD::MUL_I24: {
4557 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4558 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4559 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4560 RHSKnown.countMinTrailingZeros();
4561 Known.Zero.setLowBits(std::min(TrailZ, 32u));
4562 // Skip extra check if all bits are known zeros.
4563 if (TrailZ >= 32)
4564 break;
4565
4566 // Truncate to 24 bits.
4567 LHSKnown = LHSKnown.trunc(24);
4568 RHSKnown = RHSKnown.trunc(24);
4569
4570 if (Opc == AMDGPUISD::MUL_I24) {
4571 unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
4572 unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
4573 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4574 if (MaxValBits >= 32)
4575 break;
4576 bool LHSNegative = LHSKnown.isNegative();
4577 bool LHSNonNegative = LHSKnown.isNonNegative();
4578 bool LHSPositive = LHSKnown.isStrictlyPositive();
4579 bool RHSNegative = RHSKnown.isNegative();
4580 bool RHSNonNegative = RHSKnown.isNonNegative();
4581 bool RHSPositive = RHSKnown.isStrictlyPositive();
4582
4583 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4584 Known.Zero.setHighBits(32 - MaxValBits);
4585 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4586 Known.One.setHighBits(32 - MaxValBits);
4587 } else {
4588 unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
4589 unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
4590 unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4591 if (MaxValBits >= 32)
4592 break;
4593 Known.Zero.setHighBits(32 - MaxValBits);
4594 }
4595 break;
4596 }
4597 case AMDGPUISD::PERM: {
4598 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4599 if (!CMask)
4600 return;
4601
4602 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4603 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4604 unsigned Sel = CMask->getZExtValue();
4605
4606 for (unsigned I = 0; I < 32; I += 8) {
4607 unsigned SelBits = Sel & 0xff;
4608 if (SelBits < 4) {
4609 SelBits *= 8;
4610 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4611 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4612 } else if (SelBits < 7) {
4613 SelBits = (SelBits & 3) * 8;
4614 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4615 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4616 } else if (SelBits == 0x0c) {
4617 Known.Zero |= 0xFFull << I;
4618 } else if (SelBits > 0x0c) {
4619 Known.One |= 0xFFull << I;
4620 }
4621 Sel >>= 8;
4622 }
4623 break;
4624 }
4625 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
4626 Known.Zero.setHighBits(24);
4627 break;
4628 }
4629 case AMDGPUISD::BUFFER_LOAD_USHORT: {
4630 Known.Zero.setHighBits(16);
4631 break;
4632 }
4633 case AMDGPUISD::LDS: {
4634 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4635 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4636
4637 Known.Zero.setHighBits(16);
4638 Known.Zero.setLowBits(Log2(Alignment));
4639 break;
4640 }
4641 case ISD::INTRINSIC_WO_CHAIN: {
4642 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4643 switch (IID) {
4644 case Intrinsic::amdgcn_mbcnt_lo:
4645 case Intrinsic::amdgcn_mbcnt_hi: {
4646 const GCNSubtarget &ST =
4647 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4648 // These return at most the wavefront size - 1.
4649 unsigned Size = Op.getValueType().getSizeInBits();
4650 Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4651 break;
4652 }
4653 default:
4654 break;
4655 }
4656 }
4657 }
4658}
4659
4660unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4661 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4662 unsigned Depth) const {
4663 switch (Op.getOpcode()) {
4664 case AMDGPUISD::BFE_I32: {
4665 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4666 if (!Width)
4667 return 1;
4668
4669 unsigned SignBits = 32 - Width->getZExtValue() + 1;
4670 if (!isNullConstant(Op.getOperand(1)))
4671 return SignBits;
4672
4673 // TODO: Could probably figure something out with non-0 offsets.
4674 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4675 return std::max(SignBits, Op0SignBits);
4676 }
4677
4678 case AMDGPUISD::BFE_U32: {
4679 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4680 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4681 }
4682
4683 case AMDGPUISD::CARRY:
4684 case AMDGPUISD::BORROW:
4685 return 31;
4686 case AMDGPUISD::BUFFER_LOAD_BYTE:
4687 return 25;
4688 case AMDGPUISD::BUFFER_LOAD_SHORT:
4689 return 17;
4690 case AMDGPUISD::BUFFER_LOAD_UBYTE:
4691 return 24;
4692 case AMDGPUISD::BUFFER_LOAD_USHORT:
4693 return 16;
4694 case AMDGPUISD::FP_TO_FP16:
4695 return 16;
4696 default:
4697 return 1;
4698 }
4699}
4700
4701unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4702 GISelKnownBits &Analysis, Register R,
4703 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4704 unsigned Depth) const {
4705 const MachineInstr *MI = MRI.getVRegDef(R);
4706 if (!MI)
4707 return 1;
4708
4709 // TODO: Check range metadata on MMO.
4710 switch (MI->getOpcode()) {
4711 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4712 return 25;
4713 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4714 return 17;
4715 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4716 return 24;
4717 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4718 return 16;
4719 default:
4720 return 1;
4721 }
4722}
4723
4724bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4725 const SelectionDAG &DAG,
4726 bool SNaN,
4727 unsigned Depth) const {
4728 unsigned Opcode = Op.getOpcode();
4729 switch (Opcode) {
4730 case AMDGPUISD::FMIN_LEGACY:
4731 case AMDGPUISD::FMAX_LEGACY: {
4732 if (SNaN)
4733 return true;
4734
4735 // TODO: Can check no nans on one of the operands for each one, but which
4736 // one?
4737 return false;
4738 }
4739 case AMDGPUISD::FMUL_LEGACY:
4740 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4741 if (SNaN)
4742 return true;
4743 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4744 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4745 }
4746 case AMDGPUISD::FMED3:
4747 case AMDGPUISD::FMIN3:
4748 case AMDGPUISD::FMAX3:
4749 case AMDGPUISD::FMAD_FTZ: {
4750 if (SNaN)
4751 return true;
4752 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4753 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4754 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4755 }
4756 case AMDGPUISD::CVT_F32_UBYTE0:
4757 case AMDGPUISD::CVT_F32_UBYTE1:
4758 case AMDGPUISD::CVT_F32_UBYTE2:
4759 case AMDGPUISD::CVT_F32_UBYTE3:
4760 return true;
4761
4762 case AMDGPUISD::RCP:
4763 case AMDGPUISD::RSQ:
4764 case AMDGPUISD::RCP_LEGACY:
4765 case AMDGPUISD::RSQ_CLAMP: {
4766 if (SNaN)
4767 return true;
4768
4769 // TODO: Need is known positive check.
4770 return false;
4771 }
4772 case AMDGPUISD::LDEXP:
4773 case AMDGPUISD::FRACT: {
4774 if (SNaN)
4775 return true;
4776 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4777 }
4778 case AMDGPUISD::DIV_SCALE:
4779 case AMDGPUISD::DIV_FMAS:
4780 case AMDGPUISD::DIV_FIXUP:
4781 // TODO: Refine on operands.
4782 return SNaN;
4783 case AMDGPUISD::SIN_HW:
4784 case AMDGPUISD::COS_HW: {
4785 // TODO: Need check for infinity
4786 return SNaN;
4787 }
4788 case ISD::INTRINSIC_WO_CHAIN: {
4789 unsigned IntrinsicID
4790 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4791 // TODO: Handle more intrinsics
4792 switch (IntrinsicID) {
4793 case Intrinsic::amdgcn_cubeid:
4794 return true;
4795
4796 case Intrinsic::amdgcn_frexp_mant: {
4797 if (SNaN)
4798 return true;
4799 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4800 }
4801 case Intrinsic::amdgcn_cvt_pkrtz: {
4802 if (SNaN)
4803 return true;
4804 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4805 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4806 }
4807 case Intrinsic::amdgcn_rcp:
4808 case Intrinsic::amdgcn_rsq:
4809 case Intrinsic::amdgcn_rcp_legacy:
4810 case Intrinsic::amdgcn_rsq_legacy:
4811 case Intrinsic::amdgcn_rsq_clamp: {
4812 if (SNaN)
4813 return true;
4814
4815 // TODO: Need is known positive check.
4816 return false;
4817 }
4818 case Intrinsic::amdgcn_trig_preop:
4819 case Intrinsic::amdgcn_fdot2:
4820 // TODO: Refine on operand
4821 return SNaN;
4822 case Intrinsic::amdgcn_fma_legacy:
4823 if (SNaN)
4824 return true;
4825 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4826 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4827 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
4828 default:
4829 return false;
4830 }
4831 }
4832 default:
4833 return false;
4834 }
4835}
4836
4837TargetLowering::AtomicExpansionKind
4838AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4839 switch (RMW->getOperation()) {
4840 case AtomicRMWInst::Nand:
4841 case AtomicRMWInst::FAdd:
4842 case AtomicRMWInst::FSub:
4843 return AtomicExpansionKind::CmpXChg;
4844 default:
4845 return AtomicExpansionKind::None;
4846 }
4847}
4848
4849bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal(
4850 unsigned Opc, LLT Ty1, LLT Ty2) const {
4851 return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
4852}

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

1//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
11
12#include "llvm/CodeGen/Register.h"
13#include "llvm/Pass.h"
14
15namespace llvm {
16
17class Function;
18class LLT;
19class raw_ostream;
20class TargetRegisterClass;
21class TargetRegisterInfo;
22
23struct ArgDescriptor {
24private:
25 friend struct AMDGPUFunctionArgInfo;
26 friend class AMDGPUArgumentUsageInfo;
27
28 union {
29 MCRegister Reg;
30 unsigned StackOffset;
31 };
32
33 // Bitmask to locate argument within the register.
34 unsigned Mask;
35
36 bool IsStack : 1;
37 bool IsSet : 1;
38
39public:
40 constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
41 bool IsStack = false, bool IsSet = false)
42 : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
43
44 static constexpr ArgDescriptor createRegister(Register Reg,
45 unsigned Mask = ~0u) {
46 return ArgDescriptor(Reg, Mask, false, true);
47 }
48
49 static constexpr ArgDescriptor createStack(unsigned Offset,
50 unsigned Mask = ~0u) {
51 return ArgDescriptor(Offset, Mask, true, true);
52 }
53
54 static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
55 unsigned Mask) {
56 return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
57 }
58
59 bool isSet() const {
60 return IsSet;
61 }
62
63 explicit operator bool() const {
64 return isSet();
65 }
66
67 bool isRegister() const {
68 return !IsStack;
69 }
70
71 MCRegister getRegister() const {
72 assert(!IsStack)((void)0);
73 return Reg;
74 }
75
76 unsigned getStackOffset() const {
77 assert(IsStack)((void)0);
78 return StackOffset;
79 }
80
81 unsigned getMask() const {
82 return Mask;
83 }
84
85 bool isMasked() const {
86 return Mask != ~0u;
3
Assuming the condition is true
4
Returning the value 1, which participates in a condition later
87 }
88
89 void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
90};
91
92inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
93 Arg.print(OS);
94 return OS;
95}
96
97struct AMDGPUFunctionArgInfo {
98 enum PreloadedValue {
99 // SGPRS:
100 PRIVATE_SEGMENT_BUFFER = 0,
101 DISPATCH_PTR = 1,
102 QUEUE_PTR = 2,
103 KERNARG_SEGMENT_PTR = 3,
104 DISPATCH_ID = 4,
105 FLAT_SCRATCH_INIT = 5,
106 WORKGROUP_ID_X = 10,
107 WORKGROUP_ID_Y = 11,
108 WORKGROUP_ID_Z = 12,
109 PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
110 IMPLICIT_BUFFER_PTR = 15,
111 IMPLICIT_ARG_PTR = 16,
112
113 // VGPRS:
114 WORKITEM_ID_X = 17,
115 WORKITEM_ID_Y = 18,
116 WORKITEM_ID_Z = 19,
117 FIRST_VGPR_VALUE = WORKITEM_ID_X
118 };
119
120 // Kernel input registers setup for the HSA ABI in allocation order.
121
122 // User SGPRs in kernels
123 // XXX - Can these require argument spills?
124 ArgDescriptor PrivateSegmentBuffer;
125 ArgDescriptor DispatchPtr;
126 ArgDescriptor QueuePtr;
127 ArgDescriptor KernargSegmentPtr;
128 ArgDescriptor DispatchID;
129 ArgDescriptor FlatScratchInit;
130 ArgDescriptor PrivateSegmentSize;
131
132 // System SGPRs in kernels.
133 ArgDescriptor WorkGroupIDX;
134 ArgDescriptor WorkGroupIDY;
135 ArgDescriptor WorkGroupIDZ;
136 ArgDescriptor WorkGroupInfo;
137 ArgDescriptor PrivateSegmentWaveByteOffset;
138
139 // Pointer with offset from kernargsegmentptr to where special ABI arguments
140 // are passed to callable functions.
141 ArgDescriptor ImplicitArgPtr;
142
143 // Input registers for non-HSA ABI
144 ArgDescriptor ImplicitBufferPtr;
145
146 // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed
147 // into v0, 10 bits per dimension if packed-tid is set.
148 ArgDescriptor WorkItemIDX;
149 ArgDescriptor WorkItemIDY;
150 ArgDescriptor WorkItemIDZ;
151
152 std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
153 getPreloadedValue(PreloadedValue Value) const;
154
155 static constexpr AMDGPUFunctionArgInfo fixedABILayout();
156};
157
158class AMDGPUArgumentUsageInfo : public ImmutablePass {
159private:
160 DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
161
162public:
163 static char ID;
164
165 static const AMDGPUFunctionArgInfo ExternFunctionInfo;
166 static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
167
168 AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
169
170 void getAnalysisUsage(AnalysisUsage &AU) const override {
171 AU.setPreservesAll();
172 }
173
174 bool doInitialization(Module &M) override;
175 bool doFinalization(Module &M) override;
176
177 void print(raw_ostream &OS, const Module *M = nullptr) const override;
178
179 void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) {
180 ArgInfoMap[&F] = ArgInfo;
181 }
182
183 const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
184};
185
186} // end namespace llvm
187
188#endif

/usr/src/gnu/usr.bin/clang/libLLVM/../../../llvm/llvm/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some functions that are useful for math stuff.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_SUPPORT_MATHEXTRAS_H
14#define LLVM_SUPPORT_MATHEXTRAS_H
15
16#include "llvm/Support/Compiler.h"
17#include <cassert>
18#include <climits>
19#include <cmath>
20#include <cstdint>
21#include <cstring>
22#include <limits>
23#include <type_traits>
24
25#ifdef __ANDROID_NDK__
26#include <android/api-level.h>
27#endif
28
29#ifdef _MSC_VER
30// Declare these intrinsics manually rather including intrin.h. It's very
31// expensive, and MathExtras.h is popular.
32// #include <intrin.h>
33extern "C" {
34unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
35unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
36unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
37unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
38}
39#endif
40
41namespace llvm {
42
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53/// Mathematical constants.
54namespace numbers {
55// TODO: Track C++20 std::numbers.
56// TODO: Favor using the hexadecimal FP constants (requires C++17).
57constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113
58 egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
59 ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
60 ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392
61 log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0)
62 log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
63 pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
64 inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541
65 sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
66 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
67 sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
68 inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
69 sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
70 inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1)
71 phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
72constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
73 egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
74 ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
75 ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
76 log2ef = 1.44269504F, // (0x1.715476P+0)
77 log10ef = .434294482F, // (0x1.bcb7b2P-2)
78 pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
79 inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
80 sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
81 inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
82 sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
83 inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1)
84 sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
85 inv_sqrt3f = .577350269F, // (0x1.279a74P-1)
86 phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
87} // namespace numbers
88
89namespace detail {
90template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
91 static unsigned count(T Val, ZeroBehavior) {
92 if (!Val)
93 return std::numeric_limits<T>::digits;
94 if (Val & 0x1)
95 return 0;
96
97 // Bisection method.
98 unsigned ZeroBits = 0;
99 T Shift = std::numeric_limits<T>::digits >> 1;
100 T Mask = std::numeric_limits<T>::max() >> Shift;
101 while (Shift) {
102 if ((Val & Mask) == 0) {
103 Val >>= Shift;
104 ZeroBits |= Shift;
105 }
106 Shift >>= 1;
107 Mask >>= Shift;
108 }
109 return ZeroBits;
110 }
111};
112
113#if defined(__GNUC__4) || defined(_MSC_VER)
114template <typename T> struct TrailingZerosCounter<T, 4> {
115 static unsigned count(T Val, ZeroBehavior ZB) {
116 if (ZB
8.1
'ZB' is not equal to ZB_Undefined
8.1
'ZB' is not equal to ZB_Undefined
8.1
'ZB' is not equal to ZB_Undefined
!= ZB_Undefined && Val == 0)
9
Assuming 'Val' is equal to 0
10
Taking true branch
117 return 32;
11
Returning the value 32
118
119#if __has_builtin(__builtin_ctz)1 || defined(__GNUC__4)
120 return __builtin_ctz(Val);
121#elif defined(_MSC_VER)
122 unsigned long Index;
123 _BitScanForward(&Index, Val);
124 return Index;
125#endif
126 }
127};
128
129#if !defined(_MSC_VER) || defined(_M_X64)
130template <typename T> struct TrailingZerosCounter<T, 8> {
131 static unsigned count(T Val, ZeroBehavior ZB) {
132 if (ZB != ZB_Undefined && Val == 0)
133 return 64;
134
135#if __has_builtin(__builtin_ctzll)1 || defined(__GNUC__4)
136 return __builtin_ctzll(Val);
137#elif defined(_MSC_VER)
138 unsigned long Index;
139 _BitScanForward64(&Index, Val);
140 return Index;
141#endif
142 }
143};
144#endif
145#endif
146} // namespace detail
147
148/// Count number of 0's from the least significant bit to the most
149/// stopping at the first 1.
150///
151/// Only unsigned integral types are allowed.
152///
153/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
154/// valid arguments.
155template <typename T>
156unsigned countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
157 static_assert(std::numeric_limits<T>::is_integer &&
158 !std::numeric_limits<T>::is_signed,
159 "Only unsigned integral types are allowed.");
160 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
8
Calling 'TrailingZerosCounter::count'
12
Returning from 'TrailingZerosCounter::count'
13
Returning the value 32
161}
162
163namespace detail {
164template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
165 static unsigned count(T Val, ZeroBehavior) {
166 if (!Val)
167 return std::numeric_limits<T>::digits;
168
169 // Bisection method.
170 unsigned ZeroBits = 0;
171 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
172 T Tmp = Val >> Shift;
173 if (Tmp)
174 Val = Tmp;
175 else
176 ZeroBits |= Shift;
177 }
178 return ZeroBits;
179 }
180};
181
182#if defined(__GNUC__4) || defined(_MSC_VER)
183template <typename T> struct LeadingZerosCounter<T, 4> {
184 static unsigned count(T Val, ZeroBehavior ZB) {
185 if (ZB != ZB_Undefined && Val == 0)
186 return 32;
187
188#if __has_builtin(__builtin_clz)1 || defined(__GNUC__4)
189 return __builtin_clz(Val);
190#elif defined(_MSC_VER)
191 unsigned long Index;
192 _BitScanReverse(&Index, Val);
193 return Index ^ 31;
194#endif
195 }
196};
197
198#if !defined(_MSC_VER) || defined(_M_X64)
199template <typename T> struct LeadingZerosCounter<T, 8> {
200 static unsigned count(T Val, ZeroBehavior ZB) {
201 if (ZB != ZB_Undefined && Val == 0)
202 return 64;
203
204#if __has_builtin(__builtin_clzll)1 || defined(__GNUC__4)
205 return __builtin_clzll(Val);
206#elif defined(_MSC_VER)
207 unsigned long Index;
208 _BitScanReverse64(&Index, Val);
209 return Index ^ 63;
210#endif
211 }
212};
213#endif
214#endif
215} // namespace detail
216
217/// Count number of 0's from the most significant bit to the least
218/// stopping at the first 1.
219///
220/// Only unsigned integral types are allowed.
221///
222/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
223/// valid arguments.
224template <typename T>
225unsigned countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
226 static_assert(std::numeric_limits<T>::is_integer &&
227 !std::numeric_limits<T>::is_signed,
228 "Only unsigned integral types are allowed.");
229 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
230}
231
232/// Get the index of the first set bit starting from the least
233/// significant bit.
234///
235/// Only unsigned integral types are allowed.
236///
237/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
238/// valid arguments.
239template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
240 if (ZB == ZB_Max && Val == 0)
241 return std::numeric_limits<T>::max();
242
243 return countTrailingZeros(Val, ZB_Undefined);
244}
245
246/// Create a bitmask with the N right-most bits set to 1, and all other
247/// bits set to 0. Only unsigned types are allowed.
248template <typename T> T maskTrailingOnes(unsigned N) {
249 static_assert(std::is_unsigned<T>::value, "Invalid type!");
250 const unsigned Bits = CHAR_BIT8 * sizeof(T);
251 assert(N <= Bits && "Invalid bit index")((void)0);
252 return N == 0 ? 0 : (T(-1) >> (Bits - N));
253}
254
255/// Create a bitmask with the N left-most bits set to 1, and all other
256/// bits set to 0. Only unsigned types are allowed.
257template <typename T> T maskLeadingOnes(unsigned N) {
258 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
259}
260
261/// Create a bitmask with the N right-most bits set to 0, and all other
262/// bits set to 1. Only unsigned types are allowed.
263template <typename T> T maskTrailingZeros(unsigned N) {
264 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
265}
266
267/// Create a bitmask with the N left-most bits set to 0, and all other
268/// bits set to 1. Only unsigned types are allowed.
269template <typename T> T maskLeadingZeros(unsigned N) {
270 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
271}
272
273/// Get the index of the last set bit starting from the least
274/// significant bit.
275///
276/// Only unsigned integral types are allowed.
277///
278/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
279/// valid arguments.
280template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
281 if (ZB == ZB_Max && Val == 0)
282 return std::numeric_limits<T>::max();
283
284 // Use ^ instead of - because both gcc and llvm can remove the associated ^
285 // in the __builtin_clz intrinsic on x86.
286 return countLeadingZeros(Val, ZB_Undefined) ^
287 (std::numeric_limits<T>::digits - 1);
288}
289
290/// Macro compressed bit reversal table for 256 bits.
291///
292/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
293static const unsigned char BitReverseTable256[256] = {
294#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
295#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
296#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
297 R6(0), R6(2), R6(1), R6(3)
298#undef R2
299#undef R4
300#undef R6
301};
302
303/// Reverse the bits in \p Val.
304template <typename T>
305T reverseBits(T Val) {
306 unsigned char in[sizeof(Val)];
307 unsigned char out[sizeof(Val)];
308 std::memcpy(in, &Val, sizeof(Val));
309 for (unsigned i = 0; i < sizeof(Val); ++i)
310 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
311 std::memcpy(&Val, out, sizeof(Val));
312 return Val;
313}
314
315#if __has_builtin(__builtin_bitreverse8)1
316template<>
317inline uint8_t reverseBits<uint8_t>(uint8_t Val) {
318 return __builtin_bitreverse8(Val);
319}
320#endif
321
322#if __has_builtin(__builtin_bitreverse16)1
323template<>
324inline uint16_t reverseBits<uint16_t>(uint16_t Val) {
325 return __builtin_bitreverse16(Val);
326}
327#endif
328
329#if __has_builtin(__builtin_bitreverse32)1
330template<>
331inline uint32_t reverseBits<uint32_t>(uint32_t Val) {
332 return __builtin_bitreverse32(Val);
333}
334#endif
335
336#if __has_builtin(__builtin_bitreverse64)1
337template<>
338inline uint64_t reverseBits<uint64_t>(uint64_t Val) {
339 return __builtin_bitreverse64(Val);
340}
341#endif
342
343// NOTE: The following support functions use the _32/_64 extensions instead of
344// type overloading so that signed and unsigned integers can be used without
345// ambiguity.
346
347/// Return the high 32 bits of a 64 bit value.
348constexpr inline uint32_t Hi_32(uint64_t Value) {
349 return static_cast<uint32_t>(Value >> 32);
350}
351
352/// Return the low 32 bits of a 64 bit value.
353constexpr inline uint32_t Lo_32(uint64_t Value) {
354 return static_cast<uint32_t>(Value);
355}
356
357/// Make a 64-bit integer from a high / low pair of 32-bit integers.
358constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
359 return ((uint64_t)High << 32) | (uint64_t)Low;
360}
361
362/// Checks if an integer fits into the given bit width.
363template <unsigned N> constexpr inline bool isInt(int64_t x) {
364 return N >= 64 || (-(INT64_C(1)1LL<<(N-1)) <= x && x < (INT64_C(1)1LL<<(N-1)));
365}
366// Template specializations to get better code for common cases.
367template <> constexpr inline bool isInt<8>(int64_t x) {
368 return static_cast<int8_t>(x) == x;
369}
370template <> constexpr inline bool isInt<16>(int64_t x) {
371 return static_cast<int16_t>(x) == x;
372}
373template <> constexpr inline bool isInt<32>(int64_t x) {
374 return static_cast<int32_t>(x) == x;
375}
376
377/// Checks if a signed integer is an N bit number shifted left by S.
378template <unsigned N, unsigned S>
379constexpr inline bool isShiftedInt(int64_t x) {
380 static_assert(
381 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
382 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
383 return isInt<N + S>(x) && (x % (UINT64_C(1)1ULL << S) == 0);
384}
385
386/// Checks if an unsigned integer fits into the given bit width.
387///
388/// This is written as two functions rather than as simply
389///
390/// return N >= 64 || X < (UINT64_C(1) << N);
391///
392/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
393/// left too many places.
394template <unsigned N>
395constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
396 static_assert(N > 0, "isUInt<0> doesn't make sense");
397 return X < (UINT64_C(1)1ULL << (N));
398}
399template <unsigned N>
400constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t) {
401 return true;
402}
403
404// Template specializations to get better code for common cases.
405template <> constexpr inline bool isUInt<8>(uint64_t x) {
406 return static_cast<uint8_t>(x) == x;
407}
408template <> constexpr inline bool isUInt<16>(uint64_t x) {
409 return static_cast<uint16_t>(x) == x;
410}
411template <> constexpr inline bool isUInt<32>(uint64_t x) {
412 return static_cast<uint32_t>(x) == x;
413}
414
415/// Checks if a unsigned integer is an N bit number shifted left by S.
416template <unsigned N, unsigned S>
417constexpr inline bool isShiftedUInt(uint64_t x) {
418 static_assert(
419 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
420 static_assert(N + S <= 64,
421 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
422 // Per the two static_asserts above, S must be strictly less than 64. So
423 // 1 << S is not undefined behavior.
424 return isUInt<N + S>(x) && (x % (UINT64_C(1)1ULL << S) == 0);
425}
426
427/// Gets the maximum value for a N-bit unsigned integer.
428inline uint64_t maxUIntN(uint64_t N) {
429 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
430
431 // uint64_t(1) << 64 is undefined behavior, so we can't do
432 // (uint64_t(1) << N) - 1
433 // without checking first that N != 64. But this works and doesn't have a
434 // branch.
435 return UINT64_MAX0xffffffffffffffffULL >> (64 - N);
436}
437
438/// Gets the minimum value for a N-bit signed integer.
439inline int64_t minIntN(int64_t N) {
440 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
441
442 return UINT64_C(1)1ULL + ~(UINT64_C(1)1ULL << (N - 1));
443}
444
445/// Gets the maximum value for a N-bit signed integer.
446inline int64_t maxIntN(int64_t N) {
447 assert(N > 0 && N <= 64 && "integer width out of range")((void)0);
448
449 // This relies on two's complement wraparound when N == 64, so we convert to
450 // int64_t only at the very end to avoid UB.
451 return (UINT64_C(1)1ULL << (N - 1)) - 1;
452}
453
454/// Checks if an unsigned integer fits into the given (dynamic) bit width.
455inline bool isUIntN(unsigned N, uint64_t x) {
456 return N >= 64 || x <= maxUIntN(N);
457}
458
459/// Checks if an signed integer fits into the given (dynamic) bit width.
460inline bool isIntN(unsigned N, int64_t x) {
461 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
462}
463
464/// Return true if the argument is a non-empty sequence of ones starting at the
465/// least significant bit with the remainder zero (32 bit version).
466/// Ex. isMask_32(0x0000FFFFU) == true.
467constexpr inline bool isMask_32(uint32_t Value) {
468 return Value && ((Value + 1) & Value) == 0;
469}
470
471/// Return true if the argument is a non-empty sequence of ones starting at the
472/// least significant bit with the remainder zero (64 bit version).
473constexpr inline bool isMask_64(uint64_t Value) {
474 return Value && ((Value + 1) & Value) == 0;
475}
476
477/// Return true if the argument contains a non-empty sequence of ones with the
478/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
479constexpr inline bool isShiftedMask_32(uint32_t Value) {
480 return Value && isMask_32((Value - 1) | Value);
481}
482
483/// Return true if the argument contains a non-empty sequence of ones with the
484/// remainder zero (64 bit version.)
485constexpr inline bool isShiftedMask_64(uint64_t Value) {
486 return Value && isMask_64((Value - 1) | Value);
487}
488
489/// Return true if the argument is a power of two > 0.
490/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
491constexpr inline bool isPowerOf2_32(uint32_t Value) {
492 return Value && !(Value & (Value - 1));
493}
494
495/// Return true if the argument is a power of two > 0 (64 bit edition.)
496constexpr inline bool isPowerOf2_64(uint64_t Value) {
497 return Value && !(Value & (Value - 1));
498}
499
500/// Count the number of ones from the most significant bit to the first
501/// zero bit.
502///
503/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
504/// Only unsigned integral types are allowed.
505///
506/// \param ZB the behavior on an input of all ones. Only ZB_Width and
507/// ZB_Undefined are valid arguments.
508template <typename T>
509unsigned countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
510 static_assert(std::numeric_limits<T>::is_integer &&
511 !std::numeric_limits<T>::is_signed,
512 "Only unsigned integral types are allowed.");
513 return countLeadingZeros<T>(~Value, ZB);
514}
515
516/// Count the number of ones from the least significant bit to the first
517/// zero bit.
518///
519/// Ex. countTrailingOnes(0x00FF00FF) == 8.
520/// Only unsigned integral types are allowed.
521///
522/// \param ZB the behavior on an input of all ones. Only ZB_Width and
523/// ZB_Undefined are valid arguments.
524template <typename T>
525unsigned countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
526 static_assert(std::numeric_limits<T>::is_integer &&
527 !std::numeric_limits<T>::is_signed,
528 "Only unsigned integral types are allowed.");
529 return countTrailingZeros<T>(~Value, ZB);
530}
531
532namespace detail {
533template <typename T, std::size_t SizeOfT> struct PopulationCounter {
534 static unsigned count(T Value) {
535 // Generic version, forward to 32 bits.
536 static_assert(SizeOfT <= 4, "Not implemented!");
537#if defined(__GNUC__4)
538 return __builtin_popcount(Value);
539#else
540 uint32_t v = Value;
541 v = v - ((v >> 1) & 0x55555555);
542 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
543 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
544#endif
545 }
546};
547
548template <typename T> struct PopulationCounter<T, 8> {
549 static unsigned count(T Value) {
550#if defined(__GNUC__4)
551 return __builtin_popcountll(Value);
552#else
553 uint64_t v = Value;
554 v = v - ((v >> 1) & 0x5555555555555555ULL);
555 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
556 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
557 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
558#endif
559 }
560};
561} // namespace detail
562
563/// Count the number of set bits in a value.
564/// Ex. countPopulation(0xF000F000) = 8
565/// Returns 0 if the word is zero.
566template <typename T>
567inline unsigned countPopulation(T Value) {
568 static_assert(std::numeric_limits<T>::is_integer &&
569 !std::numeric_limits<T>::is_signed,
570 "Only unsigned integral types are allowed.");
571 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
572}
573
574/// Compile time Log2.
575/// Valid only for positive powers of two.
576template <size_t kValue> constexpr inline size_t CTLog2() {
577 static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue),
578 "Value is not a valid power of 2");
579 return 1 + CTLog2<kValue / 2>();
580}
581
582template <> constexpr inline size_t CTLog2<1>() { return 0; }
583
584/// Return the log base 2 of the specified value.
585inline double Log2(double Value) {
586#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
587 return __builtin_log(Value) / __builtin_log(2.0);
588#else
589 return log2(Value);
590#endif
591}
592
593/// Return the floor log base 2 of the specified value, -1 if the value is zero.
594/// (32 bit edition.)
595/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
596inline unsigned Log2_32(uint32_t Value) {
597 return 31 - countLeadingZeros(Value);
598}
599
600/// Return the floor log base 2 of the specified value, -1 if the value is zero.
601/// (64 bit edition.)
602inline unsigned Log2_64(uint64_t Value) {
603 return 63 - countLeadingZeros(Value);
604}
605
606/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
607/// (32 bit edition).
608/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
609inline unsigned Log2_32_Ceil(uint32_t Value) {
610 return 32 - countLeadingZeros(Value - 1);
611}
612
613/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
614/// (64 bit edition.)
615inline unsigned Log2_64_Ceil(uint64_t Value) {
616 return 64 - countLeadingZeros(Value - 1);
617}
618
619/// Return the greatest common divisor of the values using Euclid's algorithm.
620template <typename T>
621inline T greatestCommonDivisor(T A, T B) {
622 while (B) {
623 T Tmp = B;
624 B = A % B;
625 A = Tmp;
626 }
627 return A;
628}
629
630inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
631 return greatestCommonDivisor<uint64_t>(A, B);
632}
633
634/// This function takes a 64-bit integer and returns the bit equivalent double.
635inline double BitsToDouble(uint64_t Bits) {
636 double D;
637 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
638 memcpy(&D, &Bits, sizeof(Bits));
639 return D;
640}
641
642/// This function takes a 32-bit integer and returns the bit equivalent float.
643inline float BitsToFloat(uint32_t Bits) {
644 float F;
645 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
646 memcpy(&F, &Bits, sizeof(Bits));
647 return F;
648}
649
650/// This function takes a double and returns the bit equivalent 64-bit integer.
651/// Note that copying doubles around changes the bits of NaNs on some hosts,
652/// notably x86, so this routine cannot be used if these bits are needed.
653inline uint64_t DoubleToBits(double Double) {
654 uint64_t Bits;
655 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
656 memcpy(&Bits, &Double, sizeof(Double));
657 return Bits;
658}
659
660/// This function takes a float and returns the bit equivalent 32-bit integer.
661/// Note that copying floats around changes the bits of NaNs on some hosts,
662/// notably x86, so this routine cannot be used if these bits are needed.
663inline uint32_t FloatToBits(float Float) {
664 uint32_t Bits;
665 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
666 memcpy(&Bits, &Float, sizeof(Float));
667 return Bits;
668}
669
670/// A and B are either alignments or offsets. Return the minimum alignment that
671/// may be assumed after adding the two together.
672constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
673 // The largest power of 2 that divides both A and B.
674 //
675 // Replace "-Value" by "1+~Value" in the following commented code to avoid
676 // MSVC warning C4146
677 // return (A | B) & -(A | B);
678 return (A | B) & (1 + ~(A | B));
679}
680
681/// Returns the next power of two (in 64-bits) that is strictly greater than A.
682/// Returns zero on overflow.
683inline uint64_t NextPowerOf2(uint64_t A) {
684 A |= (A >> 1);
685 A |= (A >> 2);
686 A |= (A >> 4);
687 A |= (A >> 8);
688 A |= (A >> 16);
689 A |= (A >> 32);
690 return A + 1;
691}
692
693/// Returns the power of two which is less than or equal to the given value.
694/// Essentially, it is a floor operation across the domain of powers of two.
695inline uint64_t PowerOf2Floor(uint64_t A) {
696 if (!A) return 0;
697 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
698}
699
700/// Returns the power of two which is greater than or equal to the given value.
701/// Essentially, it is a ceil operation across the domain of powers of two.
702inline uint64_t PowerOf2Ceil(uint64_t A) {
703 if (!A)
704 return 0;
705 return NextPowerOf2(A - 1);
706}
707
708/// Returns the next integer (mod 2**64) that is greater than or equal to
709/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
710///
711/// If non-zero \p Skew is specified, the return value will be a minimal
712/// integer that is greater than or equal to \p Value and equal to
713/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
714/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
715///
716/// Examples:
717/// \code
718/// alignTo(5, 8) = 8
719/// alignTo(17, 8) = 24
720/// alignTo(~0LL, 8) = 0
721/// alignTo(321, 255) = 510
722///
723/// alignTo(5, 8, 7) = 7
724/// alignTo(17, 8, 1) = 17
725/// alignTo(~0LL, 8, 3) = 3
726/// alignTo(321, 255, 42) = 552
727/// \endcode
728inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
729 assert(Align != 0u && "Align can't be 0.")((void)0);
730 Skew %= Align;
731 return (Value + Align - 1 - Skew) / Align * Align + Skew;
732}
733
734/// Returns the next integer (mod 2**64) that is greater than or equal to
735/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
736template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
737 static_assert(Align != 0u, "Align must be non-zero");
738 return (Value + Align - 1) / Align * Align;
739}
740
741/// Returns the integer ceil(Numerator / Denominator).
742inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
743 return alignTo(Numerator, Denominator) / Denominator;
744}
745
746/// Returns the integer nearest(Numerator / Denominator).
747inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
748 return (Numerator + (Denominator / 2)) / Denominator;
749}
750
751/// Returns the largest uint64_t less than or equal to \p Value and is
752/// \p Skew mod \p Align. \p Align must be non-zero
753inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
754 assert(Align != 0u && "Align can't be 0.")((void)0);
755 Skew %= Align;
756 return (Value - Skew) / Align * Align + Skew;
757}
758
759/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
760/// Requires 0 < B <= 32.
761template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
762 static_assert(B > 0, "Bit width can't be 0.");
763 static_assert(B <= 32, "Bit width out of range.");
764 return int32_t(X << (32 - B)) >> (32 - B);
765}
766
767/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
768/// Requires 0 < B <= 32.
769inline int32_t SignExtend32(uint32_t X, unsigned B) {
770 assert(B > 0 && "Bit width can't be 0.")((void)0);
771 assert(B <= 32 && "Bit width out of range.")((void)0);
772 return int32_t(X << (32 - B)) >> (32 - B);
773}
774
775/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
776/// Requires 0 < B <= 64.
777template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
778 static_assert(B > 0, "Bit width can't be 0.");
779 static_assert(B <= 64, "Bit width out of range.");
780 return int64_t(x << (64 - B)) >> (64 - B);
781}
782
783/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
784/// Requires 0 < B <= 64.
785inline int64_t SignExtend64(uint64_t X, unsigned B) {
786 assert(B > 0 && "Bit width can't be 0.")((void)0);
787 assert(B <= 64 && "Bit width out of range.")((void)0);
788 return int64_t(X << (64 - B)) >> (64 - B);
789}
790
791/// Subtract two unsigned integers, X and Y, of type T and return the absolute
792/// value of the result.
793template <typename T>
794std::enable_if_t<std::is_unsigned<T>::value, T> AbsoluteDifference(T X, T Y) {
795 return X > Y ? (X - Y) : (Y - X);
796}
797
798/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
799/// maximum representable value of T on overflow. ResultOverflowed indicates if
800/// the result is larger than the maximum representable value of type T.
801template <typename T>
802std::enable_if_t<std::is_unsigned<T>::value, T>
803SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
804 bool Dummy;
805 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
806 // Hacker's Delight, p. 29
807 T Z = X + Y;
808 Overflowed = (Z < X || Z < Y);
809 if (Overflowed)
810 return std::numeric_limits<T>::max();
811 else
812 return Z;
813}
814
815/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
816/// maximum representable value of T on overflow. ResultOverflowed indicates if
817/// the result is larger than the maximum representable value of type T.
818template <typename T>
819std::enable_if_t<std::is_unsigned<T>::value, T>
820SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
821 bool Dummy;
822 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
823
824 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
825 // because it fails for uint16_t (where multiplication can have undefined
826 // behavior due to promotion to int), and requires a division in addition
827 // to the multiplication.
828
829 Overflowed = false;
830
831 // Log2(Z) would be either Log2Z or Log2Z + 1.
832 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
833 // will necessarily be less than Log2Max as desired.
834 int Log2Z = Log2_64(X) + Log2_64(Y);
835 const T Max = std::numeric_limits<T>::max();
836 int Log2Max = Log2_64(Max);
837 if (Log2Z < Log2Max) {
838 return X * Y;
839 }
840 if (Log2Z > Log2Max) {
841 Overflowed = true;
842 return Max;
843 }
844
845 // We're going to use the top bit, and maybe overflow one
846 // bit past it. Multiply all but the bottom bit then add
847 // that on at the end.
848 T Z = (X >> 1) * Y;
849 if (Z & ~(Max >> 1)) {
850 Overflowed = true;
851 return Max;
852 }
853 Z <<= 1;
854 if (X & 1)
855 return SaturatingAdd(Z, Y, ResultOverflowed);
856
857 return Z;
858}
859
860/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
861/// the product. Clamp the result to the maximum representable value of T on
862/// overflow. ResultOverflowed indicates if the result is larger than the
863/// maximum representable value of type T.
864template <typename T>
865std::enable_if_t<std::is_unsigned<T>::value, T>
866SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
867 bool Dummy;
868 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
869
870 T Product = SaturatingMultiply(X, Y, &Overflowed);
871 if (Overflowed)
872 return Product;
873
874 return SaturatingAdd(A, Product, &Overflowed);
875}
876
877/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
878extern const float huge_valf;
879
880
881/// Add two signed integers, computing the two's complement truncated result,
882/// returning true if overflow occured.
883template <typename T>
884std::enable_if_t<std::is_signed<T>::value, T> AddOverflow(T X, T Y, T &Result) {
885#if __has_builtin(__builtin_add_overflow)1
886 return __builtin_add_overflow(X, Y, &Result);
887#else
888 // Perform the unsigned addition.
889 using U = std::make_unsigned_t<T>;
890 const U UX = static_cast<U>(X);
891 const U UY = static_cast<U>(Y);
892 const U UResult = UX + UY;
893
894 // Convert to signed.
895 Result = static_cast<T>(UResult);
896
897 // Adding two positive numbers should result in a positive number.
898 if (X > 0 && Y > 0)
899 return Result <= 0;
900 // Adding two negatives should result in a negative number.
901 if (X < 0 && Y < 0)
902 return Result >= 0;
903 return false;
904#endif
905}
906
907/// Subtract two signed integers, computing the two's complement truncated
908/// result, returning true if an overflow ocurred.
909template <typename T>
910std::enable_if_t<std::is_signed<T>::value, T> SubOverflow(T X, T Y, T &Result) {
911#if __has_builtin(__builtin_sub_overflow)1
912 return __builtin_sub_overflow(X, Y, &Result);
913#else
914 // Perform the unsigned addition.
915 using U = std::make_unsigned_t<T>;
916 const U UX = static_cast<U>(X);
917 const U UY = static_cast<U>(Y);
918 const U UResult = UX - UY;
919
920 // Convert to signed.
921 Result = static_cast<T>(UResult);
922
923 // Subtracting a positive number from a negative results in a negative number.
924 if (X <= 0 && Y > 0)
925 return Result >= 0;
926 // Subtracting a negative number from a positive results in a positive number.
927 if (X >= 0 && Y < 0)
928 return Result <= 0;
929 return false;
930#endif
931}
932
933/// Multiply two signed integers, computing the two's complement truncated
934/// result, returning true if an overflow ocurred.
935template <typename T>
936std::enable_if_t<std::is_signed<T>::value, T> MulOverflow(T X, T Y, T &Result) {
937 // Perform the unsigned multiplication on absolute values.
938 using U = std::make_unsigned_t<T>;
939 const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
940 const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y);
941 const U UResult = UX * UY;
942
943 // Convert to signed.
944 const bool IsNegative = (X < 0) ^ (Y < 0);
945 Result = IsNegative ? (0 - UResult) : UResult;
946
947 // If any of the args was 0, result is 0 and no overflow occurs.
948 if (UX == 0 || UY == 0)
949 return false;
950
951 // UX and UY are in [1, 2^n], where n is the number of digits.
952 // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for
953 // positive) divided by an argument compares to the other.
954 if (IsNegative)
955 return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
956 else
957 return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
958}
959
960} // End llvm namespace
961
962#endif