1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
17
18 #include <algorithm>
19 #include <any>
20 #include <memory>
21 #include <string>
22 #include <unordered_set>
23 #include <utility>
24 #include <variant>
25 #include <vector>
26
27 #include "absl/container/flat_hash_map.h"
28 #include "absl/memory/memory.h"
29 #include "absl/types/any.h"
30 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
31 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
32 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
33 #include "tensorflow/lite/delegates/gpu/common/operations.h"
34 #include "tensorflow/lite/delegates/gpu/common/status.h"
35 #include "tensorflow/lite/delegates/gpu/common/types.h"
36 #include "tensorflow/lite/delegates/gpu/gl/compiler/compiled_node.h"
37 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_auto_input.h"
38 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_inline.h"
39 #include "tensorflow/lite/delegates/gpu/gl/compiler/fuse_inplace.h"
40 #include "tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h"
41 #include "tensorflow/lite/delegates/gpu/gl/float16_conversions.h"
42
43 namespace tflite {
44 namespace gpu {
45 namespace gl {
46 namespace {
47
48 struct ExceedSizeChecker {
operator ()tflite::gpu::gl::__anon763b9c710111::ExceedSizeChecker49 bool operator()(uint32_t v) const { return v > max_size.x; }
50
operator ()tflite::gpu::gl::__anon763b9c710111::ExceedSizeChecker51 bool operator()(const uint2& v) const {
52 return v.x > max_size.x || v.y > max_size.y;
53 }
54
operator ()tflite::gpu::gl::__anon763b9c710111::ExceedSizeChecker55 bool operator()(const uint3& v) const {
56 return v.x > max_size.x || v.y > max_size.y || v.z > max_z_size;
57 }
58
59 int2 max_size;
60 int max_z_size;
61 };
62
63 // Returns true if any size variable exceeds the given limit
ExceedsMaxSize(const Object & object,const GpuInfo & gpu_info)64 bool ExceedsMaxSize(const Object& object, const GpuInfo& gpu_info) {
65 ExceedSizeChecker size_checker;
66 size_checker.max_size =
67 int2(gpu_info.GetMaxImage2DWidth(), gpu_info.GetMaxImage2DHeight());
68 size_checker.max_z_size = gpu_info.GetMaxImage2DArrayLayers();
69 return std::visit(size_checker, object.size);
70 }
71
ChooseFastestObjectType(const GpuInfo & gpu_info)72 ObjectType ChooseFastestObjectType(const GpuInfo& gpu_info) {
73 return gpu_info.IsAdreno() ? ObjectType::TEXTURE : ObjectType::BUFFER;
74 }
75
ChooseFastestRefObjectType(const GpuInfo & gpu_info,const CompilationOptions & options)76 ObjectType ChooseFastestRefObjectType(const GpuInfo& gpu_info,
77 const CompilationOptions& options) {
78 if (!gpu_info.IsAdreno()) {
79 return ObjectType::BUFFER;
80 }
81 if (gpu_info.adreno_info.adreno_gpu == AdrenoGpu::kAdreno630) {
82 return ObjectType::TEXTURE;
83 } else {
84 return options.allow_precision_loss ? ObjectType::TEXTURE
85 : ObjectType::BUFFER;
86 }
87 }
88
89 // Compiler executes the following steps:
90 // 1. Runs NodeShader for every node in the input graph.
91 // 2. Creates a compiled graph that mirrors the input graph and keeps
92 // GeneratedCode in operation's attributes.
93 // 3. Fuses nodes in the compiled graph.
94 // 4. Generates the full shader code using the nodes in the compiled graph.
95 class CompilerImpl : public Compiler {
96 public:
97 // We use const GpuInfo* because it doesn't let you assign temporary object
CompilerImpl(const NodeShader * node_shader,const GpuInfo * gpu_info,const CompilationOptions & options)98 CompilerImpl(const NodeShader* node_shader, const GpuInfo* gpu_info,
99 const CompilationOptions& options)
100 : node_shader_(*node_shader), gpu_info_(*gpu_info), options_(options) {
101 if (options_.preferred_obj_type == ObjectType::UNKNOWN) {
102 options_.preferred_obj_type = ChooseFastestObjectType(*gpu_info);
103 }
104 if (options_.ref_obj_type == ObjectType::UNKNOWN) {
105 options_.ref_obj_type = ChooseFastestRefObjectType(*gpu_info, options);
106 }
107 }
108
Compile(const GraphFloat32 & graph,const std::unordered_set<int> & tflite_graph_io,const ShaderCodeCallback & callback)109 absl::Status Compile(
110 const GraphFloat32& graph,
111 const std::unordered_set<int>& tflite_graph_io, // NOLINT
112 const ShaderCodeCallback& callback) final {
113 // It is important to have ids in a compiled graph identical to the given
114 // graph.
115 RETURN_IF_ERROR(graph.MakeExactCopy(&compiled_graph_));
116
117 // Clear out batch dimension for dynamic batch support.
118 if (options_.dynamic_batch) {
119 for (auto value : compiled_graph_.values()) {
120 value->tensor.shape.b = 1;
121 }
122 }
123
124 // Generate a shader for a node and all input/output objects.
125 for (auto node : compiled_graph_.nodes()) {
126 CompiledNodeAttributes attr;
127 attr.node_indices.push_back(node->id);
128 NodeShader::GenerationContext ctx = {&gpu_info_, options_,
129 node->operation.type,
130 node->operation.attributes};
131 for (const auto& tensor : graph.FindInputs(node->id)) {
132 const auto& shape = tensor->tensor.shape;
133 ctx.input_shapes.push_back({shape.b, shape.h, shape.w, shape.c});
134 }
135 for (const auto& tensor : graph.FindOutputs(node->id)) {
136 const auto& shape = tensor->tensor.shape;
137 ctx.output_shapes.push_back({shape.b, shape.h, shape.w, shape.c});
138 }
139 RETURN_IF_ERROR(node_shader_.GenerateCode(ctx, &attr.code));
140 node->operation.attributes = std::move(attr);
141 }
142
143 ModelTransformer transformer(&compiled_graph_);
144 if (options_.fuse_operations) {
145 FuseAutoOutputWithInline fuse_inline;
146 if (!transformer.Apply("fuse_auto_with_inline", &fuse_inline)) {
147 return absl::InternalError("fuse_auto_with_inline failed");
148 }
149 FuseInplaceUpdate fuse_inplace;
150 if (!transformer.Apply("fuse_inplace_update", &fuse_inplace)) {
151 return absl::InternalError("fuse_inplace failed");
152 }
153 if (options_.auto_input_fusion) {
154 FuseAutoInput fuse_auto_input;
155 if (!transformer.Apply("fuse_auto_input", &fuse_auto_input)) {
156 return absl::InternalError("fuse_auto_input failed");
157 }
158 }
159 }
160 RemoveUnusedInplaceUpdates remove_inplace_updates;
161 if (!transformer.Apply("remove_inplace_updates", &remove_inplace_updates)) {
162 return absl::InternalError("remove_inplace_updates failed");
163 }
164
165 // Prepare internal objects.
166 absl::flat_hash_map<ValueId, Object> objects;
167 for (auto value : compiled_graph_.values()) {
168 Object object = MakePHWC4Ref(value->id, value->tensor.shape);
169 object.data_type = value->tensor.type;
170 // External references may not be upgraded to f16 nor be represented as
171 // textures.
172 const bool is_external =
173 graph.IsGraphInput(value->id) || graph.IsGraphOutput(value->id) ||
174 tflite_graph_io.find(value->tensor.ref) != tflite_graph_io.end();
175 if (is_external) {
176 object.object_type = ObjectType::BUFFER;
177 } else if (options_.allow_precision_loss) {
178 MaybeConvertToFloat16(&object);
179 }
180 objects[value->id] = std::move(object);
181 }
182
183 // Prepare readonly objects and check whether object types are supported.
184 for (auto node : compiled_graph_.nodes()) {
185 auto& attr =
186 std::any_cast<CompiledNodeAttributes&>(node->operation.attributes);
187
188 // Set workload explicitly.
189 if (attr.code.workload == uint3()) {
190 auto outputs = compiled_graph_.FindOutputs(node->id);
191 auto shape = outputs[0]->tensor.shape;
192 for (auto output : outputs) {
193 if (shape != output->tensor.shape) {
194 return absl::FailedPreconditionError(
195 "Workload uint3() requires all output sizes to match");
196 }
197 }
198 attr.code.workload = uint3(shape.w, shape.h, DivideRoundUp(shape.c, 4));
199 }
200
201 int num_textures = 0;
202 // Counts number of used textures and chooses ObjectType for an object.
203 auto set_object_type = [&](Object* object) {
204 if (object->object_type == ObjectType::BUFFER) {
205 // Don't change from buffer once it is set.
206 return;
207 }
208 bool is_ref = IsRef(*object);
209 if (num_textures < gpu_info_.GetMaxImageArguments() &&
210 !ExceedsMaxSize(*object, gpu_info_) &&
211 (object->object_type == ObjectType::TEXTURE ||
212 (is_ref && options_.ref_obj_type == ObjectType::TEXTURE) ||
213 (!is_ref && options_.preferred_obj_type == ObjectType::TEXTURE))) {
214 object->object_type = ObjectType::TEXTURE;
215 num_textures++;
216 } else {
217 object->object_type = ObjectType::BUFFER;
218 }
219 };
220
221 for (auto& object : attr.code.objects) {
222 // Downgrade readonly objects to F16 is requested.
223 if (options_.allow_precision_loss) {
224 MaybeConvertToFloat16(&object.second);
225 }
226 set_object_type(&object.second);
227 }
228
229 for (auto ref : compiled_graph_.FindInputs(node->id)) {
230 set_object_type(&objects[ref->id]);
231 }
232 for (auto ref : compiled_graph_.FindOutputs(node->id)) {
233 set_object_type(&objects[ref->id]);
234 }
235 }
236
237 // Generate shaders from the transformed graph.
238 ShaderCodegen codegen(options_, gpu_info_);
239 for (auto node : compiled_graph_.nodes()) {
240 auto& attr =
241 std::any_cast<CompiledNodeAttributes&>(node->operation.attributes);
242 if (attr.code.source_code.empty()) {
243 // noop. Skip this node.
244 continue;
245 }
246
247 // Declare inputs and outputs explicitly.
248 for (auto ref : compiled_graph_.FindInputs(node->id)) {
249 auto object = objects[ref->id];
250 object.access = AccessType::READ;
251 attr.inputs.push_back(object);
252 }
253 for (auto ref : compiled_graph_.FindOutputs(node->id)) {
254 auto object = objects[ref->id];
255 object.access = AccessType::WRITE;
256 attr.outputs.push_back(object);
257 }
258
259 // Allocate bindings. Textures must be bound first.
260 uint32_t binding = 0;
261 auto set_binding = [&](ObjectType type, Object& object) {
262 if (object.object_type == type) {
263 object.binding = binding++;
264 }
265 };
266 for (auto& object : attr.inputs) {
267 set_binding(ObjectType::TEXTURE, object);
268 }
269 for (auto& object : attr.outputs) {
270 set_binding(ObjectType::TEXTURE, object);
271 }
272 for (auto& object : attr.code.objects) {
273 set_binding(ObjectType::TEXTURE, object.second);
274 }
275 for (auto& object : attr.inputs) {
276 set_binding(ObjectType::BUFFER, object);
277 }
278 for (auto& object : attr.outputs) {
279 set_binding(ObjectType::BUFFER, object);
280 }
281 for (auto& object : attr.code.objects) {
282 set_binding(ObjectType::BUFFER, object.second);
283 }
284
285 // Generate source code.
286 ShaderCode shader_code;
287 RETURN_IF_ERROR(codegen.Build(std::move(attr), &shader_code));
288 RETURN_IF_ERROR(callback(std::move(shader_code)));
289 }
290 return absl::OkStatus();
291 }
292
293 private:
294 const NodeShader& node_shader_;
295 const GpuInfo& gpu_info_;
296 CompilationOptions options_;
297 GraphFloat32 compiled_graph_;
298 };
299
300 } // namespace
301
NewCompiler(const NodeShader * node_shader,const GpuInfo * gpu_info,const CompilationOptions & options)302 std::unique_ptr<Compiler> NewCompiler(const NodeShader* node_shader,
303 const GpuInfo* gpu_info,
304 const CompilationOptions& options) {
305 return std::make_unique<CompilerImpl>(node_shader, gpu_info, options);
306 }
307
308 } // namespace gl
309 } // namespace gpu
310 } // namespace tflite
311