1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_API_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_API_H_
18
19 // Usage example:
20 //
21 // // Builder is created from a model using GPU-specific parameters.
22 // std::unique_ptr<InferenceBuilder> builder = ...;
23 //
24 // // input data is coming from a texture
25 // // output data goes to CPU
26 // builder->SetInputObjectDef(0, {DataType::FLOAT16, DataLayout::PHWC4,
27 // ObjectType::OPENGL_TEXTURE, true});
28 // builder->SetOutputObjectDef(0, {DataType::FLOAT32, DataLayout::BHWC,
29 // ObjectType::CPU_MEMORY, false});
30 // std::unique_ptr<InferenceRunner> runner;
31 // RETURN_IF_ERROR(builder->Build(&runner)); // may take significant time.
32 // RETURN_IF_ERROR(
33 // runner->SetInputObject(0, OpenGlTexture{texture_ud, texture_format}));
34 // RETURN_IF_ERROR(runner->Run());
35
36 #include <cstdint>
37 #include <memory>
38 #include <variant>
39 #include <vector>
40
41 #include "absl/types/span.h"
42 #include "absl/types/variant.h"
43 #include <CL/cl.h>
44 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
45 #include "tensorflow/lite/delegates/gpu/common/status.h"
46 #include "tensorflow/lite/delegates/gpu/common/util.h"
47 #include "vulkan/vulkan.h" // from @vulkan_headers
48
49 #define GL_NO_PROTOTYPES
50 #define EGL_NO_PROTOTYPES
51 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
52 #undef GL_NO_PROTOTYPES
53 #undef EGL_NO_PROTOTYPES
54
55 namespace tflite {
56 namespace gpu {
57
58 // Common abbreviations:
59 // B - batch
60 // H - height
61 // W - width
62 // C - channels
63 // D - depth := DivideRoundUp(C, 4)
64 // C4 - is the constant = 4.
65 enum class DataLayout {
66 UNKNOWN,
67 BHWC,
68 DHWC4,
69 HWDC4,
70 HDWC4,
71 };
72
73 enum class ObjectType {
74 UNKNOWN,
75 OPENGL_SSBO,
76 OPENGL_TEXTURE,
77 CPU_MEMORY,
78 OPENCL_TEXTURE,
79 OPENCL_BUFFER,
80 VULKAN_BUFFER,
81 VULKAN_TEXTURE
82 };
83
84 struct OpenGlBuffer {
85 OpenGlBuffer() = default;
OpenGlBufferOpenGlBuffer86 explicit OpenGlBuffer(GLuint new_id) : id(new_id) {}
87
88 GLuint id = GL_INVALID_INDEX;
89 };
90
91 struct OpenGlTexture {
92 OpenGlTexture() = default;
OpenGlTextureOpenGlTexture93 OpenGlTexture(GLuint new_id, GLenum new_format)
94 : id(new_id), format(new_format) {}
95
96 GLuint id = GL_INVALID_INDEX;
97 GLenum format = GL_INVALID_ENUM;
98 };
99
100 struct OpenClBuffer {
101 OpenClBuffer() = default;
OpenClBufferOpenClBuffer102 explicit OpenClBuffer(cl_mem new_memobj) : memobj(new_memobj) {}
103
104 cl_mem memobj = nullptr;
105 };
106
107 struct OpenClTexture {
108 OpenClTexture() = default;
OpenClTextureOpenClTexture109 explicit OpenClTexture(cl_mem new_memobj) : memobj(new_memobj) {}
110
111 cl_mem memobj = nullptr;
112 // TODO(akulik): should it specify texture format?
113 };
114
115 struct VulkanBuffer {
116 VulkanBuffer() = default;
VulkanBufferVulkanBuffer117 explicit VulkanBuffer(VkBuffer buffer_, VkDeviceSize size_,
118 VkDeviceMemory memory_, VkDeviceSize offset_)
119 : buffer(buffer_), size(size_), memory(memory_), offset(offset_) {}
120
121 VkBuffer buffer;
122 VkDeviceSize size;
123 VkDeviceMemory memory;
124 VkDeviceSize offset;
125 };
126
127 struct VulkanTexture {
128 VulkanTexture() = default;
VulkanTextureVulkanTexture129 explicit VulkanTexture(VkDeviceMemory new_memory) : memory(new_memory) {}
130
131 VkImage image;
132 VkImageView image_view;
133 VkFormat format;
134 VkExtent3D extent;
135 VkDeviceMemory memory;
136 VkDeviceSize offset;
137 };
138
139 struct VulkanMemory {
140 VulkanMemory() = default;
VulkanMemoryVulkanMemory141 explicit VulkanMemory(VkDeviceMemory new_memory) : memory(new_memory) {}
142
143 VkDeviceMemory memory;
144 VkDeviceSize size;
145 VkDeviceSize offset;
146 };
147
148 struct CpuMemory {
149 CpuMemory() = default;
CpuMemoryCpuMemory150 CpuMemory(void* new_data, size_t new_size_bytes)
151 : data(new_data), size_bytes(new_size_bytes) {}
152
153 void* data = nullptr;
154 size_t size_bytes = 0;
155 };
156
157 template <typename T>
MakeCpuMemory(absl::Span<T> t)158 inline CpuMemory MakeCpuMemory(absl::Span<T> t) {
159 CpuMemory m;
160 m.data = t.data();
161 m.size_bytes = t.size() * sizeof(T);
162 return m;
163 }
164
165 template <typename T>
MakeReadableCpuMemory(absl::Span<const T> t)166 inline CpuMemory MakeReadableCpuMemory(absl::Span<const T> t) {
167 CpuMemory m;
168 m.data = const_cast<T*>(t.data());
169 m.size_bytes = t.size() * sizeof(T);
170 return m;
171 }
172
173 // Defines object representation.
174 struct ObjectDef {
175 DataType data_type = DataType::UNKNOWN;
176 DataLayout data_layout = DataLayout::UNKNOWN;
177 ObjectType object_type = ObjectType::UNKNOWN;
178
179 // If true, then object is managed externally and needs to be provided to
180 // InferenceRunner by a user before running inference.
181 //
182 // User-provided objects will not be re-used internally for any purpose to
183 // lower overall memory usage.
184 bool user_provided = false;
185
186 bool operator==(const ObjectDef& other) const {
187 return data_type == other.data_type && data_layout == other.data_layout &&
188 object_type == other.object_type &&
189 user_provided == other.user_provided;
190 }
191 };
192
193 bool IsValid(const ObjectDef& def);
194
195 struct Dimensions {
DimensionsDimensions196 Dimensions() : b(1), h(1), w(1), c(1) {}
197
DimensionsDimensions198 Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
199 : b(batch), h(height), w(width), c(channels) {}
200
dDimensions201 int32_t d() const { return DivideRoundUp(c, 4); }
202
productDimensions203 int32_t product() const { return b * h * w * c; }
204
205 bool operator==(const Dimensions& other) const {
206 return b == other.b && h == other.h && w == other.w && c == other.c;
207 }
208
209 int32_t b;
210 int32_t h;
211 int32_t w;
212 int32_t c;
213 };
214
215 // Connects tensor shape with corresponding object definition.
216 struct TensorObjectDef {
217 // Dimensions semantic is defined by corresponding DataLayout.
218 Dimensions dimensions;
219 ObjectDef object_def;
220
221 bool operator==(const TensorObjectDef& other) const {
222 return dimensions == other.dimensions && object_def == other.object_def;
223 }
224 };
225
226 // @return true if tensor object def is defined.
227 bool IsValid(const TensorObjectDef& def);
228
229 // @return the number of elements in a tensor object.
230 uint32_t NumElements(const TensorObjectDef& def);
231
232 using TensorObject =
233 std::variant<std::monostate, OpenGlBuffer, OpenGlTexture, CpuMemory,
234 OpenClBuffer, OpenClTexture, VulkanBuffer, VulkanTexture>;
235
236 // @return true if object is set and corresponding values are defined.
237 bool IsValid(const TensorObjectDef& def, const TensorObject& object);
238
239 ObjectType GetType(const TensorObject& object);
240
241 // @return true if corresponding object is set for the given type
242 bool IsObjectPresent(ObjectType type, const TensorObject& obj);
243
244 // @return true if corresponding object has already been initialized and
245 // assigned with a specific ObjectType.
246 bool IsObjectInitialized(const TensorObject& obj);
247
248 class InferenceRunner;
249
250 // Allows to inspect and change input and output definitions before a graph is
251 // prepared for the inference.
252 class InferenceBuilder {
253 public:
~InferenceBuilder()254 virtual ~InferenceBuilder() {}
255
256 // Returns inference graph inputs and outputs definitions.
257 virtual std::vector<TensorObjectDef> inputs() const = 0;
258 virtual std::vector<TensorObjectDef> outputs() const = 0;
259
260 // Sets new shape for the input if underlying implementation and graph
261 // structure allows dynamic tensors.
262 virtual absl::Status SetInputShape(int index,
263 const Dimensions& dimensions) = 0;
264
265 // Updates object definitions for the given index. Implementation may allow
266 // to use different layouts and/or data type conversions between objects
267 // defined in a graph and given objects, for example:
268 // input '0' is DataType::FLOAT32, DataLayout::BHWC.
269 // A user, however, has an input in DataType::FLOAT16, DataLayout::PHWC4.
270 // An implementation may allow this transformation to happen automatically
271 // under the hood.
272 virtual absl::Status SetInputObjectDef(int index, ObjectDef def) = 0;
273 virtual absl::Status SetOutputObjectDef(int index, ObjectDef def) = 0;
SetAllInputObjectDefsTo(ObjectDef def)274 virtual absl::Status SetAllInputObjectDefsTo(ObjectDef def) {
275 auto input_defs = inputs();
276 for (int i = 0; i < input_defs.size(); ++i) {
277 RETURN_IF_ERROR(SetInputObjectDef(i, def));
278 }
279 return absl::OkStatus();
280 }
SetAllOutputObjectDefsTo(ObjectDef def)281 virtual absl::Status SetAllOutputObjectDefsTo(ObjectDef def) {
282 auto output_defs = outputs();
283 for (int i = 0; i < output_defs.size(); ++i) {
284 RETURN_IF_ERROR(SetOutputObjectDef(i, def));
285 }
286 return absl::OkStatus();
287 }
288
289 // Creates new instance of the inference runner. InferenceBuilder stays valid
290 // and could be used to create another inference runner if needed.
291 //
292 // This method may take significant time to prepare new inference runner. For
293 // example, it may require to compile OpenGL shaders.
294 virtual absl::Status Build(std::unique_ptr<InferenceRunner>* runner) = 0;
295 };
296
297 // Runs prepared inference. Every object marked as external needs to be set
298 // prior calling Run method.
299 class InferenceRunner {
300 public:
~InferenceRunner()301 virtual ~InferenceRunner() {}
302
303 // Returns inference graph inputs and outputs definitions.
304 virtual std::vector<TensorObjectDef> inputs() const = 0;
305 virtual std::vector<TensorObjectDef> outputs() const = 0;
306
307 // Getters provide access to underlying objects for the given index.
308 // Setters allow to set or change external object for the given index. Note,
309 // object need to match object definition set before in InferenceBuilder.
310
311 virtual absl::Status GetInputObject(int index, TensorObject* object) = 0;
312 virtual absl::Status GetOutputObject(int index, TensorObject* object) = 0;
313 virtual absl::Status SetInputObject(int index, TensorObject object) = 0;
314 virtual absl::Status SetOutputObject(int index, TensorObject object) = 0;
315
316 virtual absl::Status Run() = 0;
317 };
318
319 // Encapsulated compilation/runtime tradeoffs.
320 enum class InferenceUsage {
321 UNKNOWN,
322
323 // InferenceRunner will be used only once. Therefore, it is important to
324 // minimize bootstrap time as well.
325 FAST_SINGLE_ANSWER,
326
327 // Prefer maximizing the throughput. Same inference runner will be used
328 // repeatedly on different inputs.
329 SUSTAINED_SPEED,
330 };
331
332 // Defines aspects to control while instantiating a runner.
333 enum class InferencePriority {
334 UNKNOWN,
335
336 AUTO,
337
338 MIN_LATENCY,
339
340 MAX_PRECISION,
341
342 MIN_MEMORY_USAGE,
343 };
344
345 struct InferenceOptions {
346 InferenceUsage usage = InferenceUsage::SUSTAINED_SPEED;
347
348 // Ordered priorities provide better understanding of desired semantics,
349 // where priority(n) is more important than priority(n+1).
350 // AUTO priority is needed when a single priority is the most important
351 // factor. For example, priority1 = InferencePriority::MIN_LATENCY and leaving
352 // everything else to AUTO would result in configuration that achieves maximum
353 // performance.
354 //
355 // AUTO priority can only be used when higher priorities are fully specified.
356 // For example:
357 // VALID: priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
358 // VALID: priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
359 // priority3 = AUTO
360 // INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
361 // INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
362 // priority3 = MAX_PRECISION
363 // Invalid priorities will result in error.
364 InferencePriority priority1 = InferencePriority::MAX_PRECISION;
365
366 InferencePriority priority2 = InferencePriority::AUTO;
367
368 InferencePriority priority3 = InferencePriority::AUTO;
369 };
370
371 // Returns a position number for the priority. If priority is missing,
372 // then it would return 'max num priorities + 1'.
373 int GetPosition(const InferenceOptions& options, InferencePriority p);
374
375 // Return true if options are valid.
376 bool IsValid(const InferenceOptions& options);
377
378 // Resolves AUTO priorities and specifies them explicitly.
379 // Note, no-one should assume that these mappings will not change.
380 // Technically this function is declared here for code re-use purposes and
381 // by no means it should be treated as canonical way to resolve AUTO.
382 void ResolveAutoPriority(InferenceOptions* options);
383
384 enum class PriorityImportance {
385 UNKNOWN,
386 HIGHER,
387 LOWER,
388 };
389
390 // If both p1 and p2 are not present in options, return UNKNOWN
391 // If p1 is present, but p2 is not, return HIGHER
392 // If p2 is present, but p1 is not, return LOWER
393 // If both are present, and p1 is more important, return HIGHER, otherwise,
394 // LOWER.
395 PriorityImportance GetRelativeImportance(const InferenceOptions& options,
396 InferencePriority p1,
397 InferencePriority p2);
398
399 } // namespace gpu
400 } // namespace tflite
401
402 #endif // TENSORFLOW_LITE_DELEGATES_GPU_API_H_
403