xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_
18 
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
24 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
25 #include "tensorflow/lite/delegates/gpu/common/kernel_info.h"
26 #include "tensorflow/lite/delegates/gpu/common/precision.h"
27 #include "tensorflow/lite/delegates/gpu/common/status.h"
28 #include "tensorflow/lite/delegates/gpu/common/task/arguments.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/compiler_options.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h"
32 #include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h"
33 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
34 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
35 #include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h"
36 #include "tensorflow/lite/delegates/gpu/common/types.h"
37 
38 namespace tflite {
39 namespace gpu {
40 // kCustom: default value
41 //   GPUOperation::GetGridSize must be overloaded
42 // kWBToX_HDToY_SToZ:
43 //   grid_x = dst_[0]->Width() * dst_[0]->Batch();
44 //   grid_y = dst_[0]->Height() * dst_[0]->Depth();
45 //   grid_z = dst_[0]->Slices();
46 // kWBToX_HDToY_ZIs1:
47 //   grid_x = dst_[0]->Width() * dst_[0]->Batch();
48 //   grid_y = dst_[0]->Height() * dst_[0]->Depth();
49 //   grid_z = 1;
50 // kWBToX_HToY_DToZ:
51 //   grid_x = dst_[0]->Width() * dst_[0]->Batch();
52 //   grid_y = dst_[0]->Height();
53 //   grid_z = dst_[0]->Depth();
54 // kBToX_YIs1_ZIs1:
55 //   grid_x = dst_[0]->Batch();
56 //   grid_y = 1;
57 //   grid_z = 1;
58 enum class TensorToGrid {
59   kCustom,
60   kWBToX_HDToY_SToZ,
61   kWBToX_HDToY_ZIs1,
62   kWBToX_HToY_DToZ,
63   kBToX_YIs1_ZIs1
64 };
65 
66 struct OperationDef {
67   CalculationsPrecision precision;
68   std::vector<TensorDescriptor> src_tensors;
69   std::vector<TensorDescriptor> dst_tensors;
70 
71   // returns FLOAT32 for F32 precision and FLOAT16 for F16 precision
72   DataType GetDataType() const;
73   // Primary means the first src tensor, because first tensor usually defines
74   // the structure of kernel, all other resources(biases) types and etc.
75   DataType GetPrimaryDataType() const;
76   TensorStorageType GetPrimaryStorageType() const;
77   bool IsBatchSupported() const;
78 };
79 
80 struct ElementwiseDescriptor {
81   Arguments args;
82   std::string code;
83 };
84 
85 class GPUOperation {
86  public:
87   GPUOperation() = default;
88   explicit GPUOperation(const OperationDef& definition);
89   virtual ~GPUOperation() = default;
90   // Move only
91   GPUOperation(GPUOperation&& operation);
92   GPUOperation& operator=(GPUOperation&& operation);
93   GPUOperation(const GPUOperation&) = delete;
94   GPUOperation& operator=(const GPUOperation&) = delete;
95 
96   absl::Status AddOperation(const GpuInfo& gpu_info, GPUOperation* operation);
97 
98   //    input       input
99   //      |           |
100   //    elem0         |
101   //      |    -->  elem
102   //    elem1         |
103   //      |           |
104   //    output      output
105   absl::Status FuseSimpleElemWithSimpleElem(const GpuInfo& gpu_info,
106                                             GPUOperation* operation);
107 
108   //      input           input
109   //     /    \             |
110   //  elem0    |            |
111   //     \    /      -->  elem
112   //     elem1              |
113   //       |                |
114   //     output           output
115   absl::Status Fuse2InputElemWithSimpleElemAsFirstInput(
116       const GpuInfo& gpu_info, GPUOperation* operation);
117 
118   //      input           input
119   //     /    \             |
120   //    |    elem0          |
121   //     \    /      -->  elem
122   //     elem1              |
123   //       |                |
124   //     output           output
125   absl::Status Fuse2InputElemWithSimpleElemAsSecondInput(
126       const GpuInfo& gpu_info, GPUOperation* operation);
127 
128   void SetSrc(GpuSpatialTensor* ptr, int index = 0);
129   void SetDst(GpuSpatialTensor* ptr, int index = 0);
130 
131   struct DispatchInfo {
132     int3 work_group_size;
133     int3 work_groups_count;
134   };
135   void GetPossibleDispatches(TuningType tuning_type, const GpuInfo& gpu_info,
136                              const KernelInfo& kernel_info,
137                              std::vector<DispatchInfo>* dispatches) const;
138 
GetSrcTensorsNames()139   const std::vector<std::string>& GetSrcTensorsNames() const {
140     return src_tensors_names_;
141   }
GetDstTensorsNames()142   const std::vector<std::string>& GetDstTensorsNames() const {
143     return dst_tensors_names_;
144   }
GetSrcTensors()145   const std::vector<GpuSpatialTensor*>& GetSrcTensors() const { return src_; }
GetDstTensors()146   const std::vector<GpuSpatialTensor*>& GetDstTensors() const { return dst_; }
GetWorkGroupsCount()147   const int3& GetWorkGroupsCount() const { return work_groups_count_; }
148 
149   absl::Status AssembleCode(const GpuInfo& gpu_info);
150 
PostCompileCheck(const GpuInfo & gpu_info,const KernelInfo & kernel_info)151   virtual absl::Status PostCompileCheck(const GpuInfo& gpu_info,
152                                         const KernelInfo& kernel_info) {
153     return absl::OkStatus();
154   }
155 
GetDefinition()156   const OperationDef& GetDefinition() const { return definition_; }
157 
158   void AddSrcTensor(const std::string& tensor_name,
159                     const TensorDescriptor& desc);
160   void AddSrcBuffer(const std::string& buffer_name,
161                     const BufferDescriptor& desc);
162   void AddDstTensor(const std::string& tensor_name,
163                     const TensorDescriptor& desc);
164 
IsLinkable()165   bool IsLinkable() const { return elementwise_; }
166 
BindArguments(ArgumentsBinder * args)167   virtual absl::Status BindArguments(ArgumentsBinder* args) {
168     return absl::OkStatus();
169   }
RecalculateGridSize()170   void RecalculateGridSize() { grid_size_ = GetGridSize(); }
171   void RecalculateWorkGroupsCount();
172 
173   Arguments args_;
174   std::string code_;
175   int3 work_group_size_ = int3(8, 4, 1);
176   std::vector<CompilerOptions> compiler_options_;
177   // not applicable to elementwise
178   TensorToGrid tensor_to_grid_ = TensorToGrid::kCustom;
179 
180   // for profiling
181   uint64_t flops_ = 0;
182   // size in bytes of constant gpu_objects inside args_
183   uint64_t const_args_size_ = 0;
184 
185   // Must be called before const generic objects in args_ released.
186   void CalculateConstArgsSize();
187 
188  protected:
189   friend flatbuffers::Offset<tflite::gpu::data::GPUOperation> Encode(
190       const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
191   friend absl::Status Decode(const tflite::gpu::data::GPUOperation* fb_op,
192                              GPUOperation* op);
193   friend GPUOperation CreateGpuOperation(const OperationDef& definition,
194                                          ElementwiseDescriptor&& descriptor);
195   friend GPUOperation CreateGpuOperation(const OperationDef& definition,
196                                          ElementwiseDescriptor&& descriptor,
197                                          const BHWC& second_shape);
198 
199   friend absl::Status Fuse2InputElemWith2SimpleElem(const GpuInfo& gpu_info,
200                                                     GPUOperation&& elem0,
201                                                     GPUOperation&& elem1,
202                                                     GPUOperation&& elem_root,
203                                                     GPUOperation* result);
204 
205   virtual int3 GetGridSize() const;
206   virtual void GetPossibleKernelWorkGroups(
207       TuningType tuning_type, const GpuInfo& gpu_info,
208       const KernelInfo& kernel_info, std::vector<int3>* work_groups) const;
209 
210   // Defines operation calculation precision and format of src/dst tensors.
211   OperationDef definition_;
212   std::vector<GpuSpatialTensor*> src_;
213   std::vector<GpuSpatialTensor*> dst_;
214   int grid_dimension_ = 3;  // can be 1, 2 or 3
215   int3 work_group_launch_order_ = int3(0, 1, 2);
216   int3 grid_size_ = int3(0, 0, 0);
217   std::vector<std::string> src_tensors_names_;
218   std::vector<std::string> dst_tensors_names_;
219 
220  private:
221   absl::Status GetTensorDescriptor(const std::string& tensor_name,
222                                    TensorDescriptor** resutl);
223   absl::Status ResolveSecondElementwiseInput();
224   int3 work_groups_count_ = int3(0, 0, 0);
225   bool elementwise_ = false;      // temporary, used during op construction
226   int elementwise_inputs_ = 0;    // can be {0, 1, 2}
227   std::string
228       second_elementwise_tensor_name_;  // used with elementwise_inputs_ = 2
229   int linkable_count_ = 0;        // temporary, used during op construction
230   std::string elementwise_code_;  // temporary, used during op construction
231 };
232 
233 GPUOperation CreateGpuOperation(const OperationDef& definition,
234                                 ElementwiseDescriptor&& descriptor);
235 
236 // For creating elementwise operations with 2 runtime inputs
237 GPUOperation CreateGpuOperation(const OperationDef& definition,
238                                 ElementwiseDescriptor&& descriptor,
239                                 const BHWC& second_shape);
240 
241 //      input           input
242 //     /    \             |
243 //  elem0  elem1          |
244 //     \    /      -->  elem
245 //   elem_root            |
246 //       |                |
247 //     output           output
248 absl::Status Fuse2InputElemWith2SimpleElem(const GpuInfo& gpu_info,
249                                            GPUOperation&& elem0,
250                                            GPUOperation&& elem1,
251                                            GPUOperation&& elem_root,
252                                            GPUOperation* result);
253 }  // namespace gpu
254 }  // namespace tflite
255 
256 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_
257