1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_ 17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_ 18 19 #include <string> 20 #include <utility> 21 #include <vector> 22 23 #include "tensorflow/lite/delegates/gpu/common/data_type.h" 24 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h" 25 #include "tensorflow/lite/delegates/gpu/common/kernel_info.h" 26 #include "tensorflow/lite/delegates/gpu/common/precision.h" 27 #include "tensorflow/lite/delegates/gpu/common/status.h" 28 #include "tensorflow/lite/delegates/gpu/common/task/arguments.h" 29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h" 30 #include "tensorflow/lite/delegates/gpu/common/task/compiler_options.h" 31 #include "tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h" 32 #include "tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h" 33 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h" 34 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h" 35 #include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h" 36 #include "tensorflow/lite/delegates/gpu/common/types.h" 37 38 namespace tflite { 39 namespace gpu { 40 // kCustom: default value 41 // GPUOperation::GetGridSize must be overloaded 42 // kWBToX_HDToY_SToZ: 43 // grid_x = dst_[0]->Width() * dst_[0]->Batch(); 44 // grid_y = dst_[0]->Height() * dst_[0]->Depth(); 45 // grid_z = dst_[0]->Slices(); 46 // kWBToX_HDToY_ZIs1: 47 // grid_x = dst_[0]->Width() * dst_[0]->Batch(); 48 // grid_y = dst_[0]->Height() * dst_[0]->Depth(); 49 // grid_z = 1; 50 // kWBToX_HToY_DToZ: 51 // grid_x = dst_[0]->Width() * dst_[0]->Batch(); 52 // grid_y = dst_[0]->Height(); 53 // grid_z = dst_[0]->Depth(); 54 // kBToX_YIs1_ZIs1: 55 // grid_x = dst_[0]->Batch(); 56 // grid_y = 1; 57 // grid_z = 1; 58 enum class TensorToGrid { 59 kCustom, 60 kWBToX_HDToY_SToZ, 61 kWBToX_HDToY_ZIs1, 62 kWBToX_HToY_DToZ, 63 kBToX_YIs1_ZIs1 64 }; 65 66 struct OperationDef { 67 CalculationsPrecision precision; 68 std::vector<TensorDescriptor> src_tensors; 69 std::vector<TensorDescriptor> dst_tensors; 70 71 // returns FLOAT32 for F32 precision and FLOAT16 for F16 precision 72 DataType GetDataType() const; 73 // Primary means the first src tensor, because first tensor usually defines 74 // the structure of kernel, all other resources(biases) types and etc. 75 DataType GetPrimaryDataType() const; 76 TensorStorageType GetPrimaryStorageType() const; 77 bool IsBatchSupported() const; 78 }; 79 80 struct ElementwiseDescriptor { 81 Arguments args; 82 std::string code; 83 }; 84 85 class GPUOperation { 86 public: 87 GPUOperation() = default; 88 explicit GPUOperation(const OperationDef& definition); 89 virtual ~GPUOperation() = default; 90 // Move only 91 GPUOperation(GPUOperation&& operation); 92 GPUOperation& operator=(GPUOperation&& operation); 93 GPUOperation(const GPUOperation&) = delete; 94 GPUOperation& operator=(const GPUOperation&) = delete; 95 96 absl::Status AddOperation(const GpuInfo& gpu_info, GPUOperation* operation); 97 98 // input input 99 // | | 100 // elem0 | 101 // | --> elem 102 // elem1 | 103 // | | 104 // output output 105 absl::Status FuseSimpleElemWithSimpleElem(const GpuInfo& gpu_info, 106 GPUOperation* operation); 107 108 // input input 109 // / \ | 110 // elem0 | | 111 // \ / --> elem 112 // elem1 | 113 // | | 114 // output output 115 absl::Status Fuse2InputElemWithSimpleElemAsFirstInput( 116 const GpuInfo& gpu_info, GPUOperation* operation); 117 118 // input input 119 // / \ | 120 // | elem0 | 121 // \ / --> elem 122 // elem1 | 123 // | | 124 // output output 125 absl::Status Fuse2InputElemWithSimpleElemAsSecondInput( 126 const GpuInfo& gpu_info, GPUOperation* operation); 127 128 void SetSrc(GpuSpatialTensor* ptr, int index = 0); 129 void SetDst(GpuSpatialTensor* ptr, int index = 0); 130 131 struct DispatchInfo { 132 int3 work_group_size; 133 int3 work_groups_count; 134 }; 135 void GetPossibleDispatches(TuningType tuning_type, const GpuInfo& gpu_info, 136 const KernelInfo& kernel_info, 137 std::vector<DispatchInfo>* dispatches) const; 138 GetSrcTensorsNames()139 const std::vector<std::string>& GetSrcTensorsNames() const { 140 return src_tensors_names_; 141 } GetDstTensorsNames()142 const std::vector<std::string>& GetDstTensorsNames() const { 143 return dst_tensors_names_; 144 } GetSrcTensors()145 const std::vector<GpuSpatialTensor*>& GetSrcTensors() const { return src_; } GetDstTensors()146 const std::vector<GpuSpatialTensor*>& GetDstTensors() const { return dst_; } GetWorkGroupsCount()147 const int3& GetWorkGroupsCount() const { return work_groups_count_; } 148 149 absl::Status AssembleCode(const GpuInfo& gpu_info); 150 PostCompileCheck(const GpuInfo & gpu_info,const KernelInfo & kernel_info)151 virtual absl::Status PostCompileCheck(const GpuInfo& gpu_info, 152 const KernelInfo& kernel_info) { 153 return absl::OkStatus(); 154 } 155 GetDefinition()156 const OperationDef& GetDefinition() const { return definition_; } 157 158 void AddSrcTensor(const std::string& tensor_name, 159 const TensorDescriptor& desc); 160 void AddSrcBuffer(const std::string& buffer_name, 161 const BufferDescriptor& desc); 162 void AddDstTensor(const std::string& tensor_name, 163 const TensorDescriptor& desc); 164 IsLinkable()165 bool IsLinkable() const { return elementwise_; } 166 BindArguments(ArgumentsBinder * args)167 virtual absl::Status BindArguments(ArgumentsBinder* args) { 168 return absl::OkStatus(); 169 } RecalculateGridSize()170 void RecalculateGridSize() { grid_size_ = GetGridSize(); } 171 void RecalculateWorkGroupsCount(); 172 173 Arguments args_; 174 std::string code_; 175 int3 work_group_size_ = int3(8, 4, 1); 176 std::vector<CompilerOptions> compiler_options_; 177 // not applicable to elementwise 178 TensorToGrid tensor_to_grid_ = TensorToGrid::kCustom; 179 180 // for profiling 181 uint64_t flops_ = 0; 182 // size in bytes of constant gpu_objects inside args_ 183 uint64_t const_args_size_ = 0; 184 185 // Must be called before const generic objects in args_ released. 186 void CalculateConstArgsSize(); 187 188 protected: 189 friend flatbuffers::Offset<tflite::gpu::data::GPUOperation> Encode( 190 const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder); 191 friend absl::Status Decode(const tflite::gpu::data::GPUOperation* fb_op, 192 GPUOperation* op); 193 friend GPUOperation CreateGpuOperation(const OperationDef& definition, 194 ElementwiseDescriptor&& descriptor); 195 friend GPUOperation CreateGpuOperation(const OperationDef& definition, 196 ElementwiseDescriptor&& descriptor, 197 const BHWC& second_shape); 198 199 friend absl::Status Fuse2InputElemWith2SimpleElem(const GpuInfo& gpu_info, 200 GPUOperation&& elem0, 201 GPUOperation&& elem1, 202 GPUOperation&& elem_root, 203 GPUOperation* result); 204 205 virtual int3 GetGridSize() const; 206 virtual void GetPossibleKernelWorkGroups( 207 TuningType tuning_type, const GpuInfo& gpu_info, 208 const KernelInfo& kernel_info, std::vector<int3>* work_groups) const; 209 210 // Defines operation calculation precision and format of src/dst tensors. 211 OperationDef definition_; 212 std::vector<GpuSpatialTensor*> src_; 213 std::vector<GpuSpatialTensor*> dst_; 214 int grid_dimension_ = 3; // can be 1, 2 or 3 215 int3 work_group_launch_order_ = int3(0, 1, 2); 216 int3 grid_size_ = int3(0, 0, 0); 217 std::vector<std::string> src_tensors_names_; 218 std::vector<std::string> dst_tensors_names_; 219 220 private: 221 absl::Status GetTensorDescriptor(const std::string& tensor_name, 222 TensorDescriptor** resutl); 223 absl::Status ResolveSecondElementwiseInput(); 224 int3 work_groups_count_ = int3(0, 0, 0); 225 bool elementwise_ = false; // temporary, used during op construction 226 int elementwise_inputs_ = 0; // can be {0, 1, 2} 227 std::string 228 second_elementwise_tensor_name_; // used with elementwise_inputs_ = 2 229 int linkable_count_ = 0; // temporary, used during op construction 230 std::string elementwise_code_; // temporary, used during op construction 231 }; 232 233 GPUOperation CreateGpuOperation(const OperationDef& definition, 234 ElementwiseDescriptor&& descriptor); 235 236 // For creating elementwise operations with 2 runtime inputs 237 GPUOperation CreateGpuOperation(const OperationDef& definition, 238 ElementwiseDescriptor&& descriptor, 239 const BHWC& second_shape); 240 241 // input input 242 // / \ | 243 // elem0 elem1 | 244 // \ / --> elem 245 // elem_root | 246 // | | 247 // output output 248 absl::Status Fuse2InputElemWith2SimpleElem(const GpuInfo& gpu_info, 249 GPUOperation&& elem0, 250 GPUOperation&& elem1, 251 GPUOperation&& elem_root, 252 GPUOperation* result); 253 } // namespace gpu 254 } // namespace tflite 255 256 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OPERATION_H_ 257