1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_ 16 #define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_ 17 18 #include <memory> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "tensorflow/lite/c/common.h" 24 #include "tensorflow/lite/delegates/serialization.h" 25 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h" 26 #include "tensorflow/lite/nnapi/nnapi_implementation.h" 27 28 struct NnApiSLDriverImplFL5; 29 struct NnapiDelegateVendorPlugin; 30 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory; 31 32 namespace tflite { 33 34 namespace delegate { 35 namespace nnapi { 36 class NNAPIDelegateKernel; 37 } // namespace nnapi 38 } // namespace delegate 39 40 using tflite::delegate::nnapi::NNAPIDelegateKernel; 41 42 // TFliteDelegate to interface with NNAPI. 43 class StatefulNnApiDelegate : public TfLiteDelegate { 44 public: 45 // Encapsulates all options that are specific to NNAPI delegate. 46 struct Options { 47 // Preferred Power/perf trade-off. For more details please see 48 // ANeuralNetworksCompilation_setPreference documentation in : 49 // https://developer.android.com/ndk/reference/group/neural-networks.html 50 enum ExecutionPreference { 51 kUndefined = -1, 52 kLowPower = 0, 53 kFastSingleAnswer = 1, 54 kSustainedSpeed = 2, 55 }; 56 57 // Preferred Power/perf trade-off. 58 ExecutionPreference execution_preference = kUndefined; 59 60 // Selected NNAPI accelerator with nul-terminated name. 61 // Default to nullptr, which implies the NNAPI default behavior: NNAPI 62 // runtime is allowed to use all available accelerators. If the selected 63 // accelerator cannot be found, NNAPI will not be used. 64 // It is the caller's responsibility to ensure the string is valid for the 65 // duration of the Options object lifetime. 66 const char* accelerator_name = nullptr; 67 68 // The nul-terminated cache dir for NNAPI model. 69 // Default to nullptr, which implies the NNAPI will not try caching the 70 // compilation. 71 const char* cache_dir = nullptr; 72 73 // The unique nul-terminated token string for NNAPI model. 74 // Default to nullptr, which implies the NNAPI will not try caching the 75 // compilation. It is the caller's responsibility to ensure there is no 76 // clash of the tokens. 77 // NOTE: when using compilation caching, it is not recommended to use the 78 // same delegate instance for multiple models. 79 const char* model_token = nullptr; 80 81 // Whether to disallow NNAPI CPU usage. Only effective on Android 10 and 82 // above. The NNAPI CPU typically performs less well than built-in TfLite 83 // kernels, but allowing CPU allows partial acceleration of models. If this 84 // is set to true, NNAPI is only used if the whole model is accelerated. 85 bool disallow_nnapi_cpu = true; 86 87 // Specifies the max number of partitions to delegate. A value <= 0 means 88 // no limit. 89 // If the delegation of the full set of supported nodes would generate a 90 // number of partition greater than this parameter, only 91 // <max_number_delegated_partitions> of them will be actually accelerated. 92 // The selection is currently done sorting partitions in decreasing order 93 // of number of nodes and selecting them until the limit is reached. 94 int max_number_delegated_partitions = 3; 95 96 // allow fp32 compuation to be run in fp16. 97 bool allow_fp16 = false; 98 99 // Specifies the relative priority for executions of the model. 100 // Available values are {ANEURALNETWORKS_PRIORITY_LOW, 101 // ANEURALNETWORKS_PRIORITY_MEDIUM, ANEURALNETWORKS_PRIORITY_HIGH, 102 // ANEURALNETWORKS_PRIORITY_DEFAULT}. 103 int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT; 104 105 // Specifies the maximum expected duration in nanosecond for compiling the 106 // model. If the device is not able to complete the compilation within the 107 // specified duration, the compilation may be aborted. If set to 0, the 108 // timeout duration is considered infinite. 109 uint64_t max_compilation_timeout_duration_ns = 0; 110 111 // Specifies the maximum expected duration in nanosecond for executing the 112 // model. If the device is not able to complete the execution within the 113 // specified duration, the execution may be aborted. If set to 0, the 114 // timeout duration is considered infinite. 115 uint64_t max_execution_timeout_duration_ns = 0; 116 117 // Specifies the maximum expected duration in nanosecond for WHILE loops in 118 // the execution. If a WHILE loop condition model does not output false 119 // within the specified duration, the execution will be aborted. If set to 120 // 0, the default timeout for loops will be used. 121 uint64_t max_execution_loop_timeout_duration_ns = 0; 122 123 // Whether to allow dynamic dimension sizes without re-compilation. 124 // A tensor of with dynamic dimension must have a valid dim_signature 125 // defined. 126 // Only supported in NNAPI 1.1 and newer versions. 127 // WARNING: Setting this flag to true may result in model being rejected by 128 // accelerator. This should only be enabled if the target device supports 129 // dynamic dimensions of the model. 130 bool allow_dynamic_dimensions = false; 131 132 // Force using NNAPI Burst mode if supported. 133 // Burst mode allows accelerators to efficiently manage resources, which 134 // would significantly reduce overhead especially if the same delegate 135 // instance is to be used for multiple inferences. 136 // If NNAPI devices are specified and are of NNAPI feature level 5 or 137 // higher, NNAPI delegate will automatically enable burst mode for better 138 // performance. 139 // Default: Disabled for devices with NNAPI feature level 4 or lower. 140 bool use_burst_computation = false; 141 142 // Specifies the max number of NNAPI reusable executions to cache. An 143 // execution can be reused if the input and output tensors are using the 144 // same buffer handles, and all dynamic dimensions are unchanged. Setting 145 // this field to 0 means do not reuse execution. 146 uint32_t max_execution_cache_size = 4; 147 148 // Provides hints about the max size of tensors with dynamic shapes. The key 149 // of the map is the tensor index, and the value is the max size of the 150 // tensor in bytes. If a vendor plugin is supplied, this field is required 151 // for all output tensors with dynamic shapes because the output size cannot 152 // be inferred. Otherwise, this field is optional and any provided 153 // information may be used to guide the memory allocation. This field has no 154 // effect on tensors with static shapes. 155 std::map<int, size_t> tensor_max_size_hints; 156 157 // The optional null-terminated vendor specific compilation hints string. 158 // It is the vendor_plugin's responsibility to parse the hint string and 159 // decide whether the hints should be respected or not. If no vendor_plugin 160 // provided, the hints will be ignored. 161 const char* vendor_compilation_hints = nullptr; 162 163 // The optional null-terminated vendor specific execution hints string. 164 // It is the vendor_plugin's responsibility to parse the hint string and 165 // decide whether the hints should be respected or not. If no vendor_plugin 166 // provided, the hints will be ignored. 167 const char* vendor_execution_hints = nullptr; 168 169 // It is the users responsibility to make sure that 170 // vendor_plugin outlives the delegate instance. 171 // If a vendor plugin is supplied, and the model has dynamic dimensions, the 172 // delegate is not able to propagate tensor shapes. In such a case, the user 173 // must provide max tensor size in the "tensor_max_size_hints" field for all 174 // output tensors with dynamic shapes. 175 NnapiDelegateVendorPlugin* vendor_plugin = nullptr; 176 }; 177 178 // Uses default options. 179 StatefulNnApiDelegate(); 180 181 // The ownership of the NnApi instance is left to the caller of the 182 // StatefulNnApiDelegate constructor; the caller must ensure that the lifetime 183 // of the NnApi instance exceeds the lifetime of the StatefulNnApiDelegate. 184 explicit StatefulNnApiDelegate(const NnApi* nnapi); 185 186 // The constructor that accepts options from user. 187 // This makes a copy of any data that it needs from Options, so 188 // the caller can safely deallocate any storage pointed to by 189 // the 'const char *' members of Options immediately after calling this. 190 explicit StatefulNnApiDelegate(Options options); 191 192 // Constructor that accepts both an NnApi instance and options. 193 // The ownership of the NnApi instance is left to the caller of the 194 // StatefulNnApiDelegate constructor; the caller must ensure that the lifetime 195 // of the NnApi instance exceeds the lifetime of the StatefulNnApiDelegate. 196 // This constructor makes a copy of any data that it needs from Options, so 197 // the caller can safely deallocate any storage pointed to by 198 // the 'const char *' members of Options immediately after calling this. 199 StatefulNnApiDelegate(const NnApi* nnapi, Options options); 200 201 // Constructor that accepts an NnApiSLDriverImplFL5 instance and options. 202 // The ownership of the NnApiSLDriverImplFL5 instance is left to the caller of 203 // the StatefulNnApiDelegate constructor; the caller must ensure that the 204 // lifetime of the NnApiSLDriverImplFL5 instance encompasses all calls to 205 // methods on the StatefulNnApiDelegate instance, other than the destructor. 206 // This constructor makes a copy of any data that it needs from Options, so 207 // the caller can safely deallocate any storage pointed to by 208 // the 'const char *' members of Options immediately after calling this. 209 // 210 // The NN API Support Library Driver must support at least NNAPI Feature Level 211 // 5 (introduced in SDK level 31), but this might point to a compatible struct 212 // that also supports a higher NNAPI Feature Level. These cases can be 213 // distinguished by examining the base.implFeatureLevel field, which should be 214 // set to the supported feature level (which must be >= 215 // ANEURALNETWORKS_FEATURE_LEVEL_5). 216 // 217 // Please note that since NNAPI Support Library doesn't implement some of the 218 // functions (see CreateNnApiFromSupportLibrary implementation and NNAPI SL 219 // documentation for details), the underlying NnApi structure will have 220 // nullptr stored in some of the function pointers. Calling such functions 221 // will result in a crash. 222 // 223 // WARNING: This is an experimental interface that is subject to change. 224 StatefulNnApiDelegate( 225 const NnApiSLDriverImplFL5* nnapi_support_library_driver, 226 Options options); 227 228 ~StatefulNnApiDelegate() = default; 229 230 // Returns the delegate options. 231 // The lifetime of the storage pointed to by the 'const char *' members of the 232 // returned Options object is the same as the lifetime of the supplied 233 // TfLiteDelegate instance. 234 static const Options GetOptions(TfLiteDelegate* delegate); 235 236 // Callback function which copies data from ANeuralNetworksMemory to host 237 // tensor CPU buffer. It is the users responsibility to implement these 238 // callbacks for the specific types of shared memory they intend to use. 239 // WARNING: This is an experimental interface that is subject to change. 240 typedef TfLiteStatus (*CopyToHostTensorFnPtr)(TfLiteTensor* tensor, 241 ANeuralNetworksMemory* memory, 242 size_t memory_offset, 243 size_t byte_size, 244 void* callback_context); 245 246 // Encapsulates all fields related to memory registration for internal 247 // bookkeeping only. 248 struct MemoryRegistration { 249 ANeuralNetworksMemory* memory; 250 CopyToHostTensorFnPtr callback; 251 void* callback_context; 252 // The registeration timestamp. It is unique for each registered memory in 253 // the lifetime of a StatefulNnApiDelegate. 254 uint64_t timestamp; 255 }; 256 257 // Register the ANeuralNetworksMemory handle with the delegate. A 258 // TfLiteBufferHandle will be returned to be used with 259 // Interpreter::SetBufferHandle. The callback_context will be passed to the 260 // callback function when invoked. 261 // Note: the returned TfLiteBufferHandle can only be used with a single 262 // Interpreter instance. However, the caller can register the same memory 263 // multiple times to get different handles to use with difference Interpreter 264 // instances 265 // WARNING: This is an experimental interface that is subject to change. 266 TfLiteBufferHandle RegisterNnapiMemory(ANeuralNetworksMemory* memory, 267 CopyToHostTensorFnPtr callback, 268 void* callback_context); 269 270 // Returns the vector of known ANeuralNetworksMemory handles. 271 // Note: this function is not intended to be called by developers. 272 // WARNING: This is an experimental interface that is subject to change. 273 static const std::vector<MemoryRegistration>& GetTensorMemoryMap( 274 TfLiteDelegate* delegate); 275 276 // Returns ptr to delegates::Serialization, if caching is enabled by user via 277 // cache_dir & model_token. 278 static delegates::Serialization* GetCache(TfLiteDelegate* delegate); 279 280 // Returns the int value of the ResultCode returned by the latest 281 // failed call to NNAPI, if any. Zero only in case of NO failed calls since 282 // the construction of this instance of StatefulNnApiDelegate. 283 // The error code is reset when the delegate is re-initialized 284 // (i.e. when calling interpreter.ModifyGraphWithDelegate(delegate)). 285 int GetNnApiErrno() const; 286 287 private: 288 // Encapsulates all delegate data. 289 struct Data { 290 // Pointer to NNAPI implementation to be used by this delegate as 291 // set when building the StatefulNnApiDelegate instance. 292 // Will generally be the NnApiInstance() singleton but can be overridden 293 // for testing or for users needing to wrap or stub parts of NNAPI. 294 // The ownership of the nnapi instance is left to the caller of 295 // the StatefulNnApiDelegate constructor. 296 const NnApi* nnapi; 297 // Preferred Power/perf trade-off. 298 Options::ExecutionPreference execution_preference; 299 // Selected NNAPI accelerator name. 300 std::string accelerator_name; 301 // The cache dir for NNAPI model. 302 std::string cache_dir; 303 // The unique token string for NNAPI model. 304 std::string model_token; 305 // Whether to disallow NNAPI CPU. 306 bool disallow_nnapi_cpu; 307 // Tensor to ANeuralNetworksMemory mapping. 308 std::vector<MemoryRegistration> tensor_memory_map; 309 // The next timestamp for buffer handle registration. 310 uint64_t next_buffer_handle_timestamp = 1; 311 // Contains a non zero value if any NNAPI method call 312 // operation returned a non zero result code. 313 int nnapi_errno = ANEURALNETWORKS_NO_ERROR; 314 // Cache of kernels already built in StatefulNnApiDelegate::DoPrepare 315 // when trying to understand if all nodes are supported by the target 316 // accelerators. 317 // The key is the index of the first node in the partition. 318 // Couldn't use unique_ptr because of problems building on gcc 319 std::unordered_map<int, NNAPIDelegateKernel*> delegate_state_cache; 320 // Maximum number of NNAPI partition to delegate. Zero or negative means 321 // no limit. Copied from StatefulNnApiDelegate::Options 322 int max_number_delegated_partitions; 323 // allow fp32 computation to be run in fp16. 324 bool allow_fp16; 325 // Specifies the relative priority for executions of the model. 326 int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT; 327 // Specifies the maximum expected duration in nanosecond for compiling the 328 // model. 329 uint64_t max_compilation_timeout_duration_ns = 0; 330 // Specifies the maximum expected duration in nanosecond for executing the 331 // model. 332 uint64_t max_execution_timeout_duration_ns = 0; 333 // Specifies the maximum expected duration in nanosecond for WHILE loops in 334 // the execution 335 uint64_t max_execution_loop_timeout_duration_ns = 0; 336 // Whether to allow dynamic dimension sizes without re-compilation. 337 bool allow_dynamic_dimensions = false; 338 // Whether to use NNAPI Burst mode. 339 bool use_burst_computation = false; 340 // Specifies the max number of NNAPI reusable executions to cache. 341 uint32_t max_execution_cache_size = 4; 342 // Provides hints about the max size of tensors with dynamic shapes. 343 std::map<int, size_t> tensor_max_size_hints; 344 // The null-terminated vendor specific compilation hints string 345 const char* vendor_compilation_hints = nullptr; 346 // The null-terminated vendor specific execution hints string. 347 const char* vendor_execution_hints = nullptr; 348 349 // It is the users responsibility to make sure that 350 // vendor_plugin outlives the delegate instance. 351 NnapiDelegateVendorPlugin* vendor_plugin = nullptr; 352 353 // Smart pointer for automatically cleaning up NnApi structure in case the 354 // delegate was constructed from an NNAPI support library 355 std::unique_ptr<const NnApi> owned_nnapi = nullptr; 356 357 // TFLite Serialization in case caching has been enabled by the user through 358 // Options. 359 std::unique_ptr<delegates::Serialization> cache; 360 361 explicit Data(const NnApi* nnapi); 362 explicit Data(std::unique_ptr<const NnApi> nnapi); 363 ~Data(); 364 365 // Caches an initialised NNAPIDelegateKernel. 366 void CacheDelegateKernel(const TfLiteDelegateParams* delegate_params, 367 NNAPIDelegateKernel* delegate_state); 368 // Returns a cached NNAPIDelegateKernel if available and removes it 369 // from the cache transferring the ownership to the caller. 370 NNAPIDelegateKernel* MaybeGetCachedDelegateKernel( 371 const TfLiteDelegateParams* delegate_params); 372 }; 373 374 // Implements TfLiteDelegate::Prepare. Please refer to TFLiteDelegate 375 // documentation for more info. 376 static TfLiteStatus DoPrepare(TfLiteContext* context, 377 TfLiteDelegate* delegate); 378 379 // Copy the data from delegate buffer handle into raw memory of the given 380 // 'tensor'. The delegate is allowed to allocate the raw 381 // bytes as long as it follows the rules for kTfLiteDynamic tensors. 382 static TfLiteStatus DoCopyFromBufferHandle(TfLiteContext* context, 383 TfLiteDelegate* delegate, 384 TfLiteBufferHandle buffer_handle, 385 TfLiteTensor* tensor); 386 387 // Copy the data from raw memory of the given 'tensor' to delegate buffer 388 // handle. Currently this function is not supported, and calling the function 389 // will result in an error. 390 static TfLiteStatus DoCopyToBufferHandle(TfLiteContext* context, 391 TfLiteDelegate* delegate, 392 TfLiteBufferHandle buffer_handle, 393 TfLiteTensor* tensor); 394 395 // Free the Delegate Buffer Handle. Note: This only frees the handle, but 396 // this doesn't release the underlying resource (e.g. textures). The 397 // resources are either owned by application layer or the delegate. 398 static void DoFreeBufferHandle(TfLiteContext* context, 399 TfLiteDelegate* delegate, 400 TfLiteBufferHandle* handle); 401 402 // Returns the nodes that can be delegated via NNAPI to the accelerator 403 // specified in the delegate options and information about the way the 404 // graph will be partitioned if the supported nodes will be delegated. 405 // Partition information is composed by the number of partitions and 406 // the delegate parameters associated to each partition. 407 // The method also caches in delegate->data the NNApiDelegateKernel instances 408 // that have been created during the device evaluation. 409 // All arguments are expected to be non-null. 410 static TfLiteStatus GetNodesSupportedByAccelerator( 411 TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi, 412 const std::vector<int>& supported_nodes, 413 std::vector<int>* device_supported_nodes, int* num_partitions, 414 TfLiteDelegateParams** params_array, int* nnapi_errno); 415 416 // Alters the given array of nodes_to_delegate to limit the number of NNAPI 417 // owned partition to be less or equal than num_partitions. If num_partitions 418 // is less or equal to zero the input is left unaltered. 419 // The nodes_to_delegate array is expected to contain at element 0 the number 420 // of nodes to delegate and in remaining elements the set of nodes 421 // that would be delegated to NNAPI if this function wouldn't be 422 // called. It will be altered storing in the first element the count of 423 // nodes to actually delegate and in the remainder of the array the indexes. 424 // The params_array params might be altered during the functions execution. 425 static TfLiteStatus LimitDelegatedPartitions( 426 int max_partitions, 427 std::vector<TfLiteDelegateParams> partition_params_array, 428 std::vector<int>* nodes_to_delegate); 429 430 void StatefulNnApiDelegateConstructorImpl(const Options& options); 431 432 // Delegate data presented through TfLiteDelegate::data_. 433 Data delegate_data_; 434 }; 435 436 // DEPRECATED: Please use StatefulNnApiDelegate class instead. 437 // 438 // Returns a singleton delegate that can be used to use the NN API. 439 // e.g. 440 // TfLiteDelegate* delegate = NnApiDelegate(); 441 // interpreter->ModifyGraphWithDelegate(delegate); 442 // NnApiDelegate() returns a singleton, so you should not free this 443 // pointer or worry about its lifetime. 444 TfLiteDelegate* NnApiDelegate(); 445 446 } // namespace tflite 447 448 #endif // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_ 449