xref: /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/nnapi/nnapi_delegate.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
16 #define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
17 
18 #include <memory>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "tensorflow/lite/c/common.h"
24 #include "tensorflow/lite/delegates/serialization.h"
25 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
26 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
27 
28 struct NnApiSLDriverImplFL5;
29 struct NnapiDelegateVendorPlugin;
30 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
31 
32 namespace tflite {
33 
34 namespace delegate {
35 namespace nnapi {
36 class NNAPIDelegateKernel;
37 }  // namespace nnapi
38 }  // namespace delegate
39 
40 using tflite::delegate::nnapi::NNAPIDelegateKernel;
41 
42 // TFliteDelegate to interface with NNAPI.
43 class StatefulNnApiDelegate : public TfLiteDelegate {
44  public:
45   // Encapsulates all options that are specific to NNAPI delegate.
46   struct Options {
47     // Preferred Power/perf trade-off. For more details please see
48     // ANeuralNetworksCompilation_setPreference documentation in :
49     // https://developer.android.com/ndk/reference/group/neural-networks.html
50     enum ExecutionPreference {
51       kUndefined = -1,
52       kLowPower = 0,
53       kFastSingleAnswer = 1,
54       kSustainedSpeed = 2,
55     };
56 
57     // Preferred Power/perf trade-off.
58     ExecutionPreference execution_preference = kUndefined;
59 
60     // Selected NNAPI accelerator with nul-terminated name.
61     // Default to nullptr, which implies the NNAPI default behavior: NNAPI
62     // runtime is allowed to use all available accelerators. If the selected
63     // accelerator cannot be found, NNAPI will not be used.
64     // It is the caller's responsibility to ensure the string is valid for the
65     // duration of the Options object lifetime.
66     const char* accelerator_name = nullptr;
67 
68     // The nul-terminated cache dir for NNAPI model.
69     // Default to nullptr, which implies the NNAPI will not try caching the
70     // compilation.
71     const char* cache_dir = nullptr;
72 
73     // The unique nul-terminated token string for NNAPI model.
74     // Default to nullptr, which implies the NNAPI will not try caching the
75     // compilation. It is the caller's responsibility to ensure there is no
76     // clash of the tokens.
77     // NOTE: when using compilation caching, it is not recommended to use the
78     // same delegate instance for multiple models.
79     const char* model_token = nullptr;
80 
81     // Whether to disallow NNAPI CPU usage. Only effective on Android 10 and
82     // above. The NNAPI CPU typically performs less well than built-in TfLite
83     // kernels, but allowing CPU allows partial acceleration of models. If this
84     // is set to true, NNAPI is only used if the whole model is accelerated.
85     bool disallow_nnapi_cpu = true;
86 
87     // Specifies the max number of partitions to delegate. A value <= 0 means
88     // no limit.
89     // If the delegation of the full set of supported nodes would generate a
90     // number of partition greater than this parameter, only
91     // <max_number_delegated_partitions> of them will be actually accelerated.
92     // The selection is currently done sorting partitions in decreasing order
93     // of number of nodes and selecting them until the limit is reached.
94     int max_number_delegated_partitions = 3;
95 
96     // allow fp32 compuation to be run in fp16.
97     bool allow_fp16 = false;
98 
99     // Specifies the relative priority for executions of the model.
100     // Available values are {ANEURALNETWORKS_PRIORITY_LOW,
101     // ANEURALNETWORKS_PRIORITY_MEDIUM, ANEURALNETWORKS_PRIORITY_HIGH,
102     // ANEURALNETWORKS_PRIORITY_DEFAULT}.
103     int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
104 
105     // Specifies the maximum expected duration in nanosecond for compiling the
106     // model. If the device is not able to complete the compilation within the
107     // specified duration, the compilation may be aborted. If set to 0, the
108     // timeout duration is considered infinite.
109     uint64_t max_compilation_timeout_duration_ns = 0;
110 
111     // Specifies the maximum expected duration in nanosecond for executing the
112     // model. If the device is not able to complete the execution within the
113     // specified duration, the execution may be aborted. If set to 0, the
114     // timeout duration is considered infinite.
115     uint64_t max_execution_timeout_duration_ns = 0;
116 
117     // Specifies the maximum expected duration in nanosecond for WHILE loops in
118     // the execution. If a WHILE loop condition model does not output false
119     // within the specified duration, the execution will be aborted. If set to
120     // 0, the default timeout for loops will be used.
121     uint64_t max_execution_loop_timeout_duration_ns = 0;
122 
123     // Whether to allow dynamic dimension sizes without re-compilation.
124     // A tensor of with dynamic dimension must have a valid dim_signature
125     // defined.
126     // Only supported in NNAPI 1.1 and newer versions.
127     // WARNING: Setting this flag to true may result in model being rejected by
128     // accelerator. This should only be enabled if the target device supports
129     // dynamic dimensions of the model.
130     bool allow_dynamic_dimensions = false;
131 
132     // Force using NNAPI Burst mode if supported.
133     // Burst mode allows accelerators to efficiently manage resources, which
134     // would significantly reduce overhead especially if the same delegate
135     // instance is to be used for multiple inferences.
136     // If NNAPI devices are specified and are of NNAPI feature level 5 or
137     // higher, NNAPI delegate will automatically enable burst mode for better
138     // performance.
139     // Default: Disabled for devices with NNAPI feature level 4 or lower.
140     bool use_burst_computation = false;
141 
142     // Specifies the max number of NNAPI reusable executions to cache. An
143     // execution can be reused if the input and output tensors are using the
144     // same buffer handles, and all dynamic dimensions are unchanged. Setting
145     // this field to 0 means do not reuse execution.
146     uint32_t max_execution_cache_size = 4;
147 
148     // Provides hints about the max size of tensors with dynamic shapes. The key
149     // of the map is the tensor index, and the value is the max size of the
150     // tensor in bytes. If a vendor plugin is supplied, this field is required
151     // for all output tensors with dynamic shapes because the output size cannot
152     // be inferred. Otherwise, this field is optional and any provided
153     // information may be used to guide the memory allocation. This field has no
154     // effect on tensors with static shapes.
155     std::map<int, size_t> tensor_max_size_hints;
156 
157     // The optional null-terminated vendor specific compilation hints string.
158     // It is the vendor_plugin's responsibility to parse the hint string and
159     // decide whether the hints should be respected or not. If no vendor_plugin
160     // provided, the hints will be ignored.
161     const char* vendor_compilation_hints = nullptr;
162 
163     // The optional null-terminated vendor specific execution hints string.
164     // It is the vendor_plugin's responsibility to parse the hint string and
165     // decide whether the hints should be respected or not. If no vendor_plugin
166     // provided, the hints will be ignored.
167     const char* vendor_execution_hints = nullptr;
168 
169     // It is the users responsibility to make sure that
170     // vendor_plugin outlives the delegate instance.
171     // If a vendor plugin is supplied, and the model has dynamic dimensions, the
172     // delegate is not able to propagate tensor shapes. In such a case, the user
173     // must provide max tensor size in the "tensor_max_size_hints" field for all
174     // output tensors with dynamic shapes.
175     NnapiDelegateVendorPlugin* vendor_plugin = nullptr;
176   };
177 
178   // Uses default options.
179   StatefulNnApiDelegate();
180 
181   // The ownership of the NnApi instance is left to the caller of the
182   // StatefulNnApiDelegate constructor; the caller must ensure that the lifetime
183   // of the NnApi instance exceeds the lifetime of the StatefulNnApiDelegate.
184   explicit StatefulNnApiDelegate(const NnApi* nnapi);
185 
186   // The constructor that accepts options from user.
187   // This makes a copy of any data that it needs from Options, so
188   // the caller can safely deallocate any storage pointed to by
189   // the 'const char *' members of Options immediately after calling this.
190   explicit StatefulNnApiDelegate(Options options);
191 
192   // Constructor that accepts both an NnApi instance and options.
193   // The ownership of the NnApi instance is left to the caller of the
194   // StatefulNnApiDelegate constructor; the caller must ensure that the lifetime
195   // of the NnApi instance exceeds the lifetime of the StatefulNnApiDelegate.
196   // This constructor makes a copy of any data that it needs from Options, so
197   // the caller can safely deallocate any storage pointed to by
198   // the 'const char *' members of Options immediately after calling this.
199   StatefulNnApiDelegate(const NnApi* nnapi, Options options);
200 
201   // Constructor that accepts an NnApiSLDriverImplFL5 instance and options.
202   // The ownership of the NnApiSLDriverImplFL5 instance is left to the caller of
203   // the StatefulNnApiDelegate constructor; the caller must ensure that the
204   // lifetime of the NnApiSLDriverImplFL5 instance encompasses all calls to
205   // methods on the StatefulNnApiDelegate instance, other than the destructor.
206   // This constructor makes a copy of any data that it needs from Options, so
207   // the caller can safely deallocate any storage pointed to by
208   // the 'const char *' members of Options immediately after calling this.
209   //
210   // The NN API Support Library Driver must support at least NNAPI Feature Level
211   // 5 (introduced in SDK level 31), but this might point to a compatible struct
212   // that also supports a higher NNAPI Feature Level. These cases can be
213   // distinguished by examining the base.implFeatureLevel field, which should be
214   // set to the supported feature level (which must be >=
215   // ANEURALNETWORKS_FEATURE_LEVEL_5).
216   //
217   // Please note that since NNAPI Support Library doesn't implement some of the
218   // functions (see CreateNnApiFromSupportLibrary implementation and NNAPI SL
219   // documentation for details), the underlying NnApi structure will have
220   // nullptr stored in some of the function pointers. Calling such functions
221   // will result in a crash.
222   //
223   // WARNING: This is an experimental interface that is subject to change.
224   StatefulNnApiDelegate(
225       const NnApiSLDriverImplFL5* nnapi_support_library_driver,
226       Options options);
227 
228   ~StatefulNnApiDelegate() = default;
229 
230   // Returns the delegate options.
231   // The lifetime of the storage pointed to by the 'const char *' members of the
232   // returned Options object is the same as the lifetime of the supplied
233   // TfLiteDelegate instance.
234   static const Options GetOptions(TfLiteDelegate* delegate);
235 
236   // Callback function which copies data from ANeuralNetworksMemory to host
237   // tensor CPU buffer. It is the users responsibility to implement these
238   // callbacks for the specific types of shared memory they intend to use.
239   // WARNING: This is an experimental interface that is subject to change.
240   typedef TfLiteStatus (*CopyToHostTensorFnPtr)(TfLiteTensor* tensor,
241                                                 ANeuralNetworksMemory* memory,
242                                                 size_t memory_offset,
243                                                 size_t byte_size,
244                                                 void* callback_context);
245 
246   // Encapsulates all fields related to memory registration for internal
247   // bookkeeping only.
248   struct MemoryRegistration {
249     ANeuralNetworksMemory* memory;
250     CopyToHostTensorFnPtr callback;
251     void* callback_context;
252     // The registeration timestamp. It is unique for each registered memory in
253     // the lifetime of a StatefulNnApiDelegate.
254     uint64_t timestamp;
255   };
256 
257   // Register the ANeuralNetworksMemory handle with the delegate. A
258   // TfLiteBufferHandle will be returned to be used with
259   // Interpreter::SetBufferHandle. The callback_context will be passed to the
260   // callback function when invoked.
261   // Note: the returned TfLiteBufferHandle can only be used with a single
262   // Interpreter instance. However, the caller can register the same memory
263   // multiple times to get different handles to use with difference Interpreter
264   // instances
265   // WARNING: This is an experimental interface that is subject to change.
266   TfLiteBufferHandle RegisterNnapiMemory(ANeuralNetworksMemory* memory,
267                                          CopyToHostTensorFnPtr callback,
268                                          void* callback_context);
269 
270   // Returns the vector of known ANeuralNetworksMemory handles.
271   // Note: this function is not intended to be called by developers.
272   // WARNING: This is an experimental interface that is subject to change.
273   static const std::vector<MemoryRegistration>& GetTensorMemoryMap(
274       TfLiteDelegate* delegate);
275 
276   // Returns ptr to delegates::Serialization, if caching is enabled by user via
277   // cache_dir & model_token.
278   static delegates::Serialization* GetCache(TfLiteDelegate* delegate);
279 
280   // Returns the int value of the ResultCode returned by the latest
281   // failed call to NNAPI, if any. Zero only in case of NO failed calls since
282   // the construction of this instance of StatefulNnApiDelegate.
283   // The error code is reset when the delegate is re-initialized
284   // (i.e. when calling interpreter.ModifyGraphWithDelegate(delegate)).
285   int GetNnApiErrno() const;
286 
287  private:
288   // Encapsulates all delegate data.
289   struct Data {
290     // Pointer to NNAPI implementation to be used by this delegate as
291     // set when building the StatefulNnApiDelegate instance.
292     // Will generally be the NnApiInstance() singleton but can be overridden
293     // for testing or for users needing to wrap or stub parts of NNAPI.
294     // The ownership of the nnapi instance is left to the caller of
295     // the StatefulNnApiDelegate constructor.
296     const NnApi* nnapi;
297     // Preferred Power/perf trade-off.
298     Options::ExecutionPreference execution_preference;
299     // Selected NNAPI accelerator name.
300     std::string accelerator_name;
301     // The cache dir for NNAPI model.
302     std::string cache_dir;
303     // The unique token string for NNAPI model.
304     std::string model_token;
305     // Whether to disallow NNAPI CPU.
306     bool disallow_nnapi_cpu;
307     // Tensor to ANeuralNetworksMemory mapping.
308     std::vector<MemoryRegistration> tensor_memory_map;
309     // The next timestamp for buffer handle registration.
310     uint64_t next_buffer_handle_timestamp = 1;
311     // Contains a non zero value if any NNAPI method call
312     // operation returned a non zero result code.
313     int nnapi_errno = ANEURALNETWORKS_NO_ERROR;
314     // Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
315     // when trying to understand if all nodes are supported by the target
316     // accelerators.
317     // The key is the index of the first node in the partition.
318     // Couldn't use unique_ptr because of problems building on gcc
319     std::unordered_map<int, NNAPIDelegateKernel*> delegate_state_cache;
320     // Maximum number of NNAPI partition to delegate. Zero or negative means
321     // no limit. Copied from StatefulNnApiDelegate::Options
322     int max_number_delegated_partitions;
323     // allow fp32 computation to be run in fp16.
324     bool allow_fp16;
325     // Specifies the relative priority for executions of the model.
326     int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
327     // Specifies the maximum expected duration in nanosecond for compiling the
328     // model.
329     uint64_t max_compilation_timeout_duration_ns = 0;
330     // Specifies the maximum expected duration in nanosecond for executing the
331     // model.
332     uint64_t max_execution_timeout_duration_ns = 0;
333     // Specifies the maximum expected duration in nanosecond for WHILE loops in
334     // the execution
335     uint64_t max_execution_loop_timeout_duration_ns = 0;
336     // Whether to allow dynamic dimension sizes without re-compilation.
337     bool allow_dynamic_dimensions = false;
338     // Whether to use NNAPI Burst mode.
339     bool use_burst_computation = false;
340     // Specifies the max number of NNAPI reusable executions to cache.
341     uint32_t max_execution_cache_size = 4;
342     // Provides hints about the max size of tensors with dynamic shapes.
343     std::map<int, size_t> tensor_max_size_hints;
344     // The null-terminated vendor specific compilation hints string
345     const char* vendor_compilation_hints = nullptr;
346     // The null-terminated vendor specific execution hints string.
347     const char* vendor_execution_hints = nullptr;
348 
349     // It is the users responsibility to make sure that
350     // vendor_plugin outlives the delegate instance.
351     NnapiDelegateVendorPlugin* vendor_plugin = nullptr;
352 
353     // Smart pointer for automatically cleaning up NnApi structure in case the
354     // delegate was constructed from an NNAPI support library
355     std::unique_ptr<const NnApi> owned_nnapi = nullptr;
356 
357     // TFLite Serialization in case caching has been enabled by the user through
358     // Options.
359     std::unique_ptr<delegates::Serialization> cache;
360 
361     explicit Data(const NnApi* nnapi);
362     explicit Data(std::unique_ptr<const NnApi> nnapi);
363     ~Data();
364 
365     // Caches an initialised NNAPIDelegateKernel.
366     void CacheDelegateKernel(const TfLiteDelegateParams* delegate_params,
367                              NNAPIDelegateKernel* delegate_state);
368     // Returns a cached NNAPIDelegateKernel if available and removes it
369     // from the cache transferring the ownership to the caller.
370     NNAPIDelegateKernel* MaybeGetCachedDelegateKernel(
371         const TfLiteDelegateParams* delegate_params);
372   };
373 
374   // Implements TfLiteDelegate::Prepare. Please refer to TFLiteDelegate
375   // documentation for more info.
376   static TfLiteStatus DoPrepare(TfLiteContext* context,
377                                 TfLiteDelegate* delegate);
378 
379   // Copy the data from delegate buffer handle into raw memory of the given
380   // 'tensor'. The delegate is allowed to allocate the raw
381   // bytes as long as it follows the rules for kTfLiteDynamic tensors.
382   static TfLiteStatus DoCopyFromBufferHandle(TfLiteContext* context,
383                                              TfLiteDelegate* delegate,
384                                              TfLiteBufferHandle buffer_handle,
385                                              TfLiteTensor* tensor);
386 
387   // Copy the data from raw memory of the given 'tensor' to delegate buffer
388   // handle. Currently this function is not supported, and calling the function
389   // will result in an error.
390   static TfLiteStatus DoCopyToBufferHandle(TfLiteContext* context,
391                                            TfLiteDelegate* delegate,
392                                            TfLiteBufferHandle buffer_handle,
393                                            TfLiteTensor* tensor);
394 
395   // Free the Delegate Buffer Handle. Note: This only frees the handle, but
396   // this doesn't release the underlying resource (e.g. textures). The
397   // resources are either owned by application layer or the delegate.
398   static void DoFreeBufferHandle(TfLiteContext* context,
399                                  TfLiteDelegate* delegate,
400                                  TfLiteBufferHandle* handle);
401 
402   // Returns the nodes that can be delegated via NNAPI to the accelerator
403   // specified in the delegate options and information about the way the
404   // graph will be partitioned if the supported nodes will be delegated.
405   // Partition information is composed by the number of partitions and
406   // the delegate parameters associated to each partition.
407   // The method also caches in delegate->data the NNApiDelegateKernel instances
408   // that have been created during the device evaluation.
409   // All arguments are expected to be non-null.
410   static TfLiteStatus GetNodesSupportedByAccelerator(
411       TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
412       const std::vector<int>& supported_nodes,
413       std::vector<int>* device_supported_nodes, int* num_partitions,
414       TfLiteDelegateParams** params_array, int* nnapi_errno);
415 
416   // Alters the given array of nodes_to_delegate to limit the number of NNAPI
417   // owned partition to be less or equal than num_partitions. If num_partitions
418   // is less or equal to zero the input is left unaltered.
419   // The nodes_to_delegate array is expected to contain at element 0 the number
420   // of nodes to delegate and in remaining elements the set of nodes
421   // that would be delegated to NNAPI if this function wouldn't be
422   // called. It will be altered storing in the first element the count of
423   // nodes to actually delegate and in the remainder of the array the indexes.
424   // The params_array params might be altered during the functions execution.
425   static TfLiteStatus LimitDelegatedPartitions(
426       int max_partitions,
427       std::vector<TfLiteDelegateParams> partition_params_array,
428       std::vector<int>* nodes_to_delegate);
429 
430   void StatefulNnApiDelegateConstructorImpl(const Options& options);
431 
432   // Delegate data presented through TfLiteDelegate::data_.
433   Data delegate_data_;
434 };
435 
436 // DEPRECATED: Please use StatefulNnApiDelegate class instead.
437 //
438 // Returns a singleton delegate that can be used to use the NN API.
439 // e.g.
440 //   TfLiteDelegate* delegate = NnApiDelegate();
441 //   interpreter->ModifyGraphWithDelegate(delegate);
442 // NnApiDelegate() returns a singleton, so you should not free this
443 // pointer or worry about its lifetime.
444 TfLiteDelegate* NnApiDelegate();
445 
446 }  // namespace tflite
447 
448 #endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
449