xref: /aosp_15_r20/external/android-nn-driver/ArmnnPreparedModel_1_3.cpp (revision 3e777be0405cee09af5d5785ff37f7cfb5bee59a)
1 //
2 // Copyright © 2020-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 // Note: the ArmnnFencedExecutionCallback and code snippet in the executeFenced() function
6 //       in this file is based on Android code
7 //       under the Apache 2.0 license. See comments below for details.
8 //
9 
10 #define LOG_TAG "ArmnnDriver"
11 
12 #include "ArmnnPreparedModel_1_3.hpp"
13 #include "Utils.hpp"
14 
15 #include <armnn/Types.hpp>
16 
17 #include <Utils.h>
18 #include <android/sync.h>
19 #include <log/log.h>
20 #include <OperationsUtils.h>
21 #include <ExecutionBurstServer.h>
22 #include <ValidateHal.h>
23 
24 #include <chrono>
25 #include <cinttypes>
26 
27 #ifdef ARMNN_ANDROID_S
28 #include <LegacyUtils.h>
29 #endif
30 
31 using namespace android;
32 using namespace android::hardware;
33 
34 namespace {
35 
36 static const V1_2::Timing g_NoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
37 using namespace armnn_driver;
38 using TimePoint = std::chrono::steady_clock::time_point;
39 
Now()40 TimePoint Now()
41 {
42     return std::chrono::steady_clock::now();
43 }
44 
MicrosecondsDuration(TimePoint endPoint,TimePoint startPoint)45 unsigned long MicrosecondsDuration(TimePoint endPoint, TimePoint startPoint)
46 {
47     return static_cast<unsigned long>(std::chrono::duration_cast<std::chrono::microseconds>(
48                                       endPoint - startPoint).count());
49 }
50 
NotifyCallbackAndCheck(const::android::sp<V1_0::IExecutionCallback> & callback,V1_3::ErrorStatus errorStatus,std::vector<V1_2::OutputShape>,const V1_2::Timing,std::string callingFunction)51 void NotifyCallbackAndCheck(const ::android::sp<V1_0::IExecutionCallback>& callback,
52                             V1_3::ErrorStatus errorStatus,
53                             std::vector<V1_2::OutputShape>,
54                             const V1_2::Timing,
55                             std::string callingFunction)
56 {
57     Return<void> returned = callback->notify(convertToV1_0(errorStatus));
58     // This check is required, if the callback fails and it isn't checked it will bring down the service
59     if (!returned.isOk())
60     {
61         ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
62               callingFunction.c_str(), returned.description().c_str());
63     }
64 }
65 
NotifyCallbackAndCheck(const::android::sp<V1_2::IExecutionCallback> & callback,V1_3::ErrorStatus errorStatus,std::vector<V1_2::OutputShape> outputShapes,const V1_2::Timing timing,std::string callingFunction)66 void NotifyCallbackAndCheck(const ::android::sp<V1_2::IExecutionCallback>& callback,
67                             V1_3::ErrorStatus errorStatus,
68                             std::vector<V1_2::OutputShape> outputShapes,
69                             const V1_2::Timing timing,
70                             std::string callingFunction)
71 {
72     Return<void> returned = callback->notify_1_2(convertToV1_0(errorStatus), outputShapes, timing);
73     // This check is required, if the callback fails and it isn't checked it will bring down the service
74     if (!returned.isOk())
75     {
76         ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
77               callingFunction.c_str(), returned.description().c_str());
78     }
79 }
80 
NotifyCallbackAndCheck(const::android::sp<V1_3::IExecutionCallback> & callback,V1_3::ErrorStatus errorStatus,std::vector<V1_2::OutputShape> outputShapes,const V1_2::Timing timing,std::string callingFunction)81 void NotifyCallbackAndCheck(const ::android::sp<V1_3::IExecutionCallback>& callback,
82                             V1_3::ErrorStatus errorStatus,
83                             std::vector<V1_2::OutputShape> outputShapes,
84                             const V1_2::Timing timing,
85                             std::string callingFunction)
86 {
87     Return<void> returned = callback->notify_1_3(errorStatus, outputShapes, timing);
88     // This check is required, if the callback fails and it isn't checked it will bring down the service
89     if (!returned.isOk())
90     {
91         ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
92               callingFunction.c_str(), returned.description().c_str());
93     }
94 }
95 
ValidateRequestArgument(const V1_0::RequestArgument & requestArg,const armnn::TensorInfo & tensorInfo)96 bool ValidateRequestArgument(const V1_0::RequestArgument& requestArg, const armnn::TensorInfo& tensorInfo)
97 {
98     if (requestArg.dimensions.size() != 0)
99     {
100         if (requestArg.dimensions.size() != tensorInfo.GetNumDimensions())
101         {
102             ALOGE("Mismatched dimensions (request argument: %zu, expected: %u)",
103                   requestArg.dimensions.size(), tensorInfo.GetNumDimensions());
104             return false;
105         }
106 
107         for (unsigned int d = 0; d < tensorInfo.GetNumDimensions(); ++d)
108         {
109             if (requestArg.dimensions[d] != 0 && requestArg.dimensions[d] != tensorInfo.GetShape()[d])
110             {
111                 ALOGE("Mismatched size for dimension %d (request argument: %u, expected %u)",
112                       d, requestArg.dimensions[d], tensorInfo.GetShape()[d]);
113                 return false;
114             }
115         }
116     }
117 
118     return true;
119 }
120 
GetTensorForRequestArgument(const V1_0::RequestArgument & requestArg,const armnn::TensorInfo & tensorInfo,const std::vector<::android::nn::RunTimePoolInfo> & requestPools)121 armnn::Tensor GetTensorForRequestArgument(const V1_0::RequestArgument& requestArg,
122                                           const armnn::TensorInfo& tensorInfo,
123                                           const std::vector<::android::nn::RunTimePoolInfo>& requestPools)
124 {
125     if (!ValidateRequestArgument(requestArg, tensorInfo))
126     {
127         return armnn::Tensor();
128     }
129 
130     return armnn::Tensor(tensorInfo, GetMemoryFromPool(requestArg.location, requestPools));
131 }
132 
BuildTensorName(const char * tensorNamePrefix,std::size_t index)133 inline std::string BuildTensorName(const char* tensorNamePrefix, std::size_t index)
134 {
135     return tensorNamePrefix + std::to_string(index);
136 }
137 
138 } // anonymous namespace
139 
140 using namespace android::hardware;
141 
142 namespace armnn_driver
143 {
144 
145 template<typename HalVersion>
146 RequestThread_1_3<ArmnnPreparedModel_1_3, HalVersion, CallbackContext_1_3>
147         ArmnnPreparedModel_1_3<HalVersion>::m_RequestThread;
148 
149 template<typename HalVersion>
150 std::unique_ptr<armnn::Threadpool> ArmnnPreparedModel_1_3<HalVersion>::m_Threadpool(nullptr);
151 
152 template<typename HalVersion>
153 template<typename TensorBindingCollection>
DumpTensorsIfRequired(char const * tensorNamePrefix,const TensorBindingCollection & tensorBindings)154 void ArmnnPreparedModel_1_3<HalVersion>::DumpTensorsIfRequired(char const* tensorNamePrefix,
155                                                                const TensorBindingCollection& tensorBindings)
156 {
157     if (!m_RequestInputsAndOutputsDumpDir.empty())
158     {
159         const std::string requestName = std::to_string(m_NetworkId) + "_" + std::to_string(m_RequestCount) + ".dump";
160         for (std::size_t i = 0u; i < tensorBindings.size(); ++i)
161         {
162             DumpTensor(m_RequestInputsAndOutputsDumpDir,
163                        requestName,
164                        BuildTensorName(tensorNamePrefix, i),
165                        tensorBindings[i].second);
166         }
167     }
168 }
169 
170 template<typename HalVersion>
ArmnnPreparedModel_1_3(armnn::NetworkId networkId,armnn::IRuntime * runtime,const V1_3::Model & model,const std::string & requestInputsAndOutputsDumpDir,const bool gpuProfilingEnabled,V1_3::Priority priority,const bool asyncModelExecutionEnabled,const unsigned int numberOfThreads,const bool importEnabled,const bool exportEnabled)171 ArmnnPreparedModel_1_3<HalVersion>::ArmnnPreparedModel_1_3(armnn::NetworkId networkId,
172                                                            armnn::IRuntime* runtime,
173                                                            const V1_3::Model& model,
174                                                            const std::string& requestInputsAndOutputsDumpDir,
175                                                            const bool gpuProfilingEnabled,
176                                                            V1_3::Priority priority,
177                                                            const bool asyncModelExecutionEnabled,
178                                                            const unsigned int numberOfThreads,
179                                                            const bool importEnabled,
180                                                            const bool exportEnabled)
181     : m_NetworkId(networkId)
182     , m_Runtime(runtime)
183     , m_Model(model)
184     , m_RequestCount(0)
185     , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
186     , m_GpuProfilingEnabled(gpuProfilingEnabled)
187     , m_ModelPriority(priority)
188     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
189     , m_EnableImport(importEnabled)
190     , m_EnableExport(exportEnabled)
191     , m_PreparedFromCache(false)
192 {
193     // Enable profiling if required.
194     m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
195 
196     if (m_AsyncModelExecutionEnabled)
197     {
198         std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
199         for (unsigned int i=0; i < numberOfThreads; ++i)
200         {
201             memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
202         }
203 
204         if (!m_Threadpool)
205         {
206             m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
207         }
208         else
209         {
210             m_Threadpool->LoadMemHandles(memHandles);
211         }
212 
213         m_WorkingMemHandle = memHandles.back();
214     }
215 }
216 
217 template<typename HalVersion>
ArmnnPreparedModel_1_3(armnn::NetworkId networkId,armnn::IRuntime * runtime,const std::string & requestInputsAndOutputsDumpDir,const bool gpuProfilingEnabled,V1_3::Priority priority,const bool asyncModelExecutionEnabled,const unsigned int numberOfThreads,const bool importEnabled,const bool exportEnabled,const bool preparedFromCache)218 ArmnnPreparedModel_1_3<HalVersion>::ArmnnPreparedModel_1_3(armnn::NetworkId networkId,
219                                                            armnn::IRuntime* runtime,
220                                                            const std::string& requestInputsAndOutputsDumpDir,
221                                                            const bool gpuProfilingEnabled,
222                                                            V1_3::Priority priority,
223                                                            const bool asyncModelExecutionEnabled,
224                                                            const unsigned int numberOfThreads,
225                                                            const bool importEnabled,
226                                                            const bool exportEnabled,
227                                                            const bool preparedFromCache)
228     : m_NetworkId(networkId)
229     , m_Runtime(runtime)
230     , m_RequestCount(0)
231     , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
232     , m_GpuProfilingEnabled(gpuProfilingEnabled)
233     , m_ModelPriority(priority)
234     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
235     , m_EnableImport(importEnabled)
236     , m_EnableExport(exportEnabled)
237     , m_PreparedFromCache(preparedFromCache)
238 {
239     // Enable profiling if required.
240     m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
241 
242     if (m_AsyncModelExecutionEnabled)
243     {
244         std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
245         for (unsigned int i=0; i < numberOfThreads; ++i)
246         {
247             memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
248         }
249 
250         if (!m_Threadpool)
251         {
252             m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
253         }
254         else
255         {
256             m_Threadpool->LoadMemHandles(memHandles);
257         }
258 
259         m_WorkingMemHandle = memHandles.back();
260     }
261 }
262 
263 template<typename HalVersion>
~ArmnnPreparedModel_1_3()264 ArmnnPreparedModel_1_3<HalVersion>::~ArmnnPreparedModel_1_3()
265 {
266     // Get a hold of the profiler used by this model.
267     std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkId);
268     if (profiler && m_GpuProfilingEnabled)
269     {
270         // Dump the profiling info to a file if required.
271         DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId,
272                                     profiler.get());
273     }
274 
275     // Unload the network associated with this model.
276     m_Runtime->UnloadNetwork(m_NetworkId);
277 
278     // Unload the network memhandles from the threadpool
279     if (m_AsyncModelExecutionEnabled)
280     {
281         m_Threadpool->UnloadMemHandles(m_NetworkId);
282     }
283 }
284 
285 template<typename HalVersion>
execute(const V1_0::Request & request,const::android::sp<V1_0::IExecutionCallback> & callback)286 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::execute(const V1_0::Request& request,
287         const ::android::sp<V1_0::IExecutionCallback>& callback)
288 {
289     if (callback.get() == nullptr)
290     {
291         ALOGE("ArmnnPreparedModel_1_3::execute invalid callback passed");
292         return V1_0::ErrorStatus::INVALID_ARGUMENT;
293     }
294 
295     auto cb = [callback](V1_3::ErrorStatus errorStatus,
296                          std::vector<V1_2::OutputShape> outputShapes,
297                          const V1_2::Timing& timing,
298                          std::string callingFunction)
299     {
300         NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
301     };
302 
303 
304     return convertToV1_0(Execute(convertToV1_3(request), V1_2::MeasureTiming::NO, cb));
305 }
306 
307 template<typename HalVersion>
execute_1_2(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,const sp<V1_2::IExecutionCallback> & callback)308 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::execute_1_2(
309     const V1_0::Request& request,
310     V1_2::MeasureTiming measureTiming,
311     const sp<V1_2::IExecutionCallback>& callback)
312 {
313     if (callback.get() == nullptr)
314     {
315         ALOGE("ArmnnPreparedModel_1_3::execute_1_2 invalid callback passed");
316         return V1_0::ErrorStatus::INVALID_ARGUMENT;
317     }
318 
319     auto cb = [callback](V1_3::ErrorStatus errorStatus,
320                          std::vector<V1_2::OutputShape> outputShapes,
321                          const V1_2::Timing& timing,
322                          std::string callingFunction)
323     {
324         NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
325     };
326 
327     return convertToV1_0(Execute(convertToV1_3(request), measureTiming, cb));
328 }
329 
330 template<typename HalVersion>
execute_1_3(const V1_3::Request & request,V1_2::MeasureTiming measureTiming,const V1_3::OptionalTimePoint &,const V1_3::OptionalTimeoutDuration &,const sp<V1_3::IExecutionCallback> & callback)331 Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::execute_1_3(
332         const V1_3::Request& request,
333         V1_2::MeasureTiming measureTiming,
334         const V1_3::OptionalTimePoint&,
335         const V1_3::OptionalTimeoutDuration&,
336         const sp<V1_3::IExecutionCallback>& callback)
337 {
338     if (callback.get() == nullptr)
339     {
340         ALOGE("ArmnnPreparedModel_1_3::execute_1_3 invalid callback passed");
341         return V1_3::ErrorStatus::INVALID_ARGUMENT;
342     }
343 
344     auto cb = [callback](V1_3::ErrorStatus errorStatus,
345                          std::vector<V1_2::OutputShape> outputShapes,
346                          const V1_2::Timing& timing,
347                          std::string callingFunction)
348     {
349         NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
350     };
351 
352     return Execute(request, measureTiming, cb);
353 }
354 
355 /// This class is inspired by the sample implementation in Android named SampleFencedExecutionCallback.
356 /// The original code is licensed under Apache-2.0 and can be found at the following link:
357 /// https://android.googlesource.com/platform/frameworks/ml/+/master/nn/driver/sample/SampleDriver.h
358 class ArmnnFencedExecutionCallback : public V1_3::IFencedExecutionCallback
359 {
360 public:
ArmnnFencedExecutionCallback(V1_3::ErrorStatus errorStatus,V1_2::Timing timing,V1_2::Timing fenceTiming)361     ArmnnFencedExecutionCallback(V1_3::ErrorStatus errorStatus, V1_2::Timing timing, V1_2::Timing fenceTiming)
362         : m_ErrorStatus(errorStatus), m_Timing(timing), m_FenceTiming(fenceTiming) {}
~ArmnnFencedExecutionCallback()363     ~ArmnnFencedExecutionCallback() {}
364 
getExecutionInfo(getExecutionInfo_cb callback)365     Return<void> getExecutionInfo(getExecutionInfo_cb callback) override
366     {
367         callback(m_ErrorStatus, m_Timing, m_FenceTiming);
368         return Void();
369     }
370 private:
371     V1_3::ErrorStatus m_ErrorStatus;
372     V1_2::Timing m_Timing;
373     V1_2::Timing m_FenceTiming;
374 };
375 
376 template<typename HalVersion>
executeFenced(const V1_3::Request & request,const hidl_vec<hidl_handle> & fenceWaitFor,V1_2::MeasureTiming measureTiming,const V1_3::OptionalTimePoint & deadline,const V1_3::OptionalTimeoutDuration & loopTimeoutDuration,const V1_3::OptionalTimeoutDuration &,executeFenced_cb cb)377 Return<void> ArmnnPreparedModel_1_3<HalVersion>::executeFenced(const V1_3::Request& request,
378                                                                const hidl_vec<hidl_handle>& fenceWaitFor,
379                                                                V1_2::MeasureTiming measureTiming,
380                                                                const V1_3::OptionalTimePoint& deadline,
381                                                                const V1_3::OptionalTimeoutDuration& loopTimeoutDuration,
382                                                                const V1_3::OptionalTimeoutDuration&,
383                                                                executeFenced_cb cb)
384 {
385     ALOGV("ArmnnPreparedModel_1_3::executeFenced(...)");
386     if (cb == nullptr)
387     {
388         ALOGE("ArmnnPreparedModel_1_3::executeFenced invalid callback passed");
389         cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
390         return Void();
391     }
392 
393     if (deadline.getDiscriminator() != V1_3::OptionalTimePoint::hidl_discriminator::none)
394     {
395         ALOGW("ArmnnPreparedModel_1_3::executeFenced parameter deadline is set but not supported.");
396     }
397 
398     if (loopTimeoutDuration.getDiscriminator() != V1_3::OptionalTimeoutDuration::hidl_discriminator::none)
399     {
400         ALOGW("ArmnnPreparedModel_1_3::executeFenced parameter loopTimeoutDuration is set but not supported.");
401     }
402 
403     if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model, /*allowUnspecifiedOutput=*/false))
404     {
405         ALOGV("ArmnnPreparedModel_1_3::executeFenced outputs must be specified for fenced execution ");
406         cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
407         return Void();
408     }
409 
410     ExecutionContext_1_3 ctx;
411     if (measureTiming == V1_2::MeasureTiming::YES)
412     {
413         ctx.measureTimings = measureTiming;
414         ctx.driverStart = Now();
415     }
416 
417     if (!m_PreparedFromCache)
418     {
419         ALOGV("ArmnnPreparedModel_1_3::executeFenced(): %s", GetModelSummary(m_Model).c_str());
420     }
421     m_RequestCount++;
422 
423     if (!m_RequestInputsAndOutputsDumpDir.empty())
424     {
425         ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(&cb));
426     }
427 
428     // This code snippet is inspired by the sample implementation in Android named SampleDriver::executeFenced()
429     // function. The original code is licensed under Apache-2.0 and can be found at the following link:
430     // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/driver/sample/SampleDriver.cpp
431     const auto fenceSize = fenceWaitFor.size();
432     for (unsigned int index = 0; index < fenceSize; ++index)
433     {
434         auto fenceNativeHandle = fenceWaitFor[index].getNativeHandle();
435         if (!fenceNativeHandle)
436         {
437             cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
438             return Void();
439         }
440 
441         if (fenceNativeHandle->numFds != 1)
442         {
443             cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
444             return Void();
445         }
446 
447         if (sync_wait(fenceNativeHandle->data[0], -1) < 0)
448         {
449             ALOGE("ArmnnPreparedModel_1_3::executeFenced sync fence failed.");
450             cb(V1_3::ErrorStatus::GENERAL_FAILURE, hidl_handle(nullptr), nullptr);
451             return Void();
452         }
453     }
454 
455     TimePoint fenceExecutionStart;
456     if (measureTiming == V1_2::MeasureTiming::YES)
457     {
458         fenceExecutionStart = Now();
459     }
460 
461     // map the memory pool into shared pointers
462     // use a shared memory pools vector on the heap, as it is passed to the request thread
463     auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
464 
465     // allocate the tensors on the heap, as they are passed to the request thread
466     auto inputs = std::make_shared<armnn::InputTensors>();
467     auto outputs = std::make_shared<armnn::OutputTensors>();
468 
469     auto [status, outShapes, timings, message] = PrepareMemoryForIO(*inputs, *outputs, *memPools, request);
470     if (status != V1_3::ErrorStatus::NONE)
471     {
472         cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
473         return Void();
474     }
475 
476     ALOGV("ArmnnPreparedModel_1_3::executeFenced(...) before ExecuteGraph");
477 
478     // call it with nullCallback for now as we will report the error status from here..
479     auto nullCallback = [](V1_3::ErrorStatus, std::vector<V1_2::OutputShape>, const V1_2::Timing&, std::string) {};
480     CallbackContext_1_3 cbCtx;
481     cbCtx.callback = nullCallback;
482     cbCtx.ctx = ctx;
483 
484     auto errorStatus = ExecuteGraph(memPools, *inputs, *outputs, cbCtx);
485     if (errorStatus != V1_3::ErrorStatus::NONE)
486     {
487         cb(errorStatus, hidl_handle(nullptr), nullptr);
488         return Void();
489     }
490     ALOGV("ArmnnPreparedModel_1_3::executeFenced(...) after ExecuteGraph");
491 
492     V1_2::Timing timing = g_NoTiming;
493     V1_2::Timing fenceTiming = g_NoTiming;
494     if (measureTiming == V1_2::MeasureTiming::YES)
495     {
496         fenceTiming.timeOnDevice = MicrosecondsDuration(ctx.deviceEnd, ctx.deviceStart);
497         fenceTiming.timeInDriver = MicrosecondsDuration(ctx.driverEnd, fenceExecutionStart);
498         ALOGV("ArmnnPreparedModel_1_3::fenceFinishExecutionTiming - Device = %lu Driver = %lu",
499               static_cast<unsigned long>(fenceTiming.timeOnDevice),
500               static_cast<unsigned long>(fenceTiming.timeInDriver));
501     }
502 
503     sp<ArmnnFencedExecutionCallback> armnnFencedExecutionCallback =
504         new ArmnnFencedExecutionCallback(V1_3::ErrorStatus::NONE, timing, fenceTiming);
505     cb(V1_3::ErrorStatus::NONE, hidl_handle(nullptr), armnnFencedExecutionCallback);
506     return Void();
507 }
508 
509 template<typename HalVersion>
PrepareMemoryForInputs(armnn::InputTensors & inputs,const V1_3::Request & request,const std::vector<android::nn::RunTimePoolInfo> & memPools)510 Return<V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::PrepareMemoryForInputs(
511     armnn::InputTensors& inputs,
512     const V1_3::Request& request,
513     const std::vector<android::nn::RunTimePoolInfo>& memPools)
514 {
515     inputs.reserve(request.inputs.size());
516     for (unsigned int i = 0; i < request.inputs.size(); i++)
517     {
518         const auto& inputArg = request.inputs[i];
519         armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
520         // inputs (of type InputTensors) is composed of a vector of ConstTensors.
521         // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
522         inputTensorInfo.SetConstant();
523         auto result = ValidateRequestArgument<V1_3::ErrorStatus, V1_3::Request>(request,
524                                                                                 inputTensorInfo,
525                                                                                 inputArg,
526                                                                                 "input");
527 
528         if (result != V1_3::ErrorStatus::NONE)
529         {
530             return result;
531         }
532 
533         const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, memPools);
534 
535         if (inputTensor.GetMemoryArea() == nullptr)
536         {
537             ALOGE("Cannot execute request. Error converting request input %u to tensor", i);
538             return V1_3::ErrorStatus::GENERAL_FAILURE;
539         }
540 
541         inputs.emplace_back(i, inputTensor);
542     }
543 
544     return V1_3::ErrorStatus::NONE;
545 }
546 
547 template<typename HalVersion>
PrepareMemoryForOutputs(armnn::OutputTensors & outputs,std::vector<V1_2::OutputShape> & outputShapes,const V1_3::Request & request,const std::vector<android::nn::RunTimePoolInfo> & memPools)548 Return<V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::PrepareMemoryForOutputs(
549     armnn::OutputTensors& outputs,
550     std::vector<V1_2::OutputShape> &outputShapes,
551     const V1_3::Request& request,
552     const std::vector<android::nn::RunTimePoolInfo>& memPools)
553 {
554     outputs.reserve(request.outputs.size());
555     for (unsigned int i = 0; i < request.outputs.size(); i++)
556     {
557         const auto& outputArg = request.outputs[i];
558         armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
559         auto result = ValidateRequestArgument<V1_3::ErrorStatus, V1_3::Request>(request,
560                                                                                 outputTensorInfo,
561                                                                                 outputArg,
562                                                                                 "output");
563 
564         if (result != V1_3::ErrorStatus::NONE)
565         {
566             return result;
567         }
568 
569         const armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, memPools);
570 
571         if (outputTensor.GetMemoryArea() == nullptr)
572         {
573             ALOGE("Cannot execute request. Error converting request output %u to tensor", i);
574             return V1_3::ErrorStatus::GENERAL_FAILURE;
575         }
576         const size_t outputSize = outputTensorInfo.GetNumBytes();
577 
578         unsigned int count = 0;
579         std::for_each(outputArg.dimensions.begin(), outputArg.dimensions.end(), [&](auto dim)
580         {
581             if (dim != 0)
582             {
583                 outputTensorInfo.GetShape()[count] = dim;
584             }
585             else
586             {
587                 outputTensorInfo.GetShape()[count] = outputArg.dimensions.size();
588             }
589 
590             count++;
591         });
592 
593         outputs.emplace_back(i, outputTensor);
594         outputShapes[i] = ComputeShape(outputTensorInfo);
595 
596         if (outputArg.location.length < outputSize)
597         {
598             ALOGW("ArmnnPreparedModel_1_3::Execute failed outputArg.location.length (%s) < outputSize (%s)",
599                 std::to_string(outputArg.location.length).c_str(), std::to_string(outputSize).c_str());
600             outputShapes[i].isSufficient = false;
601             return V1_3::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
602         }
603 
604         size_t bufferSize = 0;
605 #if !defined(ARMNN_ANDROID_S)
606         bufferSize = memPools.at(outputArg.location.poolIndex).getHidlMemory().size();
607 #else
608         bufferSize = memPools.at(outputArg.location.poolIndex).getSize();
609 #endif
610         if (bufferSize < outputSize)
611         {
612             ALOGW("ArmnnPreparedModel_1_3::Execute failed bufferSize (%s) < outputSize (%s)",
613                   std::to_string(bufferSize).c_str(), std::to_string(outputSize).c_str());
614             outputShapes[i].isSufficient = false;
615             return V1_3::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
616         }
617     }
618 
619     return V1_3::ErrorStatus::NONE;
620 }
621 
622 template<typename HalVersion>
623 std::tuple<V1_3::ErrorStatus, hidl_vec<V1_2::OutputShape>, V1_2::Timing, std::string>
PrepareMemoryForIO(armnn::InputTensors & inputs,armnn::OutputTensors & outputs,std::vector<android::nn::RunTimePoolInfo> & memPools,const V1_3::Request & request)624     ArmnnPreparedModel_1_3<HalVersion>::PrepareMemoryForIO(armnn::InputTensors& inputs,
625                                                            armnn::OutputTensors& outputs,
626                                                            std::vector<android::nn::RunTimePoolInfo>& memPools,
627                                                            const V1_3::Request& request)
628 {
629 #if !defined(ARMNN_ANDROID_S)
630     if (!setRunTimePoolInfosFromMemoryPools(&memPools, request.pools))
631 #else
632     if (!setRunTimePoolInfosFromMemoryPools(&memPools, uncheckedConvert(request.pools)))
633 #endif
634     {
635         return {V1_3::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
636     }
637 
638     // add the inputs and outputs with their data
639     try
640     {
641         if (PrepareMemoryForInputs(inputs, request, memPools) != V1_3::ErrorStatus::NONE)
642         {
643             return {V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
644         }
645 
646         std::vector<V1_2::OutputShape> outputShapes(request.outputs.size());
647 
648         auto errorStatus = PrepareMemoryForOutputs(outputs, outputShapes, request, memPools);
649         if (errorStatus != V1_3::ErrorStatus::NONE)
650         {
651             return {errorStatus, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
652         }
653     }
654     catch (armnn::Exception& e)
655     {
656         ALOGW("armnn::Exception caught while preparing for EnqueueWorkload: %s", e.what());
657         return {V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
658     }
659     catch (std::exception& e)
660     {
661         ALOGE("std::exception caught while preparing for EnqueueWorkload: %s", e.what());
662         return {V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
663     }
664 
665     return {V1_3::ErrorStatus::NONE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
666 }
667 
668 template<typename HalVersion>
669 template<typename CallbackContext>
ExecuteSynchronously(const V1_3::Request & request,CallbackContext cbCtx)670 Return<void> ArmnnPreparedModel_1_3<HalVersion>::ExecuteSynchronously(const V1_3::Request& request,
671                                                                       CallbackContext cbCtx)
672 {
673     if (cbCtx.ctx.measureTimings == V1_2::MeasureTiming::YES)
674     {
675         cbCtx.ctx.driverStart = Now();
676     }
677 
678     if (!m_PreparedFromCache && !android::nn::validateRequest(convertToV1_3(request), m_Model))
679     {
680         ALOGE("ArmnnPreparedModel_1_3::ExecuteSynchronously invalid request model");
681         cbCtx.callback(V1_3::ErrorStatus::INVALID_ARGUMENT,
682                        {},
683                        g_NoTiming,
684                        "ArmnnPreparedModel_1_3::ExecuteSynchronously invalid request model");
685         return Void();
686     }
687 
688     if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
689     {
690         ALOGE("ArmnnPreparedModel_1_3::ExecuteSynchronously invalid request model");
691         cbCtx.callback(V1_3::ErrorStatus::INVALID_ARGUMENT,
692                        {},
693                        g_NoTiming,
694                        "ArmnnPreparedModel_1_3::ExecuteSynchronously invalid request model");
695         return Void();
696     }
697 
698 
699     // map the memory pool into shared pointers
700     // use a shared memory pools vector on the heap, as it is passed to the request thread
701     auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
702 
703     // allocate the tensors on the heap, as they are passed to the request thread
704     auto inputs = std::make_shared<armnn::InputTensors>();
705     auto outputs = std::make_shared<armnn::OutputTensors>();
706 
707     auto [status, outputShapes, timing, message] = PrepareMemoryForIO(*inputs, *outputs, *memPools, request);
708     if (status != V1_3::ErrorStatus::NONE)
709     {
710         cbCtx.callback(status, outputShapes, timing, message);
711         return Void();
712     }
713 
714     ALOGV("ArmnnPreparedModel_1_3::ExecuteSynchronously() before Execution");
715 
716     ExecuteGraph(memPools, *inputs, *outputs, cbCtx);
717     return Void();
718 }
719 
720 template<typename HalVersion>
executeSynchronously(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,executeSynchronously_cb cb)721 Return<void> ArmnnPreparedModel_1_3<HalVersion>::executeSynchronously(const V1_0::Request& request,
722                                                                       V1_2::MeasureTiming measureTiming,
723                                                                       executeSynchronously_cb cb)
724 {
725     if (!m_PreparedFromCache)
726     {
727         ALOGV("ArmnnPreparedModel_1_3::executeSynchronously(): %s", GetModelSummary(m_Model).c_str());
728     }
729     m_RequestCount++;
730 
731     if (cb == nullptr)
732     {
733         ALOGE("ArmnnPreparedModel_1_3::executeSynchronously invalid callback passed");
734         return Void();
735     }
736 
737     auto cbWrapper = [cb](V1_3::ErrorStatus errorStatus,
738                           std::vector<V1_2::OutputShape> outputShapes,
739                           const V1_2::Timing& timing,
740                           std::string)
741     {
742         cb(convertToV1_0(errorStatus), outputShapes, timing);
743     };
744 
745     CallbackContext_1_3 cbCtx;
746     cbCtx.callback = cbWrapper;
747     cbCtx.ctx.measureTimings = measureTiming;
748 
749     ExecuteSynchronously(convertToV1_3(request), cbCtx);
750     return Void();
751 }
752 
753 template<typename HalVersion>
executeSynchronously_1_3(const V1_3::Request & request,V1_2::MeasureTiming measureTiming,const V1_3::OptionalTimePoint & deadline,const V1_3::OptionalTimeoutDuration & loopTimeoutDuration,executeSynchronously_1_3_cb cb)754 Return<void>  ArmnnPreparedModel_1_3<HalVersion>::executeSynchronously_1_3(
755         const V1_3::Request& request,
756         V1_2::MeasureTiming measureTiming,
757         const V1_3::OptionalTimePoint& deadline,
758         const V1_3::OptionalTimeoutDuration& loopTimeoutDuration,
759         executeSynchronously_1_3_cb cb)
760 {
761     if (!m_PreparedFromCache)
762     {
763         ALOGV("ArmnnPreparedModel_1_3::executeSynchronously_1_3(): %s", GetModelSummary(m_Model).c_str());
764     }
765     m_RequestCount++;
766 
767     if (cb == nullptr)
768     {
769         ALOGE("ArmnnPreparedModel_1_3::executeSynchronously_1_3 invalid callback passed");
770         return Void();
771     }
772 
773     if (deadline.getDiscriminator() != V1_3::OptionalTimePoint::hidl_discriminator::none)
774     {
775         ALOGW("ArmnnPreparedModel_1_3::executeSynchronously_1_3 parameter deadline is set but not supported.");
776     }
777 
778     if (loopTimeoutDuration.getDiscriminator() != V1_3::OptionalTimeoutDuration::hidl_discriminator::none)
779     {
780         ALOGW(
781            "ArmnnPreparedModel_1_3::executeSynchronously_1_3 parameter loopTimeoutDuration is set but not supported.");
782     }
783 
784     auto cbWrapper = [cb](V1_3::ErrorStatus errorStatus,
785                           std::vector<V1_2::OutputShape> outputShapes,
786                           const V1_2::Timing& timing,
787                           std::string)
788     {
789         cb(errorStatus, outputShapes, timing);
790     };
791 
792     CallbackContext_1_3 cbCtx;
793     cbCtx.callback = cbWrapper;
794     cbCtx.ctx.measureTimings = measureTiming;
795 
796     ExecuteSynchronously(request, cbCtx);
797     return Void();
798 }
799 
800 template<typename HalVersion>
configureExecutionBurst(const sp<V1_2::IBurstCallback> & callback,const MQDescriptorSync<V1_2::FmqRequestDatum> & requestChannel,const MQDescriptorSync<V1_2::FmqResultDatum> & resultChannel,V1_3::IPreparedModel::configureExecutionBurst_cb cb)801 Return<void> ArmnnPreparedModel_1_3<HalVersion>::configureExecutionBurst(
802         const sp<V1_2::IBurstCallback>& callback,
803         const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
804         const MQDescriptorSync<V1_2::FmqResultDatum>& resultChannel,
805         V1_3::IPreparedModel::configureExecutionBurst_cb cb)
806 {
807     ALOGV("ArmnnPreparedModel_1_3::configureExecutionBurst");
808     const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(callback,
809                                                                        requestChannel,
810                                                                        resultChannel,
811                                                                        this);
812 
813     if (burst == nullptr)
814     {
815         cb(V1_0::ErrorStatus::GENERAL_FAILURE, {});
816     }
817     else
818     {
819         cb(V1_0::ErrorStatus::NONE, burst);
820     }
821     return Void();
822 }
823 
824 template<typename HalVersion>
825 template<typename CallbackContext>
ExecuteGraph(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> & pMemPools,armnn::InputTensors & inputTensors,armnn::OutputTensors & outputTensors,CallbackContext cb)826 Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::ExecuteGraph(
827     std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
828     armnn::InputTensors& inputTensors,
829     armnn::OutputTensors& outputTensors,
830     CallbackContext cb)
831 {
832     ALOGV("ArmnnPreparedModel_1_3::ExecuteGraph(...)");
833     // Capture the graph execution start time.
834     std::chrono::time_point<std::chrono::system_clock> graphExecutionStart = std::chrono::system_clock::now();
835 
836     DumpTensorsIfRequired("Input", inputTensors);
837 
838     std::vector<V1_2::OutputShape> outputShapes(outputTensors.size());
839     for (unsigned int i = 0; i < outputTensors.size(); i++)
840     {
841         std::pair<int, armnn::Tensor> outputTensorPair = outputTensors[i];
842         const armnn::Tensor outputTensor = outputTensorPair.second;
843         const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
844 
845         outputShapes[i] = ComputeShape(outputTensorInfo);
846     }
847 
848     // run it
849     try
850     {
851         if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
852         {
853             cb.ctx.deviceStart = Now();
854         }
855         armnn::Status status;
856         if (m_AsyncModelExecutionEnabled)
857         {
858             ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph m_AsyncModelExecutionEnabled true");
859             status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors);
860         }
861         else
862         {
863             ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph m_AsyncModelExecutionEnabled false");
864             // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
865             std::vector<armnn::ImportedInputId> importedInputIds;
866             if (m_EnableImport)
867             {
868                 importedInputIds =  m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
869             }
870             std::vector<armnn::ImportedOutputId> importedOutputIds;
871             if (m_EnableExport)
872             {
873                 importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
874             }
875             status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
876                                                 importedInputIds, importedOutputIds);
877         }
878 
879         if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
880         {
881             cb.ctx.deviceEnd = Now();
882         }
883         if (status != armnn::Status::Success)
884         {
885             ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph EnqueueWorkload failed");
886             cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
887             return V1_3::ErrorStatus::GENERAL_FAILURE;
888         }
889     }
890     catch (armnn::Exception& e)
891     {
892         ALOGW("armnn:Exception caught from EnqueueWorkload: %s", e.what());
893         cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
894         return V1_3::ErrorStatus::GENERAL_FAILURE;
895     }
896     catch (std::exception& e)
897     {
898         ALOGE("std::exception caught from EnqueueWorkload: %s", e.what());
899         cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
900         return V1_3::ErrorStatus::GENERAL_FAILURE;
901     }
902 
903     CommitPools(*pMemPools);
904 
905     DumpTensorsIfRequired("Output", outputTensors);
906 
907     if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
908     {
909         cb.ctx.driverEnd = Now();
910         V1_2::Timing timing;
911         timing.timeOnDevice = MicrosecondsDuration(cb.ctx.deviceEnd, cb.ctx.deviceStart);
912         timing.timeInDriver = MicrosecondsDuration(cb.ctx.driverEnd, cb.ctx.driverStart);
913         ALOGV("ArmnnPreparedModel_1_3::execute timing - Device = %lu Driver = %lu",
914               static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
915         cb.callback(V1_3::ErrorStatus::NONE, outputShapes, timing, "ArmnnPreparedModel_1_3::ExecuteGraph");
916     } else
917     {
918         cb.callback(V1_3::ErrorStatus::NONE, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
919     }
920     // Log the total time in this call. This is a good number to compare to that printed out by
921     // RuntimeImpl::EnqueueWorkload. The difference should be the execution overhead of the driver.
922     ALOGI("ArmnnPreparedModel_1_3::ExecuteGraph Execution time = %lld µs",
923           std::chrono::duration_cast<std::chrono::microseconds>
924           (std::chrono::system_clock::now() - graphExecutionStart).count());
925     return V1_3::ErrorStatus::NONE;
926 }
927 
928 /// Schedule the graph prepared from the request for execution
929 template<typename HalVersion>
930 template<typename CallbackContext>
ScheduleGraphForExecution(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> & pMemPools,std::shared_ptr<armnn::InputTensors> & inputTensors,std::shared_ptr<armnn::OutputTensors> & outputTensors,CallbackContext callbackContext,armnn::QosExecPriority priority)931 void ArmnnPreparedModel_1_3<HalVersion>::ScheduleGraphForExecution(
932         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
933         std::shared_ptr<armnn::InputTensors>& inputTensors,
934         std::shared_ptr<armnn::OutputTensors>& outputTensors,
935         CallbackContext callbackContext,
936         armnn::QosExecPriority priority)
937 {
938     ALOGV("ArmnnPreparedModel_1_3::ScheduleGraphForExecution(...)");
939 
940     DumpTensorsIfRequired("Input", *inputTensors);
941 
942     unsigned int outputTensorSize = outputTensors.get()->size();
943     std::vector<V1_2::OutputShape> outputShapes(outputTensorSize);
944     for (unsigned int i = 0; i < outputTensorSize; i++)
945     {
946         std::pair<int, armnn::Tensor> outputTensorPair = outputTensors.get()->at(i);
947         const armnn::Tensor outputTensor = outputTensorPair.second;
948         const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
949 
950         outputShapes[i] = ComputeShape(outputTensorInfo);
951     }
952 
953     auto tpCb = std::make_shared<
954         ArmnnThreadPoolCallback_1_3<CallbackContext_1_3>>(this,
955                                                           pMemPools,
956                                                           outputShapes,
957                                                           inputTensors,
958                                                           outputTensors,
959                                                           callbackContext);
960 
961     m_Threadpool->Schedule(m_NetworkId,
962                            *tpCb->m_InputTensors,
963                            *tpCb->m_OutputTensors,
964                            priority,
965                            tpCb);
966     ALOGV("ArmnnPreparedModel_1_3::ScheduleGraphForExecution end");
967 }
968 
969 template<typename HalVersion>
ExecuteWithDummyInputs(unsigned int numInputs,unsigned int numOutputs)970 bool ArmnnPreparedModel_1_3<HalVersion>::ExecuteWithDummyInputs(unsigned int numInputs, unsigned int numOutputs)
971 {
972     std::vector<std::vector<char>> storage;
973     armnn::InputTensors inputTensors;
974     for (unsigned int i = 0; i < numInputs; i++)
975     {
976         armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
977         // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors.
978         // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
979         inputTensorInfo.SetConstant();
980 
981         storage.emplace_back(inputTensorInfo.GetNumBytes());
982         const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data());
983 
984         inputTensors.emplace_back(i, inputTensor);
985     }
986 
987     armnn::OutputTensors outputTensors;
988     for (unsigned int i = 0; i < numOutputs; i++)
989     {
990         const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
991         storage.emplace_back(outputTensorInfo.GetNumBytes());
992         const armnn::Tensor outputTensor(outputTensorInfo, storage.back().data());
993 
994         outputTensors.emplace_back(i, outputTensor);
995     }
996 
997     auto nullCallback = [](V1_3::ErrorStatus, std::vector<V1_2::OutputShape>, const V1_2::Timing&, std::string) {};
998     CallbackContext_1_3 callbackContext;
999     callbackContext.callback = nullCallback;
1000     callbackContext.ctx.measureTimings = V1_2::MeasureTiming::NO;
1001     auto memPools = std::make_shared<std::vector<::android::nn::RunTimePoolInfo>>();
1002 
1003     auto errorStatus = ExecuteGraph(memPools,
1004                                     inputTensors,
1005                                     outputTensors,
1006                                     callbackContext);
1007     return errorStatus == V1_3::ErrorStatus::NONE;
1008 }
1009 
1010 template<typename HalVersion>
Execute(const V1_3::Request & request,V1_2::MeasureTiming measureTiming,CallbackAsync_1_3 callback)1011 Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::Execute(const V1_3::Request& request,
1012                                                                        V1_2::MeasureTiming measureTiming,
1013                                                                        CallbackAsync_1_3 callback)
1014 {
1015     ExecutionContext_1_3 ctx;
1016     if (measureTiming == V1_2::MeasureTiming::YES)
1017     {
1018         ctx.measureTimings = measureTiming;
1019         ctx.driverStart = Now();
1020     }
1021 
1022     if (!m_PreparedFromCache)
1023     {
1024         ALOGV("ArmnnPreparedModel_1_3::execute(): %s", GetModelSummary(m_Model).c_str());
1025     }
1026     m_RequestCount++;
1027 
1028     if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
1029     {
1030         callback(V1_3::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute");
1031         return V1_3::ErrorStatus::INVALID_ARGUMENT;
1032     }
1033 
1034     if (!m_RequestInputsAndOutputsDumpDir.empty())
1035     {
1036         ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(&callback));
1037     }
1038 
1039     // map the memory pool into shared pointers
1040     // use a shared memory pools vector on the heap, as it is passed to the request thread
1041     auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
1042 
1043     // allocate the tensors on the heap, as they are passed to the request thread
1044     auto inputTensors = std::make_shared<armnn::InputTensors>();
1045     auto outputTensors = std::make_shared<armnn::OutputTensors>();
1046 
1047     auto [status, outShapes, timing, message] = PrepareMemoryForIO(*inputTensors, *outputTensors,
1048                                                                    *memPools, request);
1049     if (status != V1_3::ErrorStatus::NONE)
1050     {
1051         callback(status, outShapes, timing, message);
1052     }
1053 
1054     switch(status)
1055     {
1056         case V1_3::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE:
1057             return V1_3::ErrorStatus::NONE;
1058         case V1_3::ErrorStatus::GENERAL_FAILURE:
1059             return V1_3::ErrorStatus::GENERAL_FAILURE;
1060         case V1_3::ErrorStatus::INVALID_ARGUMENT:
1061             return V1_3::ErrorStatus::INVALID_ARGUMENT;
1062         default:
1063         {}
1064     }
1065     CallbackContext_1_3 cb;
1066     cb.callback = callback;
1067     cb.ctx = ctx;
1068 
1069 
1070     enum class QosExecPriority
1071     {
1072         Low    = 0,
1073         Medium = 1,
1074         High   = 2
1075     };
1076 
1077 
1078     if (m_AsyncModelExecutionEnabled)
1079     {
1080         armnn::QosExecPriority priority;
1081 
1082         switch (GetModelPriority()) {
1083             case V1_3::Priority::LOW:
1084                 priority = armnn::QosExecPriority::Low;
1085                 break;
1086             case V1_3::Priority::MEDIUM:
1087                 priority = armnn::QosExecPriority::Medium;
1088                 break;
1089             case V1_3::Priority::HIGH:
1090                 priority = armnn::QosExecPriority::High;
1091                 break;
1092             default:
1093                 priority = armnn::QosExecPriority::Medium;
1094 
1095         }
1096 
1097         ALOGV("ArmnnPreparedModel_1_3::execute(...) before ScheduleGraphForExecution");
1098         ScheduleGraphForExecution(memPools, inputTensors, outputTensors, cb, priority);
1099         ALOGV("ArmnnPreparedModel_1_3::execute(...) after ScheduleGraphForExecution");
1100         return V1_3::ErrorStatus::NONE;
1101     }
1102 
1103     ALOGV("ArmnnPreparedModel_1_3::execute(...) before PostMsg");
1104     // post the request for asynchronous execution
1105     m_RequestThread.PostMsg(this, memPools, inputTensors, outputTensors, cb);
1106     ALOGV("ArmnnPreparedModel_1_3::execute(...) after PostMsg");
1107     return V1_3::ErrorStatus::NONE;
1108 }
1109 
1110 template<typename HalVersion>
GetModelPriority()1111 V1_3::Priority ArmnnPreparedModel_1_3<HalVersion>::GetModelPriority()
1112 {
1113     return m_ModelPriority;
1114 }
1115 
1116 template<typename HalVersion>
1117 template <typename CallbackContext>
Notify(armnn::Status status,armnn::InferenceTimingPair timeTaken)1118 void ArmnnPreparedModel_1_3<HalVersion>::ArmnnThreadPoolCallback_1_3<CallbackContext>::Notify(
1119         armnn::Status status, armnn::InferenceTimingPair timeTaken)
1120 {
1121     ALOGV("ArmnnPreparedModel_1_3::ArmnnThreadPoolCallback_1_3<CallbackContext>::Notify");
1122     CommitPools(*m_MemPools);
1123 
1124      m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors);
1125 
1126     if (status != armnn::Status::Success)
1127     {
1128         ALOGW("ArmnnThreadPoolCallback_1_3::Notify EnqueueWorkload failed");
1129         m_CallbackContext.callback(V1_3::ErrorStatus::GENERAL_FAILURE,
1130                                    {},
1131                                    g_NoTiming,
1132                                    "ArmnnPreparedModel_1_3::ArmnnThreadPoolCallback_1_3");
1133         return;
1134     }
1135 
1136     if (m_CallbackContext.ctx.measureTimings == V1_2::MeasureTiming::YES)
1137     {
1138         m_CallbackContext.ctx.deviceStart = timeTaken.first;
1139         m_CallbackContext.ctx.deviceEnd = timeTaken.second;
1140         m_CallbackContext.ctx.driverEnd = std::chrono::steady_clock::now();
1141         V1_2::Timing timing;
1142         timing.timeOnDevice = MicrosecondsDuration(m_CallbackContext.ctx.deviceEnd, m_CallbackContext.ctx.deviceStart);
1143         timing.timeInDriver = MicrosecondsDuration(m_CallbackContext.ctx.driverEnd, m_CallbackContext.ctx.driverStart);
1144         ALOGV("ArmnnPreparedModel_1_3::execute timing - Device = %lu Driver = %lu",
1145               static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
1146         m_CallbackContext.callback(
1147                 V1_3::ErrorStatus::NONE, m_OutputShapes, timing, "ArmnnPreparedModel_1_3::ExecuteGraph");
1148     } else
1149     {
1150         m_CallbackContext.callback(
1151                 V1_3::ErrorStatus::NONE, m_OutputShapes, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
1152     }
1153     return;
1154 }
1155 
1156 #ifdef ARMNN_ANDROID_NN_V1_3
1157 template class ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>;
1158 template Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>::ExecuteGraph<CallbackContext_1_3>(
1159         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
1160         armnn::InputTensors& pInputTensors,
1161         armnn::OutputTensors& pOutputTensors,
1162         CallbackContext_1_3 cb);
1163 
1164 template void ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>::ScheduleGraphForExecution<CallbackContext_1_3>(
1165                 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
1166                 std::shared_ptr<armnn::InputTensors>& inputTensors,
1167                 std::shared_ptr<armnn::OutputTensors>& outputTensors,
1168                 CallbackContext_1_3 callbackContext,
1169                 armnn::QosExecPriority priority);
1170 #endif
1171 
1172 } // namespace armnn_driver
1173