xref: /aosp_15_r20/external/android-nn-driver/ArmnnPreparedModel_1_2.cpp (revision 3e777be0405cee09af5d5785ff37f7cfb5bee59a)
1 //
2 // Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #define LOG_TAG "ArmnnDriver"
7 
8 #include "ArmnnPreparedModel_1_2.hpp"
9 
10 #include "Utils.hpp"
11 
12 #include <armnn/Types.hpp>
13 
14 #include <log/log.h>
15 #include <OperationsUtils.h>
16 #include <ExecutionBurstServer.h>
17 #include <ValidateHal.h>
18 
19 #include <chrono>
20 #include <cinttypes>
21 
22 #ifdef ARMNN_ANDROID_S
23 #include <LegacyUtils.h>
24 #endif
25 
26 using namespace android;
27 using namespace android::hardware;
28 
29 namespace {
30 
31 static const V1_2::Timing g_NoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
32 using namespace armnn_driver;
33 using TimePoint = std::chrono::steady_clock::time_point;
34 
Now()35 TimePoint Now()
36 {
37     return std::chrono::steady_clock::now();
38 }
39 
MicrosecondsDuration(TimePoint endPoint,TimePoint startPoint)40 unsigned long MicrosecondsDuration(TimePoint endPoint, TimePoint startPoint)
41 {
42     return static_cast<unsigned long>(std::chrono::duration_cast<std::chrono::microseconds>(
43                                       endPoint - startPoint).count());
44 }
45 
NotifyCallbackAndCheck(const::android::sp<V1_0::IExecutionCallback> & callback,V1_0::ErrorStatus errorStatus,std::vector<V1_2::OutputShape>,const V1_2::Timing,std::string callingFunction)46 void NotifyCallbackAndCheck(const ::android::sp<V1_0::IExecutionCallback>& callback,
47                             V1_0::ErrorStatus errorStatus,
48                             std::vector<V1_2::OutputShape>,
49                             const V1_2::Timing,
50                             std::string callingFunction)
51 {
52     Return<void> returned = callback->notify(errorStatus);
53     // This check is required, if the callback fails and it isn't checked it will bring down the service
54     if (!returned.isOk())
55     {
56         ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
57               callingFunction.c_str(), returned.description().c_str());
58     }
59 }
60 
NotifyCallbackAndCheck(const::android::sp<V1_2::IExecutionCallback> & callback,V1_0::ErrorStatus errorStatus,std::vector<V1_2::OutputShape> outputShapes,const V1_2::Timing timing,std::string callingFunction)61 void NotifyCallbackAndCheck(const ::android::sp<V1_2::IExecutionCallback>& callback,
62                             V1_0::ErrorStatus errorStatus,
63                             std::vector<V1_2::OutputShape> outputShapes,
64                             const V1_2::Timing timing,
65                             std::string callingFunction)
66 {
67     Return<void> returned = callback->notify_1_2(errorStatus, outputShapes, timing);
68     // This check is required, if the callback fails and it isn't checked it will bring down the service
69     if (!returned.isOk())
70     {
71         ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
72               callingFunction.c_str(), returned.description().c_str());
73     }
74 }
75 
ValidateRequestArgument(const V1_0::RequestArgument & requestArg,const armnn::TensorInfo & tensorInfo)76 bool ValidateRequestArgument(const V1_0::RequestArgument& requestArg, const armnn::TensorInfo& tensorInfo)
77 {
78     if (requestArg.dimensions.size() != 0)
79     {
80         if (requestArg.dimensions.size() != tensorInfo.GetNumDimensions())
81         {
82             ALOGE("Mismatched dimensions (request argument: %zu, expected: %u)",
83                   requestArg.dimensions.size(), tensorInfo.GetNumDimensions());
84             return false;
85         }
86 
87         for (unsigned int d = 0; d < tensorInfo.GetNumDimensions(); ++d)
88         {
89             if (requestArg.dimensions[d] != 0 && requestArg.dimensions[d] != tensorInfo.GetShape()[d])
90             {
91                 ALOGE("Mismatched size for dimension %d (request argument: %u, expected %u)",
92                       d, requestArg.dimensions[d], tensorInfo.GetShape()[d]);
93                 return false;
94             }
95         }
96     }
97 
98     return true;
99 }
100 
GetTensorForRequestArgument(const V1_0::RequestArgument & requestArg,const armnn::TensorInfo & tensorInfo,const std::vector<::android::nn::RunTimePoolInfo> & requestPools)101 armnn::Tensor GetTensorForRequestArgument(const V1_0::RequestArgument& requestArg,
102                                           const armnn::TensorInfo& tensorInfo,
103                                           const std::vector<::android::nn::RunTimePoolInfo>& requestPools)
104 {
105     if (!ValidateRequestArgument(requestArg, tensorInfo))
106     {
107         return armnn::Tensor();
108     }
109 
110     return armnn::Tensor(tensorInfo, GetMemoryFromPool(requestArg.location, requestPools));
111 }
112 
BuildTensorName(const char * tensorNamePrefix,std::size_t index)113 inline std::string BuildTensorName(const char* tensorNamePrefix, std::size_t index)
114 {
115     return tensorNamePrefix + std::to_string(index);
116 }
117 
118 } // anonymous namespace
119 
120 using namespace android::hardware;
121 
122 namespace armnn_driver
123 {
124 
125 template<typename HalVersion>
126 RequestThread<ArmnnPreparedModel_1_2, HalVersion, CallbackContext_1_2>
127         ArmnnPreparedModel_1_2<HalVersion>::m_RequestThread;
128 
129 template<typename HalVersion>
130 std::unique_ptr<armnn::Threadpool> ArmnnPreparedModel_1_2<HalVersion>::m_Threadpool(nullptr);
131 
132 template<typename HalVersion>
133 template<typename TensorBindingCollection>
DumpTensorsIfRequired(char const * tensorNamePrefix,const TensorBindingCollection & tensorBindings)134 void ArmnnPreparedModel_1_2<HalVersion>::DumpTensorsIfRequired(char const* tensorNamePrefix,
135                                                                const TensorBindingCollection& tensorBindings)
136 {
137     if (!m_RequestInputsAndOutputsDumpDir.empty())
138     {
139         const std::string requestName = std::to_string(m_NetworkId) + "_" + std::to_string(m_RequestCount) + ".dump";
140         for (std::size_t i = 0u; i < tensorBindings.size(); ++i)
141         {
142             DumpTensor(m_RequestInputsAndOutputsDumpDir,
143                        requestName,
144                        BuildTensorName(tensorNamePrefix, i),
145                        tensorBindings[i].second);
146         }
147     }
148 }
149 
150 template<typename HalVersion>
ArmnnPreparedModel_1_2(armnn::NetworkId networkId,armnn::IRuntime * runtime,const V1_2::Model & model,const std::string & requestInputsAndOutputsDumpDir,const bool gpuProfilingEnabled,const bool asyncModelExecutionEnabled,const unsigned int numberOfThreads,const bool importEnabled,const bool exportEnabled)151 ArmnnPreparedModel_1_2<HalVersion>::ArmnnPreparedModel_1_2(armnn::NetworkId networkId,
152                                                            armnn::IRuntime* runtime,
153                                                            const V1_2::Model& model,
154                                                            const std::string& requestInputsAndOutputsDumpDir,
155                                                            const bool gpuProfilingEnabled,
156                                                            const bool asyncModelExecutionEnabled,
157                                                            const unsigned int numberOfThreads,
158                                                            const bool importEnabled,
159                                                            const bool exportEnabled)
160     : m_NetworkId(networkId)
161     , m_Runtime(runtime)
162     , m_Model(model)
163     , m_RequestCount(0)
164     , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
165     , m_GpuProfilingEnabled(gpuProfilingEnabled)
166     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
167     , m_EnableImport(importEnabled)
168     , m_EnableExport(exportEnabled)
169     , m_PreparedFromCache(false)
170 {
171     // Enable profiling if required.
172     m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
173 
174     if (m_AsyncModelExecutionEnabled)
175     {
176         std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
177         for (unsigned int i=0; i < numberOfThreads; ++i)
178         {
179             memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
180         }
181 
182         if (!m_Threadpool)
183         {
184             m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
185         }
186         else
187         {
188             m_Threadpool->LoadMemHandles(memHandles);
189         }
190 
191         m_WorkingMemHandle = memHandles.back();
192     }
193 }
194 
195 template<typename HalVersion>
ArmnnPreparedModel_1_2(armnn::NetworkId networkId,armnn::IRuntime * runtime,const std::string & requestInputsAndOutputsDumpDir,const bool gpuProfilingEnabled,const bool asyncModelExecutionEnabled,const unsigned int numberOfThreads,const bool importEnabled,const bool exportEnabled,const bool preparedFromCache)196 ArmnnPreparedModel_1_2<HalVersion>::ArmnnPreparedModel_1_2(armnn::NetworkId networkId,
197                                                            armnn::IRuntime* runtime,
198                                                            const std::string& requestInputsAndOutputsDumpDir,
199                                                            const bool gpuProfilingEnabled,
200                                                            const bool asyncModelExecutionEnabled,
201                                                            const unsigned int numberOfThreads,
202                                                            const bool importEnabled,
203                                                            const bool exportEnabled,
204                                                            const bool preparedFromCache)
205     : m_NetworkId(networkId)
206     , m_Runtime(runtime)
207     , m_RequestCount(0)
208     , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
209     , m_GpuProfilingEnabled(gpuProfilingEnabled)
210     , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
211     , m_EnableImport(importEnabled)
212     , m_EnableExport(exportEnabled)
213     , m_PreparedFromCache(preparedFromCache)
214 {
215     // Enable profiling if required.
216     m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
217 
218     if (m_AsyncModelExecutionEnabled)
219     {
220         std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
221         for (unsigned int i=0; i < numberOfThreads; ++i)
222         {
223             memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
224         }
225 
226         if (!m_Threadpool)
227         {
228             m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
229         }
230         else
231         {
232             m_Threadpool->LoadMemHandles(memHandles);
233         }
234 
235         m_WorkingMemHandle = memHandles.back();
236     }
237 }
238 
239 template<typename HalVersion>
~ArmnnPreparedModel_1_2()240 ArmnnPreparedModel_1_2<HalVersion>::~ArmnnPreparedModel_1_2()
241 {
242     // Get a hold of the profiler used by this model.
243     std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkId);
244     if (profiler && m_GpuProfilingEnabled)
245     {
246         // Dump the profiling info to a file if required.
247         DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId,
248                                     profiler.get());
249     }
250 
251     // Unload the network associated with this model.
252     m_Runtime->UnloadNetwork(m_NetworkId);
253 
254     // Unload the network memhandles from the threadpool
255     if (m_AsyncModelExecutionEnabled)
256     {
257         m_Threadpool->UnloadMemHandles(m_NetworkId);
258     }
259 }
260 
261 template<typename HalVersion>
execute(const V1_0::Request & request,const::android::sp<V1_0::IExecutionCallback> & callback)262 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::execute(const V1_0::Request& request,
263         const ::android::sp<V1_0::IExecutionCallback>& callback)
264 {
265     if (callback.get() == nullptr)
266     {
267         ALOGE("ArmnnPreparedModel_1_2::execute invalid callback passed");
268         return V1_0::ErrorStatus::INVALID_ARGUMENT;
269     }
270 
271     auto cb = [callback](V1_0::ErrorStatus errorStatus,
272                          std::vector<V1_2::OutputShape> outputShapes,
273                          const V1_2::Timing& timing,
274                          std::string callingFunction)
275     {
276         NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
277     };
278 
279     return Execute(request, V1_2::MeasureTiming::NO, cb);
280 }
281 
282 template<typename HalVersion>
execute_1_2(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,const sp<V1_2::IExecutionCallback> & callback)283 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::execute_1_2(
284         const V1_0::Request& request,
285         V1_2::MeasureTiming measureTiming,
286         const sp<V1_2::IExecutionCallback>& callback)
287 {
288     if (callback.get() == nullptr)
289     {
290         ALOGE("ArmnnPreparedModel_1_2::execute_1_2 invalid callback passed");
291         return V1_0::ErrorStatus::INVALID_ARGUMENT;
292     }
293 
294     auto cb = [callback](V1_0::ErrorStatus errorStatus,
295                          std::vector<V1_2::OutputShape> outputShapes,
296                          const V1_2::Timing& timing,
297                          std::string callingFunction)
298     {
299         NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
300     };
301 
302     return Execute(request, measureTiming, cb);
303 }
304 
305 template<typename HalVersion>
PrepareMemoryForInputs(armnn::InputTensors & inputs,const V1_0::Request & request,const std::vector<android::nn::RunTimePoolInfo> & memPools)306 Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForInputs(
307     armnn::InputTensors& inputs,
308     const V1_0::Request& request,
309     const std::vector<android::nn::RunTimePoolInfo>& memPools)
310 {
311     inputs.reserve(request.inputs.size());
312     for (unsigned int i = 0; i < request.inputs.size(); i++)
313     {
314         const auto& inputArg = request.inputs[i];
315         armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
316         // inputs (of type InputTensors) is composed of a vector of ConstTensors.
317         // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
318         inputTensorInfo.SetConstant();
319         auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request,
320                                                                                 inputTensorInfo,
321                                                                                 inputArg,
322                                                                                 "input");
323 
324         if (result != V1_0::ErrorStatus::NONE)
325         {
326             return result;
327         }
328 
329         const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, memPools);
330 
331         if (inputTensor.GetMemoryArea() == nullptr)
332         {
333             ALOGE("Cannot execute request. Error converting request input %u to tensor", i);
334             return V1_0::ErrorStatus::GENERAL_FAILURE;
335         }
336 
337         inputs.emplace_back(i, inputTensor);
338     }
339 
340     return V1_0::ErrorStatus::NONE;
341 }
342 
343 template<typename HalVersion>
PrepareMemoryForOutputs(armnn::OutputTensors & outputs,std::vector<V1_2::OutputShape> & outputShapes,const V1_0::Request & request,const std::vector<android::nn::RunTimePoolInfo> & memPools)344 Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForOutputs(
345     armnn::OutputTensors& outputs,
346     std::vector<V1_2::OutputShape> &outputShapes,
347     const V1_0::Request& request,
348     const std::vector<android::nn::RunTimePoolInfo>& memPools)
349 {
350     outputs.reserve(request.outputs.size());
351     for (unsigned int i = 0; i < request.outputs.size(); i++)
352     {
353         const auto& outputArg = request.outputs[i];
354         armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
355         auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request,
356                                                                                 outputTensorInfo,
357                                                                                 outputArg,
358                                                                                 "output");
359 
360         if (result != V1_0::ErrorStatus::NONE)
361         {
362             return result;
363         }
364 
365         const armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, memPools);
366         if (outputTensor.GetMemoryArea() == nullptr)
367         {
368             ALOGE("Cannot execute request. Error converting request output %u to tensor", i);
369             return V1_0::ErrorStatus::GENERAL_FAILURE;
370         }
371 
372         const size_t outputSize = outputTensorInfo.GetNumBytes();
373 
374         if (outputArg.location.length < outputSize)
375         {
376             ALOGW("ArmnnPreparedModel_1_2::Execute failed: outputArg.location.length < outputSize");
377             return V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
378         }
379 
380 #if !defined(ARMNN_ANDROID_S)
381         const size_t bufferSize = memPools.at(outputArg.location.poolIndex).getHidlMemory().size();
382         if (bufferSize < outputSize)
383         {
384             ALOGW("ArmnnPreparedModel_1_2::Execute failed: bufferSize < outputSize");
385             return V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
386         }
387 #else
388         const size_t bufferSize = memPools.at(outputArg.location.poolIndex).getSize();
389         if (bufferSize < outputSize)
390         {
391             ALOGW("ArmnnPreparedModel_1_2::Execute failed bufferSize (%s) < outputSize (%s)",
392                   std::to_string(bufferSize).c_str(), std::to_string(outputSize).c_str());
393             outputShapes[i].isSufficient = false;
394             return V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
395         }
396 #endif
397         outputs.emplace_back(i, outputTensor);
398         outputShapes[i] = ComputeShape(outputTensorInfo);
399     }
400 
401     return V1_0::ErrorStatus::NONE;
402 }
403 
404 template<typename HalVersion>
PrepareMemoryForIO(armnn::InputTensors & inputs,armnn::OutputTensors & outputs,std::vector<android::nn::RunTimePoolInfo> & memPools,const V1_0::Request & request,CallbackAsync_1_2 callback)405 Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForIO(
406                                          armnn::InputTensors& inputs,
407                                          armnn::OutputTensors& outputs,
408                                          std::vector<android::nn::RunTimePoolInfo>& memPools,
409                                          const V1_0::Request& request,
410                                          CallbackAsync_1_2 callback)
411 {
412 #if !defined(ARMNN_ANDROID_S)
413     if (!setRunTimePoolInfosFromHidlMemories(&memPools, request.pools))
414 #else
415     if (!setRunTimePoolInfosFromCanonicalMemories(&memPools, uncheckedConvert(request.pools)))
416 #endif
417     {
418         callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
419         return V1_0::ErrorStatus::GENERAL_FAILURE;
420     }
421     // add the inputs and outputs with their data
422     try
423     {
424         if (PrepareMemoryForInputs(inputs, request, memPools) != V1_0::ErrorStatus::NONE)
425         {
426             callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
427             return V1_0::ErrorStatus::GENERAL_FAILURE;
428         }
429 
430         std::vector<V1_2::OutputShape> outputShapes(request.outputs.size());
431 
432         auto errorStatus = PrepareMemoryForOutputs(outputs, outputShapes, request, memPools);
433         if (errorStatus != V1_0::ErrorStatus::NONE)
434         {
435             callback(errorStatus,
436                      outputShapes,
437                      g_NoTiming,
438                      "ArmnnPreparedModel_1_2::Execute");
439             return errorStatus;
440         }
441     }
442     catch (armnn::Exception& e)
443     {
444         ALOGW("armnn::Exception caught while preparing for EnqueueWorkload: %s", e.what());
445         callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
446         return V1_0::ErrorStatus::GENERAL_FAILURE;
447     }
448     catch (std::exception& e)
449     {
450         ALOGE("std::exception caught while preparing for EnqueueWorkload: %s", e.what());
451         callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
452         return V1_0::ErrorStatus::GENERAL_FAILURE;
453     }
454 
455     return V1_0::ErrorStatus::NONE;
456 }
457 
458 template<typename HalVersion>
executeSynchronously(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,executeSynchronously_cb cb)459 Return<void> ArmnnPreparedModel_1_2<HalVersion>::executeSynchronously(const V1_0::Request& request,
460                                                                       V1_2::MeasureTiming measureTiming,
461                                                                       executeSynchronously_cb cb)
462 {
463     if (!m_PreparedFromCache)
464     {
465         ALOGV("ArmnnPreparedModel_1_2::executeSynchronously(): %s", GetModelSummary(m_Model).c_str());
466     }
467     m_RequestCount++;
468 
469     if (cb == nullptr)
470     {
471         ALOGE("ArmnnPreparedModel_1_2::executeSynchronously invalid callback passed");
472         return Void();
473     }
474 
475     TimePoint driverStart;
476 
477     if (measureTiming == V1_2::MeasureTiming::YES)
478     {
479         driverStart = Now();
480     }
481 
482     if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
483     {
484         ALOGE("ArmnnPreparedModel_1_2::executeSynchronously invalid request model");
485         cb(V1_0::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming);
486         return Void();
487     }
488 
489     auto cbWrapper = [cb](V1_0::ErrorStatus errorStatus,
490                           std::vector<V1_2::OutputShape> outputShapes,
491                           const V1_2::Timing& timing,
492                           std::string)
493         {
494             cb(errorStatus, outputShapes, timing);
495         };
496 
497     // map the memory pool into shared pointers
498     // use a shared memory pools vector on the heap, as it is passed to the request thread
499     auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
500 
501     // allocate the tensors on the heap, as they are passed to the request thread
502     auto inputs = std::make_shared<armnn::InputTensors>();
503     auto outputs = std::make_shared<armnn::OutputTensors>();
504 
505     auto prepareStatus = PrepareMemoryForIO(*inputs, *outputs, *memPools, request, cbWrapper);
506     if (prepareStatus != V1_0::ErrorStatus::NONE)
507     {
508         return Void();
509     }
510 
511     ALOGV("ArmnnPreparedModel_1_2::executeSynchronously() before Execution");
512 
513     CallbackContext_1_2 cbCtx;
514     cbCtx.callback = cbWrapper;
515     cbCtx.ctx.measureTimings = measureTiming;
516     cbCtx.ctx.driverStart = driverStart;
517     ExecuteGraph(memPools, *inputs, *outputs, cbCtx);
518 
519     return Void();
520 }
521 
522 template<typename HalVersion>
523 template<typename CallbackContext>
ExecuteGraph(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> & pMemPools,armnn::InputTensors & inputTensors,armnn::OutputTensors & outputTensors,CallbackContext cb)524 bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteGraph(
525         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
526         armnn::InputTensors& inputTensors,
527         armnn::OutputTensors& outputTensors,
528         CallbackContext cb)
529 {
530     ALOGV("ArmnnPreparedModel_1_2::ExecuteGraph(...)");
531 
532     TimePoint driverEnd, deviceStart, deviceEnd;
533     // Capture the graph execution start time.
534     std::chrono::time_point<std::chrono::system_clock> graphExecutionStart = std::chrono::system_clock::now();
535 
536     DumpTensorsIfRequired("Input", inputTensors);
537 
538     std::vector<V1_2::OutputShape> outputShapes(outputTensors.size());
539     for (unsigned int i = 0; i < outputTensors.size(); i++)
540     {
541         std::pair<int, armnn::Tensor> outputTensorPair = outputTensors[i];
542         const armnn::Tensor outputTensor = outputTensorPair.second;
543         const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
544 
545         outputShapes[i] = ComputeShape(outputTensorInfo);
546     }
547 
548     // run it
549     try
550     {
551         if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
552         {
553             deviceStart = Now();
554         }
555 
556         armnn::Status status;
557         if (m_AsyncModelExecutionEnabled)
558         {
559             ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled true");
560             status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors);
561         }
562         else
563         {
564             ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled false");
565 
566             // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
567             std::vector<armnn::ImportedInputId> importedInputIds;
568             if (m_EnableImport)
569             {
570                 importedInputIds =  m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
571             }
572             std::vector<armnn::ImportedOutputId> importedOutputIds;
573             if (m_EnableExport)
574             {
575                 importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
576             }
577             status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
578                                                 importedInputIds, importedOutputIds);
579         }
580 
581         if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
582         {
583             deviceEnd = Now();
584         }
585         if (status != armnn::Status::Success)
586         {
587             ALOGW("EnqueueWorkload failed");
588             cb.callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming,
589                     "ArmnnPreparedModel_1_2::ExecuteGraph");
590             return false;
591         }
592     }
593     catch (armnn::Exception& e)
594     {
595         ALOGW("armnn:Exception caught from EnqueueWorkload: %s", e.what());
596         cb.callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
597         return false;
598     }
599     catch (std::exception& e)
600     {
601         ALOGE("std::exception caught from EnqueueWorkload: %s", e.what());
602         cb.callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
603         return false;
604     }
605 
606     CommitPools(*pMemPools);
607 
608     DumpTensorsIfRequired("Output", outputTensors);
609 
610     if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
611     {
612         driverEnd = Now();
613         V1_2::Timing timing;
614         timing.timeOnDevice = MicrosecondsDuration(deviceEnd, deviceStart);
615         timing.timeInDriver = MicrosecondsDuration(driverEnd, cb.ctx.driverStart);
616         ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu",
617               static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
618         cb.callback(V1_0::ErrorStatus::NONE, outputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph");
619     } else {
620         cb.callback(V1_0::ErrorStatus::NONE, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
621     }
622 
623     // Log the total time in this call. This is a good number to compare to that printed out by
624     // RuntimeImpl::EnqueueWorkload. The difference should be the execution overhead of the driver.
625     ALOGI("ArmnnPreparedModel_1_2::ExecuteGraph Execution time = %lld µs",
626           std::chrono::duration_cast<std::chrono::microseconds>
627           (std::chrono::system_clock::now() - graphExecutionStart).count());
628     return true;
629 }
630 
631 template<typename HalVersion>
ExecuteWithDummyInputs(unsigned int numInputs,unsigned int numOutputs)632 bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteWithDummyInputs(unsigned int numInputs, unsigned int numOutputs)
633 {
634     std::vector<std::vector<char>> storage;
635     armnn::InputTensors inputTensors;
636     for (unsigned int i = 0; i < numInputs; i++)
637     {
638         armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
639         // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors.
640         // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
641         inputTensorInfo.SetConstant();
642 
643         storage.emplace_back(inputTensorInfo.GetNumBytes());
644         const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data());
645 
646         inputTensors.emplace_back(i, inputTensor);
647     }
648 
649     armnn::OutputTensors outputTensors;
650     for (unsigned int i = 0; i < numOutputs; i++)
651     {
652         const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
653         storage.emplace_back(outputTensorInfo.GetNumBytes());
654         const armnn::Tensor outputTensor(outputTensorInfo, storage.back().data());
655 
656         outputTensors.emplace_back(i, outputTensor);
657     }
658 
659     auto nullCallback = [](V1_0::ErrorStatus, std::vector<V1_2::OutputShape>, const V1_2::Timing&, std::string) {};
660     CallbackContext_1_2 callbackContext;
661     callbackContext.callback = nullCallback;
662     callbackContext.ctx.measureTimings = V1_2::MeasureTiming::NO;
663     auto memPools = std::make_shared<std::vector<::android::nn::RunTimePoolInfo>>();
664     return ExecuteGraph(memPools,
665                         inputTensors,
666                         outputTensors,
667                         callbackContext);
668 }
669 
670 template<typename HalVersion>
Execute(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,CallbackAsync_1_2 callback)671 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::Execute(const V1_0::Request& request,
672                                                                        V1_2::MeasureTiming measureTiming,
673                                                                        CallbackAsync_1_2 callback)
674 {
675     ExecutionContext_1_2 ctx;
676     if (measureTiming == V1_2::MeasureTiming::YES)
677     {
678         ctx.measureTimings = measureTiming;
679         ctx.driverStart = Now();
680     }
681 
682     if (!m_PreparedFromCache)
683     {
684         ALOGV("ArmnnPreparedModel_1_2::execute(): %s", GetModelSummary(m_Model).c_str());
685     }
686     m_RequestCount++;
687 
688     if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
689     {
690         callback(V1_0::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
691         return V1_0::ErrorStatus::INVALID_ARGUMENT;
692     }
693 
694     if (!m_RequestInputsAndOutputsDumpDir.empty())
695     {
696         ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(&callback));
697     }
698 
699     // map the memory pool into shared pointers
700     // use a shared memory pools vector on the heap, as it is passed to the request thread
701     auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
702 
703     // allocate the tensors on the heap, as they are passed to the request thread
704     auto inputTensors = std::make_shared<armnn::InputTensors>();
705     auto outputTensors = std::make_shared<armnn::OutputTensors>();
706 
707     auto prepareStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, request, callback);
708     switch(prepareStatus)
709     {
710         case V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE:
711             return V1_0::ErrorStatus::NONE;
712         case V1_0::ErrorStatus::GENERAL_FAILURE:
713             return V1_0::ErrorStatus::GENERAL_FAILURE;
714         default:
715         {}
716     }
717 
718 
719     // post the request for asynchronous execution
720     CallbackContext_1_2 cb;
721     cb.callback = callback;
722     cb.ctx = ctx;
723 
724     if (m_AsyncModelExecutionEnabled)
725     {
726         ALOGV("ArmnnPreparedModel_1_2::execute(...) before ScheduleGraphForExecution");
727         ScheduleGraphForExecution(memPools, inputTensors, outputTensors, cb);
728         ALOGV("ArmnnPreparedModel_1_2::execute(...) after ScheduleGraphForExecution");
729         return V1_0::ErrorStatus::NONE;
730     }
731 
732     ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg");
733     m_RequestThread.PostMsg(this, memPools, inputTensors, outputTensors, cb);
734     ALOGV("ArmnnPreparedModel_1_2::execute(...) after PostMsg");
735     return V1_0::ErrorStatus::NONE;
736 }
737 
738 template<typename HalVersion>
configureExecutionBurst(const sp<V1_2::IBurstCallback> & callback,const MQDescriptorSync<V1_2::FmqRequestDatum> & requestChannel,const MQDescriptorSync<V1_2::FmqResultDatum> & resultChannel,V1_2::IPreparedModel::configureExecutionBurst_cb cb)739 Return<void> ArmnnPreparedModel_1_2<HalVersion>::configureExecutionBurst(
740     const sp<V1_2::IBurstCallback>& callback,
741     const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
742     const MQDescriptorSync<V1_2::FmqResultDatum>& resultChannel,
743     V1_2::IPreparedModel::configureExecutionBurst_cb cb)
744 {
745     ALOGV("ArmnnPreparedModel_1_2::configureExecutionBurst");
746     const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(callback,
747                                                                        requestChannel,
748                                                                        resultChannel,
749                                                                        this);
750 
751     if (burst == nullptr)
752     {
753         cb(V1_0::ErrorStatus::GENERAL_FAILURE, {});
754     }
755     else
756     {
757         cb(V1_0::ErrorStatus::NONE, burst);
758     }
759     return Void();
760 }
761 
762 /// Schedule the graph prepared from the request for execution
763 template<typename HalVersion>
764 template<typename CallbackContext>
ScheduleGraphForExecution(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> & pMemPools,std::shared_ptr<armnn::InputTensors> & inputTensors,std::shared_ptr<armnn::OutputTensors> & outputTensors,CallbackContext callbackContext)765 void ArmnnPreparedModel_1_2<HalVersion>::ScheduleGraphForExecution(
766         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
767         std::shared_ptr<armnn::InputTensors>& inputTensors,
768         std::shared_ptr<armnn::OutputTensors>& outputTensors,
769         CallbackContext callbackContext)
770 {
771     ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution(...)");
772 
773     DumpTensorsIfRequired("Input", *inputTensors);
774 
775     unsigned int outputTensorSize = outputTensors.get()->size();
776     std::vector<V1_2::OutputShape> outputShapes(outputTensorSize);
777     for (unsigned int i = 0; i < outputTensorSize; i++)
778     {
779         std::pair<int, armnn::Tensor> outputTensorPair = outputTensors.get()->at(i);
780         const armnn::Tensor outputTensor = outputTensorPair.second;
781         const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
782 
783         outputShapes[i] = ComputeShape(outputTensorInfo);
784     }
785 
786     auto tpCb = std::make_shared<
787         ArmnnThreadPoolCallback_1_2<CallbackContext_1_2>>(this,
788                                                           pMemPools,
789                                                           outputShapes,
790                                                           inputTensors,
791                                                           outputTensors,
792                                                           callbackContext);
793 
794     m_Threadpool->Schedule(m_NetworkId,
795                            *tpCb->m_InputTensors,
796                            *tpCb->m_OutputTensors,
797                            armnn::QosExecPriority::Medium,
798                            tpCb);
799     ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution end");
800 }
801 
802 template<typename HalVersion>
803 template <typename CallbackContext>
Notify(armnn::Status status,armnn::InferenceTimingPair timeTaken)804 void ArmnnPreparedModel_1_2<HalVersion>::ArmnnThreadPoolCallback_1_2<CallbackContext>::Notify(
805         armnn::Status status, armnn::InferenceTimingPair timeTaken)
806 {
807     ALOGV("ArmnnPreparedModel_1_2::ArmnnThreadPoolCallback_1_2 Notify");
808 
809     TimePoint driverEnd;
810 
811     CommitPools(*m_MemPools);
812 
813     m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors);
814 
815     if (status != armnn::Status::Success)
816     {
817         ALOGW("ArmnnThreadPoolCallback::Notify EnqueueWorkload failed");
818         m_CallbackContext.callback(
819                 V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel::ExecuteGraph");
820         return;
821     }
822 
823     if (m_CallbackContext.ctx.measureTimings == V1_2::MeasureTiming::YES)
824     {
825         driverEnd = std::chrono::steady_clock::now();
826         V1_2::Timing timing;
827         timing.timeOnDevice = MicrosecondsDuration(timeTaken.second, timeTaken.first);
828         timing.timeInDriver = MicrosecondsDuration(driverEnd, m_CallbackContext.ctx.driverStart);
829         ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu",
830               static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
831         m_CallbackContext.callback(
832                 V1_0::ErrorStatus::NONE, m_OutputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph");
833     } else {
834         m_CallbackContext.callback(
835                 V1_0::ErrorStatus::NONE, m_OutputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
836     }
837     return;
838 }
839 
840 #if defined(ARMNN_ANDROID_NN_V1_2) || defined(ARMNN_ANDROID_NN_V1_3)
841 template class ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>;
842 template bool ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ExecuteGraph<CallbackContext_1_2>(
843         std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
844         armnn::InputTensors& pInputTensors,
845         armnn::OutputTensors& pOutputTensors,
846         CallbackContext_1_2 cb);
847 
848 template void ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ScheduleGraphForExecution<CallbackContext_1_2>(
849                 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
850                 std::shared_ptr<armnn::InputTensors>& inputTensors,
851                 std::shared_ptr<armnn::OutputTensors>& outputTensors,
852                 CallbackContext_1_2 callbackContext);
853 #endif
854 
855 } // namespace armnn_driver
856