1 //
2 // Copyright © 2020-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 // Note: the ArmnnFencedExecutionCallback and code snippet in the executeFenced() function
6 // in this file is based on Android code
7 // under the Apache 2.0 license. See comments below for details.
8 //
9
10 #define LOG_TAG "ArmnnDriver"
11
12 #include "ArmnnPreparedModel_1_3.hpp"
13 #include "Utils.hpp"
14
15 #include <armnn/Types.hpp>
16
17 #include <Utils.h>
18 #include <android/sync.h>
19 #include <log/log.h>
20 #include <OperationsUtils.h>
21 #include <ExecutionBurstServer.h>
22 #include <ValidateHal.h>
23
24 #include <chrono>
25 #include <cinttypes>
26
27 #ifdef ARMNN_ANDROID_S
28 #include <LegacyUtils.h>
29 #endif
30
31 using namespace android;
32 using namespace android::hardware;
33
34 namespace {
35
36 static const V1_2::Timing g_NoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
37 using namespace armnn_driver;
38 using TimePoint = std::chrono::steady_clock::time_point;
39
Now()40 TimePoint Now()
41 {
42 return std::chrono::steady_clock::now();
43 }
44
MicrosecondsDuration(TimePoint endPoint,TimePoint startPoint)45 unsigned long MicrosecondsDuration(TimePoint endPoint, TimePoint startPoint)
46 {
47 return static_cast<unsigned long>(std::chrono::duration_cast<std::chrono::microseconds>(
48 endPoint - startPoint).count());
49 }
50
NotifyCallbackAndCheck(const::android::sp<V1_0::IExecutionCallback> & callback,V1_3::ErrorStatus errorStatus,std::vector<V1_2::OutputShape>,const V1_2::Timing,std::string callingFunction)51 void NotifyCallbackAndCheck(const ::android::sp<V1_0::IExecutionCallback>& callback,
52 V1_3::ErrorStatus errorStatus,
53 std::vector<V1_2::OutputShape>,
54 const V1_2::Timing,
55 std::string callingFunction)
56 {
57 Return<void> returned = callback->notify(convertToV1_0(errorStatus));
58 // This check is required, if the callback fails and it isn't checked it will bring down the service
59 if (!returned.isOk())
60 {
61 ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
62 callingFunction.c_str(), returned.description().c_str());
63 }
64 }
65
NotifyCallbackAndCheck(const::android::sp<V1_2::IExecutionCallback> & callback,V1_3::ErrorStatus errorStatus,std::vector<V1_2::OutputShape> outputShapes,const V1_2::Timing timing,std::string callingFunction)66 void NotifyCallbackAndCheck(const ::android::sp<V1_2::IExecutionCallback>& callback,
67 V1_3::ErrorStatus errorStatus,
68 std::vector<V1_2::OutputShape> outputShapes,
69 const V1_2::Timing timing,
70 std::string callingFunction)
71 {
72 Return<void> returned = callback->notify_1_2(convertToV1_0(errorStatus), outputShapes, timing);
73 // This check is required, if the callback fails and it isn't checked it will bring down the service
74 if (!returned.isOk())
75 {
76 ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
77 callingFunction.c_str(), returned.description().c_str());
78 }
79 }
80
NotifyCallbackAndCheck(const::android::sp<V1_3::IExecutionCallback> & callback,V1_3::ErrorStatus errorStatus,std::vector<V1_2::OutputShape> outputShapes,const V1_2::Timing timing,std::string callingFunction)81 void NotifyCallbackAndCheck(const ::android::sp<V1_3::IExecutionCallback>& callback,
82 V1_3::ErrorStatus errorStatus,
83 std::vector<V1_2::OutputShape> outputShapes,
84 const V1_2::Timing timing,
85 std::string callingFunction)
86 {
87 Return<void> returned = callback->notify_1_3(errorStatus, outputShapes, timing);
88 // This check is required, if the callback fails and it isn't checked it will bring down the service
89 if (!returned.isOk())
90 {
91 ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
92 callingFunction.c_str(), returned.description().c_str());
93 }
94 }
95
ValidateRequestArgument(const V1_0::RequestArgument & requestArg,const armnn::TensorInfo & tensorInfo)96 bool ValidateRequestArgument(const V1_0::RequestArgument& requestArg, const armnn::TensorInfo& tensorInfo)
97 {
98 if (requestArg.dimensions.size() != 0)
99 {
100 if (requestArg.dimensions.size() != tensorInfo.GetNumDimensions())
101 {
102 ALOGE("Mismatched dimensions (request argument: %zu, expected: %u)",
103 requestArg.dimensions.size(), tensorInfo.GetNumDimensions());
104 return false;
105 }
106
107 for (unsigned int d = 0; d < tensorInfo.GetNumDimensions(); ++d)
108 {
109 if (requestArg.dimensions[d] != 0 && requestArg.dimensions[d] != tensorInfo.GetShape()[d])
110 {
111 ALOGE("Mismatched size for dimension %d (request argument: %u, expected %u)",
112 d, requestArg.dimensions[d], tensorInfo.GetShape()[d]);
113 return false;
114 }
115 }
116 }
117
118 return true;
119 }
120
GetTensorForRequestArgument(const V1_0::RequestArgument & requestArg,const armnn::TensorInfo & tensorInfo,const std::vector<::android::nn::RunTimePoolInfo> & requestPools)121 armnn::Tensor GetTensorForRequestArgument(const V1_0::RequestArgument& requestArg,
122 const armnn::TensorInfo& tensorInfo,
123 const std::vector<::android::nn::RunTimePoolInfo>& requestPools)
124 {
125 if (!ValidateRequestArgument(requestArg, tensorInfo))
126 {
127 return armnn::Tensor();
128 }
129
130 return armnn::Tensor(tensorInfo, GetMemoryFromPool(requestArg.location, requestPools));
131 }
132
BuildTensorName(const char * tensorNamePrefix,std::size_t index)133 inline std::string BuildTensorName(const char* tensorNamePrefix, std::size_t index)
134 {
135 return tensorNamePrefix + std::to_string(index);
136 }
137
138 } // anonymous namespace
139
140 using namespace android::hardware;
141
142 namespace armnn_driver
143 {
144
145 template<typename HalVersion>
146 RequestThread_1_3<ArmnnPreparedModel_1_3, HalVersion, CallbackContext_1_3>
147 ArmnnPreparedModel_1_3<HalVersion>::m_RequestThread;
148
149 template<typename HalVersion>
150 std::unique_ptr<armnn::Threadpool> ArmnnPreparedModel_1_3<HalVersion>::m_Threadpool(nullptr);
151
152 template<typename HalVersion>
153 template<typename TensorBindingCollection>
DumpTensorsIfRequired(char const * tensorNamePrefix,const TensorBindingCollection & tensorBindings)154 void ArmnnPreparedModel_1_3<HalVersion>::DumpTensorsIfRequired(char const* tensorNamePrefix,
155 const TensorBindingCollection& tensorBindings)
156 {
157 if (!m_RequestInputsAndOutputsDumpDir.empty())
158 {
159 const std::string requestName = std::to_string(m_NetworkId) + "_" + std::to_string(m_RequestCount) + ".dump";
160 for (std::size_t i = 0u; i < tensorBindings.size(); ++i)
161 {
162 DumpTensor(m_RequestInputsAndOutputsDumpDir,
163 requestName,
164 BuildTensorName(tensorNamePrefix, i),
165 tensorBindings[i].second);
166 }
167 }
168 }
169
170 template<typename HalVersion>
ArmnnPreparedModel_1_3(armnn::NetworkId networkId,armnn::IRuntime * runtime,const V1_3::Model & model,const std::string & requestInputsAndOutputsDumpDir,const bool gpuProfilingEnabled,V1_3::Priority priority,const bool asyncModelExecutionEnabled,const unsigned int numberOfThreads,const bool importEnabled,const bool exportEnabled)171 ArmnnPreparedModel_1_3<HalVersion>::ArmnnPreparedModel_1_3(armnn::NetworkId networkId,
172 armnn::IRuntime* runtime,
173 const V1_3::Model& model,
174 const std::string& requestInputsAndOutputsDumpDir,
175 const bool gpuProfilingEnabled,
176 V1_3::Priority priority,
177 const bool asyncModelExecutionEnabled,
178 const unsigned int numberOfThreads,
179 const bool importEnabled,
180 const bool exportEnabled)
181 : m_NetworkId(networkId)
182 , m_Runtime(runtime)
183 , m_Model(model)
184 , m_RequestCount(0)
185 , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
186 , m_GpuProfilingEnabled(gpuProfilingEnabled)
187 , m_ModelPriority(priority)
188 , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
189 , m_EnableImport(importEnabled)
190 , m_EnableExport(exportEnabled)
191 , m_PreparedFromCache(false)
192 {
193 // Enable profiling if required.
194 m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
195
196 if (m_AsyncModelExecutionEnabled)
197 {
198 std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
199 for (unsigned int i=0; i < numberOfThreads; ++i)
200 {
201 memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
202 }
203
204 if (!m_Threadpool)
205 {
206 m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
207 }
208 else
209 {
210 m_Threadpool->LoadMemHandles(memHandles);
211 }
212
213 m_WorkingMemHandle = memHandles.back();
214 }
215 }
216
217 template<typename HalVersion>
ArmnnPreparedModel_1_3(armnn::NetworkId networkId,armnn::IRuntime * runtime,const std::string & requestInputsAndOutputsDumpDir,const bool gpuProfilingEnabled,V1_3::Priority priority,const bool asyncModelExecutionEnabled,const unsigned int numberOfThreads,const bool importEnabled,const bool exportEnabled,const bool preparedFromCache)218 ArmnnPreparedModel_1_3<HalVersion>::ArmnnPreparedModel_1_3(armnn::NetworkId networkId,
219 armnn::IRuntime* runtime,
220 const std::string& requestInputsAndOutputsDumpDir,
221 const bool gpuProfilingEnabled,
222 V1_3::Priority priority,
223 const bool asyncModelExecutionEnabled,
224 const unsigned int numberOfThreads,
225 const bool importEnabled,
226 const bool exportEnabled,
227 const bool preparedFromCache)
228 : m_NetworkId(networkId)
229 , m_Runtime(runtime)
230 , m_RequestCount(0)
231 , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
232 , m_GpuProfilingEnabled(gpuProfilingEnabled)
233 , m_ModelPriority(priority)
234 , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
235 , m_EnableImport(importEnabled)
236 , m_EnableExport(exportEnabled)
237 , m_PreparedFromCache(preparedFromCache)
238 {
239 // Enable profiling if required.
240 m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
241
242 if (m_AsyncModelExecutionEnabled)
243 {
244 std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
245 for (unsigned int i=0; i < numberOfThreads; ++i)
246 {
247 memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
248 }
249
250 if (!m_Threadpool)
251 {
252 m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
253 }
254 else
255 {
256 m_Threadpool->LoadMemHandles(memHandles);
257 }
258
259 m_WorkingMemHandle = memHandles.back();
260 }
261 }
262
263 template<typename HalVersion>
~ArmnnPreparedModel_1_3()264 ArmnnPreparedModel_1_3<HalVersion>::~ArmnnPreparedModel_1_3()
265 {
266 // Get a hold of the profiler used by this model.
267 std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkId);
268 if (profiler && m_GpuProfilingEnabled)
269 {
270 // Dump the profiling info to a file if required.
271 DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId,
272 profiler.get());
273 }
274
275 // Unload the network associated with this model.
276 m_Runtime->UnloadNetwork(m_NetworkId);
277
278 // Unload the network memhandles from the threadpool
279 if (m_AsyncModelExecutionEnabled)
280 {
281 m_Threadpool->UnloadMemHandles(m_NetworkId);
282 }
283 }
284
285 template<typename HalVersion>
execute(const V1_0::Request & request,const::android::sp<V1_0::IExecutionCallback> & callback)286 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::execute(const V1_0::Request& request,
287 const ::android::sp<V1_0::IExecutionCallback>& callback)
288 {
289 if (callback.get() == nullptr)
290 {
291 ALOGE("ArmnnPreparedModel_1_3::execute invalid callback passed");
292 return V1_0::ErrorStatus::INVALID_ARGUMENT;
293 }
294
295 auto cb = [callback](V1_3::ErrorStatus errorStatus,
296 std::vector<V1_2::OutputShape> outputShapes,
297 const V1_2::Timing& timing,
298 std::string callingFunction)
299 {
300 NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
301 };
302
303
304 return convertToV1_0(Execute(convertToV1_3(request), V1_2::MeasureTiming::NO, cb));
305 }
306
307 template<typename HalVersion>
execute_1_2(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,const sp<V1_2::IExecutionCallback> & callback)308 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::execute_1_2(
309 const V1_0::Request& request,
310 V1_2::MeasureTiming measureTiming,
311 const sp<V1_2::IExecutionCallback>& callback)
312 {
313 if (callback.get() == nullptr)
314 {
315 ALOGE("ArmnnPreparedModel_1_3::execute_1_2 invalid callback passed");
316 return V1_0::ErrorStatus::INVALID_ARGUMENT;
317 }
318
319 auto cb = [callback](V1_3::ErrorStatus errorStatus,
320 std::vector<V1_2::OutputShape> outputShapes,
321 const V1_2::Timing& timing,
322 std::string callingFunction)
323 {
324 NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
325 };
326
327 return convertToV1_0(Execute(convertToV1_3(request), measureTiming, cb));
328 }
329
330 template<typename HalVersion>
execute_1_3(const V1_3::Request & request,V1_2::MeasureTiming measureTiming,const V1_3::OptionalTimePoint &,const V1_3::OptionalTimeoutDuration &,const sp<V1_3::IExecutionCallback> & callback)331 Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::execute_1_3(
332 const V1_3::Request& request,
333 V1_2::MeasureTiming measureTiming,
334 const V1_3::OptionalTimePoint&,
335 const V1_3::OptionalTimeoutDuration&,
336 const sp<V1_3::IExecutionCallback>& callback)
337 {
338 if (callback.get() == nullptr)
339 {
340 ALOGE("ArmnnPreparedModel_1_3::execute_1_3 invalid callback passed");
341 return V1_3::ErrorStatus::INVALID_ARGUMENT;
342 }
343
344 auto cb = [callback](V1_3::ErrorStatus errorStatus,
345 std::vector<V1_2::OutputShape> outputShapes,
346 const V1_2::Timing& timing,
347 std::string callingFunction)
348 {
349 NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
350 };
351
352 return Execute(request, measureTiming, cb);
353 }
354
355 /// This class is inspired by the sample implementation in Android named SampleFencedExecutionCallback.
356 /// The original code is licensed under Apache-2.0 and can be found at the following link:
357 /// https://android.googlesource.com/platform/frameworks/ml/+/master/nn/driver/sample/SampleDriver.h
358 class ArmnnFencedExecutionCallback : public V1_3::IFencedExecutionCallback
359 {
360 public:
ArmnnFencedExecutionCallback(V1_3::ErrorStatus errorStatus,V1_2::Timing timing,V1_2::Timing fenceTiming)361 ArmnnFencedExecutionCallback(V1_3::ErrorStatus errorStatus, V1_2::Timing timing, V1_2::Timing fenceTiming)
362 : m_ErrorStatus(errorStatus), m_Timing(timing), m_FenceTiming(fenceTiming) {}
~ArmnnFencedExecutionCallback()363 ~ArmnnFencedExecutionCallback() {}
364
getExecutionInfo(getExecutionInfo_cb callback)365 Return<void> getExecutionInfo(getExecutionInfo_cb callback) override
366 {
367 callback(m_ErrorStatus, m_Timing, m_FenceTiming);
368 return Void();
369 }
370 private:
371 V1_3::ErrorStatus m_ErrorStatus;
372 V1_2::Timing m_Timing;
373 V1_2::Timing m_FenceTiming;
374 };
375
376 template<typename HalVersion>
executeFenced(const V1_3::Request & request,const hidl_vec<hidl_handle> & fenceWaitFor,V1_2::MeasureTiming measureTiming,const V1_3::OptionalTimePoint & deadline,const V1_3::OptionalTimeoutDuration & loopTimeoutDuration,const V1_3::OptionalTimeoutDuration &,executeFenced_cb cb)377 Return<void> ArmnnPreparedModel_1_3<HalVersion>::executeFenced(const V1_3::Request& request,
378 const hidl_vec<hidl_handle>& fenceWaitFor,
379 V1_2::MeasureTiming measureTiming,
380 const V1_3::OptionalTimePoint& deadline,
381 const V1_3::OptionalTimeoutDuration& loopTimeoutDuration,
382 const V1_3::OptionalTimeoutDuration&,
383 executeFenced_cb cb)
384 {
385 ALOGV("ArmnnPreparedModel_1_3::executeFenced(...)");
386 if (cb == nullptr)
387 {
388 ALOGE("ArmnnPreparedModel_1_3::executeFenced invalid callback passed");
389 cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
390 return Void();
391 }
392
393 if (deadline.getDiscriminator() != V1_3::OptionalTimePoint::hidl_discriminator::none)
394 {
395 ALOGW("ArmnnPreparedModel_1_3::executeFenced parameter deadline is set but not supported.");
396 }
397
398 if (loopTimeoutDuration.getDiscriminator() != V1_3::OptionalTimeoutDuration::hidl_discriminator::none)
399 {
400 ALOGW("ArmnnPreparedModel_1_3::executeFenced parameter loopTimeoutDuration is set but not supported.");
401 }
402
403 if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model, /*allowUnspecifiedOutput=*/false))
404 {
405 ALOGV("ArmnnPreparedModel_1_3::executeFenced outputs must be specified for fenced execution ");
406 cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
407 return Void();
408 }
409
410 ExecutionContext_1_3 ctx;
411 if (measureTiming == V1_2::MeasureTiming::YES)
412 {
413 ctx.measureTimings = measureTiming;
414 ctx.driverStart = Now();
415 }
416
417 if (!m_PreparedFromCache)
418 {
419 ALOGV("ArmnnPreparedModel_1_3::executeFenced(): %s", GetModelSummary(m_Model).c_str());
420 }
421 m_RequestCount++;
422
423 if (!m_RequestInputsAndOutputsDumpDir.empty())
424 {
425 ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(&cb));
426 }
427
428 // This code snippet is inspired by the sample implementation in Android named SampleDriver::executeFenced()
429 // function. The original code is licensed under Apache-2.0 and can be found at the following link:
430 // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/driver/sample/SampleDriver.cpp
431 const auto fenceSize = fenceWaitFor.size();
432 for (unsigned int index = 0; index < fenceSize; ++index)
433 {
434 auto fenceNativeHandle = fenceWaitFor[index].getNativeHandle();
435 if (!fenceNativeHandle)
436 {
437 cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
438 return Void();
439 }
440
441 if (fenceNativeHandle->numFds != 1)
442 {
443 cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
444 return Void();
445 }
446
447 if (sync_wait(fenceNativeHandle->data[0], -1) < 0)
448 {
449 ALOGE("ArmnnPreparedModel_1_3::executeFenced sync fence failed.");
450 cb(V1_3::ErrorStatus::GENERAL_FAILURE, hidl_handle(nullptr), nullptr);
451 return Void();
452 }
453 }
454
455 TimePoint fenceExecutionStart;
456 if (measureTiming == V1_2::MeasureTiming::YES)
457 {
458 fenceExecutionStart = Now();
459 }
460
461 // map the memory pool into shared pointers
462 // use a shared memory pools vector on the heap, as it is passed to the request thread
463 auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
464
465 // allocate the tensors on the heap, as they are passed to the request thread
466 auto inputs = std::make_shared<armnn::InputTensors>();
467 auto outputs = std::make_shared<armnn::OutputTensors>();
468
469 auto [status, outShapes, timings, message] = PrepareMemoryForIO(*inputs, *outputs, *memPools, request);
470 if (status != V1_3::ErrorStatus::NONE)
471 {
472 cb(V1_3::ErrorStatus::INVALID_ARGUMENT, hidl_handle(nullptr), nullptr);
473 return Void();
474 }
475
476 ALOGV("ArmnnPreparedModel_1_3::executeFenced(...) before ExecuteGraph");
477
478 // call it with nullCallback for now as we will report the error status from here..
479 auto nullCallback = [](V1_3::ErrorStatus, std::vector<V1_2::OutputShape>, const V1_2::Timing&, std::string) {};
480 CallbackContext_1_3 cbCtx;
481 cbCtx.callback = nullCallback;
482 cbCtx.ctx = ctx;
483
484 auto errorStatus = ExecuteGraph(memPools, *inputs, *outputs, cbCtx);
485 if (errorStatus != V1_3::ErrorStatus::NONE)
486 {
487 cb(errorStatus, hidl_handle(nullptr), nullptr);
488 return Void();
489 }
490 ALOGV("ArmnnPreparedModel_1_3::executeFenced(...) after ExecuteGraph");
491
492 V1_2::Timing timing = g_NoTiming;
493 V1_2::Timing fenceTiming = g_NoTiming;
494 if (measureTiming == V1_2::MeasureTiming::YES)
495 {
496 fenceTiming.timeOnDevice = MicrosecondsDuration(ctx.deviceEnd, ctx.deviceStart);
497 fenceTiming.timeInDriver = MicrosecondsDuration(ctx.driverEnd, fenceExecutionStart);
498 ALOGV("ArmnnPreparedModel_1_3::fenceFinishExecutionTiming - Device = %lu Driver = %lu",
499 static_cast<unsigned long>(fenceTiming.timeOnDevice),
500 static_cast<unsigned long>(fenceTiming.timeInDriver));
501 }
502
503 sp<ArmnnFencedExecutionCallback> armnnFencedExecutionCallback =
504 new ArmnnFencedExecutionCallback(V1_3::ErrorStatus::NONE, timing, fenceTiming);
505 cb(V1_3::ErrorStatus::NONE, hidl_handle(nullptr), armnnFencedExecutionCallback);
506 return Void();
507 }
508
509 template<typename HalVersion>
PrepareMemoryForInputs(armnn::InputTensors & inputs,const V1_3::Request & request,const std::vector<android::nn::RunTimePoolInfo> & memPools)510 Return<V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::PrepareMemoryForInputs(
511 armnn::InputTensors& inputs,
512 const V1_3::Request& request,
513 const std::vector<android::nn::RunTimePoolInfo>& memPools)
514 {
515 inputs.reserve(request.inputs.size());
516 for (unsigned int i = 0; i < request.inputs.size(); i++)
517 {
518 const auto& inputArg = request.inputs[i];
519 armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
520 // inputs (of type InputTensors) is composed of a vector of ConstTensors.
521 // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
522 inputTensorInfo.SetConstant();
523 auto result = ValidateRequestArgument<V1_3::ErrorStatus, V1_3::Request>(request,
524 inputTensorInfo,
525 inputArg,
526 "input");
527
528 if (result != V1_3::ErrorStatus::NONE)
529 {
530 return result;
531 }
532
533 const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, memPools);
534
535 if (inputTensor.GetMemoryArea() == nullptr)
536 {
537 ALOGE("Cannot execute request. Error converting request input %u to tensor", i);
538 return V1_3::ErrorStatus::GENERAL_FAILURE;
539 }
540
541 inputs.emplace_back(i, inputTensor);
542 }
543
544 return V1_3::ErrorStatus::NONE;
545 }
546
547 template<typename HalVersion>
PrepareMemoryForOutputs(armnn::OutputTensors & outputs,std::vector<V1_2::OutputShape> & outputShapes,const V1_3::Request & request,const std::vector<android::nn::RunTimePoolInfo> & memPools)548 Return<V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::PrepareMemoryForOutputs(
549 armnn::OutputTensors& outputs,
550 std::vector<V1_2::OutputShape> &outputShapes,
551 const V1_3::Request& request,
552 const std::vector<android::nn::RunTimePoolInfo>& memPools)
553 {
554 outputs.reserve(request.outputs.size());
555 for (unsigned int i = 0; i < request.outputs.size(); i++)
556 {
557 const auto& outputArg = request.outputs[i];
558 armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
559 auto result = ValidateRequestArgument<V1_3::ErrorStatus, V1_3::Request>(request,
560 outputTensorInfo,
561 outputArg,
562 "output");
563
564 if (result != V1_3::ErrorStatus::NONE)
565 {
566 return result;
567 }
568
569 const armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, memPools);
570
571 if (outputTensor.GetMemoryArea() == nullptr)
572 {
573 ALOGE("Cannot execute request. Error converting request output %u to tensor", i);
574 return V1_3::ErrorStatus::GENERAL_FAILURE;
575 }
576 const size_t outputSize = outputTensorInfo.GetNumBytes();
577
578 unsigned int count = 0;
579 std::for_each(outputArg.dimensions.begin(), outputArg.dimensions.end(), [&](auto dim)
580 {
581 if (dim != 0)
582 {
583 outputTensorInfo.GetShape()[count] = dim;
584 }
585 else
586 {
587 outputTensorInfo.GetShape()[count] = outputArg.dimensions.size();
588 }
589
590 count++;
591 });
592
593 outputs.emplace_back(i, outputTensor);
594 outputShapes[i] = ComputeShape(outputTensorInfo);
595
596 if (outputArg.location.length < outputSize)
597 {
598 ALOGW("ArmnnPreparedModel_1_3::Execute failed outputArg.location.length (%s) < outputSize (%s)",
599 std::to_string(outputArg.location.length).c_str(), std::to_string(outputSize).c_str());
600 outputShapes[i].isSufficient = false;
601 return V1_3::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
602 }
603
604 size_t bufferSize = 0;
605 #if !defined(ARMNN_ANDROID_S)
606 bufferSize = memPools.at(outputArg.location.poolIndex).getHidlMemory().size();
607 #else
608 bufferSize = memPools.at(outputArg.location.poolIndex).getSize();
609 #endif
610 if (bufferSize < outputSize)
611 {
612 ALOGW("ArmnnPreparedModel_1_3::Execute failed bufferSize (%s) < outputSize (%s)",
613 std::to_string(bufferSize).c_str(), std::to_string(outputSize).c_str());
614 outputShapes[i].isSufficient = false;
615 return V1_3::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
616 }
617 }
618
619 return V1_3::ErrorStatus::NONE;
620 }
621
622 template<typename HalVersion>
623 std::tuple<V1_3::ErrorStatus, hidl_vec<V1_2::OutputShape>, V1_2::Timing, std::string>
PrepareMemoryForIO(armnn::InputTensors & inputs,armnn::OutputTensors & outputs,std::vector<android::nn::RunTimePoolInfo> & memPools,const V1_3::Request & request)624 ArmnnPreparedModel_1_3<HalVersion>::PrepareMemoryForIO(armnn::InputTensors& inputs,
625 armnn::OutputTensors& outputs,
626 std::vector<android::nn::RunTimePoolInfo>& memPools,
627 const V1_3::Request& request)
628 {
629 #if !defined(ARMNN_ANDROID_S)
630 if (!setRunTimePoolInfosFromMemoryPools(&memPools, request.pools))
631 #else
632 if (!setRunTimePoolInfosFromMemoryPools(&memPools, uncheckedConvert(request.pools)))
633 #endif
634 {
635 return {V1_3::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
636 }
637
638 // add the inputs and outputs with their data
639 try
640 {
641 if (PrepareMemoryForInputs(inputs, request, memPools) != V1_3::ErrorStatus::NONE)
642 {
643 return {V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
644 }
645
646 std::vector<V1_2::OutputShape> outputShapes(request.outputs.size());
647
648 auto errorStatus = PrepareMemoryForOutputs(outputs, outputShapes, request, memPools);
649 if (errorStatus != V1_3::ErrorStatus::NONE)
650 {
651 return {errorStatus, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
652 }
653 }
654 catch (armnn::Exception& e)
655 {
656 ALOGW("armnn::Exception caught while preparing for EnqueueWorkload: %s", e.what());
657 return {V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
658 }
659 catch (std::exception& e)
660 {
661 ALOGE("std::exception caught while preparing for EnqueueWorkload: %s", e.what());
662 return {V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
663 }
664
665 return {V1_3::ErrorStatus::NONE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute"};
666 }
667
668 template<typename HalVersion>
669 template<typename CallbackContext>
ExecuteSynchronously(const V1_3::Request & request,CallbackContext cbCtx)670 Return<void> ArmnnPreparedModel_1_3<HalVersion>::ExecuteSynchronously(const V1_3::Request& request,
671 CallbackContext cbCtx)
672 {
673 if (cbCtx.ctx.measureTimings == V1_2::MeasureTiming::YES)
674 {
675 cbCtx.ctx.driverStart = Now();
676 }
677
678 if (!m_PreparedFromCache && !android::nn::validateRequest(convertToV1_3(request), m_Model))
679 {
680 ALOGE("ArmnnPreparedModel_1_3::ExecuteSynchronously invalid request model");
681 cbCtx.callback(V1_3::ErrorStatus::INVALID_ARGUMENT,
682 {},
683 g_NoTiming,
684 "ArmnnPreparedModel_1_3::ExecuteSynchronously invalid request model");
685 return Void();
686 }
687
688 if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
689 {
690 ALOGE("ArmnnPreparedModel_1_3::ExecuteSynchronously invalid request model");
691 cbCtx.callback(V1_3::ErrorStatus::INVALID_ARGUMENT,
692 {},
693 g_NoTiming,
694 "ArmnnPreparedModel_1_3::ExecuteSynchronously invalid request model");
695 return Void();
696 }
697
698
699 // map the memory pool into shared pointers
700 // use a shared memory pools vector on the heap, as it is passed to the request thread
701 auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
702
703 // allocate the tensors on the heap, as they are passed to the request thread
704 auto inputs = std::make_shared<armnn::InputTensors>();
705 auto outputs = std::make_shared<armnn::OutputTensors>();
706
707 auto [status, outputShapes, timing, message] = PrepareMemoryForIO(*inputs, *outputs, *memPools, request);
708 if (status != V1_3::ErrorStatus::NONE)
709 {
710 cbCtx.callback(status, outputShapes, timing, message);
711 return Void();
712 }
713
714 ALOGV("ArmnnPreparedModel_1_3::ExecuteSynchronously() before Execution");
715
716 ExecuteGraph(memPools, *inputs, *outputs, cbCtx);
717 return Void();
718 }
719
720 template<typename HalVersion>
executeSynchronously(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,executeSynchronously_cb cb)721 Return<void> ArmnnPreparedModel_1_3<HalVersion>::executeSynchronously(const V1_0::Request& request,
722 V1_2::MeasureTiming measureTiming,
723 executeSynchronously_cb cb)
724 {
725 if (!m_PreparedFromCache)
726 {
727 ALOGV("ArmnnPreparedModel_1_3::executeSynchronously(): %s", GetModelSummary(m_Model).c_str());
728 }
729 m_RequestCount++;
730
731 if (cb == nullptr)
732 {
733 ALOGE("ArmnnPreparedModel_1_3::executeSynchronously invalid callback passed");
734 return Void();
735 }
736
737 auto cbWrapper = [cb](V1_3::ErrorStatus errorStatus,
738 std::vector<V1_2::OutputShape> outputShapes,
739 const V1_2::Timing& timing,
740 std::string)
741 {
742 cb(convertToV1_0(errorStatus), outputShapes, timing);
743 };
744
745 CallbackContext_1_3 cbCtx;
746 cbCtx.callback = cbWrapper;
747 cbCtx.ctx.measureTimings = measureTiming;
748
749 ExecuteSynchronously(convertToV1_3(request), cbCtx);
750 return Void();
751 }
752
753 template<typename HalVersion>
executeSynchronously_1_3(const V1_3::Request & request,V1_2::MeasureTiming measureTiming,const V1_3::OptionalTimePoint & deadline,const V1_3::OptionalTimeoutDuration & loopTimeoutDuration,executeSynchronously_1_3_cb cb)754 Return<void> ArmnnPreparedModel_1_3<HalVersion>::executeSynchronously_1_3(
755 const V1_3::Request& request,
756 V1_2::MeasureTiming measureTiming,
757 const V1_3::OptionalTimePoint& deadline,
758 const V1_3::OptionalTimeoutDuration& loopTimeoutDuration,
759 executeSynchronously_1_3_cb cb)
760 {
761 if (!m_PreparedFromCache)
762 {
763 ALOGV("ArmnnPreparedModel_1_3::executeSynchronously_1_3(): %s", GetModelSummary(m_Model).c_str());
764 }
765 m_RequestCount++;
766
767 if (cb == nullptr)
768 {
769 ALOGE("ArmnnPreparedModel_1_3::executeSynchronously_1_3 invalid callback passed");
770 return Void();
771 }
772
773 if (deadline.getDiscriminator() != V1_3::OptionalTimePoint::hidl_discriminator::none)
774 {
775 ALOGW("ArmnnPreparedModel_1_3::executeSynchronously_1_3 parameter deadline is set but not supported.");
776 }
777
778 if (loopTimeoutDuration.getDiscriminator() != V1_3::OptionalTimeoutDuration::hidl_discriminator::none)
779 {
780 ALOGW(
781 "ArmnnPreparedModel_1_3::executeSynchronously_1_3 parameter loopTimeoutDuration is set but not supported.");
782 }
783
784 auto cbWrapper = [cb](V1_3::ErrorStatus errorStatus,
785 std::vector<V1_2::OutputShape> outputShapes,
786 const V1_2::Timing& timing,
787 std::string)
788 {
789 cb(errorStatus, outputShapes, timing);
790 };
791
792 CallbackContext_1_3 cbCtx;
793 cbCtx.callback = cbWrapper;
794 cbCtx.ctx.measureTimings = measureTiming;
795
796 ExecuteSynchronously(request, cbCtx);
797 return Void();
798 }
799
800 template<typename HalVersion>
configureExecutionBurst(const sp<V1_2::IBurstCallback> & callback,const MQDescriptorSync<V1_2::FmqRequestDatum> & requestChannel,const MQDescriptorSync<V1_2::FmqResultDatum> & resultChannel,V1_3::IPreparedModel::configureExecutionBurst_cb cb)801 Return<void> ArmnnPreparedModel_1_3<HalVersion>::configureExecutionBurst(
802 const sp<V1_2::IBurstCallback>& callback,
803 const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
804 const MQDescriptorSync<V1_2::FmqResultDatum>& resultChannel,
805 V1_3::IPreparedModel::configureExecutionBurst_cb cb)
806 {
807 ALOGV("ArmnnPreparedModel_1_3::configureExecutionBurst");
808 const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(callback,
809 requestChannel,
810 resultChannel,
811 this);
812
813 if (burst == nullptr)
814 {
815 cb(V1_0::ErrorStatus::GENERAL_FAILURE, {});
816 }
817 else
818 {
819 cb(V1_0::ErrorStatus::NONE, burst);
820 }
821 return Void();
822 }
823
824 template<typename HalVersion>
825 template<typename CallbackContext>
ExecuteGraph(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> & pMemPools,armnn::InputTensors & inputTensors,armnn::OutputTensors & outputTensors,CallbackContext cb)826 Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::ExecuteGraph(
827 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
828 armnn::InputTensors& inputTensors,
829 armnn::OutputTensors& outputTensors,
830 CallbackContext cb)
831 {
832 ALOGV("ArmnnPreparedModel_1_3::ExecuteGraph(...)");
833 // Capture the graph execution start time.
834 std::chrono::time_point<std::chrono::system_clock> graphExecutionStart = std::chrono::system_clock::now();
835
836 DumpTensorsIfRequired("Input", inputTensors);
837
838 std::vector<V1_2::OutputShape> outputShapes(outputTensors.size());
839 for (unsigned int i = 0; i < outputTensors.size(); i++)
840 {
841 std::pair<int, armnn::Tensor> outputTensorPair = outputTensors[i];
842 const armnn::Tensor outputTensor = outputTensorPair.second;
843 const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
844
845 outputShapes[i] = ComputeShape(outputTensorInfo);
846 }
847
848 // run it
849 try
850 {
851 if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
852 {
853 cb.ctx.deviceStart = Now();
854 }
855 armnn::Status status;
856 if (m_AsyncModelExecutionEnabled)
857 {
858 ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph m_AsyncModelExecutionEnabled true");
859 status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors);
860 }
861 else
862 {
863 ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph m_AsyncModelExecutionEnabled false");
864 // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
865 std::vector<armnn::ImportedInputId> importedInputIds;
866 if (m_EnableImport)
867 {
868 importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
869 }
870 std::vector<armnn::ImportedOutputId> importedOutputIds;
871 if (m_EnableExport)
872 {
873 importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
874 }
875 status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
876 importedInputIds, importedOutputIds);
877 }
878
879 if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
880 {
881 cb.ctx.deviceEnd = Now();
882 }
883 if (status != armnn::Status::Success)
884 {
885 ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph EnqueueWorkload failed");
886 cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
887 return V1_3::ErrorStatus::GENERAL_FAILURE;
888 }
889 }
890 catch (armnn::Exception& e)
891 {
892 ALOGW("armnn:Exception caught from EnqueueWorkload: %s", e.what());
893 cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
894 return V1_3::ErrorStatus::GENERAL_FAILURE;
895 }
896 catch (std::exception& e)
897 {
898 ALOGE("std::exception caught from EnqueueWorkload: %s", e.what());
899 cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
900 return V1_3::ErrorStatus::GENERAL_FAILURE;
901 }
902
903 CommitPools(*pMemPools);
904
905 DumpTensorsIfRequired("Output", outputTensors);
906
907 if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
908 {
909 cb.ctx.driverEnd = Now();
910 V1_2::Timing timing;
911 timing.timeOnDevice = MicrosecondsDuration(cb.ctx.deviceEnd, cb.ctx.deviceStart);
912 timing.timeInDriver = MicrosecondsDuration(cb.ctx.driverEnd, cb.ctx.driverStart);
913 ALOGV("ArmnnPreparedModel_1_3::execute timing - Device = %lu Driver = %lu",
914 static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
915 cb.callback(V1_3::ErrorStatus::NONE, outputShapes, timing, "ArmnnPreparedModel_1_3::ExecuteGraph");
916 } else
917 {
918 cb.callback(V1_3::ErrorStatus::NONE, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
919 }
920 // Log the total time in this call. This is a good number to compare to that printed out by
921 // RuntimeImpl::EnqueueWorkload. The difference should be the execution overhead of the driver.
922 ALOGI("ArmnnPreparedModel_1_3::ExecuteGraph Execution time = %lld µs",
923 std::chrono::duration_cast<std::chrono::microseconds>
924 (std::chrono::system_clock::now() - graphExecutionStart).count());
925 return V1_3::ErrorStatus::NONE;
926 }
927
928 /// Schedule the graph prepared from the request for execution
929 template<typename HalVersion>
930 template<typename CallbackContext>
ScheduleGraphForExecution(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> & pMemPools,std::shared_ptr<armnn::InputTensors> & inputTensors,std::shared_ptr<armnn::OutputTensors> & outputTensors,CallbackContext callbackContext,armnn::QosExecPriority priority)931 void ArmnnPreparedModel_1_3<HalVersion>::ScheduleGraphForExecution(
932 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
933 std::shared_ptr<armnn::InputTensors>& inputTensors,
934 std::shared_ptr<armnn::OutputTensors>& outputTensors,
935 CallbackContext callbackContext,
936 armnn::QosExecPriority priority)
937 {
938 ALOGV("ArmnnPreparedModel_1_3::ScheduleGraphForExecution(...)");
939
940 DumpTensorsIfRequired("Input", *inputTensors);
941
942 unsigned int outputTensorSize = outputTensors.get()->size();
943 std::vector<V1_2::OutputShape> outputShapes(outputTensorSize);
944 for (unsigned int i = 0; i < outputTensorSize; i++)
945 {
946 std::pair<int, armnn::Tensor> outputTensorPair = outputTensors.get()->at(i);
947 const armnn::Tensor outputTensor = outputTensorPair.second;
948 const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
949
950 outputShapes[i] = ComputeShape(outputTensorInfo);
951 }
952
953 auto tpCb = std::make_shared<
954 ArmnnThreadPoolCallback_1_3<CallbackContext_1_3>>(this,
955 pMemPools,
956 outputShapes,
957 inputTensors,
958 outputTensors,
959 callbackContext);
960
961 m_Threadpool->Schedule(m_NetworkId,
962 *tpCb->m_InputTensors,
963 *tpCb->m_OutputTensors,
964 priority,
965 tpCb);
966 ALOGV("ArmnnPreparedModel_1_3::ScheduleGraphForExecution end");
967 }
968
969 template<typename HalVersion>
ExecuteWithDummyInputs(unsigned int numInputs,unsigned int numOutputs)970 bool ArmnnPreparedModel_1_3<HalVersion>::ExecuteWithDummyInputs(unsigned int numInputs, unsigned int numOutputs)
971 {
972 std::vector<std::vector<char>> storage;
973 armnn::InputTensors inputTensors;
974 for (unsigned int i = 0; i < numInputs; i++)
975 {
976 armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
977 // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors.
978 // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
979 inputTensorInfo.SetConstant();
980
981 storage.emplace_back(inputTensorInfo.GetNumBytes());
982 const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data());
983
984 inputTensors.emplace_back(i, inputTensor);
985 }
986
987 armnn::OutputTensors outputTensors;
988 for (unsigned int i = 0; i < numOutputs; i++)
989 {
990 const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
991 storage.emplace_back(outputTensorInfo.GetNumBytes());
992 const armnn::Tensor outputTensor(outputTensorInfo, storage.back().data());
993
994 outputTensors.emplace_back(i, outputTensor);
995 }
996
997 auto nullCallback = [](V1_3::ErrorStatus, std::vector<V1_2::OutputShape>, const V1_2::Timing&, std::string) {};
998 CallbackContext_1_3 callbackContext;
999 callbackContext.callback = nullCallback;
1000 callbackContext.ctx.measureTimings = V1_2::MeasureTiming::NO;
1001 auto memPools = std::make_shared<std::vector<::android::nn::RunTimePoolInfo>>();
1002
1003 auto errorStatus = ExecuteGraph(memPools,
1004 inputTensors,
1005 outputTensors,
1006 callbackContext);
1007 return errorStatus == V1_3::ErrorStatus::NONE;
1008 }
1009
1010 template<typename HalVersion>
Execute(const V1_3::Request & request,V1_2::MeasureTiming measureTiming,CallbackAsync_1_3 callback)1011 Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<HalVersion>::Execute(const V1_3::Request& request,
1012 V1_2::MeasureTiming measureTiming,
1013 CallbackAsync_1_3 callback)
1014 {
1015 ExecutionContext_1_3 ctx;
1016 if (measureTiming == V1_2::MeasureTiming::YES)
1017 {
1018 ctx.measureTimings = measureTiming;
1019 ctx.driverStart = Now();
1020 }
1021
1022 if (!m_PreparedFromCache)
1023 {
1024 ALOGV("ArmnnPreparedModel_1_3::execute(): %s", GetModelSummary(m_Model).c_str());
1025 }
1026 m_RequestCount++;
1027
1028 if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
1029 {
1030 callback(V1_3::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_3::execute");
1031 return V1_3::ErrorStatus::INVALID_ARGUMENT;
1032 }
1033
1034 if (!m_RequestInputsAndOutputsDumpDir.empty())
1035 {
1036 ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(&callback));
1037 }
1038
1039 // map the memory pool into shared pointers
1040 // use a shared memory pools vector on the heap, as it is passed to the request thread
1041 auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
1042
1043 // allocate the tensors on the heap, as they are passed to the request thread
1044 auto inputTensors = std::make_shared<armnn::InputTensors>();
1045 auto outputTensors = std::make_shared<armnn::OutputTensors>();
1046
1047 auto [status, outShapes, timing, message] = PrepareMemoryForIO(*inputTensors, *outputTensors,
1048 *memPools, request);
1049 if (status != V1_3::ErrorStatus::NONE)
1050 {
1051 callback(status, outShapes, timing, message);
1052 }
1053
1054 switch(status)
1055 {
1056 case V1_3::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE:
1057 return V1_3::ErrorStatus::NONE;
1058 case V1_3::ErrorStatus::GENERAL_FAILURE:
1059 return V1_3::ErrorStatus::GENERAL_FAILURE;
1060 case V1_3::ErrorStatus::INVALID_ARGUMENT:
1061 return V1_3::ErrorStatus::INVALID_ARGUMENT;
1062 default:
1063 {}
1064 }
1065 CallbackContext_1_3 cb;
1066 cb.callback = callback;
1067 cb.ctx = ctx;
1068
1069
1070 enum class QosExecPriority
1071 {
1072 Low = 0,
1073 Medium = 1,
1074 High = 2
1075 };
1076
1077
1078 if (m_AsyncModelExecutionEnabled)
1079 {
1080 armnn::QosExecPriority priority;
1081
1082 switch (GetModelPriority()) {
1083 case V1_3::Priority::LOW:
1084 priority = armnn::QosExecPriority::Low;
1085 break;
1086 case V1_3::Priority::MEDIUM:
1087 priority = armnn::QosExecPriority::Medium;
1088 break;
1089 case V1_3::Priority::HIGH:
1090 priority = armnn::QosExecPriority::High;
1091 break;
1092 default:
1093 priority = armnn::QosExecPriority::Medium;
1094
1095 }
1096
1097 ALOGV("ArmnnPreparedModel_1_3::execute(...) before ScheduleGraphForExecution");
1098 ScheduleGraphForExecution(memPools, inputTensors, outputTensors, cb, priority);
1099 ALOGV("ArmnnPreparedModel_1_3::execute(...) after ScheduleGraphForExecution");
1100 return V1_3::ErrorStatus::NONE;
1101 }
1102
1103 ALOGV("ArmnnPreparedModel_1_3::execute(...) before PostMsg");
1104 // post the request for asynchronous execution
1105 m_RequestThread.PostMsg(this, memPools, inputTensors, outputTensors, cb);
1106 ALOGV("ArmnnPreparedModel_1_3::execute(...) after PostMsg");
1107 return V1_3::ErrorStatus::NONE;
1108 }
1109
1110 template<typename HalVersion>
GetModelPriority()1111 V1_3::Priority ArmnnPreparedModel_1_3<HalVersion>::GetModelPriority()
1112 {
1113 return m_ModelPriority;
1114 }
1115
1116 template<typename HalVersion>
1117 template <typename CallbackContext>
Notify(armnn::Status status,armnn::InferenceTimingPair timeTaken)1118 void ArmnnPreparedModel_1_3<HalVersion>::ArmnnThreadPoolCallback_1_3<CallbackContext>::Notify(
1119 armnn::Status status, armnn::InferenceTimingPair timeTaken)
1120 {
1121 ALOGV("ArmnnPreparedModel_1_3::ArmnnThreadPoolCallback_1_3<CallbackContext>::Notify");
1122 CommitPools(*m_MemPools);
1123
1124 m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors);
1125
1126 if (status != armnn::Status::Success)
1127 {
1128 ALOGW("ArmnnThreadPoolCallback_1_3::Notify EnqueueWorkload failed");
1129 m_CallbackContext.callback(V1_3::ErrorStatus::GENERAL_FAILURE,
1130 {},
1131 g_NoTiming,
1132 "ArmnnPreparedModel_1_3::ArmnnThreadPoolCallback_1_3");
1133 return;
1134 }
1135
1136 if (m_CallbackContext.ctx.measureTimings == V1_2::MeasureTiming::YES)
1137 {
1138 m_CallbackContext.ctx.deviceStart = timeTaken.first;
1139 m_CallbackContext.ctx.deviceEnd = timeTaken.second;
1140 m_CallbackContext.ctx.driverEnd = std::chrono::steady_clock::now();
1141 V1_2::Timing timing;
1142 timing.timeOnDevice = MicrosecondsDuration(m_CallbackContext.ctx.deviceEnd, m_CallbackContext.ctx.deviceStart);
1143 timing.timeInDriver = MicrosecondsDuration(m_CallbackContext.ctx.driverEnd, m_CallbackContext.ctx.driverStart);
1144 ALOGV("ArmnnPreparedModel_1_3::execute timing - Device = %lu Driver = %lu",
1145 static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
1146 m_CallbackContext.callback(
1147 V1_3::ErrorStatus::NONE, m_OutputShapes, timing, "ArmnnPreparedModel_1_3::ExecuteGraph");
1148 } else
1149 {
1150 m_CallbackContext.callback(
1151 V1_3::ErrorStatus::NONE, m_OutputShapes, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph");
1152 }
1153 return;
1154 }
1155
1156 #ifdef ARMNN_ANDROID_NN_V1_3
1157 template class ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>;
1158 template Return <V1_3::ErrorStatus> ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>::ExecuteGraph<CallbackContext_1_3>(
1159 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
1160 armnn::InputTensors& pInputTensors,
1161 armnn::OutputTensors& pOutputTensors,
1162 CallbackContext_1_3 cb);
1163
1164 template void ArmnnPreparedModel_1_3<hal_1_3::HalPolicy>::ScheduleGraphForExecution<CallbackContext_1_3>(
1165 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
1166 std::shared_ptr<armnn::InputTensors>& inputTensors,
1167 std::shared_ptr<armnn::OutputTensors>& outputTensors,
1168 CallbackContext_1_3 callbackContext,
1169 armnn::QosExecPriority priority);
1170 #endif
1171
1172 } // namespace armnn_driver
1173