1 //
2 // Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5
6 #include "LoadedNetwork.hpp"
7 #include "Layer.hpp"
8 #include "Graph.hpp"
9 #include "Profiling.hpp"
10 #include "HeapProfiling.hpp"
11 #include "WorkingMemHandle.hpp"
12 #include "ExecutionData.hpp"
13
14 #include <armnn/BackendHelper.hpp>
15 #include <armnn/BackendRegistry.hpp>
16 #include <armnn/Logging.hpp>
17
18 #include <armnn/backends/TensorHandle.hpp>
19 #include <armnn/backends/IBackendInternal.hpp>
20 #include <armnn/backends/IMemoryManager.hpp>
21 #include <armnn/backends/MemCopyWorkload.hpp>
22
23 #include <armnn/profiling/ArmNNProfiling.hpp>
24
25 #include <armnn/utility/Assert.hpp>
26
27 #include <backendsCommon/MemSyncWorkload.hpp>
28
29 #include <common/include/Processes.hpp>
30
31 #include <fmt/format.h>
32
33 namespace armnn
34 {
35
36 using namespace std;
37 using namespace arm::pipe;
38
39 namespace
40 {
41
42 template <typename ExceptionType>
ToErrorMessage(const char * prefix,const ExceptionType & error)43 std::string ToErrorMessage(const char * prefix, const ExceptionType & error)
44 {
45 std::stringstream ss;
46 ss << prefix << " " << error.what();
47 return ss.str();
48 }
49
AddLayerStructure(std::unique_ptr<TimelineUtilityMethods> & timelineUtils,const Layer & layer,ProfilingGuid networkGuid)50 void AddLayerStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
51 const Layer& layer,
52 ProfilingGuid networkGuid)
53 {
54 // Add layer to the post-optimisation network structure
55 std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
56 timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
57 networkGuid,
58 layerName,
59 LabelsAndEventClasses::LAYER_GUID);
60 for (auto&& input : layer.GetInputSlots())
61 {
62 const IOutputSlot* source = input.GetConnectedOutputSlot();
63 ARMNN_ASSERT(source != NULL);
64 timelineUtils->CreateConnectionRelationship(ProfilingRelationshipType::RetentionLink,
65 source->GetOwningLayerGuid(),
66 layer.GetGuid());
67 }
68 }
69
AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods> & timelineUtils,std::unique_ptr<IWorkload> & workload,const Layer & layer)70 void AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
71 std::unique_ptr<IWorkload>& workload,
72 const Layer& layer)
73 {
74 // Add workload to the post-optimisation network structure
75 timelineUtils->CreateTypedEntity(workload->GetGuid(), LabelsAndEventClasses::WORKLOAD_GUID);
76 timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
77 layer.GetBackendId().Get(),
78 LabelsAndEventClasses::BACKENDID_GUID);
79
80 // Link the workload to the layer
81 timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
82 layer.GetGuid(),
83 workload->GetGuid(),
84 LabelsAndEventClasses::CHILD_GUID);
85 }
86
87 } // anonymous
88
89 /**
90 * This function performs a sanity check to ensure that the combination of input and output memory source matches the
91 * values for importEnabled and exportEnabled that were specified during optimization. During optimization the tensor
92 * handle factories are chosen based on whether import and export are enabled. If the user then specifies something
93 * incompatible here it can lead to problems.
94 *
95 * @param optimizedOptions
96 * @param networkProperties
97 */
ValidateSourcesMatchOptimizedNetwork(std::vector<BackendOptions> optimizedOptions,const INetworkProperties & networkProperties)98 void ValidateSourcesMatchOptimizedNetwork(std::vector<BackendOptions> optimizedOptions,
99 const INetworkProperties& networkProperties)
100 {
101 // Find the "Global" backend options. During the optimize phase the values of importEnabled and exportEnabled are
102 // added as backend options.
103 const vector<BackendOptions>::iterator& backendItr =
104 find_if(optimizedOptions.begin(), optimizedOptions.end(), [](const BackendOptions& backend) {
105 if (backend.GetBackendId().Get() == "Global")
106 {
107 return true;
108 }
109 else
110 {
111 return false;
112 }
113 });
114 bool importEnabled = false;
115 bool exportEnabled = false;
116 if (backendItr != optimizedOptions.end())
117 {
118 // Find the importEnabled and exportEnabled values.
119 for (size_t i = 0; i < backendItr->GetOptionCount(); i++)
120 {
121 const BackendOptions::BackendOption& option = backendItr->GetOption(i);
122 if (option.GetName() == "ImportEnabled")
123 {
124 importEnabled = option.GetValue().AsBool();
125 }
126 if (option.GetName() == "ExportEnabled")
127 {
128 exportEnabled = option.GetValue().AsBool();
129 }
130 }
131 }
132
133 // Now that we have values for import and export compare them to the MemorySource variables.
134 // Any value of MemorySource that's not "Undefined" implies that we need to do an import of some kind.
135 if ((networkProperties.m_InputSource == MemorySource::Undefined && importEnabled) ||
136 (networkProperties.m_InputSource != MemorySource::Undefined && !importEnabled))
137 {
138 auto message = fmt::format("The input memory source specified, '{0}',", networkProperties.m_InputSource);
139 if (!importEnabled)
140 {
141 message.append(" requires that memory import be enabled. However, "
142 "it was disabled when this network was optimized.");
143 }
144 else
145 {
146 message.append(" requires that memory import be disabled. However, "
147 "it was enabled when this network was optimized.");
148 }
149 throw InvalidArgumentException(message);
150 }
151
152 if ((networkProperties.m_OutputSource == MemorySource::Undefined && exportEnabled) ||
153 (networkProperties.m_OutputSource != MemorySource::Undefined && !exportEnabled))
154 {
155 auto message = fmt::format("The output memory source specified, '{0}',", networkProperties.m_OutputSource);
156 if (!exportEnabled)
157 {
158 message.append(" requires that memory export be enabled. However, "
159 "it was disabled when this network was optimized.");
160 }
161 else
162 {
163 message.append(" requires that memory export be disabled. However, "
164 "it was enabled when this network was optimized.");
165 }
166 throw InvalidArgumentException(message);
167 }
168 } // anonymous
169
MakeLoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,std::string & errorMessage,const INetworkProperties & networkProperties,arm::pipe::IProfilingService * profilingService)170 std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
171 std::string& errorMessage,
172 const INetworkProperties& networkProperties,
173 arm::pipe::IProfilingService* profilingService)
174 {
175 std::unique_ptr<LoadedNetwork> loadedNetwork;
176
177 auto Fail = [&](const std::exception& error) -> std::unique_ptr<LoadedNetwork>
178 {
179 errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error);
180 ARMNN_LOG(error) << errorMessage;
181
182 return std::unique_ptr<LoadedNetwork>();
183 };
184
185 try
186 {
187 loadedNetwork.reset(new LoadedNetwork(std::move(net), networkProperties, profilingService));
188 }
189 catch (const armnn::RuntimeException& error)
190 {
191 return Fail(error);
192 }
193 catch (const armnn::Exception& error)
194 {
195 return Fail(error);
196 }
197 catch (const std::runtime_error& error)
198 {
199 return Fail(error);
200 }
201
202 return loadedNetwork;
203 }
204
LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,const INetworkProperties & networkProperties,arm::pipe::IProfilingService * profilingService)205 LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
206 const INetworkProperties& networkProperties,
207 arm::pipe::IProfilingService* profilingService) :
208 m_OptimizedNetwork(std::move(net)),
209 m_NetworkProperties(networkProperties),
210 m_TensorHandleFactoryRegistry(),
211 m_ProfilingService(profilingService)
212 {
213 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadedNetwork");
214 // Get the profiler and register it for the current thread.
215 const std::shared_ptr<IProfiler>& profiler = m_OptimizedNetwork->GetProfiler();
216 ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
217
218 profiler->EnableProfiling(networkProperties.m_ProfilingEnabled);
219
220 profiler->EnableNetworkDetailsToStdOut(networkProperties.m_OutputNetworkDetailsMethod);
221
222 // We need to check that the memory sources match up with the values of import and export specified during the
223 // optimize phase. If they don't this will throw an exception.
224 ValidateSourcesMatchOptimizedNetwork(m_OptimizedNetwork.get()->pOptimizedNetworkImpl->GetModelOptions(),
225 m_NetworkProperties);
226
227 //First create tensor handlers, backends and workload factories.
228 //Handlers are created before workloads are.
229 //Because workload creation can modify some of the handlers,
230 //(for example the splitter and concat layers).
231
232 bool useExternalMemoryManager = false;
233 bool useInternalMemoryManager = false;
234 Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
235 // Ensure Topological order
236 order.SetLayersOutOfOrder();
237 order.TopologicalSort();
238
239 if (!networkProperties.m_AsyncEnabled)
240 {
241 m_IsInputImported = std::vector<bool>(order.GetNumInputs(), false);
242 m_IsOutputImported = std::vector<bool>(order.GetNumOutputs(), false);
243 }
244
245 for (auto&& layer : order)
246 {
247 auto const& backendId = layer->GetBackendId();
248 if (m_Backends.count(backendId) == 0)
249 {
250 auto createBackend = BackendRegistryInstance().GetFactory(backendId);
251 auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));
252
253 IBackendInternal* backend = it.first->second.get();
254
255 // If we're doing async execution verify that the backend supports it and ExternallyManagedMemory.
256 if (networkProperties.m_AsyncEnabled)
257 {
258 if (!HasCapability(BackendOptions::BackendOption{"AsyncExecution", true}, backend->GetCapabilities()))
259 {
260 std::string er = backend->GetId();
261 er += " does not support AsyncExecution";
262 throw BackendCapabilityException(er);
263 }
264 if (!HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},
265 backend->GetCapabilities()))
266 {
267 std::string er = backend->GetId();
268 er += " does not support ExternallyManagedMemory\n";
269 er += "AsyncEnabled networks require all backends to support ExternallyManagedMemory";
270 throw BackendCapabilityException(er);
271 }
272 m_SupportsExternallyManagedMemory[backend->GetId()] = true;
273 useExternalMemoryManager = true;
274 }
275 else
276 {
277 m_SupportsExternallyManagedMemory[backend->GetId()] = false;
278 useInternalMemoryManager = true;
279 }
280
281 IBackendInternal::IWorkloadFactoryPtr workloadFactory;
282 if (backend->SupportsTensorAllocatorAPI())
283 {
284 workloadFactory = backend->CreateWorkloadFactory(
285 m_TensorHandleFactoryRegistry,
286 m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(),
287 static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
288 static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
289 }
290 else
291 {
292 m_BackendMemoryMangers.emplace_back(backend->CreateMemoryManager());
293 workloadFactory = backend->CreateWorkloadFactory(
294 m_BackendMemoryMangers.back(), m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
295 }
296 m_WorkloadFactories[backendId ] = std::move(workloadFactory);
297 }
298 }
299
300 if (!networkProperties.m_AsyncEnabled)
301 {
302 for (auto&& layer : order)
303 {
304 auto& workloadFactory = GetWorkloadFactory(*layer);
305 bool supportsExternalManager = m_SupportsExternallyManagedMemory[layer->GetBackendId()];
306
307 switch (layer->GetType())
308 {
309 case LayerType::Input:
310 case LayerType::MemImport:
311 {
312 // If IsImportEnabled is true then we need to set IsMemoryManaged
313 // to false when creating TensorHandles
314 layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
315 workloadFactory,
316 !supportsExternalManager && !m_NetworkProperties.m_ImportEnabled);
317 break;
318 }
319 case LayerType::Constant:
320 {
321 layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, true);
322 break;
323 }
324 default:
325 {
326 // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
327 // If Export is enabled disable memory management so we can export, otherwise we do a copy
328 if ((layer->GetNumOutputSlots() == 1) &&
329 (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
330 (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
331 {
332 layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
333 workloadFactory,
334 !supportsExternalManager && !m_NetworkProperties.m_ExportEnabled);
335 }
336 else
337 {
338 layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
339 workloadFactory,
340 !supportsExternalManager);
341 }
342 }
343 }
344 }
345 }
346
347 ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
348 std::unique_ptr<TimelineUtilityMethods> timelineUtils =
349 TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
350 if (timelineUtils)
351 {
352 timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
353 // Mark the network with a start of life event
354 timelineUtils->RecordEvent(networkGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
355 // and with the process ID
356 int processID = arm::pipe::GetCurrentProcessId();
357 std::stringstream ss;
358 ss << processID;
359 timelineUtils->MarkEntityWithLabel(networkGuid, ss.str(), LabelsAndEventClasses::PROCESS_ID_GUID);
360 }
361
362 std::vector<IWorkload*> ConstWorkloads;
363
364 //Then create workloads.
365 {
366 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_CreateWorkloads");
367 for (auto&& layer: order)
368 {
369 if (timelineUtils)
370 {
371 // Add layer to the post-optimisation network structure
372 AddLayerStructure(timelineUtils, *layer, networkGuid);
373 }
374
375 const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer);
376
377 switch (layer->GetType())
378 {
379 case LayerType::Input:
380 case LayerType::Output:
381 {
382 // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
383 break;
384 }
385 default:
386 {
387 auto workload = layer->CreateWorkload(workloadFactory);
388
389 if (!workload)
390 {
391 const char* const layerName =
392 layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
393 throw InvalidArgumentException(
394 fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
395 layerName, static_cast<int>(layer->GetType()), layer->GetBackendId().Get()
396 ));
397 }
398
399 if (timelineUtils)
400 {
401 // Add workload to the post-optimisation network structure
402 AddWorkloadStructure(timelineUtils, workload, *layer);
403 }
404
405 // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
406 // and are separated out from the other workloads
407 if((networkProperties.m_AsyncEnabled || useExternalMemoryManager) &&
408 layer->GetType() == LayerType::Constant)
409 {
410 m_ConstantTensorHandles[layer->GetGuid()] =
411 layer->GetOutputSlot(0).GetOutputHandler().GetData();
412 m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
413 }
414 else
415 {
416 m_WorkloadQueue.push_back(std::move(workload));
417
418 if (layer->GetType() == LayerType::Constant)
419 {
420 // Place the Constant Workloads into a queue so that they can be executed first
421 ConstWorkloads.push_back(m_WorkloadQueue.back().get());
422 }
423 }
424 // release the constant data in the layer.
425 layer->ReleaseConstantData();
426 break;
427 }
428 }
429 }
430 }
431
432 // Gather information about workloads for inputs & outputs
433 if (!networkProperties.m_AsyncEnabled && m_WorkloadQueue.size() != 0)
434 {
435 const int noOfInputs = armnn::numeric_cast<int>(order.GetNumInputs());
436
437 // Get indices of all workloads connected to each input and
438 // check if they support tensor handle replacement
439 for (const BindableLayer* layer: order.GetInputLayers())
440 {
441 const auto bindingId = layer->GetBindingId();
442
443 bool supportsReplacement = true;
444
445 for (const auto inputSlot: layer->GetOutputSlot(0).GetConnections())
446 {
447 auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(inputSlot->GetOwningLayer()));
448 workloadIndex -= noOfInputs;
449
450 m_InputWorkloadSlotPairs[bindingId].emplace_back(WorkloadIndices{
451 armnn::numeric_cast<unsigned int>(workloadIndex), inputSlot->GetSlotIndex()});
452
453 auto workload = m_WorkloadQueue[m_InputWorkloadSlotPairs[bindingId].back().m_WorkloadIndex].get();
454 supportsReplacement &= workload->SupportsTensorHandleReplacement();
455 }
456
457 ITensorHandleFactory::FactoryId factoryId = layer->GetOutputSlot(0).GetTensorHandleFactoryId();
458 // Get matching import factory Id
459 ITensorHandleFactory::FactoryId importFactoryId =
460 m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
461
462 ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId);
463
464 if (supportsReplacement && importFactory)
465 {
466 m_PreImportedInputHandles.emplace_back(
467 bindingId, importFactory->CreateTensorHandle(layer->GetOutputSlot(0).GetTensorInfo(), false));
468 }
469 else
470 {
471 m_PreImportedInputHandles.emplace_back(bindingId, nullptr);
472 }
473 }
474
475 // Get indices of all workloads connected to each output and
476 // check if they support tensor handle replacement
477 for (const BindableLayer* layer: order.GetOutputLayers())
478 {
479 const auto bindingId = layer->GetBindingId();
480
481 const auto outputSlot = layer->GetInputSlot(0).GetConnectedOutputSlot();
482 auto& indices = m_OutputWorkloadSlotPairs[bindingId];
483
484 auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(outputSlot->GetOwningLayer()));
485 workloadIndex -= noOfInputs;
486
487 indices.m_OutputSlotIndices = WorkloadIndices{numeric_cast<unsigned int>(workloadIndex),
488 outputSlot->CalculateIndexOnOwner()};
489
490 bool supportsReplacement = true;
491 auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
492 supportsReplacement &= outputWorkload->SupportsTensorHandleReplacement();
493
494 for (auto &inputSlot: outputSlot->GetConnections())
495 {
496 if(inputSlot->GetOwningLayer().GetType() != LayerType::Output)
497 {
498 auto inWorkloadIndex = std::distance(order.begin(),
499 order.GetPosInGraph(inputSlot->GetOwningLayer()));
500 inWorkloadIndex -= noOfInputs;
501 indices.m_InputSlotIndices.emplace_back(WorkloadIndices{numeric_cast<unsigned int>(inWorkloadIndex),
502 inputSlot->GetSlotIndex()});
503 auto inputWorkload = m_WorkloadQueue[indices.m_InputSlotIndices.back().m_WorkloadIndex].get();
504 supportsReplacement &= inputWorkload->SupportsTensorHandleReplacement();
505 }
506 }
507
508 ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
509 // Get matching import factory Id
510 ITensorHandleFactory::FactoryId importFactoryId =
511 m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
512 ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId);
513
514 if (supportsReplacement && importFactory)
515 {
516 m_PreImportedOutputHandles.emplace_back(
517 bindingId, importFactory->CreateTensorHandle(outputSlot->GetTensorInfo(), false));
518 }
519 else
520 {
521 m_PreImportedOutputHandles.emplace_back(bindingId, nullptr);
522 }
523 }
524 }
525
526 for (auto&& workloadFactory : m_WorkloadFactories)
527 {
528 workloadFactory.second->AfterWorkloadsCreated();
529 }
530
531 if (timelineUtils)
532 {
533 // Commit to send the post-optimisation network structure
534 timelineUtils->Commit();
535 }
536
537 if (useExternalMemoryManager)
538 {
539 if (networkProperties.m_AsyncEnabled)
540 {
541 CreateMemoryProfileAsync();
542 }
543 else
544 {
545 CreateMemoryProfile();
546 }
547
548 auto backendStrategyMap = BackendRegistryInstance().GetMemoryOptimizerStrategies();
549 for (auto& backendMemoryProfile : m_MemBlockMap)
550 {
551 const BackendId& backendId = backendMemoryProfile.first;
552 if (backendStrategyMap.find(backendId) != backendStrategyMap.end())
553 {
554 m_MemBinMap[backendId] = backendStrategyMap[backendId]->Optimize(backendMemoryProfile.second);
555 }
556 else
557 {
558 m_MemBinMap[backendId] = m_ConstantStrategy->Optimize(backendMemoryProfile.second);
559 }
560 }
561
562 if (!networkProperties.m_AsyncEnabled)
563 {
564 m_ExternalMemoryManager = CreateExternalMemoryManger(m_TensorMemory);
565
566 // Sort m_TensorMemory, so it's order matches m_Tensorhandles
567 std::sort(m_TensorMemory.begin(), m_TensorMemory.end(),
568 [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
569 const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
570 {
571 return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
572 });
573 }
574 }
575
576 // Now that the intermediate tensor memory has been set-up,
577 // do any post allocation configuration for each workload.
578 if (!networkProperties.m_AsyncEnabled)
579 {
580 if (useInternalMemoryManager)
581 {
582 // Set up memory.
583 m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
584 }
585
586 for (auto &workload : m_WorkloadQueue)
587 {
588 workload->PostAllocationConfigure();
589 }
590 }
591
592 if (useExternalMemoryManager)
593 {
594 if (!networkProperties.m_AsyncEnabled)
595 {
596 AllocateAndExecuteConstantWorkloads();
597 }
598 else
599 {
600 AllocateAndExecuteConstantWorkloadsAsync();
601 }
602 }
603 // If synchronous, execute all constant layer workloads
604 if (!networkProperties.m_AsyncEnabled)
605 {
606 for (auto workload: ConstWorkloads)
607 {
608 workload->Execute();
609 }
610 }
611 }
612
AllocateAndExecuteConstantWorkloads()613 void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
614 {
615 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
616 for (auto& pair : m_ConstantWorkloads)
617 {
618 auto tensorHandle = m_ConstantTensorHandles[pair.first];
619 tensorHandle->Allocate();
620 pair.second->Execute();
621 }
622 }
623
AllocateAndExecuteConstantWorkloadsAsync()624 void LoadedNetwork::AllocateAndExecuteConstantWorkloadsAsync()
625 {
626 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
627 Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
628 for (auto&& layer : order)
629 {
630 if (layer->GetType() == LayerType::Constant)
631 {
632 const auto& outSlot = layer->GetOutputSlots()[0];
633 const auto factoryId = outSlot.GetTensorHandleFactoryId();
634 ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
635 auto& workloadFactory = GetWorkloadFactory(*layer);
636
637 layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
638 ITensorHandle* tensorHandle = outSlot.GetOutputHandler().GetData();
639
640 m_ConstantTensorHandles[layer->GetGuid()] = tensorHandle;
641 tensorHandle->Allocate();
642
643 auto& backend = m_Backends.at(layer->GetBackendId());
644
645 WorkingMemDescriptor memDesc;
646 memDesc.m_Outputs.push_back(tensorHandle);
647
648 ExecutionData executionData = backend->CreateExecutionData(memDesc);
649 m_ConstantWorkloads[layer->GetGuid()]->ExecuteAsync(executionData);
650 }
651 }
652 }
653
SendNetworkStructure(arm::pipe::IProfilingService & profilingService)654 void LoadedNetwork::SendNetworkStructure(arm::pipe::IProfilingService& profilingService)
655 {
656 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_SendNetworkStructure");
657 Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
658 ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
659
660 std::unique_ptr<TimelineUtilityMethods> timelineUtils =
661 TimelineUtilityMethods::GetTimelineUtils(profilingService);
662
663 timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
664
665 for (auto&& layer : order)
666 {
667 // Add layer to the post-optimisation network structure
668 AddLayerStructure(timelineUtils, *layer, networkGuid);
669 switch (layer->GetType())
670 {
671 case LayerType::Input:
672 case LayerType::Output:
673 {
674 // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
675 break;
676 }
677 default:
678 {
679 for (auto& workload : m_WorkloadQueue)
680 {
681 // Add workload to the post-optimisation network structure
682 AddWorkloadStructure(timelineUtils, workload, *layer);
683 }
684 break;
685 }
686 }
687 }
688 // Commit to send the post-optimisation network structure
689 timelineUtils->Commit();
690 }
691
GetNetworkGuid()692 ProfilingGuid LoadedNetwork::GetNetworkGuid()
693 {
694 return m_OptimizedNetwork->GetGuid();
695 }
696
GetInputTensorInfo(LayerBindingId layerId) const697 TensorInfo LoadedNetwork::GetInputTensorInfo(LayerBindingId layerId) const
698 {
699 for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers())
700 {
701 ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
702 if (inputLayer->GetBindingId() == layerId)
703 {
704 return inputLayer->GetOutputSlot(0).GetTensorInfo();
705 }
706 }
707
708 throw InvalidArgumentException(fmt::format("No input layer is associated with id {}", layerId));
709 }
710
GetOutputTensorInfo(LayerBindingId layerId) const711 TensorInfo LoadedNetwork::GetOutputTensorInfo(LayerBindingId layerId) const
712 {
713 for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers())
714 {
715 ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
716 ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
717 if (outputLayer->GetBindingId() == layerId)
718 {
719 return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
720 }
721 }
722
723 throw InvalidArgumentException(fmt::format("No output layer is associated with id {}", layerId));
724 }
725
GetWorkloadFactory(const Layer & layer) const726 const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) const
727 {
728 const IWorkloadFactory* workloadFactory = nullptr;
729
730 auto it = m_WorkloadFactories.find(layer.GetBackendId());
731 if (it == m_WorkloadFactories.end())
732 {
733 throw RuntimeException(fmt::format("No workload factory for {0} to be used for layer: {1}",
734 layer.GetBackendId().Get(),
735 layer.GetNameStr()),
736 CHECK_LOCATION());
737 }
738
739 workloadFactory = it->second.get();
740
741 ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
742
743 return *workloadFactory;
744 }
745
746 namespace {
747
748 // Non-copyable class owning accelerator-specific tensor data.
749 class TensorPin
750 {
751 public:
TensorPin(std::unique_ptr<ITensorHandle> handle,const TensorInfo & info,LayerBindingId id)752 TensorPin(std::unique_ptr<ITensorHandle> handle, const TensorInfo& info, LayerBindingId id)
753 : m_TensorHandle(std::move(handle))
754 , m_TensorInfo(info)
755 , m_Id(id)
756 {
757 }
758
GetTensorHandle() const759 ITensorHandle* GetTensorHandle() const { return m_TensorHandle.get(); }
GetTensorInfo() const760 const TensorInfo& GetTensorInfo() const { return m_TensorInfo; }
GetBindingId() const761 LayerBindingId GetBindingId() const { return m_Id; }
762
763 private:
764 std::unique_ptr<ITensorHandle> m_TensorHandle;
765 TensorInfo m_TensorInfo;
766 LayerBindingId m_Id;
767 };
768
GetTensorPin(LayerBindingId id,const std::vector<TensorPin> & pins,char const * bindingPointDesc)769 static const TensorPin& GetTensorPin(LayerBindingId id,
770 const std::vector<TensorPin>& pins,
771 char const* bindingPointDesc)
772 {
773 auto it = std::find_if(pins.begin(), pins.end(),
774 [id](const TensorPin& pin)
775 {
776 return pin.GetBindingId() == id;
777 });
778
779 if (it != pins.end())
780 {
781 return *it;
782 }
783 else
784 {
785 throw InvalidArgumentException(fmt::format("No tensor supplied for {0} {1}", bindingPointDesc, id));
786 }
787 }
788
789 // Stores data that needs to be kept accessible for the entire execution of a workload.
790 class WorkloadData
791 {
792 public:
WorkloadData(const InputTensors & inputTensors,const OutputTensors & outputTensors)793 WorkloadData(const InputTensors& inputTensors, const OutputTensors& outputTensors)
794 {
795 m_InputTensorPins.reserve(inputTensors.size());
796 m_OutputTensorPins.reserve(outputTensors.size());
797
798 for (auto inputTensorPair : inputTensors)
799 {
800 auto inputTensor = inputTensorPair.second;
801
802 std::unique_ptr<ITensorHandle> tensorHandle =
803 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),inputTensor.GetMemoryArea());
804 LayerBindingId layerId = inputTensorPair.first;
805
806 m_InputTensorPins.emplace_back(std::move(tensorHandle), inputTensor.GetInfo(), layerId);
807 }
808
809 for (auto outputTensorPair : outputTensors)
810 {
811 auto outputTensor = outputTensorPair.second;
812
813 std::unique_ptr<ITensorHandle> tensorHandle =
814 std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
815 LayerBindingId layerId = outputTensorPair.first;
816
817 m_OutputTensorPins.emplace_back(std::move(tensorHandle), outputTensor.GetInfo(), layerId);
818 }
819 }
820
GetInputTensorPin(LayerBindingId id) const821 const TensorPin& GetInputTensorPin(LayerBindingId id) const
822 {
823 return GetTensorPin(id, m_InputTensorPins, "input");
824 }
825
GetOutputTensorPin(LayerBindingId id) const826 const TensorPin& GetOutputTensorPin(LayerBindingId id) const
827 {
828 return GetTensorPin(id, m_OutputTensorPins, "output");
829 }
830
831 private:
832
833 std::vector<TensorPin> m_InputTensorPins;
834 std::vector<TensorPin> m_OutputTensorPins;
835 };
836
837 }
838
EnqueueWorkload(const InputTensors & inputTensors,const OutputTensors & outputTensors,std::vector<ImportedInputId> preImportedInputIds,std::vector<ImportedOutputId> preImportedOutputIds)839 Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
840 const OutputTensors& outputTensors,
841 std::vector<ImportedInputId> preImportedInputIds,
842 std::vector<ImportedOutputId> preImportedOutputIds)
843 {
844 const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
845
846 // Walk graph to determine the order of execution.
847 if (graph.GetNumLayers() < 2)
848 {
849 ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
850 return Status::Failure;
851 }
852
853 // Data that must be kept alive for the entire execution of the workload.
854 WorkloadData workloadData(inputTensors, outputTensors);
855
856 // Input tensors can be provided as parameters or pre imported. Either way the number of
857 // tensors should match the number of inputs.
858 if (graph.GetNumInputs() != (inputTensors.size() + preImportedInputIds.size()))
859 {
860 throw InvalidArgumentException("Number of inputs provided does not match network.");
861 }
862
863 // For each input to the network, call EnqueueInput with the data passed by the user.
864 {
865 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
866 m_InputQueue.clear();
867 m_InputQueue.reserve(graph.GetNumInputs());
868
869 unsigned int inputIndex = 0;
870 unsigned int importedInputIdIndex = 0;
871 std::sort(preImportedInputIds.begin(), preImportedInputIds.end());
872 for (const BindableLayer* inputLayer : graph.GetInputLayers())
873 {
874 if (importedInputIdIndex < preImportedInputIds.size() &&
875 inputIndex == preImportedInputIds[importedInputIdIndex])
876 {
877 // Only replace tensorhandles if they have not already been replaced
878 if (!m_IsInputImported[inputIndex])
879 {
880 auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get();
881
882 for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()])
883 {
884 auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
885 workload->ReplaceInputTensorHandle(outputTensorHandle, workloadInfo.m_SlotIndex);
886 }
887 m_IsInputImported[inputIndex] = true;
888 }
889 importedInputIdIndex++;
890 }
891 else
892 {
893 if (m_IsInputImported[inputIndex])
894 {
895 OutputHandler& handler = const_cast<OutputHandler&>(inputLayer->GetOutputHandler(0));
896
897 for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()])
898 {
899 auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
900 workload->ReplaceInputTensorHandle(handler.GetData(), workloadInfo.m_SlotIndex);
901 }
902
903 m_IsInputImported[inputIndex] = false;
904 }
905
906 // InputTensorHandle is not imported yet, process to enqueue input
907 const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
908 EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
909 }
910 inputIndex++;
911 }
912 }
913 // For each output to the network, call EnqueueOutput with the data passed by the user.
914 {
915 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
916 m_OutputQueue.clear();
917 m_OutputQueue.reserve(graph.GetNumOutputs());
918
919 if (preImportedOutputIds.size() > graph.GetNumOutputs())
920 {
921 throw InvalidArgumentException("Invalid number of preImportedOutputIds");
922 }
923
924 unsigned int outputIndex = 0;
925 unsigned int importedOutputIdIndex = 0;
926 std::sort(preImportedOutputIds.begin(), preImportedOutputIds.end());
927 for (const BindableLayer* outputLayer : graph.GetOutputLayers())
928 {
929 if (importedOutputIdIndex < preImportedOutputIds.size() &&
930 outputIndex == preImportedOutputIds[importedOutputIdIndex])
931 {
932 // Only replace tensorhandles if they have not already been replaced
933 ITensorHandle* inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get();
934
935 if (!m_IsOutputImported[outputIndex])
936 {
937 const auto bindingId = outputLayer->GetBindingId();
938 const auto& indices = m_OutputWorkloadSlotPairs[bindingId];
939
940 auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
941
942 outputWorkload->ReplaceOutputTensorHandle(inputTensorHandle,
943 indices.m_OutputSlotIndices.m_SlotIndex);
944
945 for (const auto& workloadInfo: indices.m_InputSlotIndices)
946 {
947 auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
948 inputWorkload->ReplaceInputTensorHandle(inputTensorHandle, workloadInfo.m_SlotIndex);
949 }
950 m_IsOutputImported[outputIndex] = true;
951 }
952
953 ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
954 MemSyncQueueDescriptor syncDesc;
955 syncDesc.m_Inputs.push_back(inputTensorHandle);
956 WorkloadInfo info;
957 info.m_InputTensorInfos.push_back(
958 outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo());
959 auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
960 ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
961 m_OutputQueue.push_back(move(syncWorkload));
962 importedOutputIdIndex++;
963 }
964 else
965 {
966 if (m_IsOutputImported[outputIndex])
967 {
968 const auto bindingId = outputLayer->GetBindingId();
969 const auto& indices = m_OutputWorkloadSlotPairs[bindingId];
970
971 auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
972 const OutputHandler& outputHandler =
973 outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOutputHandler();
974
975 outputWorkload->ReplaceOutputTensorHandle(
976 outputHandler.GetData(), indices.m_OutputSlotIndices.m_SlotIndex);
977
978 for (const auto& workloadInfo: indices.m_InputSlotIndices)
979 {
980 auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
981 inputWorkload->ReplaceInputTensorHandle(outputHandler.GetData(), workloadInfo.m_SlotIndex);
982 }
983 m_IsOutputImported[outputIndex] = false;
984 }
985
986 const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
987 // OutputTensorHandle is not imported yet, process to enqueue Output
988 EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
989 }
990 outputIndex++;
991 }
992 }
993
994 std::unique_ptr<TimelineUtilityMethods> timelineUtils =
995 TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
996 ProfilingGuid inferenceGuid = m_ProfilingService->GetNextGuid();
997 if (timelineUtils)
998 {
999 // Add inference timeline trace if profiling is enabled.
1000 ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
1001 timelineUtils->CreateTypedEntity(inferenceGuid, LabelsAndEventClasses::INFERENCE_GUID);
1002 timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
1003 networkGuid,
1004 inferenceGuid,
1005 LabelsAndEventClasses::EXECUTION_OF_GUID);
1006 timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
1007 }
1008
1009 bool executionSucceeded = true;
1010
1011 {
1012 if (m_ProfilingService->IsProfilingEnabled())
1013 {
1014 m_ProfilingService->IncrementCounterValue(INFERENCES_RUN);
1015 }
1016 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute");
1017 ARMNN_SCOPED_HEAP_PROFILING("Executing");
1018 executionSucceeded = Execute(timelineUtils, inferenceGuid);
1019 }
1020
1021 if (timelineUtils)
1022 {
1023 // Add end of life of the inference timeline if profiling is enabled.
1024 timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
1025 timelineUtils->Commit();
1026 }
1027
1028 return executionSucceeded ? Status::Success : Status::Failure;
1029 }
1030
EnqueueInput(const BindableLayer & layer,ITensorHandle * tensorHandle,const TensorInfo & tensorInfo)1031 void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
1032 {
1033 if (layer.GetType() != LayerType::Input)
1034 {
1035 throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
1036 }
1037
1038 if (tensorHandle == nullptr)
1039 {
1040 throw InvalidArgumentException("EnqueueInput: tensorHandle must not be NULL");
1041 }
1042
1043 InputQueueDescriptor inputQueueDescriptor;
1044 WorkloadInfo info;
1045
1046 inputQueueDescriptor.m_Inputs.push_back(tensorHandle);
1047 info.m_InputTensorInfos.push_back(tensorInfo);
1048
1049 ARMNN_ASSERT_MSG(layer.GetNumOutputSlots() == 1, "Can only handle Input Layer with one output");
1050 const OutputHandler& handler = layer.GetOutputHandler();
1051 const TensorInfo& outputTensorInfo = handler.GetTensorInfo();
1052 ITensorHandle* outputTensorHandle = handler.GetData();
1053 ARMNN_ASSERT_MSG(outputTensorHandle != nullptr,
1054 "Data should have been allocated.");
1055 inputQueueDescriptor.m_Outputs.push_back(outputTensorHandle);
1056 info.m_OutputTensorInfos.push_back(outputTensorInfo);
1057
1058 MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
1059 bool needMemCopy = true;
1060 if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor
1061 {
1062 if(CheckFlag(importFlags, m_NetworkProperties.m_InputSource))
1063 {
1064 needMemCopy = false;
1065 // This assumes a CPU Tensor handle
1066 void* mem = tensorHandle->Map(false);
1067 if (outputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource))
1068 {
1069 tensorHandle->Unmap();
1070 return; // No need for a workload since the import has been done.
1071 }
1072 tensorHandle->Unmap();
1073 throw MemoryImportException("EnqueueInput: Memory Import failed");
1074 }
1075 }
1076 if (needMemCopy)
1077 {
1078 // Create a mem copy workload for input since we did not import
1079 std::unique_ptr<IWorkload> inputWorkload = std::make_unique<CopyMemGenericWorkload>(inputQueueDescriptor, info);
1080
1081 ARMNN_ASSERT_MSG(inputWorkload, "No input workload created");
1082
1083 std::unique_ptr<TimelineUtilityMethods> timelineUtils =
1084 TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
1085 if (timelineUtils)
1086 {
1087 // Add Input Workload to the post-optimisation network structure
1088 AddWorkloadStructure(timelineUtils, inputWorkload, layer);
1089 timelineUtils->Commit();
1090 }
1091
1092 m_InputQueue.push_back(move(inputWorkload));
1093 }
1094 }
1095
EnqueueOutput(const BindableLayer & layer,ITensorHandle * tensorHandle,const TensorInfo & tensorInfo)1096 void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
1097 {
1098 if (layer.GetType() != LayerType::Output)
1099 {
1100 throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
1101 }
1102
1103 if (tensorHandle == nullptr)
1104 {
1105 throw InvalidArgumentException("EnqueueOutput: tensorHandle must not be NULL");
1106 }
1107
1108 OutputQueueDescriptor outputQueueDescriptor;
1109 WorkloadInfo info;
1110
1111 outputQueueDescriptor.m_Outputs.push_back(tensorHandle);
1112 info.m_OutputTensorInfos.push_back(tensorInfo);
1113
1114 ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
1115
1116 // Gets the output handler from the previous node.
1117 const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();
1118
1119 const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
1120 ITensorHandle* inputTensorHandle = outputHandler.GetData();
1121 ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
1122
1123 // Try import the output tensor.
1124 // Note: We can only import the output pointer if all of the following hold true:
1125 // a) The imported pointer is aligned sufficiently
1126 // b) The tensor has zero padding
1127 // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
1128 // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
1129 // e) m_IsExportEnabled must be set to true
1130 bool needMemCopy = true;
1131 if (m_NetworkProperties.m_ExportEnabled &&
1132 (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
1133 {
1134 if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
1135 {
1136 MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
1137 if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
1138 {
1139 needMemCopy = false;
1140 void *mem = tensorHandle->Map(false);
1141 bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
1142 tensorHandle->Unmap();
1143
1144 if (importOk)
1145 {
1146 // Insert synchronization workload
1147 MemSyncQueueDescriptor syncDesc;
1148 syncDesc.m_Inputs.push_back(inputTensorHandle);
1149 info.m_InputTensorInfos.push_back(inputTensorInfo);
1150 auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
1151 ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
1152 m_OutputQueue.push_back(move(syncWorkload));
1153 }
1154 else
1155 {
1156 throw MemoryExportException("EnqueueOutput: Memory Export failed");
1157 }
1158 }
1159 }
1160 }
1161 if (needMemCopy)
1162 {
1163 // If we got here then we didn't export the memory, so add an output workload which performs a memcopy.
1164 outputQueueDescriptor.m_Inputs.push_back(inputTensorHandle);
1165 info.m_InputTensorInfos.push_back(inputTensorInfo);
1166
1167 std::unique_ptr<IWorkload> outputWorkload =
1168 std::make_unique<CopyMemGenericWorkload>(outputQueueDescriptor, info);
1169 ARMNN_ASSERT_MSG(outputWorkload, "No output workload created");
1170
1171 std::unique_ptr<TimelineUtilityMethods> timelineUtils =
1172 TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
1173 if (timelineUtils)
1174 {
1175 // Add Output Workload to the post-optimisation network structure
1176 AddWorkloadStructure(timelineUtils, outputWorkload, layer);
1177 timelineUtils->Commit();
1178 }
1179
1180 m_OutputQueue.push_back(move(outputWorkload));
1181 }
1182 }
1183
AllocateWorkingMemory(std::lock_guard<std::mutex> & lock)1184 void LoadedNetwork::AllocateWorkingMemory(
1185 #if !defined(ARMNN_DISABLE_THREADS)
1186 std::lock_guard<std::mutex>& lock
1187 #endif
1188 )
1189 {
1190 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation");
1191
1192 #if !defined(ARMNN_DISABLE_THREADS)
1193 // this unused parameter makes sure we can only call this function with a valid lock
1194 IgnoreUnused(lock);
1195 #endif
1196 if (m_IsWorkingMemAllocated)
1197 {
1198 return;
1199 }
1200
1201 if (m_ExternalMemoryManager)
1202 {
1203 m_ExternalMemoryManager->Allocate();
1204
1205 for (unsigned int i = 0; i < m_TensorMemory.size(); ++i)
1206 {
1207 m_Tensorhandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second);
1208 }
1209 }
1210
1211 for (auto&& memoryManager : m_BackendMemoryMangers)
1212 {
1213 if (memoryManager)
1214 {
1215 memoryManager->Acquire();
1216 }
1217 }
1218 m_TensorHandleFactoryRegistry.AquireMemory();
1219 m_IsWorkingMemAllocated = true;
1220 }
1221
FreeWorkingMemory()1222 void LoadedNetwork::FreeWorkingMemory()
1223 {
1224 #if !defined(ARMNN_DISABLE_THREADS)
1225 std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
1226 #endif
1227
1228 if (!m_IsWorkingMemAllocated)
1229 {
1230 return;
1231 }
1232
1233 if (m_ExternalMemoryManager)
1234 {
1235 m_ExternalMemoryManager->Deallocate();
1236 }
1237
1238 // Informs the memory managers to release memory in its respective memory group
1239 for (auto&& memoryManager : m_BackendMemoryMangers)
1240 {
1241 if (memoryManager)
1242 {
1243 memoryManager->Release();
1244 }
1245 }
1246 m_TensorHandleFactoryRegistry.ReleaseMemory();
1247 m_IsWorkingMemAllocated = false;
1248 }
1249
Execute(std::unique_ptr<TimelineUtilityMethods> & timelineUtils,ProfilingGuid inferenceGuid)1250 bool LoadedNetwork::Execute(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
1251 ProfilingGuid inferenceGuid)
1252 {
1253 bool success = true;
1254
1255 auto Fail = [&](const std::exception& error)
1256 {
1257 ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
1258 success = false;
1259 };
1260
1261 try
1262 {
1263 #if !defined(ARMNN_DISABLE_THREADS)
1264 std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
1265 AllocateWorkingMemory(lockGuard);
1266 #else
1267 AllocateWorkingMemory();
1268 #endif
1269
1270 ProfilingDynamicGuid workloadInferenceID(0);
1271 auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue)
1272 {
1273 for (auto& workload : queue)
1274 {
1275 if(timelineUtils)
1276 {
1277 workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
1278 inferenceGuid);
1279 }
1280 workload->Execute();
1281 if(timelineUtils)
1282 {
1283 timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
1284 }
1285 }
1286 };
1287
1288 ExecuteQueue(m_InputQueue);
1289 ExecuteQueue(m_WorkloadQueue);
1290 ExecuteQueue(m_OutputQueue);
1291 }
1292 catch (const RuntimeException& error)
1293 {
1294 Fail(error);
1295 }
1296 catch (const std::runtime_error& error)
1297 {
1298 Fail(error);
1299 }
1300
1301 return success;
1302 }
1303
EnqueueInput(const ConstTensor & inputTensor,ITensorHandle * inputTensorHandle)1304 void LoadedNetwork::EnqueueInput(const ConstTensor& inputTensor, ITensorHandle* inputTensorHandle)
1305 {
1306 if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor
1307 {
1308 MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
1309 if (CheckFlag(importFlags, m_NetworkProperties.m_InputSource) )
1310 {
1311 std::unique_ptr<ITensorHandle> tensorHandle =
1312 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),
1313 inputTensor.GetMemoryArea());
1314 void* mem = tensorHandle->Map(false);
1315
1316 if (inputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource))
1317 {
1318 tensorHandle->Unmap();
1319 return;
1320 }
1321 tensorHandle->Unmap();
1322 throw MemoryImportException("EnqueueInput: Memory Import failed");
1323 }
1324 else
1325 {
1326 throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
1327 }
1328 }
1329 else
1330 {
1331 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyInput");
1332 std::unique_ptr<ITensorHandle> tensorHandle =
1333 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea());
1334
1335 auto copyFunc = [](void* dst, const void* src, size_t size)
1336 {
1337 memcpy(dst, src, size);
1338 };
1339
1340 CopyTensorContentsGeneric(tensorHandle.get(), inputTensorHandle, copyFunc);
1341 }
1342 }
1343
1344 // Note: We can only import the output pointer if all of the following hold true:
1345 // a) The imported pointer is aligned sufficiently
1346 // b) The tensor has zero padding
1347 // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
1348 // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
1349 // e) m_IsExportEnabled must be set to true
ImportOutputTensor(const Tensor & outputTensor,ITensorHandle * outputTensorHandle)1350 void LoadedNetwork::ImportOutputTensor(const Tensor& outputTensor, ITensorHandle* outputTensorHandle)
1351 {
1352 ARMNN_ASSERT_MSG(outputTensorHandle != nullptr, "Data should have been allocated.");
1353 MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
1354 if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
1355 {
1356 std::unique_ptr<ITensorHandle> tensorHandle =
1357 std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
1358 outputTensor.GetMemoryArea());
1359
1360 void* mem = tensorHandle->Map(false);
1361 bool importOk = outputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
1362 tensorHandle->Unmap();
1363
1364 if (!importOk)
1365 {
1366 throw MemoryExportException("ImportOutputTensor: Memory Export failed");
1367 }
1368 }
1369 else
1370 {
1371 throw MemoryExportException("ImportOutputTensor: Memory Export failed, attempting to export Input Layer");
1372 }
1373
1374 }
1375
CopyToOutputTensor(const Tensor & outputTensor,ITensorHandle * outputTensorHandle)1376 void CopyToOutputTensor(const Tensor& outputTensor, ITensorHandle* outputTensorHandle)
1377 {
1378 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyOutput");
1379 auto copyFunc = [](void* dst, const void* src, size_t size)
1380 {
1381 memcpy(dst, src, size);
1382 };
1383
1384 std::unique_ptr<ITensorHandle> tensorHandle =
1385 std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
1386 outputTensor.GetMemoryArea());
1387
1388 CopyTensorContentsGeneric(outputTensorHandle, tensorHandle.get(), copyFunc);
1389 }
1390
1391
GetInputTensor(const LayerBindingId layerId,const InputTensors & inputTensors)1392 const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors& inputTensors)
1393 {
1394 for (auto inputTensorPair : inputTensors)
1395 {
1396 LayerBindingId id = inputTensorPair.first;
1397 if (id == layerId)
1398 {
1399 return inputTensorPair.second;
1400 }
1401 }
1402 throw InvalidArgumentException("Input does not exist.");
1403 }
1404
GetOutputTensor(const LayerBindingId layerId,const OutputTensors & outputTensors)1405 const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors& outputTensors)
1406 {
1407 for (auto outputTensorPair : outputTensors)
1408 {
1409 LayerBindingId id = outputTensorPair.first;
1410 if (id == layerId)
1411 {
1412 return outputTensorPair.second;
1413 }
1414 }
1415 throw InvalidArgumentException("Output does not exist.");
1416 }
1417
ImportInputs(const InputTensors & inputTensors,MemorySource forceImportMemorySource)1418 std::vector<ImportedInputId> LoadedNetwork::ImportInputs(const InputTensors& inputTensors,
1419 MemorySource forceImportMemorySource)
1420 {
1421 if (!m_NetworkProperties.m_AsyncEnabled)
1422 {
1423 // Cannot import if import is not enabled and forceImportMemorySource is undefined
1424 if (forceImportMemorySource == MemorySource::Undefined)
1425 {
1426 throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
1427 }
1428 // The number of pre imported tensors should not exceed the number of inputs.
1429 if (inputTensors.size() > m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumInputs())
1430 {
1431 throw MemoryImportException("ImportInputs: The number of tensors provided exceeds the number of inputs.");
1432 }
1433
1434 std::vector<ImportedInputId> importedInputs;
1435 Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
1436 unsigned int inputIndex = 0;
1437 for (const BindableLayer* inputLayer : graph.GetInputLayers())
1438 {
1439 auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get();
1440
1441 if (!outputTensorHandle)
1442 {
1443 inputIndex++;
1444 continue;
1445 }
1446
1447 auto layerBindingId = inputLayer->GetBindingId();
1448 auto it = std::find_if(inputTensors.begin(), inputTensors.end(), [=](const auto& inputTensor)
1449 {
1450 return inputTensor.first == layerBindingId;
1451 });
1452
1453 if (it == inputTensors.end())
1454 {
1455 inputIndex++;
1456 continue;
1457 }
1458
1459 const auto& inputTensor = *it;
1460 std::unique_ptr<ITensorHandle> passThroughTensorHandle =
1461 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
1462 inputTensor.second.GetMemoryArea());
1463
1464 try
1465 {
1466 if (outputTensorHandle->CanBeImported(passThroughTensorHandle->Map(), forceImportMemorySource)
1467 && (outputTensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource)))
1468 {
1469 importedInputs.push_back(inputIndex);
1470 }
1471 passThroughTensorHandle->Unmap();
1472 }
1473 catch(const MemoryImportException& exception)
1474 {
1475 ARMNN_LOG(error) << "An error occurred attempting to import input_"
1476 << inputIndex << " : " << exception.what();
1477 passThroughTensorHandle->Unmap();
1478 }
1479 inputIndex++;
1480 }
1481
1482 return importedInputs;
1483 }
1484 else
1485 {
1486 // Import when the import of network properties is enabled
1487 std::vector<ImportedInputId> importedInputs;
1488 Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
1489
1490 for (auto inputTensor : inputTensors)
1491 {
1492 auto layerBindingId = inputTensor.first;
1493 auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer)
1494 {
1495 return layer->GetBindingId() == layerBindingId;
1496 });
1497
1498 if (it == graph.GetInputLayers().end())
1499 {
1500 throw MemoryImportException(fmt::format(
1501 "ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId));
1502 }
1503
1504 const Layer* layer = *it;
1505 if (layer->GetType() != LayerType::Input)
1506 {
1507 throw InvalidArgumentException("ImportInputs: given layer not an InputLayer");
1508 }
1509
1510 auto& backend = m_Backends.at(layer->GetBackendId());
1511 if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities()))
1512 {
1513 std::string er = backend->GetId();
1514 er += " does not have PreImportIOTensors capability";
1515 throw BackendCapabilityException(er);
1516 }
1517
1518 const OutputSlot& outputSlot = layer->GetOutputSlots()[0];
1519
1520 ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
1521 const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
1522
1523 ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
1524 ARMNN_ASSERT(handleFactory);
1525
1526 ImportedTensorHandlePin importedTensorHandlePin{layerBindingId,
1527 handleFactory->CreateTensorHandle(tensorInfo, false)};
1528
1529 ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get();
1530
1531 if (!CheckFlag(tensorHandle->GetImportFlags(), forceImportMemorySource))
1532 {
1533 throw MemoryImportException(
1534 fmt::format("ImportInputs: Memory Import failed, backend: "
1535 "{} does not support importing from source {}"
1536 , factoryId, m_NetworkProperties.m_InputSource));
1537 }
1538
1539 std::unique_ptr<ITensorHandle> passThroughTensorHandle =
1540 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
1541 inputTensor.second.GetMemoryArea());
1542
1543 if (tensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource))
1544 {
1545 importedInputs.push_back(m_CurImportedInputId++);
1546 passThroughTensorHandle->Unmap();
1547 }
1548 else
1549 {
1550 passThroughTensorHandle->Unmap();
1551 throw MemoryImportException("ImportInputs: Memory Import failed");
1552 }
1553
1554 m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin));
1555 }
1556 return importedInputs;
1557 }
1558 }
1559
ImportOutputs(const OutputTensors & outputTensors,MemorySource forceImportMemorySource)1560 std::vector<ImportedOutputId> LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors,
1561 MemorySource forceImportMemorySource)
1562 {
1563 if (!m_NetworkProperties.m_AsyncEnabled)
1564 {
1565 // Cannot import if import is not enabled and forceImportMemorySource is undefined
1566 if (forceImportMemorySource == MemorySource::Undefined)
1567 {
1568 throw MemoryImportException("ImportOutputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
1569 }
1570 // If forceImportMemorySource is defined, try import if memory is aligned
1571 if (outputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumOutputs())
1572 {
1573 throw MemoryImportException("ImportOutputs: Force Import failed, incorrect number of tensors");
1574 }
1575 std::vector<ImportedOutputId> importedOutputs;
1576 Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
1577
1578 unsigned int outputIndex = 0;
1579 for (const BindableLayer* const outputLayer : graph.GetOutputLayers())
1580 {
1581 auto inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get();
1582 if (!inputTensorHandle)
1583 {
1584 outputIndex++;
1585 continue;
1586 }
1587
1588 auto layerBindingId = outputLayer->GetBindingId();
1589 auto it = std::find_if(outputTensors.begin(), outputTensors.end(), [=] (const auto& outputTensor)
1590 {
1591 return outputTensor.first == layerBindingId;
1592 });
1593
1594 if (it == outputTensors.end())
1595 {
1596 outputIndex++;
1597 continue;
1598 }
1599
1600 const auto outputTensor = *it;
1601 try
1602 {
1603 // Check if the output memory can be imported
1604 if (inputTensorHandle->CanBeImported(outputTensor.second.GetMemoryArea(), forceImportMemorySource)
1605 && inputTensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
1606 {
1607 importedOutputs.push_back(outputIndex);
1608 }
1609 }
1610 catch(const MemoryImportException& exception)
1611 {
1612 ARMNN_LOG(error) << "An error occurred attempting to import output_"
1613 << outputIndex << " : " << exception.what();
1614 }
1615 outputIndex++;
1616 }
1617 return importedOutputs;
1618 }
1619
1620 std::vector<ImportedOutputId> importedOutputs;
1621 Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
1622
1623 for (const auto& outputTensor : outputTensors)
1624 {
1625 auto layerBindingId = outputTensor.first;
1626 auto it = std::find_if(graph.GetOutputLayers().begin(), graph.GetOutputLayers().end(), [=](auto* layer)
1627 {
1628 return layer->GetBindingId() == layerBindingId;
1629 });
1630
1631 if (it == graph.GetOutputLayers().end())
1632 {
1633 throw MemoryImportException(fmt::format("ImportOutputs: Memory Import failed, unknown LayerBindingId: {}",
1634 layerBindingId));
1635 }
1636
1637 const Layer* layer = *it;
1638 if (layer->GetType() != LayerType::Output)
1639 {
1640 throw InvalidArgumentException("ImportOutputs: given layer not an OutputLayer");
1641 }
1642
1643 auto& backend = m_Backends.at(layer->GetBackendId());
1644 if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities()))
1645 {
1646 std::string er = backend->GetId();
1647 er += " does not have PreImportIOTensors capability";
1648 throw BackendCapabilityException(er);
1649 }
1650
1651 const InputSlot& inputSlot = layer->GetInputSlots()[0];
1652 ITensorHandleFactory::FactoryId factoryId = inputSlot.GetConnectedOutputSlot()->GetTensorHandleFactoryId();
1653 const TensorInfo& tensorInfo = inputSlot.GetConnectedOutputSlot()->GetTensorInfo();
1654
1655 ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
1656 ARMNN_ASSERT(handleFactory);
1657
1658 ImportedTensorHandlePin importedTensorHandlePin{layerBindingId,
1659 handleFactory->CreateTensorHandle(tensorInfo, false)};
1660
1661 ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get();
1662
1663 if (!CheckFlag(tensorHandle->GetImportFlags(), forceImportMemorySource))
1664 {
1665 throw MemoryImportException(fmt::format("ImportInputs: Memory Import failed, backend: "
1666 "{} does not support importing from source {}"
1667 , factoryId, forceImportMemorySource));
1668 }
1669
1670 if (tensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
1671 {
1672 importedOutputs.push_back(m_CurImportedOutputId++);
1673 }
1674 else
1675 {
1676 throw MemoryImportException("ImportInputs: Memory Import failed");
1677 }
1678
1679 m_PreImportedOutputHandles.push_back(std::move(importedTensorHandlePin));
1680 }
1681
1682 return importedOutputs;
1683 }
1684
ClearImportedInputs(const std::vector<ImportedInputId> inputIds)1685 void LoadedNetwork::ClearImportedInputs(const std::vector<ImportedInputId> inputIds)
1686 {
1687 for (auto id : inputIds)
1688 {
1689 if (id > m_PreImportedInputHandles.size())
1690 {
1691 throw InvalidArgumentException(fmt::format("ClearImportedInputs::Unknown ImportedInputId: {}", id));
1692 }
1693
1694 auto& importedTensorHandle = m_PreImportedInputHandles[id].m_TensorHandle;
1695 if (!importedTensorHandle)
1696 {
1697 throw InvalidArgumentException(
1698 fmt::format("ClearImportedInputs::ImportedInput with id: {} has already been deleted", id));
1699 }
1700 // Call Unimport then destroy the tensorHandle
1701 importedTensorHandle->Unimport();
1702 importedTensorHandle = {};
1703 }
1704 }
1705
ClearImportedOutputs(const std::vector<ImportedOutputId> outputIds)1706 void LoadedNetwork::ClearImportedOutputs(const std::vector<ImportedOutputId> outputIds)
1707 {
1708 for (auto id : outputIds)
1709 {
1710 if (id > m_PreImportedOutputHandles.size())
1711 {
1712 throw InvalidArgumentException(fmt::format("ClearImportedOutputs::Unknown ImportedOutputId: {}", id));
1713 }
1714
1715 auto& importedTensorHandle = m_PreImportedOutputHandles[id].m_TensorHandle;
1716 if (!importedTensorHandle)
1717 {
1718 throw InvalidArgumentException(
1719 fmt::format("ClearImportedOutputs::ImportedOutput with id: {} has already been deleted", id));
1720 }
1721 // Call Unimport then destroy the tensorHandle
1722 importedTensorHandle->Unimport();
1723 importedTensorHandle = {};
1724 }
1725 }
1726
Execute(const InputTensors & inputTensors,const OutputTensors & outputTensors,IWorkingMemHandle & iWorkingMemHandle,std::vector<ImportedInputId> preImportedInputs,std::vector<ImportedOutputId> preImportedOutputs)1727 Status LoadedNetwork::Execute(const InputTensors& inputTensors,
1728 const OutputTensors& outputTensors,
1729 IWorkingMemHandle& iWorkingMemHandle,
1730 std::vector<ImportedInputId> preImportedInputs,
1731 std::vector<ImportedOutputId> preImportedOutputs)
1732 {
1733 const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
1734
1735 if (inputTensors.size() + preImportedInputs.size() != graph.GetNumInputs())
1736 {
1737 if (preImportedInputs.empty())
1738 {
1739 throw InvalidArgumentException("LoadedNetwork::Execute: Number of inputs provided does not match network.");
1740 }
1741 else
1742 {
1743 throw InvalidArgumentException("LoadedNetwork::Execute: "
1744 "Number of inputs + preImportedInputs provided does not match network.");
1745 }
1746 }
1747
1748 if (outputTensors.size() + preImportedOutputs.size() != graph.GetNumOutputs())
1749 {
1750 if (preImportedOutputs.empty())
1751 {
1752 throw InvalidArgumentException("LoadedNetwork::Execute: "
1753 "Number of outputs provided does not match network.");
1754 }
1755 else
1756 {
1757 throw InvalidArgumentException("LoadedNetwork::Execute: "
1758 "Number of outputs + preImportedOutputs provided does not match network.");
1759 }
1760 }
1761
1762 WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle);
1763 // Collect all the given LayerBindingIds and check them for duplicates and unknowns.
1764 std::vector<LayerBindingId>& bindingIds = workingMemHandle.GetBindingIdVector();
1765 unsigned int index = 0;
1766 for (auto pair : inputTensors)
1767 {
1768 bindingIds[index++] = pair.first;
1769 }
1770 for (ImportedInputId id : preImportedInputs)
1771 {
1772 bindingIds[index++] = ValidateImportedInputID(id);
1773 }
1774 for (auto pair : outputTensors)
1775 {
1776 bindingIds[index++] = pair.first;
1777 }
1778 for (ImportedOutputId id : preImportedOutputs)
1779 {
1780 bindingIds[index++] = ValidateImportedOutputID(id);
1781 }
1782
1783 workingMemHandle.ValidateBindingIds();
1784
1785 auto resetMemHandle = [&]()
1786 {
1787 for (ImportedInputId id: preImportedInputs)
1788 {
1789 const LayerBindingId layerBindingId = m_PreImportedInputHandles[id].m_LayerBindingId;
1790
1791 auto inputHandle = workingMemHandle.GetInputHandle(layerBindingId);
1792 auto inputConnections = workingMemHandle.GetInputConnections(layerBindingId);
1793 for (auto it : inputConnections)
1794 {
1795 *it = inputHandle;
1796 }
1797 }
1798
1799 for (ImportedOutputId id: preImportedOutputs)
1800 {
1801 const LayerBindingId layerBindingId = m_PreImportedOutputHandles[id].m_LayerBindingId;
1802
1803 auto outputHandle = workingMemHandle.GetOutputHandle(layerBindingId);
1804 auto outputConnections = workingMemHandle.GetOutputConnection(layerBindingId);
1805
1806 for (auto it : outputConnections)
1807 {
1808 *it = outputHandle;
1809 }
1810 }
1811 };
1812
1813 std::unique_ptr<TimelineUtilityMethods> timelineUtils =
1814 TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
1815 ProfilingGuid inferenceGuid = m_ProfilingService->GetNextGuid();
1816 if (timelineUtils)
1817 {
1818 // Add inference timeline trace if profiling is enabled.
1819 ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
1820 timelineUtils->CreateTypedEntity(inferenceGuid,LabelsAndEventClasses::INFERENCE_GUID);
1821 timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
1822 networkGuid,
1823 inferenceGuid,
1824 LabelsAndEventClasses::EXECUTION_OF_GUID);
1825 timelineUtils->RecordEvent(inferenceGuid,LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
1826 }
1827
1828 bool executionSucceeded = true;
1829
1830 if (timelineUtils)
1831 {
1832 // Add end of life of the inference timeline if profiling is enabled.
1833 timelineUtils->RecordEvent(inferenceGuid,LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
1834 timelineUtils->Commit();
1835 }
1836
1837 if (!workingMemHandle.IsAllocated())
1838 {
1839 workingMemHandle.Allocate();
1840 }
1841
1842 {
1843 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
1844 for (auto pair : inputTensors)
1845 {
1846 EnqueueInput(pair.second, workingMemHandle.GetInputHandle(pair.first));
1847 }
1848
1849 // Swap in the pre-imported inputs if any
1850 for (ImportedInputId id : preImportedInputs)
1851 {
1852 const ImportedTensorHandlePin& importedInputPin = m_PreImportedInputHandles[id];
1853 const LayerBindingId layerBindingId = m_PreImportedInputHandles[id].m_LayerBindingId;
1854 const auto& preimportedHandle = importedInputPin.m_TensorHandle;
1855
1856 auto inputConnections = workingMemHandle.GetInputConnections(layerBindingId);
1857 for (auto it : inputConnections)
1858 {
1859 *it = preimportedHandle.get();
1860 }
1861 }
1862 }
1863 {
1864 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
1865 if (m_NetworkProperties.m_ExportEnabled)
1866 {
1867 for (auto pair: outputTensors)
1868 {
1869 ImportOutputTensor(pair.second, workingMemHandle.GetOutputHandle(pair.first));
1870 }
1871 }
1872
1873 for (ImportedOutputId id : preImportedOutputs)
1874 {
1875 const ImportedTensorHandlePin& importedOutputPin = m_PreImportedOutputHandles[id];
1876 const LayerBindingId layerBindingId = m_PreImportedOutputHandles[id].m_LayerBindingId;
1877 const auto& preimportedHandle = importedOutputPin.m_TensorHandle;
1878
1879 auto outputConnections = workingMemHandle.GetOutputConnection(layerBindingId);
1880 for (auto it : outputConnections)
1881 {
1882 *it = preimportedHandle.get();
1883 }
1884 }
1885 }
1886
1887 auto Fail = [&](const std::exception& error)
1888 {
1889 ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
1890 executionSucceeded = false;
1891 };
1892 ProfilingDynamicGuid workloadInferenceID(0);
1893
1894 try
1895 {
1896 for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i)
1897 {
1898 auto& workload = m_WorkloadQueue[i];
1899 if (timelineUtils)
1900 {
1901 workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
1902 inferenceGuid);
1903 }
1904
1905 workload->ExecuteAsync(workingMemHandle.GetExecutionDataAt(i).second);
1906
1907 if (timelineUtils)
1908 {
1909 timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
1910 }
1911 }
1912 }
1913 catch (const RuntimeException& error)
1914 {
1915 resetMemHandle();
1916 Fail(error);
1917 }
1918 catch (const std::runtime_error& error)
1919 {
1920 resetMemHandle();
1921 Fail(error);
1922 }
1923 catch (...)
1924 {
1925 resetMemHandle();
1926 throw;
1927 }
1928
1929 if (!m_NetworkProperties.m_ExportEnabled)
1930 {
1931 for (auto pair: outputTensors)
1932 {
1933 CopyToOutputTensor(pair.second, workingMemHandle.GetOutputHandle(pair.first));
1934 }
1935 }
1936 else
1937 {
1938 ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
1939 workingMemHandle.MemSyncOutputs();
1940 }
1941
1942 resetMemHandle();
1943
1944 return executionSucceeded ? Status::Success : Status::Failure;
1945 }
1946
1947 /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
1948 /// overlapped Execution by calling this function from different threads.
CreateWorkingMemHandle(NetworkId networkId)1949 std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
1950 {
1951 Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
1952
1953 // Tensors that will need to be allocated internally within armnn
1954 std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles;
1955 // Tensors that will be allocated externally by the user
1956 std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles;
1957
1958 std::vector<WorkingMemDescriptor> workingMemDescriptors;
1959 std::vector<std::pair<BackendId, ExecutionData>> executionDataVec;
1960
1961 auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot)
1962 {
1963 ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
1964 const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
1965
1966 if (factoryId == ITensorHandleFactory::LegacyFactoryId)
1967 {
1968 BackendId id = layer->GetBackendId();
1969 ARMNN_NO_DEPRECATE_WARN_BEGIN
1970 return m_WorkloadFactories.at(id)->CreateTensorHandle(tensorInfo, false);
1971 ARMNN_NO_DEPRECATE_WARN_END
1972 }
1973 else
1974 {
1975 ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
1976 ARMNN_ASSERT(handleFactory);
1977 return handleFactory->CreateTensorHandle(tensorInfo, false);
1978 }
1979 };
1980
1981 struct HandleInfo
1982 {
1983 ITensorHandle* m_TensorHandle;
1984
1985 bool m_IsInputLayerHandle = false;
1986 bool m_IsOutputLayerHandle = false;
1987
1988 WorkingMemHandle::InputMemDescriptorCoords m_InputMemDescriptorCoords;
1989 WorkingMemHandle::OutputMemDescriptorCoords m_OutputMemDescriptorCoords;
1990 };
1991
1992 std::unordered_map<const OutputSlot*, HandleInfo> outputToHandleInfoMap;
1993
1994 unsigned int layerIndex = 0;
1995 for (auto&& layer : order)
1996 {
1997 // Constant layers execution and management is handled during loaded network construction
1998 if (layer->GetType() == LayerType::Constant)
1999 {
2000 continue;
2001 }
2002
2003 WorkingMemDescriptor workingMemDescriptor;
2004
2005 bool isMemoryManaged = true;
2006 bool isInputLayer = false;
2007 bool isOutputLayer = false;
2008 bool isConnectedToOutputLayer = false;
2009
2010 if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::MemImport)
2011 {
2012 // Input layers/workloads will not be executed so the descriptor is not added to workingMemDescriptors
2013 // However we will still need to manage the tensorHandle
2014 isInputLayer = true;
2015 isMemoryManaged = !m_NetworkProperties.m_ImportEnabled;
2016 }
2017 else if (layer->GetType() == LayerType::Output)
2018 {
2019 isOutputLayer = true;
2020 }
2021
2022 unsigned int slotIndex = 0;
2023 // Create a tensor handle for each output slot of a layer
2024 // Once we create it, we start managing its lifetime
2025 for (auto& slot : layer->GetOutputSlots())
2026 {
2027 for (unsigned int i = 0; i < slot.GetNumConnections(); ++i)
2028 {
2029 if ((slot.GetConnection(i)->GetOwningLayer().GetType() == LayerType::Output))
2030 {
2031 if (!isConnectedToOutputLayer)
2032 {
2033 isConnectedToOutputLayer = true;
2034 // If Export is enabled disable memory management, so we can export, otherwise we do a copy
2035 isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
2036 }
2037 else
2038 {
2039 // Importing in this case would likely cause unexpected behaviour, so we disallow it.
2040 ARMNN_LOG(warning) <<
2041 fmt::format("Layer name: '{0}' guid: '{1}' has two or more OutputLayers connected to it. "
2042 "This will prevent importing on the connected OutputLayers.",
2043 layer->GetName(), layer->GetGuid());
2044 isMemoryManaged = true;
2045 }
2046 }
2047 }
2048
2049 ITensorHandle* tensorHandle;
2050 if (isMemoryManaged)
2051 {
2052 managedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
2053 tensorHandle = managedTensorHandles.back().get();
2054 }
2055 else
2056 {
2057 unmanagedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
2058 tensorHandle = unmanagedTensorHandles.back().get();
2059 }
2060
2061 workingMemDescriptor.m_Outputs.push_back(tensorHandle);
2062
2063 HandleInfo& handleInfo = outputToHandleInfoMap[&slot];
2064 handleInfo.m_TensorHandle = tensorHandle;
2065
2066 // Store the coordinates of the current layer's OutputSlot that is connected to the OutputLayer
2067 if (isConnectedToOutputLayer)
2068 {
2069 handleInfo.m_IsOutputLayerHandle = true;
2070 handleInfo.m_OutputMemDescriptorCoords.m_OutputSlotCoords = {layerIndex, slotIndex};
2071 }
2072 // Store the LayerBindingId of the InputLayer
2073 if (isInputLayer)
2074 {
2075 handleInfo.m_IsInputLayerHandle = true;
2076 LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
2077 handleInfo.m_InputMemDescriptorCoords.m_LayerBindingId = bindingId;
2078 }
2079 slotIndex++;
2080 }
2081 // Loop through the input slots in the same layer and decrement the reference counter associated
2082 // to each tensor handle we encounter.
2083 // Once it reaches zero, the lifetime of the tensor handle has ended, and we mark its memory as available
2084 // so that the next tensor handle with a non overlapping lifetime can share its memory.
2085 for (auto& slot : layer->GetInputSlots())
2086 {
2087 ARMNN_ASSERT(slot.GetConnection());
2088 auto outputSlot = slot.GetConnectedOutputSlot();
2089 auto key = outputSlot->GetOwningLayer().GetGuid();
2090
2091 // Constant layers execution and management is handled during loaded network construction
2092 auto found = m_ConstantTensorHandles.find(key);
2093 if (found != m_ConstantTensorHandles.end())
2094 {
2095 ITensorHandle* tensorHandle = found->second;
2096 workingMemDescriptor.m_Inputs.push_back(tensorHandle);
2097
2098 // Odd case where a constant layer is connected to an output layer
2099 // We will need to create a HandleInfo to track it
2100 if (isOutputLayer)
2101 {
2102 LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
2103
2104 HandleInfo& handleInfo = outputToHandleInfoMap[outputSlot];
2105 handleInfo.m_TensorHandle = tensorHandle;
2106 handleInfo.m_IsOutputLayerHandle = true;
2107 handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
2108 handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
2109 }
2110 continue;
2111 }
2112
2113 HandleInfo& handleInfo = outputToHandleInfoMap.at(outputSlot);
2114
2115 ITensorHandle* inputTensorHandle = handleInfo.m_TensorHandle;
2116 workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
2117
2118 // Store the LayerBindingId of the OutputLayer
2119 if (isOutputLayer)
2120 {
2121 LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
2122 handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
2123 handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
2124 }
2125 // In this case the layer is not an Output Layer but shares its input tensorhandle with an OutputLayer
2126 // It will need to be updated as well, if we swap out the tensorhandle
2127 else if (handleInfo.m_IsOutputLayerHandle)
2128 {
2129 handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, slot.GetSlotIndex()});
2130 }
2131
2132 // Store the coordinates of the InputSlots connected to the InputLayer
2133 // There can be more than one InputSlot connected to an InputLayer, so we use a vector
2134 if (handleInfo.m_IsInputLayerHandle)
2135 {
2136 std::pair<LayerGuid, unsigned int> connectionLocation{layerIndex, slot.GetSlotIndex()};
2137 handleInfo.m_InputMemDescriptorCoords.m_InputSlotCoords.emplace_back(connectionLocation);
2138 }
2139 }
2140
2141 // Input/Output layers/workloads will not be executed, so the descriptor is not added to workingMemDescriptors
2142 // However we will still need to manage the tensorHandle
2143 if (!isInputLayer)
2144 {
2145 // Simply auto initialise ExecutionData here, so it's added only for the layer that require execution.
2146 // The memory and data will be allocated/assigned for the void* in WorkingMemHandle::Allocate.
2147 std::pair<BackendId, ExecutionData> dataPair;
2148 dataPair.first = layer->GetBackendId();
2149
2150 executionDataVec.push_back(dataPair);
2151 workingMemDescriptors.push_back(workingMemDescriptor);
2152
2153 layerIndex++;
2154 }
2155 }
2156
2157 std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory;
2158
2159 auto externalMemoryManager = CreateExternalMemoryManger(tensorMemory);
2160
2161 // Sort m_TensorMemory, so it's order matches the outputSlot order
2162 std::sort(tensorMemory.begin(), tensorMemory.end(),
2163 [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
2164 const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
2165 {
2166 return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
2167 });
2168
2169 std::vector<WorkingMemHandle::InputMemDescriptorCoords> inputConnectionsInfo;
2170 std::vector<WorkingMemHandle::OutputMemDescriptorCoords> outputConnectionsInfo;
2171
2172 for (const auto& handleInfo: outputToHandleInfoMap)
2173 {
2174 if (handleInfo.second.m_IsOutputLayerHandle)
2175 {
2176 outputConnectionsInfo.emplace_back(handleInfo.second.m_OutputMemDescriptorCoords);
2177 }
2178
2179 if (handleInfo.second.m_IsInputLayerHandle)
2180 {
2181 inputConnectionsInfo.emplace_back(handleInfo.second.m_InputMemDescriptorCoords);
2182 }
2183 }
2184
2185 return std::make_unique<WorkingMemHandle>(networkId,
2186 inputConnectionsInfo,
2187 outputConnectionsInfo,
2188 workingMemDescriptors,
2189 std::move(externalMemoryManager),
2190 std::move(tensorMemory),
2191 std::move(managedTensorHandles),
2192 std::move(unmanagedTensorHandles),
2193 executionDataVec,
2194 &m_Backends);
2195 }
2196
RegisterDebugCallback(const DebugCallbackFunction & func)2197 void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
2198 {
2199 for (auto&& workloadPtr: m_WorkloadQueue)
2200 {
2201 workloadPtr.get()->RegisterDebugCallback(func);
2202 }
2203 }
2204
2205
CreateMemoryProfileAsync()2206 void LoadedNetwork::CreateMemoryProfileAsync()
2207 {
2208 struct PartialBlock
2209 {
2210 unsigned int m_StartOfLife;
2211 unsigned int m_Lifetime;
2212
2213 size_t m_MemSize;
2214 unsigned int m_Index;
2215
2216 BackendId m_BackendId;
2217 };
2218
2219 auto align = [](size_t numToAlign)
2220 {
2221 const size_t alignment = sizeof(float);
2222 return ((numToAlign + alignment - 1) / alignment) * alignment;
2223 };
2224
2225 std::unordered_map<const OutputSlot*, PartialBlock> memBlockTrackerMap;
2226
2227 const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
2228 const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
2229
2230 unsigned int timestep = 0;
2231 unsigned int outputIndex = 0;
2232 Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
2233
2234 for (auto&& layer : order)
2235 {
2236 const LayerType& layerType = layer->GetType();
2237 // Don't manage memory if importing.
2238 if (layerType == LayerType::Input && inputImportingEnabled)
2239 {
2240 continue;
2241 }
2242 // Don't manage memory if importing.
2243 if (layerType == LayerType::Output && outputImportingEnabled
2244 && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
2245 {
2246 continue;
2247 }
2248 // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
2249 // management is done separately.
2250 if (layerType == LayerType::Constant)
2251 {
2252 continue;
2253 }
2254
2255 BackendId backendId = layer->GetBackendId();
2256 for (auto& outputSlot : layer->GetOutputSlots())
2257 {
2258 if (!m_SupportsExternallyManagedMemory[backendId])
2259 {
2260 continue;
2261 }
2262
2263 PartialBlock partialBlock;
2264
2265 partialBlock.m_StartOfLife = timestep;
2266
2267 size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
2268 partialBlock.m_MemSize = alignedSize;
2269 partialBlock.m_Index = outputIndex++;
2270 partialBlock.m_Lifetime = outputSlot.GetNumConnections();
2271 partialBlock.m_BackendId = backendId;
2272
2273 if (partialBlock.m_Lifetime == 0)
2274 {
2275 m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
2276 partialBlock.m_StartOfLife,
2277 partialBlock.m_MemSize,
2278 0,
2279 partialBlock.m_Index);
2280 }
2281 else
2282 {
2283 memBlockTrackerMap[&outputSlot] = partialBlock;
2284 }
2285 }
2286
2287 for (auto& inputSlot : layer->GetInputSlots())
2288 {
2289 const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
2290 const LayerType& owningLayerType = connectedInputLayer.GetType();
2291
2292 if (owningLayerType == LayerType::Constant)
2293 {
2294 continue;
2295 }
2296 if (inputImportingEnabled && owningLayerType == LayerType::Input)
2297 {
2298 continue;
2299 }
2300
2301 auto outputSlot = inputSlot.GetConnectedOutputSlot();
2302
2303 PartialBlock& partialBlock = memBlockTrackerMap.at(outputSlot);
2304
2305 auto& lifetime = partialBlock.m_Lifetime;
2306 --lifetime;
2307
2308 if (lifetime == 0)
2309 {
2310 m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
2311 timestep,
2312 partialBlock.m_MemSize,
2313 0,
2314 partialBlock.m_Index);
2315 }
2316 }
2317 ++timestep;
2318 }
2319 }
2320
CreateMemoryProfile()2321 void LoadedNetwork::CreateMemoryProfile()
2322 {
2323 // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided
2324 // is a TensorHandle, the function just returns it
2325 auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle)
2326 {
2327 ITensorHandle* ancestor = subTensorHandle;
2328 while (ancestor && ancestor->GetParent())
2329 {
2330 ancestor = ancestor->GetParent();
2331 }
2332 return ancestor;
2333 };
2334
2335 struct PartialBlock
2336 {
2337 unsigned int m_StartOfLife;
2338 unsigned int m_Lifetime;
2339
2340 size_t m_MemSize;
2341 unsigned int m_Index;
2342
2343 BackendId m_BackendId;
2344 };
2345
2346 auto align = [](size_t numToAlign)
2347 {
2348 const size_t alignment = sizeof(float);
2349 return ((numToAlign + alignment - 1) / alignment) * alignment;
2350 };
2351
2352 std::unordered_map<ITensorHandle*, PartialBlock> memBlockTrackerMap;
2353
2354 const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
2355 const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
2356
2357 unsigned int timestep = 0;
2358 unsigned int outputIndex = 0;
2359 Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
2360
2361 for (auto&& layer : order)
2362 {
2363 const LayerType& layerType = layer->GetType();
2364 // Don't manage memory if importing.
2365 if (layerType == LayerType::Input && inputImportingEnabled)
2366 {
2367 continue;
2368 }
2369 // Don't manage memory if importing.
2370 if (layerType == LayerType::Output && outputImportingEnabled
2371 && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
2372 {
2373 continue;
2374 }
2375 // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
2376 // management is done separately.
2377 if (layerType == LayerType::Constant)
2378 {
2379 continue;
2380 }
2381
2382 BackendId backendId = layer->GetBackendId();
2383 for (auto& outputSlot : layer->GetOutputSlots())
2384 {
2385 if (!m_SupportsExternallyManagedMemory[backendId])
2386 {
2387 continue;
2388 }
2389
2390 ITensorHandle* tensorHandle = outputSlot.GetOutputHandler().GetData();
2391 tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
2392
2393 if (memBlockTrackerMap.find(tensorHandle) == memBlockTrackerMap.end())
2394 {
2395 PartialBlock partialBlock;
2396
2397 partialBlock.m_StartOfLife = timestep;
2398
2399 size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
2400 partialBlock.m_MemSize = alignedSize;
2401 partialBlock.m_Index = outputIndex++;
2402 partialBlock.m_Lifetime = outputSlot.GetNumConnections();
2403 partialBlock.m_BackendId = backendId;
2404
2405 if (partialBlock.m_Lifetime == 0)
2406 {
2407 m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
2408 partialBlock.m_StartOfLife,
2409 partialBlock.m_MemSize,
2410 0,
2411 partialBlock.m_Index);
2412 }
2413 else
2414 {
2415 memBlockTrackerMap[tensorHandle] = partialBlock;
2416 }
2417 m_Tensorhandles.push_back(tensorHandle);
2418
2419 }
2420 else
2421 {
2422 memBlockTrackerMap.at(tensorHandle).m_Lifetime += outputSlot.GetNumConnections();
2423 }
2424 }
2425
2426 for (auto& inputSlot : layer->GetInputSlots())
2427 {
2428 const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
2429 const LayerType& owningLayerType = connectedInputLayer.GetType();
2430
2431 if (owningLayerType == LayerType::Constant)
2432 {
2433 continue;
2434 }
2435 if (inputImportingEnabled && owningLayerType == LayerType::Input)
2436 {
2437 continue;
2438 }
2439 if (!m_SupportsExternallyManagedMemory[connectedInputLayer.GetBackendId()])
2440 {
2441 continue;
2442 }
2443
2444 auto outputSlot = inputSlot.GetConnectedOutputSlot();
2445
2446 ITensorHandle* tensorHandle = outputSlot->GetOutputHandler().GetData();
2447 tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
2448
2449 PartialBlock& partialBlock = memBlockTrackerMap.at(tensorHandle);
2450
2451 auto& lifetime = partialBlock.m_Lifetime;
2452 --lifetime;
2453
2454 if (lifetime == 0)
2455 {
2456 m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
2457 timestep,
2458 partialBlock.m_MemSize,
2459 0,
2460 partialBlock.m_Index);
2461 }
2462 }
2463 ++timestep;
2464 }
2465
2466 }
2467
CreateExternalMemoryManger(std::vector<std::pair<std::shared_ptr<TensorMemory>,MemorySource>> & tensorMemoryVec)2468 std::unique_ptr<MemoryManager> LoadedNetwork::CreateExternalMemoryManger(
2469 std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>>& tensorMemoryVec)
2470 {
2471 std::unique_ptr<MemoryManager> memoryManager = std::make_unique<MemoryManager>();
2472 auto allocatorMap = BackendRegistryInstance().GetAllocators();
2473
2474 for (auto& backend : m_MemBinMap)
2475 {
2476 std::vector<BufferStorage> bufferStorageVec;
2477
2478 std::shared_ptr<ICustomAllocator> backendAllocator;
2479 if (allocatorMap.find(backend.first) != allocatorMap.end())
2480 {
2481 backendAllocator = allocatorMap[backend.first];
2482 }
2483 else
2484 {
2485 backendAllocator = m_Backends[backend.first]->GetDefaultAllocator();
2486 }
2487
2488 for (auto& memBin : backend.second)
2489 {
2490 BufferStorage bufferStorage;
2491 bufferStorage.m_BufferSize = memBin.m_MemSize;
2492 bufferStorage.m_TensorMemoryVector.reserve(memBin.m_MemBlocks.size());
2493
2494 for (auto& memBlock : memBin.m_MemBlocks)
2495 {
2496 auto tensorMemory = std::make_shared<TensorMemory>(TensorMemory{memBlock.m_Offset, memBlock.m_Index});
2497
2498 tensorMemoryVec.emplace_back(tensorMemory, backendAllocator->GetMemorySourceType());
2499 bufferStorage.m_TensorMemoryVector.emplace_back(tensorMemory);
2500 }
2501
2502 bufferStorageVec.emplace_back(std::move(bufferStorage));
2503 }
2504
2505 memoryManager->StoreMemToAllocate(bufferStorageVec, backendAllocator, 4);
2506 }
2507
2508 return memoryManager;
2509 }
2510
ValidateImportedInputID(ImportedInputId id)2511 LayerBindingId LoadedNetwork::ValidateImportedInputID(ImportedInputId id)
2512 {
2513 try
2514 {
2515 const auto& importedTensorHandlePin = m_PreImportedInputHandles.at(id);
2516 if (!importedTensorHandlePin.m_TensorHandle)
2517 {
2518 throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute:"
2519 "PreImportedInput: {} has been deleted", id));
2520 }
2521 return importedTensorHandlePin.m_LayerBindingId;
2522 }
2523 catch (const std::out_of_range&)
2524 {
2525 throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: Unknown ImportedInputId: {}", id));
2526 }
2527 }
2528
ValidateImportedOutputID(ImportedOutputId id)2529 LayerBindingId LoadedNetwork::ValidateImportedOutputID(ImportedOutputId id)
2530 {
2531 try
2532 {
2533 const auto& importedTensorHandlePin = m_PreImportedOutputHandles.at(id);
2534 if (!importedTensorHandlePin.m_TensorHandle)
2535 {
2536 throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: "
2537 "PreImportedOutput: {} has been deleted", id));
2538 }
2539 return importedTensorHandlePin.m_LayerBindingId;
2540 }
2541 catch (const std::out_of_range&)
2542 {
2543 throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: Unknown ImportedOutputId: {}", id));
2544 }
2545 }
2546
2547 }
2548