xref: /aosp_15_r20/external/armnn/src/armnn/LoadedNetwork.cpp (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1 //
2 // Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include "LoadedNetwork.hpp"
7 #include "Layer.hpp"
8 #include "Graph.hpp"
9 #include "Profiling.hpp"
10 #include "HeapProfiling.hpp"
11 #include "WorkingMemHandle.hpp"
12 #include "ExecutionData.hpp"
13 
14 #include <armnn/BackendHelper.hpp>
15 #include <armnn/BackendRegistry.hpp>
16 #include <armnn/Logging.hpp>
17 
18 #include <armnn/backends/TensorHandle.hpp>
19 #include <armnn/backends/IBackendInternal.hpp>
20 #include <armnn/backends/IMemoryManager.hpp>
21 #include <armnn/backends/MemCopyWorkload.hpp>
22 
23 #include <armnn/profiling/ArmNNProfiling.hpp>
24 
25 #include <armnn/utility/Assert.hpp>
26 
27 #include <backendsCommon/MemSyncWorkload.hpp>
28 
29 #include <common/include/Processes.hpp>
30 
31 #include <fmt/format.h>
32 
33 namespace armnn
34 {
35 
36 using namespace std;
37 using namespace arm::pipe;
38 
39 namespace
40 {
41 
42 template <typename ExceptionType>
ToErrorMessage(const char * prefix,const ExceptionType & error)43 std::string ToErrorMessage(const char * prefix, const ExceptionType & error)
44 {
45     std::stringstream ss;
46     ss << prefix << " " << error.what();
47     return ss.str();
48 }
49 
AddLayerStructure(std::unique_ptr<TimelineUtilityMethods> & timelineUtils,const Layer & layer,ProfilingGuid networkGuid)50 void AddLayerStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
51                        const Layer& layer,
52                        ProfilingGuid networkGuid)
53 {
54     // Add layer to the post-optimisation network structure
55     std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
56     timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
57                                                networkGuid,
58                                                layerName,
59                                                LabelsAndEventClasses::LAYER_GUID);
60     for (auto&& input : layer.GetInputSlots())
61     {
62         const IOutputSlot* source = input.GetConnectedOutputSlot();
63         ARMNN_ASSERT(source != NULL);
64         timelineUtils->CreateConnectionRelationship(ProfilingRelationshipType::RetentionLink,
65                                                     source->GetOwningLayerGuid(),
66                                                     layer.GetGuid());
67     }
68 }
69 
AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods> & timelineUtils,std::unique_ptr<IWorkload> & workload,const Layer & layer)70 void AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
71                           std::unique_ptr<IWorkload>& workload,
72                           const Layer& layer)
73 {
74     // Add workload to the post-optimisation network structure
75     timelineUtils->CreateTypedEntity(workload->GetGuid(), LabelsAndEventClasses::WORKLOAD_GUID);
76     timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
77                                        layer.GetBackendId().Get(),
78                                        LabelsAndEventClasses::BACKENDID_GUID);
79 
80     // Link the workload to the layer
81     timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
82                                       layer.GetGuid(),
83                                       workload->GetGuid(),
84                                       LabelsAndEventClasses::CHILD_GUID);
85 }
86 
87 } // anonymous
88 
89 /**
90  * This function performs a sanity check to ensure that the combination of input and output memory source matches the
91  * values for importEnabled and exportEnabled that were specified during optimization. During optimization the tensor
92  * handle factories are chosen based on whether import and export are enabled. If the user then specifies something
93  * incompatible here it can lead to problems.
94  *
95  * @param optimizedOptions
96  * @param networkProperties
97  */
ValidateSourcesMatchOptimizedNetwork(std::vector<BackendOptions> optimizedOptions,const INetworkProperties & networkProperties)98 void ValidateSourcesMatchOptimizedNetwork(std::vector<BackendOptions> optimizedOptions,
99                                           const INetworkProperties& networkProperties)
100 {
101     // Find the "Global" backend options. During the optimize phase the values of importEnabled and exportEnabled are
102     // added as backend options.
103     const vector<BackendOptions>::iterator& backendItr =
104         find_if(optimizedOptions.begin(), optimizedOptions.end(), [](const BackendOptions& backend) {
105             if (backend.GetBackendId().Get() == "Global")
106             {
107                 return true;
108             }
109             else
110             {
111                 return false;
112             }
113         });
114     bool importEnabled = false;
115     bool exportEnabled = false;
116     if (backendItr != optimizedOptions.end())
117     {
118         // Find the importEnabled and exportEnabled values.
119         for (size_t i = 0; i < backendItr->GetOptionCount(); i++)
120         {
121             const BackendOptions::BackendOption& option = backendItr->GetOption(i);
122             if (option.GetName() == "ImportEnabled")
123             {
124                 importEnabled = option.GetValue().AsBool();
125             }
126             if (option.GetName() == "ExportEnabled")
127             {
128                 exportEnabled = option.GetValue().AsBool();
129             }
130         }
131     }
132 
133     // Now that we have values for import and export compare them to the MemorySource variables.
134     // Any value of MemorySource that's not "Undefined" implies that we need to do an import of some kind.
135     if ((networkProperties.m_InputSource == MemorySource::Undefined && importEnabled) ||
136         (networkProperties.m_InputSource != MemorySource::Undefined && !importEnabled))
137     {
138         auto message = fmt::format("The input memory source specified, '{0}',", networkProperties.m_InputSource);
139         if (!importEnabled)
140         {
141             message.append(" requires that memory import be enabled. However, "
142                            "it was disabled when this network was optimized.");
143         }
144         else
145         {
146             message.append(" requires that memory import be disabled. However, "
147                            "it was enabled when this network was optimized.");
148         }
149         throw InvalidArgumentException(message);
150     }
151 
152     if ((networkProperties.m_OutputSource == MemorySource::Undefined && exportEnabled) ||
153         (networkProperties.m_OutputSource != MemorySource::Undefined && !exportEnabled))
154     {
155         auto message = fmt::format("The output memory source specified, '{0}',", networkProperties.m_OutputSource);
156         if (!exportEnabled)
157         {
158             message.append(" requires that memory export be enabled. However, "
159                            "it was disabled when this network was optimized.");
160         }
161         else
162         {
163             message.append(" requires that memory export be disabled. However, "
164                            "it was enabled when this network was optimized.");
165         }
166         throw InvalidArgumentException(message);
167     }
168 } // anonymous
169 
MakeLoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,std::string & errorMessage,const INetworkProperties & networkProperties,arm::pipe::IProfilingService * profilingService)170 std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
171                                                                 std::string& errorMessage,
172                                                                 const INetworkProperties& networkProperties,
173                                                                 arm::pipe::IProfilingService* profilingService)
174 {
175     std::unique_ptr<LoadedNetwork> loadedNetwork;
176 
177     auto Fail = [&](const std::exception& error) -> std::unique_ptr<LoadedNetwork>
178     {
179         errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error);
180         ARMNN_LOG(error) << errorMessage;
181 
182         return std::unique_ptr<LoadedNetwork>();
183     };
184 
185     try
186     {
187         loadedNetwork.reset(new LoadedNetwork(std::move(net), networkProperties, profilingService));
188     }
189     catch (const armnn::RuntimeException& error)
190     {
191         return Fail(error);
192     }
193     catch (const armnn::Exception& error)
194     {
195         return Fail(error);
196     }
197     catch (const std::runtime_error& error)
198     {
199         return Fail(error);
200     }
201 
202     return loadedNetwork;
203 }
204 
LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,const INetworkProperties & networkProperties,arm::pipe::IProfilingService * profilingService)205 LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
206                              const INetworkProperties& networkProperties,
207                              arm::pipe::IProfilingService* profilingService) :
208                              m_OptimizedNetwork(std::move(net)),
209                              m_NetworkProperties(networkProperties),
210                              m_TensorHandleFactoryRegistry(),
211                              m_ProfilingService(profilingService)
212 {
213     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadedNetwork");
214     // Get the profiler and register it for the current thread.
215     const std::shared_ptr<IProfiler>& profiler = m_OptimizedNetwork->GetProfiler();
216     ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
217 
218     profiler->EnableProfiling(networkProperties.m_ProfilingEnabled);
219 
220     profiler->EnableNetworkDetailsToStdOut(networkProperties.m_OutputNetworkDetailsMethod);
221 
222     // We need to check that the memory sources match up with the values of import and export specified during the
223     // optimize phase. If they don't this will throw an exception.
224     ValidateSourcesMatchOptimizedNetwork(m_OptimizedNetwork.get()->pOptimizedNetworkImpl->GetModelOptions(),
225                                          m_NetworkProperties);
226 
227     //First create tensor handlers, backends and workload factories.
228     //Handlers are created before workloads are.
229     //Because workload creation can modify some of the handlers,
230     //(for example the splitter and concat layers).
231 
232     bool useExternalMemoryManager = false;
233     bool useInternalMemoryManager = false;
234     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
235     // Ensure Topological order
236     order.SetLayersOutOfOrder();
237     order.TopologicalSort();
238 
239     if (!networkProperties.m_AsyncEnabled)
240     {
241         m_IsInputImported = std::vector<bool>(order.GetNumInputs(), false);
242         m_IsOutputImported = std::vector<bool>(order.GetNumOutputs(), false);
243     }
244 
245     for (auto&& layer : order)
246     {
247         auto const& backendId = layer->GetBackendId();
248         if (m_Backends.count(backendId) == 0)
249         {
250             auto createBackend = BackendRegistryInstance().GetFactory(backendId);
251             auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));
252 
253             IBackendInternal* backend = it.first->second.get();
254 
255             // If we're doing async execution verify that the backend supports it and ExternallyManagedMemory.
256             if (networkProperties.m_AsyncEnabled)
257             {
258                 if (!HasCapability(BackendOptions::BackendOption{"AsyncExecution", true}, backend->GetCapabilities()))
259                 {
260                     std::string er = backend->GetId();
261                     er += " does not support AsyncExecution";
262                     throw BackendCapabilityException(er);
263                 }
264                 if (!HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},
265                 backend->GetCapabilities()))
266                 {
267                     std::string er = backend->GetId();
268                     er += " does not support ExternallyManagedMemory\n";
269                     er += "AsyncEnabled networks require all backends to support ExternallyManagedMemory";
270                     throw BackendCapabilityException(er);
271                 }
272                 m_SupportsExternallyManagedMemory[backend->GetId()] = true;
273                 useExternalMemoryManager = true;
274             }
275             else
276             {
277                 m_SupportsExternallyManagedMemory[backend->GetId()] = false;
278                 useInternalMemoryManager = true;
279             }
280 
281             IBackendInternal::IWorkloadFactoryPtr workloadFactory;
282             if (backend->SupportsTensorAllocatorAPI())
283             {
284                 workloadFactory = backend->CreateWorkloadFactory(
285                     m_TensorHandleFactoryRegistry,
286                     m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(),
287                     static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
288                     static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
289             }
290             else
291             {
292                 m_BackendMemoryMangers.emplace_back(backend->CreateMemoryManager());
293                 workloadFactory = backend->CreateWorkloadFactory(
294                         m_BackendMemoryMangers.back(), m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
295             }
296             m_WorkloadFactories[backendId ] = std::move(workloadFactory);
297         }
298     }
299 
300     if (!networkProperties.m_AsyncEnabled)
301     {
302         for (auto&& layer : order)
303         {
304             auto& workloadFactory = GetWorkloadFactory(*layer);
305             bool supportsExternalManager = m_SupportsExternallyManagedMemory[layer->GetBackendId()];
306 
307             switch (layer->GetType())
308             {
309                 case LayerType::Input:
310                 case LayerType::MemImport:
311                 {
312                     // If IsImportEnabled is true then we need to set IsMemoryManaged
313                     // to false when creating TensorHandles
314                     layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
315                                                workloadFactory,
316                                                !supportsExternalManager && !m_NetworkProperties.m_ImportEnabled);
317                     break;
318                 }
319                 case LayerType::Constant:
320                 {
321                     layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, true);
322                     break;
323                 }
324                 default:
325                 {
326                     // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
327                     // If Export is enabled disable memory management so we can export, otherwise we do a copy
328                     if ((layer->GetNumOutputSlots() == 1) &&
329                        (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
330                        (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
331                     {
332                         layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
333                                                    workloadFactory,
334                                                    !supportsExternalManager && !m_NetworkProperties.m_ExportEnabled);
335                     }
336                     else
337                     {
338                         layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
339                                                    workloadFactory,
340                                                    !supportsExternalManager);
341                     }
342                 }
343             }
344         }
345     }
346 
347     ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
348     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
349         TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
350     if (timelineUtils)
351     {
352         timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
353         // Mark the network with a start of life event
354         timelineUtils->RecordEvent(networkGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
355         // and with the process ID
356         int processID = arm::pipe::GetCurrentProcessId();
357         std::stringstream ss;
358         ss << processID;
359         timelineUtils->MarkEntityWithLabel(networkGuid, ss.str(), LabelsAndEventClasses::PROCESS_ID_GUID);
360     }
361 
362     std::vector<IWorkload*> ConstWorkloads;
363 
364     //Then create workloads.
365     {
366         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_CreateWorkloads");
367         for (auto&& layer: order)
368         {
369             if (timelineUtils)
370             {
371                 // Add layer to the post-optimisation network structure
372                 AddLayerStructure(timelineUtils, *layer, networkGuid);
373             }
374 
375             const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer);
376 
377             switch (layer->GetType())
378             {
379                 case LayerType::Input:
380                 case LayerType::Output:
381                 {
382                     // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
383                     break;
384                 }
385                 default:
386                 {
387                     auto workload = layer->CreateWorkload(workloadFactory);
388 
389                     if (!workload)
390                     {
391                         const char* const layerName =
392                                 layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
393                         throw InvalidArgumentException(
394                                 fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
395                                             layerName, static_cast<int>(layer->GetType()), layer->GetBackendId().Get()
396                                 ));
397                     }
398 
399                     if (timelineUtils)
400                     {
401                         // Add workload to the post-optimisation network structure
402                         AddWorkloadStructure(timelineUtils, workload, *layer);
403                     }
404 
405                     // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
406                     // and are separated out from the other workloads
407                     if((networkProperties.m_AsyncEnabled  || useExternalMemoryManager) &&
408                         layer->GetType() == LayerType::Constant)
409                     {
410                         m_ConstantTensorHandles[layer->GetGuid()] =
411                                 layer->GetOutputSlot(0).GetOutputHandler().GetData();
412                         m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
413                     }
414                     else
415                     {
416                         m_WorkloadQueue.push_back(std::move(workload));
417 
418                         if (layer->GetType() == LayerType::Constant)
419                         {
420                             // Place the Constant Workloads into a queue so that they can be executed first
421                             ConstWorkloads.push_back(m_WorkloadQueue.back().get());
422                         }
423                     }
424                     // release the constant data in the layer.
425                     layer->ReleaseConstantData();
426                     break;
427                 }
428             }
429         }
430     }
431 
432     // Gather information about workloads for inputs & outputs
433     if (!networkProperties.m_AsyncEnabled && m_WorkloadQueue.size() != 0)
434     {
435         const int noOfInputs = armnn::numeric_cast<int>(order.GetNumInputs());
436 
437         // Get indices of all workloads connected to each input and
438         // check if they support tensor handle replacement
439         for (const BindableLayer* layer: order.GetInputLayers())
440         {
441             const auto bindingId = layer->GetBindingId();
442 
443             bool supportsReplacement = true;
444 
445             for (const auto inputSlot: layer->GetOutputSlot(0).GetConnections())
446             {
447                 auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(inputSlot->GetOwningLayer()));
448                 workloadIndex -= noOfInputs;
449 
450                 m_InputWorkloadSlotPairs[bindingId].emplace_back(WorkloadIndices{
451                         armnn::numeric_cast<unsigned int>(workloadIndex), inputSlot->GetSlotIndex()});
452 
453                 auto workload = m_WorkloadQueue[m_InputWorkloadSlotPairs[bindingId].back().m_WorkloadIndex].get();
454                 supportsReplacement &= workload->SupportsTensorHandleReplacement();
455             }
456 
457             ITensorHandleFactory::FactoryId factoryId = layer->GetOutputSlot(0).GetTensorHandleFactoryId();
458             // Get matching import factory Id
459             ITensorHandleFactory::FactoryId importFactoryId =
460                     m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
461 
462             ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId);
463 
464             if (supportsReplacement && importFactory)
465             {
466                 m_PreImportedInputHandles.emplace_back(
467                         bindingId, importFactory->CreateTensorHandle(layer->GetOutputSlot(0).GetTensorInfo(), false));
468             }
469             else
470             {
471                 m_PreImportedInputHandles.emplace_back(bindingId, nullptr);
472             }
473         }
474 
475         // Get indices of all workloads connected to each output and
476         // check if they support tensor handle replacement
477         for (const BindableLayer* layer: order.GetOutputLayers())
478         {
479             const auto bindingId = layer->GetBindingId();
480 
481             const auto outputSlot = layer->GetInputSlot(0).GetConnectedOutputSlot();
482             auto& indices = m_OutputWorkloadSlotPairs[bindingId];
483 
484             auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(outputSlot->GetOwningLayer()));
485             workloadIndex -= noOfInputs;
486 
487             indices.m_OutputSlotIndices = WorkloadIndices{numeric_cast<unsigned int>(workloadIndex),
488                                                           outputSlot->CalculateIndexOnOwner()};
489 
490             bool supportsReplacement = true;
491             auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
492             supportsReplacement &= outputWorkload->SupportsTensorHandleReplacement();
493 
494             for (auto &inputSlot: outputSlot->GetConnections())
495             {
496                 if(inputSlot->GetOwningLayer().GetType() != LayerType::Output)
497                 {
498                     auto inWorkloadIndex = std::distance(order.begin(),
499                                                          order.GetPosInGraph(inputSlot->GetOwningLayer()));
500                     inWorkloadIndex -= noOfInputs;
501                     indices.m_InputSlotIndices.emplace_back(WorkloadIndices{numeric_cast<unsigned int>(inWorkloadIndex),
502                                                             inputSlot->GetSlotIndex()});
503                     auto inputWorkload = m_WorkloadQueue[indices.m_InputSlotIndices.back().m_WorkloadIndex].get();
504                     supportsReplacement &= inputWorkload->SupportsTensorHandleReplacement();
505                 }
506             }
507 
508             ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
509             // Get matching import factory Id
510             ITensorHandleFactory::FactoryId importFactoryId =
511                     m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
512             ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId);
513 
514             if (supportsReplacement && importFactory)
515             {
516                 m_PreImportedOutputHandles.emplace_back(
517                         bindingId, importFactory->CreateTensorHandle(outputSlot->GetTensorInfo(), false));
518             }
519             else
520             {
521                 m_PreImportedOutputHandles.emplace_back(bindingId, nullptr);
522             }
523         }
524     }
525 
526     for (auto&& workloadFactory : m_WorkloadFactories)
527     {
528         workloadFactory.second->AfterWorkloadsCreated();
529     }
530 
531     if (timelineUtils)
532     {
533         // Commit to send the post-optimisation network structure
534         timelineUtils->Commit();
535     }
536 
537     if (useExternalMemoryManager)
538     {
539         if (networkProperties.m_AsyncEnabled)
540         {
541             CreateMemoryProfileAsync();
542         }
543         else
544         {
545             CreateMemoryProfile();
546         }
547 
548         auto backendStrategyMap = BackendRegistryInstance().GetMemoryOptimizerStrategies();
549         for (auto& backendMemoryProfile : m_MemBlockMap)
550         {
551             const BackendId& backendId = backendMemoryProfile.first;
552             if (backendStrategyMap.find(backendId) != backendStrategyMap.end())
553             {
554                 m_MemBinMap[backendId] = backendStrategyMap[backendId]->Optimize(backendMemoryProfile.second);
555             }
556             else
557             {
558                 m_MemBinMap[backendId] = m_ConstantStrategy->Optimize(backendMemoryProfile.second);
559             }
560         }
561 
562         if (!networkProperties.m_AsyncEnabled)
563         {
564             m_ExternalMemoryManager = CreateExternalMemoryManger(m_TensorMemory);
565 
566             // Sort m_TensorMemory, so it's order matches m_Tensorhandles
567             std::sort(m_TensorMemory.begin(), m_TensorMemory.end(),
568                       [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
569                          const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
570                       {
571                           return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
572                       });
573         }
574     }
575 
576     // Now that the intermediate tensor memory has been set-up,
577     // do any post allocation configuration for each workload.
578     if (!networkProperties.m_AsyncEnabled)
579     {
580         if (useInternalMemoryManager)
581         {
582             // Set up memory.
583             m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
584         }
585 
586         for (auto &workload : m_WorkloadQueue)
587         {
588             workload->PostAllocationConfigure();
589         }
590     }
591 
592     if (useExternalMemoryManager)
593     {
594         if (!networkProperties.m_AsyncEnabled)
595         {
596             AllocateAndExecuteConstantWorkloads();
597         }
598         else
599         {
600             AllocateAndExecuteConstantWorkloadsAsync();
601         }
602     }
603     // If synchronous, execute all constant layer workloads
604     if (!networkProperties.m_AsyncEnabled)
605     {
606         for (auto workload: ConstWorkloads)
607         {
608             workload->Execute();
609         }
610     }
611 }
612 
AllocateAndExecuteConstantWorkloads()613 void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
614 {
615     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
616     for (auto& pair : m_ConstantWorkloads)
617     {
618         auto tensorHandle = m_ConstantTensorHandles[pair.first];
619         tensorHandle->Allocate();
620         pair.second->Execute();
621     }
622 }
623 
AllocateAndExecuteConstantWorkloadsAsync()624 void LoadedNetwork::AllocateAndExecuteConstantWorkloadsAsync()
625 {
626     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
627     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
628     for (auto&& layer : order)
629     {
630         if (layer->GetType() == LayerType::Constant)
631         {
632             const auto& outSlot = layer->GetOutputSlots()[0];
633             const auto factoryId = outSlot.GetTensorHandleFactoryId();
634             ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
635             auto& workloadFactory = GetWorkloadFactory(*layer);
636 
637             layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
638             ITensorHandle* tensorHandle = outSlot.GetOutputHandler().GetData();
639 
640             m_ConstantTensorHandles[layer->GetGuid()] = tensorHandle;
641             tensorHandle->Allocate();
642 
643             auto& backend = m_Backends.at(layer->GetBackendId());
644 
645             WorkingMemDescriptor memDesc;
646             memDesc.m_Outputs.push_back(tensorHandle);
647 
648             ExecutionData executionData = backend->CreateExecutionData(memDesc);
649             m_ConstantWorkloads[layer->GetGuid()]->ExecuteAsync(executionData);
650         }
651     }
652 }
653 
SendNetworkStructure(arm::pipe::IProfilingService & profilingService)654 void LoadedNetwork::SendNetworkStructure(arm::pipe::IProfilingService& profilingService)
655 {
656     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_SendNetworkStructure");
657     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
658     ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
659 
660     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
661         TimelineUtilityMethods::GetTimelineUtils(profilingService);
662 
663     timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
664 
665     for (auto&& layer : order)
666     {
667         // Add layer to the post-optimisation network structure
668         AddLayerStructure(timelineUtils, *layer, networkGuid);
669         switch (layer->GetType())
670         {
671             case LayerType::Input:
672             case LayerType::Output:
673             {
674                 // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
675                 break;
676             }
677             default:
678             {
679                 for (auto& workload : m_WorkloadQueue)
680                 {
681                     // Add workload to the post-optimisation network structure
682                     AddWorkloadStructure(timelineUtils, workload, *layer);
683                 }
684             break;
685             }
686         }
687     }
688     // Commit to send the post-optimisation network structure
689     timelineUtils->Commit();
690 }
691 
GetNetworkGuid()692 ProfilingGuid LoadedNetwork::GetNetworkGuid()
693 {
694     return m_OptimizedNetwork->GetGuid();
695 }
696 
GetInputTensorInfo(LayerBindingId layerId) const697 TensorInfo LoadedNetwork::GetInputTensorInfo(LayerBindingId layerId) const
698 {
699     for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers())
700     {
701         ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
702         if (inputLayer->GetBindingId() == layerId)
703         {
704             return inputLayer->GetOutputSlot(0).GetTensorInfo();
705         }
706     }
707 
708     throw InvalidArgumentException(fmt::format("No input layer is associated with id {}", layerId));
709 }
710 
GetOutputTensorInfo(LayerBindingId layerId) const711 TensorInfo LoadedNetwork::GetOutputTensorInfo(LayerBindingId layerId) const
712 {
713     for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers())
714     {
715         ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
716         ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
717         if (outputLayer->GetBindingId() == layerId)
718         {
719             return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
720         }
721     }
722 
723     throw InvalidArgumentException(fmt::format("No output layer is associated with id {}", layerId));
724 }
725 
GetWorkloadFactory(const Layer & layer) const726 const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) const
727 {
728     const IWorkloadFactory* workloadFactory = nullptr;
729 
730     auto it = m_WorkloadFactories.find(layer.GetBackendId());
731     if (it ==  m_WorkloadFactories.end())
732     {
733         throw RuntimeException(fmt::format("No workload factory for {0} to be used for layer: {1}",
734                                            layer.GetBackendId().Get(),
735                                            layer.GetNameStr()),
736                                            CHECK_LOCATION());
737     }
738 
739     workloadFactory = it->second.get();
740 
741     ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
742 
743     return *workloadFactory;
744 }
745 
746 namespace {
747 
748 // Non-copyable class owning accelerator-specific tensor data.
749 class TensorPin
750 {
751 public:
TensorPin(std::unique_ptr<ITensorHandle> handle,const TensorInfo & info,LayerBindingId id)752     TensorPin(std::unique_ptr<ITensorHandle> handle, const TensorInfo& info, LayerBindingId id)
753         : m_TensorHandle(std::move(handle))
754         , m_TensorInfo(info)
755         , m_Id(id)
756     {
757     }
758 
GetTensorHandle() const759     ITensorHandle* GetTensorHandle() const { return m_TensorHandle.get(); }
GetTensorInfo() const760     const TensorInfo& GetTensorInfo() const { return m_TensorInfo; }
GetBindingId() const761     LayerBindingId GetBindingId() const { return m_Id; }
762 
763 private:
764     std::unique_ptr<ITensorHandle> m_TensorHandle;
765     TensorInfo m_TensorInfo;
766     LayerBindingId m_Id;
767 };
768 
GetTensorPin(LayerBindingId id,const std::vector<TensorPin> & pins,char const * bindingPointDesc)769 static const TensorPin& GetTensorPin(LayerBindingId id,
770     const std::vector<TensorPin>& pins,
771     char const* bindingPointDesc)
772 {
773     auto it = std::find_if(pins.begin(), pins.end(),
774         [id](const TensorPin& pin)
775     {
776         return pin.GetBindingId() == id;
777     });
778 
779     if (it != pins.end())
780     {
781         return *it;
782     }
783     else
784     {
785         throw InvalidArgumentException(fmt::format("No tensor supplied for {0} {1}", bindingPointDesc, id));
786     }
787 }
788 
789 // Stores data that needs to be kept accessible for the entire execution of a workload.
790 class WorkloadData
791 {
792 public:
WorkloadData(const InputTensors & inputTensors,const OutputTensors & outputTensors)793     WorkloadData(const InputTensors& inputTensors, const OutputTensors& outputTensors)
794     {
795         m_InputTensorPins.reserve(inputTensors.size());
796         m_OutputTensorPins.reserve(outputTensors.size());
797 
798         for (auto inputTensorPair : inputTensors)
799         {
800             auto inputTensor = inputTensorPair.second;
801 
802             std::unique_ptr<ITensorHandle> tensorHandle =
803                 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),inputTensor.GetMemoryArea());
804             LayerBindingId layerId = inputTensorPair.first;
805 
806             m_InputTensorPins.emplace_back(std::move(tensorHandle), inputTensor.GetInfo(), layerId);
807         }
808 
809         for (auto outputTensorPair : outputTensors)
810         {
811             auto outputTensor = outputTensorPair.second;
812 
813             std::unique_ptr<ITensorHandle> tensorHandle =
814                 std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
815             LayerBindingId layerId = outputTensorPair.first;
816 
817             m_OutputTensorPins.emplace_back(std::move(tensorHandle), outputTensor.GetInfo(), layerId);
818         }
819     }
820 
GetInputTensorPin(LayerBindingId id) const821     const TensorPin& GetInputTensorPin(LayerBindingId id) const
822     {
823         return GetTensorPin(id, m_InputTensorPins, "input");
824     }
825 
GetOutputTensorPin(LayerBindingId id) const826     const TensorPin& GetOutputTensorPin(LayerBindingId id) const
827     {
828         return GetTensorPin(id, m_OutputTensorPins, "output");
829     }
830 
831 private:
832 
833     std::vector<TensorPin> m_InputTensorPins;
834     std::vector<TensorPin> m_OutputTensorPins;
835 };
836 
837 }
838 
EnqueueWorkload(const InputTensors & inputTensors,const OutputTensors & outputTensors,std::vector<ImportedInputId> preImportedInputIds,std::vector<ImportedOutputId> preImportedOutputIds)839 Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
840                                       const OutputTensors& outputTensors,
841                                       std::vector<ImportedInputId> preImportedInputIds,
842                                       std::vector<ImportedOutputId> preImportedOutputIds)
843 {
844     const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
845 
846     // Walk graph to determine the order of execution.
847     if (graph.GetNumLayers() < 2)
848     {
849         ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
850         return Status::Failure;
851     }
852 
853     // Data that must be kept alive for the entire execution of the workload.
854     WorkloadData workloadData(inputTensors, outputTensors);
855 
856     // Input tensors can be provided as parameters or pre imported. Either way the number of
857     // tensors should match the number of inputs.
858     if (graph.GetNumInputs() != (inputTensors.size() + preImportedInputIds.size()))
859     {
860         throw InvalidArgumentException("Number of inputs provided does not match network.");
861     }
862 
863     // For each input to the network, call EnqueueInput with the data passed by the user.
864     {
865         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
866         m_InputQueue.clear();
867         m_InputQueue.reserve(graph.GetNumInputs());
868 
869         unsigned int inputIndex = 0;
870         unsigned int importedInputIdIndex = 0;
871         std::sort(preImportedInputIds.begin(), preImportedInputIds.end());
872         for (const BindableLayer* inputLayer : graph.GetInputLayers())
873         {
874             if (importedInputIdIndex < preImportedInputIds.size() &&
875                 inputIndex == preImportedInputIds[importedInputIdIndex])
876             {
877                 // Only replace tensorhandles if they have not already been replaced
878                 if (!m_IsInputImported[inputIndex])
879                 {
880                     auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get();
881 
882                     for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()])
883                     {
884                         auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
885                         workload->ReplaceInputTensorHandle(outputTensorHandle, workloadInfo.m_SlotIndex);
886                     }
887                     m_IsInputImported[inputIndex] = true;
888                 }
889                 importedInputIdIndex++;
890             }
891             else
892             {
893                 if (m_IsInputImported[inputIndex])
894                 {
895                     OutputHandler& handler = const_cast<OutputHandler&>(inputLayer->GetOutputHandler(0));
896 
897                     for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()])
898                     {
899                         auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
900                         workload->ReplaceInputTensorHandle(handler.GetData(), workloadInfo.m_SlotIndex);
901                     }
902 
903                     m_IsInputImported[inputIndex] = false;
904                 }
905 
906                 // InputTensorHandle is not imported yet, process to enqueue input
907                 const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
908                 EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
909             }
910             inputIndex++;
911         }
912     }
913     // For each output to the network, call EnqueueOutput with the data passed by the user.
914     {
915         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
916         m_OutputQueue.clear();
917         m_OutputQueue.reserve(graph.GetNumOutputs());
918 
919         if (preImportedOutputIds.size() > graph.GetNumOutputs())
920         {
921             throw InvalidArgumentException("Invalid number of preImportedOutputIds");
922         }
923 
924         unsigned int outputIndex = 0;
925         unsigned int importedOutputIdIndex = 0;
926         std::sort(preImportedOutputIds.begin(), preImportedOutputIds.end());
927         for (const BindableLayer* outputLayer : graph.GetOutputLayers())
928         {
929             if (importedOutputIdIndex < preImportedOutputIds.size() &&
930                 outputIndex == preImportedOutputIds[importedOutputIdIndex])
931             {
932                 // Only replace tensorhandles if they have not already been replaced
933                 ITensorHandle* inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get();
934 
935                 if (!m_IsOutputImported[outputIndex])
936                 {
937                     const auto bindingId = outputLayer->GetBindingId();
938                     const auto& indices = m_OutputWorkloadSlotPairs[bindingId];
939 
940                     auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
941 
942                     outputWorkload->ReplaceOutputTensorHandle(inputTensorHandle,
943                                                               indices.m_OutputSlotIndices.m_SlotIndex);
944 
945                     for (const auto& workloadInfo: indices.m_InputSlotIndices)
946                     {
947                         auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
948                         inputWorkload->ReplaceInputTensorHandle(inputTensorHandle, workloadInfo.m_SlotIndex);
949                     }
950                     m_IsOutputImported[outputIndex] = true;
951                 }
952 
953                 ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
954                 MemSyncQueueDescriptor syncDesc;
955                 syncDesc.m_Inputs.push_back(inputTensorHandle);
956                 WorkloadInfo info;
957                 info.m_InputTensorInfos.push_back(
958                         outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo());
959                 auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
960                 ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
961                 m_OutputQueue.push_back(move(syncWorkload));
962                 importedOutputIdIndex++;
963             }
964             else
965             {
966                 if (m_IsOutputImported[outputIndex])
967                 {
968                     const auto bindingId = outputLayer->GetBindingId();
969                     const auto& indices = m_OutputWorkloadSlotPairs[bindingId];
970 
971                     auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
972                     const OutputHandler& outputHandler =
973                             outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOutputHandler();
974 
975                     outputWorkload->ReplaceOutputTensorHandle(
976                             outputHandler.GetData(), indices.m_OutputSlotIndices.m_SlotIndex);
977 
978                     for (const auto& workloadInfo: indices.m_InputSlotIndices)
979                     {
980                         auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
981                         inputWorkload->ReplaceInputTensorHandle(outputHandler.GetData(), workloadInfo.m_SlotIndex);
982                     }
983                     m_IsOutputImported[outputIndex] = false;
984                 }
985 
986                 const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
987                 // OutputTensorHandle is not imported yet, process to enqueue Output
988                 EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
989             }
990             outputIndex++;
991         }
992     }
993 
994     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
995                         TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
996     ProfilingGuid inferenceGuid = m_ProfilingService->GetNextGuid();
997     if (timelineUtils)
998     {
999         // Add inference timeline trace if profiling is enabled.
1000         ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
1001         timelineUtils->CreateTypedEntity(inferenceGuid, LabelsAndEventClasses::INFERENCE_GUID);
1002         timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
1003                                           networkGuid,
1004                                           inferenceGuid,
1005                                           LabelsAndEventClasses::EXECUTION_OF_GUID);
1006         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
1007     }
1008 
1009     bool executionSucceeded = true;
1010 
1011     {
1012         if (m_ProfilingService->IsProfilingEnabled())
1013         {
1014             m_ProfilingService->IncrementCounterValue(INFERENCES_RUN);
1015         }
1016         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute");
1017         ARMNN_SCOPED_HEAP_PROFILING("Executing");
1018         executionSucceeded = Execute(timelineUtils, inferenceGuid);
1019     }
1020 
1021     if (timelineUtils)
1022     {
1023         // Add end of life of the inference timeline if profiling is enabled.
1024         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
1025         timelineUtils->Commit();
1026     }
1027 
1028     return executionSucceeded ? Status::Success : Status::Failure;
1029 }
1030 
EnqueueInput(const BindableLayer & layer,ITensorHandle * tensorHandle,const TensorInfo & tensorInfo)1031 void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
1032 {
1033     if (layer.GetType() != LayerType::Input)
1034     {
1035         throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
1036     }
1037 
1038     if (tensorHandle == nullptr)
1039     {
1040         throw InvalidArgumentException("EnqueueInput: tensorHandle must not be NULL");
1041     }
1042 
1043     InputQueueDescriptor inputQueueDescriptor;
1044     WorkloadInfo info;
1045 
1046     inputQueueDescriptor.m_Inputs.push_back(tensorHandle);
1047     info.m_InputTensorInfos.push_back(tensorInfo);
1048 
1049     ARMNN_ASSERT_MSG(layer.GetNumOutputSlots() == 1, "Can only handle Input Layer with one output");
1050     const OutputHandler& handler = layer.GetOutputHandler();
1051     const TensorInfo& outputTensorInfo = handler.GetTensorInfo();
1052     ITensorHandle* outputTensorHandle = handler.GetData();
1053     ARMNN_ASSERT_MSG(outputTensorHandle != nullptr,
1054                      "Data should have been allocated.");
1055     inputQueueDescriptor.m_Outputs.push_back(outputTensorHandle);
1056     info.m_OutputTensorInfos.push_back(outputTensorInfo);
1057 
1058     MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
1059     bool needMemCopy = true;
1060     if (m_NetworkProperties.m_ImportEnabled)  // Try import the input tensor
1061     {
1062         if(CheckFlag(importFlags, m_NetworkProperties.m_InputSource))
1063         {
1064             needMemCopy = false;
1065             // This assumes a CPU Tensor handle
1066             void* mem = tensorHandle->Map(false);
1067             if (outputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource))
1068             {
1069                 tensorHandle->Unmap();
1070                 return; // No need for a workload since the import has been done.
1071             }
1072             tensorHandle->Unmap();
1073             throw MemoryImportException("EnqueueInput: Memory Import failed");
1074         }
1075     }
1076     if (needMemCopy)
1077     {
1078         // Create a mem copy workload for input since we did not import
1079         std::unique_ptr<IWorkload> inputWorkload = std::make_unique<CopyMemGenericWorkload>(inputQueueDescriptor, info);
1080 
1081         ARMNN_ASSERT_MSG(inputWorkload, "No input workload created");
1082 
1083         std::unique_ptr<TimelineUtilityMethods> timelineUtils =
1084                             TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
1085         if (timelineUtils)
1086         {
1087             // Add Input Workload to the post-optimisation network structure
1088             AddWorkloadStructure(timelineUtils, inputWorkload, layer);
1089             timelineUtils->Commit();
1090         }
1091 
1092         m_InputQueue.push_back(move(inputWorkload));
1093     }
1094 }
1095 
EnqueueOutput(const BindableLayer & layer,ITensorHandle * tensorHandle,const TensorInfo & tensorInfo)1096 void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
1097 {
1098     if (layer.GetType() != LayerType::Output)
1099     {
1100         throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
1101     }
1102 
1103     if (tensorHandle == nullptr)
1104     {
1105         throw InvalidArgumentException("EnqueueOutput: tensorHandle must not be NULL");
1106     }
1107 
1108     OutputQueueDescriptor outputQueueDescriptor;
1109     WorkloadInfo info;
1110 
1111     outputQueueDescriptor.m_Outputs.push_back(tensorHandle);
1112     info.m_OutputTensorInfos.push_back(tensorInfo);
1113 
1114     ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
1115 
1116     // Gets the output handler from the previous node.
1117     const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();
1118 
1119     const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
1120     ITensorHandle* inputTensorHandle = outputHandler.GetData();
1121     ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
1122 
1123     // Try import the output tensor.
1124     // Note: We can only import the output pointer if all of the following  hold true:
1125     // a) The imported pointer is aligned sufficiently
1126     // b) The tensor has zero padding
1127     // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
1128     // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
1129     // e) m_IsExportEnabled must be set to true
1130     bool needMemCopy = true;
1131     if (m_NetworkProperties.m_ExportEnabled &&
1132         (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
1133     {
1134         if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
1135         {
1136             MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
1137             if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
1138             {
1139                 needMemCopy = false;
1140                 void *mem = tensorHandle->Map(false);
1141                 bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
1142                 tensorHandle->Unmap();
1143 
1144                 if (importOk)
1145                 {
1146                     // Insert synchronization workload
1147                     MemSyncQueueDescriptor syncDesc;
1148                     syncDesc.m_Inputs.push_back(inputTensorHandle);
1149                     info.m_InputTensorInfos.push_back(inputTensorInfo);
1150                     auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
1151                     ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
1152                     m_OutputQueue.push_back(move(syncWorkload));
1153                 }
1154                 else
1155                 {
1156                     throw MemoryExportException("EnqueueOutput: Memory Export failed");
1157                 }
1158             }
1159         }
1160     }
1161     if (needMemCopy)
1162     {
1163         // If we got here then we didn't export the memory, so add an output workload which performs a memcopy.
1164         outputQueueDescriptor.m_Inputs.push_back(inputTensorHandle);
1165         info.m_InputTensorInfos.push_back(inputTensorInfo);
1166 
1167         std::unique_ptr<IWorkload> outputWorkload =
1168             std::make_unique<CopyMemGenericWorkload>(outputQueueDescriptor, info);
1169         ARMNN_ASSERT_MSG(outputWorkload, "No output workload created");
1170 
1171         std::unique_ptr<TimelineUtilityMethods> timelineUtils =
1172             TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
1173         if (timelineUtils)
1174         {
1175             // Add Output Workload to the post-optimisation network structure
1176             AddWorkloadStructure(timelineUtils, outputWorkload, layer);
1177             timelineUtils->Commit();
1178         }
1179 
1180         m_OutputQueue.push_back(move(outputWorkload));
1181     }
1182 }
1183 
AllocateWorkingMemory(std::lock_guard<std::mutex> & lock)1184 void LoadedNetwork::AllocateWorkingMemory(
1185 #if !defined(ARMNN_DISABLE_THREADS)
1186      std::lock_guard<std::mutex>& lock
1187 #endif
1188     )
1189 {
1190     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation");
1191 
1192 #if !defined(ARMNN_DISABLE_THREADS)
1193     // this unused parameter makes sure we can only call this function with a valid lock
1194     IgnoreUnused(lock);
1195 #endif
1196     if (m_IsWorkingMemAllocated)
1197     {
1198         return;
1199     }
1200 
1201     if (m_ExternalMemoryManager)
1202     {
1203         m_ExternalMemoryManager->Allocate();
1204 
1205         for (unsigned int i = 0; i < m_TensorMemory.size(); ++i)
1206         {
1207             m_Tensorhandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second);
1208         }
1209     }
1210 
1211     for (auto&& memoryManager : m_BackendMemoryMangers)
1212     {
1213         if (memoryManager)
1214         {
1215             memoryManager->Acquire();
1216         }
1217     }
1218     m_TensorHandleFactoryRegistry.AquireMemory();
1219     m_IsWorkingMemAllocated = true;
1220 }
1221 
FreeWorkingMemory()1222 void LoadedNetwork::FreeWorkingMemory()
1223 {
1224 #if !defined(ARMNN_DISABLE_THREADS)
1225     std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
1226 #endif
1227 
1228     if (!m_IsWorkingMemAllocated)
1229     {
1230         return;
1231     }
1232 
1233     if (m_ExternalMemoryManager)
1234     {
1235         m_ExternalMemoryManager->Deallocate();
1236     }
1237 
1238     // Informs the memory managers to release memory in its respective memory group
1239     for (auto&& memoryManager : m_BackendMemoryMangers)
1240     {
1241         if (memoryManager)
1242         {
1243             memoryManager->Release();
1244         }
1245     }
1246     m_TensorHandleFactoryRegistry.ReleaseMemory();
1247     m_IsWorkingMemAllocated = false;
1248 }
1249 
Execute(std::unique_ptr<TimelineUtilityMethods> & timelineUtils,ProfilingGuid inferenceGuid)1250 bool LoadedNetwork::Execute(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
1251                            ProfilingGuid inferenceGuid)
1252 {
1253     bool success = true;
1254 
1255     auto Fail = [&](const std::exception& error)
1256     {
1257         ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
1258         success = false;
1259     };
1260 
1261     try
1262     {
1263 #if !defined(ARMNN_DISABLE_THREADS)
1264         std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
1265         AllocateWorkingMemory(lockGuard);
1266 #else
1267         AllocateWorkingMemory();
1268 #endif
1269 
1270         ProfilingDynamicGuid workloadInferenceID(0);
1271         auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue)
1272         {
1273             for (auto& workload : queue)
1274             {
1275                 if(timelineUtils)
1276                 {
1277                     workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
1278                                                                                                     inferenceGuid);
1279                 }
1280                 workload->Execute();
1281                 if(timelineUtils)
1282                 {
1283                     timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
1284                 }
1285             }
1286         };
1287 
1288         ExecuteQueue(m_InputQueue);
1289         ExecuteQueue(m_WorkloadQueue);
1290         ExecuteQueue(m_OutputQueue);
1291     }
1292     catch (const RuntimeException& error)
1293     {
1294         Fail(error);
1295     }
1296     catch (const std::runtime_error& error)
1297     {
1298         Fail(error);
1299     }
1300 
1301     return success;
1302 }
1303 
EnqueueInput(const ConstTensor & inputTensor,ITensorHandle * inputTensorHandle)1304 void LoadedNetwork::EnqueueInput(const ConstTensor& inputTensor, ITensorHandle* inputTensorHandle)
1305 {
1306     if (m_NetworkProperties.m_ImportEnabled)  // Try import the input tensor
1307     {
1308         MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
1309         if (CheckFlag(importFlags, m_NetworkProperties.m_InputSource) )
1310         {
1311             std::unique_ptr<ITensorHandle> tensorHandle =
1312                     std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),
1313                                                                    inputTensor.GetMemoryArea());
1314             void* mem = tensorHandle->Map(false);
1315 
1316             if (inputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource))
1317             {
1318                 tensorHandle->Unmap();
1319                 return;
1320             }
1321             tensorHandle->Unmap();
1322             throw MemoryImportException("EnqueueInput: Memory Import failed");
1323         }
1324         else
1325         {
1326             throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
1327         }
1328     }
1329     else
1330     {
1331         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyInput");
1332         std::unique_ptr<ITensorHandle> tensorHandle =
1333                 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea());
1334 
1335         auto copyFunc = [](void* dst, const void* src, size_t size)
1336         {
1337             memcpy(dst, src, size);
1338         };
1339 
1340         CopyTensorContentsGeneric(tensorHandle.get(), inputTensorHandle, copyFunc);
1341     }
1342 }
1343 
1344 // Note: We can only import the output pointer if all of the following  hold true:
1345 // a) The imported pointer is aligned sufficiently
1346 // b) The tensor has zero padding
1347 // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
1348 // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
1349 // e) m_IsExportEnabled must be set to true
ImportOutputTensor(const Tensor & outputTensor,ITensorHandle * outputTensorHandle)1350 void LoadedNetwork::ImportOutputTensor(const Tensor& outputTensor, ITensorHandle* outputTensorHandle)
1351 {
1352     ARMNN_ASSERT_MSG(outputTensorHandle != nullptr, "Data should have been allocated.");
1353     MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
1354     if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
1355     {
1356         std::unique_ptr<ITensorHandle> tensorHandle =
1357                 std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
1358                                                           outputTensor.GetMemoryArea());
1359 
1360         void* mem = tensorHandle->Map(false);
1361         bool importOk = outputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
1362         tensorHandle->Unmap();
1363 
1364         if (!importOk)
1365         {
1366             throw MemoryExportException("ImportOutputTensor: Memory Export failed");
1367         }
1368     }
1369     else
1370     {
1371         throw MemoryExportException("ImportOutputTensor: Memory Export failed, attempting to export Input Layer");
1372     }
1373 
1374 }
1375 
CopyToOutputTensor(const Tensor & outputTensor,ITensorHandle * outputTensorHandle)1376 void CopyToOutputTensor(const Tensor& outputTensor, ITensorHandle* outputTensorHandle)
1377 {
1378     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyOutput");
1379     auto copyFunc = [](void* dst, const void* src, size_t size)
1380     {
1381         memcpy(dst, src, size);
1382     };
1383 
1384     std::unique_ptr<ITensorHandle> tensorHandle =
1385             std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
1386                                                       outputTensor.GetMemoryArea());
1387 
1388     CopyTensorContentsGeneric(outputTensorHandle, tensorHandle.get(), copyFunc);
1389 }
1390 
1391 
GetInputTensor(const LayerBindingId layerId,const InputTensors & inputTensors)1392 const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors& inputTensors)
1393 {
1394     for (auto inputTensorPair : inputTensors)
1395     {
1396         LayerBindingId id = inputTensorPair.first;
1397         if (id == layerId)
1398         {
1399             return inputTensorPair.second;
1400         }
1401     }
1402     throw InvalidArgumentException("Input does not exist.");
1403 }
1404 
GetOutputTensor(const LayerBindingId layerId,const OutputTensors & outputTensors)1405 const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors& outputTensors)
1406 {
1407     for (auto outputTensorPair : outputTensors)
1408     {
1409         LayerBindingId id = outputTensorPair.first;
1410         if (id == layerId)
1411         {
1412             return outputTensorPair.second;
1413         }
1414     }
1415     throw InvalidArgumentException("Output does not exist.");
1416 }
1417 
ImportInputs(const InputTensors & inputTensors,MemorySource forceImportMemorySource)1418 std::vector<ImportedInputId> LoadedNetwork::ImportInputs(const InputTensors& inputTensors,
1419                                                          MemorySource forceImportMemorySource)
1420 {
1421     if (!m_NetworkProperties.m_AsyncEnabled)
1422     {
1423         // Cannot import if import is not enabled and forceImportMemorySource is undefined
1424         if (forceImportMemorySource == MemorySource::Undefined)
1425         {
1426             throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
1427         }
1428         // The number of pre imported tensors should not exceed the number of inputs.
1429         if (inputTensors.size() > m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumInputs())
1430         {
1431             throw MemoryImportException("ImportInputs: The number of tensors provided exceeds the number of inputs.");
1432         }
1433 
1434         std::vector<ImportedInputId> importedInputs;
1435         Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
1436         unsigned int inputIndex = 0;
1437         for (const BindableLayer* inputLayer : graph.GetInputLayers())
1438         {
1439             auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get();
1440 
1441             if (!outputTensorHandle)
1442             {
1443                 inputIndex++;
1444                 continue;
1445             }
1446 
1447             auto layerBindingId = inputLayer->GetBindingId();
1448             auto it = std::find_if(inputTensors.begin(), inputTensors.end(), [=](const auto& inputTensor)
1449             {
1450                 return inputTensor.first == layerBindingId;
1451             });
1452 
1453             if (it == inputTensors.end())
1454             {
1455                 inputIndex++;
1456                 continue;
1457             }
1458 
1459             const auto& inputTensor = *it;
1460             std::unique_ptr<ITensorHandle> passThroughTensorHandle =
1461                     std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
1462                                                                    inputTensor.second.GetMemoryArea());
1463 
1464             try
1465             {
1466                 if (outputTensorHandle->CanBeImported(passThroughTensorHandle->Map(), forceImportMemorySource)
1467                     && (outputTensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource)))
1468                 {
1469                     importedInputs.push_back(inputIndex);
1470                 }
1471                 passThroughTensorHandle->Unmap();
1472             }
1473             catch(const MemoryImportException& exception)
1474             {
1475                 ARMNN_LOG(error) << "An error occurred attempting to import input_"
1476                                            << inputIndex << " : " << exception.what();
1477                 passThroughTensorHandle->Unmap();
1478             }
1479             inputIndex++;
1480         }
1481 
1482         return importedInputs;
1483     }
1484     else
1485     {
1486         // Import when the import of network properties is enabled
1487         std::vector<ImportedInputId> importedInputs;
1488         Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
1489 
1490         for (auto inputTensor : inputTensors)
1491         {
1492             auto layerBindingId = inputTensor.first;
1493             auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer)
1494             {
1495                 return layer->GetBindingId() == layerBindingId;
1496             });
1497 
1498             if (it == graph.GetInputLayers().end())
1499             {
1500                 throw MemoryImportException(fmt::format(
1501                     "ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId));
1502             }
1503 
1504             const Layer* layer = *it;
1505             if (layer->GetType() != LayerType::Input)
1506             {
1507                 throw InvalidArgumentException("ImportInputs: given layer not an InputLayer");
1508             }
1509 
1510             auto& backend = m_Backends.at(layer->GetBackendId());
1511             if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities()))
1512             {
1513                 std::string er = backend->GetId();
1514                 er += " does not have PreImportIOTensors capability";
1515                 throw BackendCapabilityException(er);
1516             }
1517 
1518             const OutputSlot& outputSlot = layer->GetOutputSlots()[0];
1519 
1520             ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
1521             const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
1522 
1523             ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
1524             ARMNN_ASSERT(handleFactory);
1525 
1526             ImportedTensorHandlePin importedTensorHandlePin{layerBindingId,
1527                                                             handleFactory->CreateTensorHandle(tensorInfo, false)};
1528 
1529             ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get();
1530 
1531             if (!CheckFlag(tensorHandle->GetImportFlags(), forceImportMemorySource))
1532             {
1533                 throw MemoryImportException(
1534                     fmt::format("ImportInputs: Memory Import failed, backend: "
1535                                 "{} does not support importing from source {}"
1536                                 , factoryId, m_NetworkProperties.m_InputSource));
1537             }
1538 
1539             std::unique_ptr<ITensorHandle> passThroughTensorHandle =
1540                     std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
1541                                                                    inputTensor.second.GetMemoryArea());
1542 
1543             if (tensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource))
1544             {
1545                 importedInputs.push_back(m_CurImportedInputId++);
1546                 passThroughTensorHandle->Unmap();
1547             }
1548             else
1549             {
1550                 passThroughTensorHandle->Unmap();
1551                 throw MemoryImportException("ImportInputs: Memory Import failed");
1552             }
1553 
1554             m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin));
1555         }
1556         return importedInputs;
1557     }
1558 }
1559 
ImportOutputs(const OutputTensors & outputTensors,MemorySource forceImportMemorySource)1560 std::vector<ImportedOutputId> LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors,
1561                                                            MemorySource forceImportMemorySource)
1562 {
1563     if (!m_NetworkProperties.m_AsyncEnabled)
1564     {
1565         // Cannot import if import is not enabled and forceImportMemorySource is undefined
1566         if (forceImportMemorySource == MemorySource::Undefined)
1567         {
1568             throw MemoryImportException("ImportOutputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
1569         }
1570         // If forceImportMemorySource is defined, try import if memory is aligned
1571         if (outputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumOutputs())
1572         {
1573             throw MemoryImportException("ImportOutputs: Force Import failed, incorrect number of tensors");
1574         }
1575         std::vector<ImportedOutputId> importedOutputs;
1576         Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
1577 
1578         unsigned int outputIndex = 0;
1579         for (const BindableLayer* const outputLayer : graph.GetOutputLayers())
1580         {
1581             auto inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get();
1582             if (!inputTensorHandle)
1583             {
1584                 outputIndex++;
1585                 continue;
1586             }
1587 
1588             auto layerBindingId = outputLayer->GetBindingId();
1589             auto it = std::find_if(outputTensors.begin(), outputTensors.end(), [=] (const auto& outputTensor)
1590             {
1591                 return outputTensor.first == layerBindingId;
1592             });
1593 
1594             if (it == outputTensors.end())
1595             {
1596                 outputIndex++;
1597                 continue;
1598             }
1599 
1600             const auto outputTensor = *it;
1601             try
1602             {
1603                 // Check if the output memory can be imported
1604                 if (inputTensorHandle->CanBeImported(outputTensor.second.GetMemoryArea(), forceImportMemorySource)
1605                     && inputTensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
1606                 {
1607                     importedOutputs.push_back(outputIndex);
1608                 }
1609             }
1610             catch(const MemoryImportException& exception)
1611             {
1612                 ARMNN_LOG(error) << "An error occurred attempting to import output_"
1613                                  << outputIndex << " : " << exception.what();
1614             }
1615             outputIndex++;
1616         }
1617         return importedOutputs;
1618     }
1619 
1620     std::vector<ImportedOutputId> importedOutputs;
1621     Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
1622 
1623     for (const auto& outputTensor : outputTensors)
1624     {
1625         auto layerBindingId = outputTensor.first;
1626         auto it = std::find_if(graph.GetOutputLayers().begin(), graph.GetOutputLayers().end(), [=](auto* layer)
1627         {
1628             return layer->GetBindingId() == layerBindingId;
1629         });
1630 
1631         if (it == graph.GetOutputLayers().end())
1632         {
1633             throw MemoryImportException(fmt::format("ImportOutputs: Memory Import failed, unknown LayerBindingId: {}",
1634                                                      layerBindingId));
1635         }
1636 
1637         const Layer* layer = *it;
1638         if (layer->GetType() != LayerType::Output)
1639         {
1640             throw InvalidArgumentException("ImportOutputs: given layer not an OutputLayer");
1641         }
1642 
1643         auto& backend = m_Backends.at(layer->GetBackendId());
1644         if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities()))
1645         {
1646             std::string er = backend->GetId();
1647             er += " does not have PreImportIOTensors capability";
1648             throw BackendCapabilityException(er);
1649         }
1650 
1651         const InputSlot& inputSlot = layer->GetInputSlots()[0];
1652         ITensorHandleFactory::FactoryId factoryId = inputSlot.GetConnectedOutputSlot()->GetTensorHandleFactoryId();
1653         const TensorInfo& tensorInfo = inputSlot.GetConnectedOutputSlot()->GetTensorInfo();
1654 
1655         ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
1656         ARMNN_ASSERT(handleFactory);
1657 
1658         ImportedTensorHandlePin importedTensorHandlePin{layerBindingId,
1659                                                         handleFactory->CreateTensorHandle(tensorInfo, false)};
1660 
1661         ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get();
1662 
1663         if (!CheckFlag(tensorHandle->GetImportFlags(), forceImportMemorySource))
1664         {
1665             throw MemoryImportException(fmt::format("ImportInputs: Memory Import failed, backend: "
1666                                                     "{} does not support importing from source {}"
1667                                                     , factoryId, forceImportMemorySource));
1668         }
1669 
1670         if (tensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
1671         {
1672             importedOutputs.push_back(m_CurImportedOutputId++);
1673         }
1674         else
1675         {
1676             throw MemoryImportException("ImportInputs: Memory Import failed");
1677         }
1678 
1679         m_PreImportedOutputHandles.push_back(std::move(importedTensorHandlePin));
1680     }
1681 
1682     return importedOutputs;
1683 }
1684 
ClearImportedInputs(const std::vector<ImportedInputId> inputIds)1685 void LoadedNetwork::ClearImportedInputs(const std::vector<ImportedInputId> inputIds)
1686 {
1687     for (auto id : inputIds)
1688     {
1689         if (id > m_PreImportedInputHandles.size())
1690         {
1691             throw InvalidArgumentException(fmt::format("ClearImportedInputs::Unknown ImportedInputId: {}", id));
1692         }
1693 
1694         auto& importedTensorHandle = m_PreImportedInputHandles[id].m_TensorHandle;
1695         if (!importedTensorHandle)
1696         {
1697             throw InvalidArgumentException(
1698                     fmt::format("ClearImportedInputs::ImportedInput with id: {} has already been deleted", id));
1699         }
1700         // Call Unimport then destroy the tensorHandle
1701         importedTensorHandle->Unimport();
1702         importedTensorHandle = {};
1703     }
1704 }
1705 
ClearImportedOutputs(const std::vector<ImportedOutputId> outputIds)1706 void LoadedNetwork::ClearImportedOutputs(const std::vector<ImportedOutputId> outputIds)
1707 {
1708     for (auto id : outputIds)
1709     {
1710         if (id > m_PreImportedOutputHandles.size())
1711         {
1712             throw InvalidArgumentException(fmt::format("ClearImportedOutputs::Unknown ImportedOutputId: {}", id));
1713         }
1714 
1715         auto& importedTensorHandle = m_PreImportedOutputHandles[id].m_TensorHandle;
1716         if (!importedTensorHandle)
1717         {
1718             throw InvalidArgumentException(
1719                     fmt::format("ClearImportedOutputs::ImportedOutput with id: {} has already been deleted", id));
1720         }
1721         // Call Unimport then destroy the tensorHandle
1722         importedTensorHandle->Unimport();
1723         importedTensorHandle = {};
1724     }
1725 }
1726 
Execute(const InputTensors & inputTensors,const OutputTensors & outputTensors,IWorkingMemHandle & iWorkingMemHandle,std::vector<ImportedInputId> preImportedInputs,std::vector<ImportedOutputId> preImportedOutputs)1727 Status LoadedNetwork::Execute(const InputTensors& inputTensors,
1728                               const OutputTensors& outputTensors,
1729                               IWorkingMemHandle& iWorkingMemHandle,
1730                               std::vector<ImportedInputId> preImportedInputs,
1731                               std::vector<ImportedOutputId> preImportedOutputs)
1732 {
1733     const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
1734 
1735     if (inputTensors.size() + preImportedInputs.size() != graph.GetNumInputs())
1736     {
1737         if (preImportedInputs.empty())
1738         {
1739             throw InvalidArgumentException("LoadedNetwork::Execute: Number of inputs provided does not match network.");
1740         }
1741         else
1742         {
1743             throw InvalidArgumentException("LoadedNetwork::Execute: "
1744                                            "Number of inputs + preImportedInputs provided does not match network.");
1745         }
1746     }
1747 
1748     if (outputTensors.size() + preImportedOutputs.size() != graph.GetNumOutputs())
1749     {
1750         if (preImportedOutputs.empty())
1751         {
1752             throw InvalidArgumentException("LoadedNetwork::Execute: "
1753                                            "Number of outputs provided does not match network.");
1754         }
1755         else
1756         {
1757             throw InvalidArgumentException("LoadedNetwork::Execute: "
1758                                            "Number of outputs + preImportedOutputs provided does not match network.");
1759         }
1760     }
1761 
1762     WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle);
1763     // Collect all the given LayerBindingIds and check them for duplicates and unknowns.
1764     std::vector<LayerBindingId>& bindingIds = workingMemHandle.GetBindingIdVector();
1765     unsigned int index = 0;
1766     for (auto pair : inputTensors)
1767     {
1768         bindingIds[index++] = pair.first;
1769     }
1770     for (ImportedInputId id : preImportedInputs)
1771     {
1772         bindingIds[index++] = ValidateImportedInputID(id);
1773     }
1774     for (auto pair : outputTensors)
1775     {
1776         bindingIds[index++] = pair.first;
1777     }
1778     for (ImportedOutputId id : preImportedOutputs)
1779     {
1780         bindingIds[index++] = ValidateImportedOutputID(id);
1781     }
1782 
1783     workingMemHandle.ValidateBindingIds();
1784 
1785     auto resetMemHandle = [&]()
1786     {
1787         for (ImportedInputId id: preImportedInputs)
1788         {
1789             const LayerBindingId layerBindingId = m_PreImportedInputHandles[id].m_LayerBindingId;
1790 
1791             auto inputHandle = workingMemHandle.GetInputHandle(layerBindingId);
1792             auto inputConnections = workingMemHandle.GetInputConnections(layerBindingId);
1793             for (auto it : inputConnections)
1794             {
1795                 *it = inputHandle;
1796             }
1797         }
1798 
1799         for (ImportedOutputId id: preImportedOutputs)
1800         {
1801             const LayerBindingId layerBindingId = m_PreImportedOutputHandles[id].m_LayerBindingId;
1802 
1803             auto outputHandle = workingMemHandle.GetOutputHandle(layerBindingId);
1804             auto outputConnections = workingMemHandle.GetOutputConnection(layerBindingId);
1805 
1806             for (auto it : outputConnections)
1807             {
1808                 *it = outputHandle;
1809             }
1810         }
1811     };
1812 
1813     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
1814            TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
1815     ProfilingGuid inferenceGuid = m_ProfilingService->GetNextGuid();
1816     if (timelineUtils)
1817     {
1818         // Add inference timeline trace if profiling is enabled.
1819        ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
1820         timelineUtils->CreateTypedEntity(inferenceGuid,LabelsAndEventClasses::INFERENCE_GUID);
1821         timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
1822                                           networkGuid,
1823                                           inferenceGuid,
1824                                          LabelsAndEventClasses::EXECUTION_OF_GUID);
1825         timelineUtils->RecordEvent(inferenceGuid,LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
1826     }
1827 
1828     bool executionSucceeded = true;
1829 
1830     if (timelineUtils)
1831     {
1832         // Add end of life of the inference timeline if profiling is enabled.
1833         timelineUtils->RecordEvent(inferenceGuid,LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
1834         timelineUtils->Commit();
1835     }
1836 
1837     if (!workingMemHandle.IsAllocated())
1838     {
1839         workingMemHandle.Allocate();
1840     }
1841 
1842     {
1843         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
1844         for (auto pair : inputTensors)
1845         {
1846             EnqueueInput(pair.second, workingMemHandle.GetInputHandle(pair.first));
1847         }
1848 
1849         // Swap in the pre-imported inputs if any
1850         for (ImportedInputId id : preImportedInputs)
1851         {
1852             const ImportedTensorHandlePin& importedInputPin = m_PreImportedInputHandles[id];
1853             const LayerBindingId layerBindingId = m_PreImportedInputHandles[id].m_LayerBindingId;
1854             const auto& preimportedHandle = importedInputPin.m_TensorHandle;
1855 
1856             auto inputConnections = workingMemHandle.GetInputConnections(layerBindingId);
1857             for (auto it : inputConnections)
1858             {
1859                 *it = preimportedHandle.get();
1860             }
1861         }
1862     }
1863     {
1864         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
1865         if (m_NetworkProperties.m_ExportEnabled)
1866         {
1867             for (auto pair: outputTensors)
1868             {
1869                 ImportOutputTensor(pair.second, workingMemHandle.GetOutputHandle(pair.first));
1870             }
1871         }
1872 
1873         for (ImportedOutputId id : preImportedOutputs)
1874         {
1875             const ImportedTensorHandlePin& importedOutputPin = m_PreImportedOutputHandles[id];
1876             const LayerBindingId layerBindingId = m_PreImportedOutputHandles[id].m_LayerBindingId;
1877             const auto& preimportedHandle = importedOutputPin.m_TensorHandle;
1878 
1879             auto outputConnections = workingMemHandle.GetOutputConnection(layerBindingId);
1880             for (auto it : outputConnections)
1881             {
1882                 *it = preimportedHandle.get();
1883             }
1884         }
1885     }
1886 
1887     auto Fail = [&](const std::exception& error)
1888     {
1889         ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
1890         executionSucceeded = false;
1891     };
1892     ProfilingDynamicGuid workloadInferenceID(0);
1893 
1894     try
1895     {
1896         for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i)
1897         {
1898             auto& workload = m_WorkloadQueue[i];
1899             if (timelineUtils)
1900             {
1901                 workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
1902                                                                                                 inferenceGuid);
1903             }
1904 
1905             workload->ExecuteAsync(workingMemHandle.GetExecutionDataAt(i).second);
1906 
1907             if (timelineUtils)
1908             {
1909                 timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
1910             }
1911         }
1912     }
1913     catch (const RuntimeException& error)
1914     {
1915         resetMemHandle();
1916         Fail(error);
1917     }
1918     catch (const std::runtime_error& error)
1919     {
1920         resetMemHandle();
1921         Fail(error);
1922     }
1923     catch (...)
1924     {
1925         resetMemHandle();
1926         throw;
1927     }
1928 
1929     if (!m_NetworkProperties.m_ExportEnabled)
1930     {
1931         for (auto pair: outputTensors)
1932         {
1933             CopyToOutputTensor(pair.second, workingMemHandle.GetOutputHandle(pair.first));
1934         }
1935     }
1936     else
1937     {
1938        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
1939        workingMemHandle.MemSyncOutputs();
1940     }
1941 
1942     resetMemHandle();
1943 
1944     return executionSucceeded ? Status::Success : Status::Failure;
1945 }
1946 
1947 /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
1948 /// overlapped Execution by calling this function from different threads.
CreateWorkingMemHandle(NetworkId networkId)1949 std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
1950 {
1951     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
1952 
1953     // Tensors that will need to be allocated internally within armnn
1954     std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles;
1955     // Tensors that will be allocated externally by the user
1956     std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles;
1957 
1958     std::vector<WorkingMemDescriptor> workingMemDescriptors;
1959     std::vector<std::pair<BackendId, ExecutionData>> executionDataVec;
1960 
1961     auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot)
1962     {
1963         ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
1964         const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
1965 
1966         if (factoryId == ITensorHandleFactory::LegacyFactoryId)
1967         {
1968             BackendId id = layer->GetBackendId();
1969             ARMNN_NO_DEPRECATE_WARN_BEGIN
1970             return m_WorkloadFactories.at(id)->CreateTensorHandle(tensorInfo, false);
1971             ARMNN_NO_DEPRECATE_WARN_END
1972         }
1973         else
1974         {
1975             ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
1976             ARMNN_ASSERT(handleFactory);
1977             return handleFactory->CreateTensorHandle(tensorInfo, false);
1978         }
1979     };
1980 
1981     struct HandleInfo
1982     {
1983         ITensorHandle* m_TensorHandle;
1984 
1985         bool m_IsInputLayerHandle = false;
1986         bool m_IsOutputLayerHandle = false;
1987 
1988         WorkingMemHandle::InputMemDescriptorCoords m_InputMemDescriptorCoords;
1989         WorkingMemHandle::OutputMemDescriptorCoords m_OutputMemDescriptorCoords;
1990     };
1991 
1992     std::unordered_map<const OutputSlot*, HandleInfo> outputToHandleInfoMap;
1993 
1994     unsigned int layerIndex = 0;
1995     for (auto&& layer : order)
1996     {
1997         // Constant layers execution and management is handled during loaded network construction
1998         if (layer->GetType() == LayerType::Constant)
1999         {
2000             continue;
2001         }
2002 
2003         WorkingMemDescriptor workingMemDescriptor;
2004 
2005         bool isMemoryManaged = true;
2006         bool isInputLayer = false;
2007         bool isOutputLayer = false;
2008         bool isConnectedToOutputLayer = false;
2009 
2010         if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::MemImport)
2011         {
2012             // Input layers/workloads will not be executed so the descriptor is not added to workingMemDescriptors
2013             // However we will still need to manage the tensorHandle
2014             isInputLayer = true;
2015             isMemoryManaged = !m_NetworkProperties.m_ImportEnabled;
2016         }
2017         else if (layer->GetType() == LayerType::Output)
2018         {
2019             isOutputLayer = true;
2020         }
2021 
2022         unsigned int slotIndex = 0;
2023         // Create a tensor handle for each output slot of a layer
2024         // Once we create it, we start managing its lifetime
2025         for (auto& slot : layer->GetOutputSlots())
2026         {
2027             for (unsigned int i = 0; i < slot.GetNumConnections(); ++i)
2028             {
2029                 if ((slot.GetConnection(i)->GetOwningLayer().GetType() == LayerType::Output))
2030                 {
2031                     if (!isConnectedToOutputLayer)
2032                     {
2033                         isConnectedToOutputLayer = true;
2034                         // If Export is enabled disable memory management, so we can export, otherwise we do a copy
2035                         isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
2036                     }
2037                     else
2038                     {
2039                         // Importing in this case would likely cause unexpected behaviour, so we disallow it.
2040                         ARMNN_LOG(warning) <<
2041                            fmt::format("Layer name: '{0}' guid: '{1}' has two or more OutputLayers connected to it. "
2042                                        "This will prevent importing on the connected OutputLayers.",
2043                                         layer->GetName(), layer->GetGuid());
2044                         isMemoryManaged = true;
2045                     }
2046                 }
2047             }
2048 
2049             ITensorHandle* tensorHandle;
2050             if (isMemoryManaged)
2051             {
2052                 managedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
2053                 tensorHandle = managedTensorHandles.back().get();
2054             }
2055             else
2056             {
2057                 unmanagedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
2058                 tensorHandle = unmanagedTensorHandles.back().get();
2059             }
2060 
2061             workingMemDescriptor.m_Outputs.push_back(tensorHandle);
2062 
2063             HandleInfo& handleInfo = outputToHandleInfoMap[&slot];
2064             handleInfo.m_TensorHandle = tensorHandle;
2065 
2066             // Store the coordinates of the current layer's OutputSlot that is connected to the OutputLayer
2067             if (isConnectedToOutputLayer)
2068             {
2069                 handleInfo.m_IsOutputLayerHandle = true;
2070                 handleInfo.m_OutputMemDescriptorCoords.m_OutputSlotCoords = {layerIndex, slotIndex};
2071             }
2072             // Store the LayerBindingId of the InputLayer
2073             if (isInputLayer)
2074             {
2075                 handleInfo.m_IsInputLayerHandle = true;
2076                 LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
2077                 handleInfo.m_InputMemDescriptorCoords.m_LayerBindingId = bindingId;
2078             }
2079             slotIndex++;
2080         }
2081         // Loop through the input slots in the same layer and decrement the reference counter associated
2082         // to each tensor handle we encounter.
2083         // Once it reaches zero, the lifetime of the tensor handle has ended, and we mark its memory as available
2084         // so that the next tensor handle with a non overlapping lifetime can share its memory.
2085         for (auto& slot : layer->GetInputSlots())
2086         {
2087             ARMNN_ASSERT(slot.GetConnection());
2088             auto outputSlot = slot.GetConnectedOutputSlot();
2089             auto key = outputSlot->GetOwningLayer().GetGuid();
2090 
2091             // Constant layers execution and management is handled during loaded network construction
2092             auto found = m_ConstantTensorHandles.find(key);
2093             if (found != m_ConstantTensorHandles.end())
2094             {
2095                 ITensorHandle* tensorHandle = found->second;
2096                 workingMemDescriptor.m_Inputs.push_back(tensorHandle);
2097 
2098                 // Odd case where a constant layer is connected to an output layer
2099                 // We will need to create a HandleInfo to track it
2100                 if (isOutputLayer)
2101                 {
2102                     LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
2103 
2104                     HandleInfo& handleInfo = outputToHandleInfoMap[outputSlot];
2105                     handleInfo.m_TensorHandle = tensorHandle;
2106                     handleInfo.m_IsOutputLayerHandle = true;
2107                     handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
2108                     handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
2109                 }
2110                 continue;
2111             }
2112 
2113             HandleInfo& handleInfo = outputToHandleInfoMap.at(outputSlot);
2114 
2115             ITensorHandle* inputTensorHandle = handleInfo.m_TensorHandle;
2116             workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
2117 
2118             // Store the LayerBindingId of the OutputLayer
2119             if (isOutputLayer)
2120             {
2121                 LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
2122                 handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
2123                 handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
2124             }
2125             // In this case the layer is not an Output Layer but shares its input tensorhandle with an OutputLayer
2126             // It will need to be updated as well, if we swap out the tensorhandle
2127             else if (handleInfo.m_IsOutputLayerHandle)
2128             {
2129                 handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, slot.GetSlotIndex()});
2130             }
2131 
2132             // Store the coordinates of the InputSlots connected to the InputLayer
2133             // There can be more than one InputSlot connected to an InputLayer, so we use a vector
2134             if (handleInfo.m_IsInputLayerHandle)
2135             {
2136                 std::pair<LayerGuid, unsigned int> connectionLocation{layerIndex, slot.GetSlotIndex()};
2137                 handleInfo.m_InputMemDescriptorCoords.m_InputSlotCoords.emplace_back(connectionLocation);
2138             }
2139         }
2140 
2141         // Input/Output layers/workloads will not be executed, so the descriptor is not added to workingMemDescriptors
2142         // However we will still need to manage the tensorHandle
2143         if (!isInputLayer)
2144         {
2145             // Simply auto initialise ExecutionData here, so it's added only for the layer that require execution.
2146             // The memory and data will be allocated/assigned for the void* in WorkingMemHandle::Allocate.
2147             std::pair<BackendId, ExecutionData> dataPair;
2148             dataPair.first = layer->GetBackendId();
2149 
2150             executionDataVec.push_back(dataPair);
2151             workingMemDescriptors.push_back(workingMemDescriptor);
2152 
2153             layerIndex++;
2154         }
2155     }
2156 
2157     std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory;
2158 
2159     auto externalMemoryManager = CreateExternalMemoryManger(tensorMemory);
2160 
2161     // Sort m_TensorMemory, so it's order matches the outputSlot order
2162     std::sort(tensorMemory.begin(), tensorMemory.end(),
2163               [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
2164                  const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
2165               {
2166                   return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
2167               });
2168 
2169     std::vector<WorkingMemHandle::InputMemDescriptorCoords> inputConnectionsInfo;
2170     std::vector<WorkingMemHandle::OutputMemDescriptorCoords> outputConnectionsInfo;
2171 
2172     for (const auto& handleInfo: outputToHandleInfoMap)
2173     {
2174         if (handleInfo.second.m_IsOutputLayerHandle)
2175         {
2176             outputConnectionsInfo.emplace_back(handleInfo.second.m_OutputMemDescriptorCoords);
2177         }
2178 
2179         if (handleInfo.second.m_IsInputLayerHandle)
2180         {
2181             inputConnectionsInfo.emplace_back(handleInfo.second.m_InputMemDescriptorCoords);
2182         }
2183     }
2184 
2185     return std::make_unique<WorkingMemHandle>(networkId,
2186                                               inputConnectionsInfo,
2187                                               outputConnectionsInfo,
2188                                               workingMemDescriptors,
2189                                               std::move(externalMemoryManager),
2190                                               std::move(tensorMemory),
2191                                               std::move(managedTensorHandles),
2192                                               std::move(unmanagedTensorHandles),
2193                                               executionDataVec,
2194                                               &m_Backends);
2195 }
2196 
RegisterDebugCallback(const DebugCallbackFunction & func)2197 void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
2198 {
2199     for (auto&& workloadPtr: m_WorkloadQueue)
2200     {
2201         workloadPtr.get()->RegisterDebugCallback(func);
2202     }
2203 }
2204 
2205 
CreateMemoryProfileAsync()2206 void LoadedNetwork::CreateMemoryProfileAsync()
2207 {
2208     struct PartialBlock
2209     {
2210         unsigned int m_StartOfLife;
2211         unsigned int m_Lifetime;
2212 
2213         size_t m_MemSize;
2214         unsigned int m_Index;
2215 
2216         BackendId m_BackendId;
2217     };
2218 
2219     auto align = [](size_t numToAlign)
2220     {
2221         const size_t alignment = sizeof(float);
2222         return ((numToAlign + alignment - 1) / alignment) * alignment;
2223     };
2224 
2225     std::unordered_map<const OutputSlot*, PartialBlock> memBlockTrackerMap;
2226 
2227     const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
2228     const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
2229 
2230     unsigned int timestep = 0;
2231     unsigned int outputIndex = 0;
2232     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
2233 
2234     for (auto&& layer : order)
2235     {
2236         const LayerType& layerType = layer->GetType();
2237         // Don't manage memory if importing.
2238         if (layerType == LayerType::Input && inputImportingEnabled)
2239         {
2240             continue;
2241         }
2242         // Don't manage memory if importing.
2243         if (layerType == LayerType::Output && outputImportingEnabled
2244             && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
2245         {
2246             continue;
2247         }
2248         // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
2249         // management is done separately.
2250         if (layerType == LayerType::Constant)
2251         {
2252             continue;
2253         }
2254 
2255         BackendId backendId = layer->GetBackendId();
2256         for (auto& outputSlot : layer->GetOutputSlots())
2257         {
2258             if (!m_SupportsExternallyManagedMemory[backendId])
2259             {
2260                 continue;
2261             }
2262 
2263             PartialBlock partialBlock;
2264 
2265             partialBlock.m_StartOfLife = timestep;
2266 
2267             size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
2268             partialBlock.m_MemSize = alignedSize;
2269             partialBlock.m_Index = outputIndex++;
2270             partialBlock.m_Lifetime = outputSlot.GetNumConnections();
2271             partialBlock.m_BackendId = backendId;
2272 
2273             if (partialBlock.m_Lifetime == 0)
2274             {
2275                 m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
2276                                                                      partialBlock.m_StartOfLife,
2277                                                                      partialBlock.m_MemSize,
2278                                                                      0,
2279                                                                      partialBlock.m_Index);
2280             }
2281             else
2282             {
2283                 memBlockTrackerMap[&outputSlot] = partialBlock;
2284             }
2285         }
2286 
2287         for (auto& inputSlot : layer->GetInputSlots())
2288         {
2289             const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
2290             const LayerType& owningLayerType = connectedInputLayer.GetType();
2291 
2292             if (owningLayerType == LayerType::Constant)
2293             {
2294                 continue;
2295             }
2296             if (inputImportingEnabled && owningLayerType == LayerType::Input)
2297             {
2298                 continue;
2299             }
2300 
2301             auto outputSlot = inputSlot.GetConnectedOutputSlot();
2302 
2303             PartialBlock& partialBlock = memBlockTrackerMap.at(outputSlot);
2304 
2305             auto& lifetime = partialBlock.m_Lifetime;
2306             --lifetime;
2307 
2308             if (lifetime == 0)
2309             {
2310                 m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
2311                                                                      timestep,
2312                                                                      partialBlock.m_MemSize,
2313                                                                      0,
2314                                                                      partialBlock.m_Index);
2315             }
2316         }
2317         ++timestep;
2318     }
2319 }
2320 
CreateMemoryProfile()2321 void LoadedNetwork::CreateMemoryProfile()
2322 {
2323     // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided
2324     // is a TensorHandle, the function just returns it
2325     auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle)
2326     {
2327         ITensorHandle* ancestor = subTensorHandle;
2328         while (ancestor && ancestor->GetParent())
2329         {
2330             ancestor = ancestor->GetParent();
2331         }
2332         return ancestor;
2333     };
2334 
2335     struct PartialBlock
2336     {
2337         unsigned int m_StartOfLife;
2338         unsigned int m_Lifetime;
2339 
2340         size_t m_MemSize;
2341         unsigned int m_Index;
2342 
2343         BackendId m_BackendId;
2344     };
2345 
2346     auto align = [](size_t numToAlign)
2347     {
2348         const size_t alignment = sizeof(float);
2349         return ((numToAlign + alignment - 1) / alignment) * alignment;
2350     };
2351 
2352     std::unordered_map<ITensorHandle*, PartialBlock> memBlockTrackerMap;
2353 
2354     const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
2355     const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
2356 
2357     unsigned int timestep = 0;
2358     unsigned int outputIndex = 0;
2359     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
2360 
2361     for (auto&& layer : order)
2362     {
2363         const LayerType& layerType = layer->GetType();
2364         // Don't manage memory if importing.
2365         if (layerType == LayerType::Input && inputImportingEnabled)
2366         {
2367             continue;
2368         }
2369         // Don't manage memory if importing.
2370         if (layerType == LayerType::Output && outputImportingEnabled
2371             && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
2372         {
2373             continue;
2374         }
2375         // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
2376         // management is done separately.
2377         if (layerType == LayerType::Constant)
2378         {
2379             continue;
2380         }
2381 
2382         BackendId backendId = layer->GetBackendId();
2383         for (auto& outputSlot : layer->GetOutputSlots())
2384         {
2385             if (!m_SupportsExternallyManagedMemory[backendId])
2386             {
2387                 continue;
2388             }
2389 
2390             ITensorHandle* tensorHandle = outputSlot.GetOutputHandler().GetData();
2391             tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
2392 
2393             if (memBlockTrackerMap.find(tensorHandle) == memBlockTrackerMap.end())
2394             {
2395                 PartialBlock partialBlock;
2396 
2397                 partialBlock.m_StartOfLife = timestep;
2398 
2399                 size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
2400                 partialBlock.m_MemSize = alignedSize;
2401                 partialBlock.m_Index = outputIndex++;
2402                 partialBlock.m_Lifetime = outputSlot.GetNumConnections();
2403                 partialBlock.m_BackendId = backendId;
2404 
2405                 if (partialBlock.m_Lifetime == 0)
2406                 {
2407                     m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
2408                                                                          partialBlock.m_StartOfLife,
2409                                                                          partialBlock.m_MemSize,
2410                                                                          0,
2411                                                                          partialBlock.m_Index);
2412                 }
2413                 else
2414                 {
2415                     memBlockTrackerMap[tensorHandle] = partialBlock;
2416                 }
2417                 m_Tensorhandles.push_back(tensorHandle);
2418 
2419             }
2420             else
2421             {
2422                 memBlockTrackerMap.at(tensorHandle).m_Lifetime += outputSlot.GetNumConnections();
2423             }
2424         }
2425 
2426         for (auto& inputSlot : layer->GetInputSlots())
2427         {
2428             const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
2429             const LayerType& owningLayerType = connectedInputLayer.GetType();
2430 
2431             if (owningLayerType == LayerType::Constant)
2432             {
2433                 continue;
2434             }
2435             if (inputImportingEnabled && owningLayerType == LayerType::Input)
2436             {
2437                 continue;
2438             }
2439             if (!m_SupportsExternallyManagedMemory[connectedInputLayer.GetBackendId()])
2440             {
2441                 continue;
2442             }
2443 
2444             auto outputSlot = inputSlot.GetConnectedOutputSlot();
2445 
2446             ITensorHandle* tensorHandle = outputSlot->GetOutputHandler().GetData();
2447             tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
2448 
2449             PartialBlock& partialBlock = memBlockTrackerMap.at(tensorHandle);
2450 
2451             auto& lifetime = partialBlock.m_Lifetime;
2452             --lifetime;
2453 
2454             if (lifetime == 0)
2455             {
2456                 m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
2457                                                                      timestep,
2458                                                                      partialBlock.m_MemSize,
2459                                                                      0,
2460                                                                      partialBlock.m_Index);
2461             }
2462         }
2463         ++timestep;
2464     }
2465 
2466 }
2467 
CreateExternalMemoryManger(std::vector<std::pair<std::shared_ptr<TensorMemory>,MemorySource>> & tensorMemoryVec)2468 std::unique_ptr<MemoryManager> LoadedNetwork::CreateExternalMemoryManger(
2469         std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>>& tensorMemoryVec)
2470 {
2471     std::unique_ptr<MemoryManager> memoryManager = std::make_unique<MemoryManager>();
2472     auto allocatorMap = BackendRegistryInstance().GetAllocators();
2473 
2474     for (auto& backend : m_MemBinMap)
2475     {
2476         std::vector<BufferStorage> bufferStorageVec;
2477 
2478         std::shared_ptr<ICustomAllocator> backendAllocator;
2479         if (allocatorMap.find(backend.first) != allocatorMap.end())
2480         {
2481             backendAllocator = allocatorMap[backend.first];
2482         }
2483         else
2484         {
2485             backendAllocator = m_Backends[backend.first]->GetDefaultAllocator();
2486         }
2487 
2488         for (auto& memBin : backend.second)
2489         {
2490             BufferStorage bufferStorage;
2491             bufferStorage.m_BufferSize = memBin.m_MemSize;
2492             bufferStorage.m_TensorMemoryVector.reserve(memBin.m_MemBlocks.size());
2493 
2494             for (auto& memBlock : memBin.m_MemBlocks)
2495             {
2496                 auto tensorMemory = std::make_shared<TensorMemory>(TensorMemory{memBlock.m_Offset, memBlock.m_Index});
2497 
2498                 tensorMemoryVec.emplace_back(tensorMemory, backendAllocator->GetMemorySourceType());
2499                 bufferStorage.m_TensorMemoryVector.emplace_back(tensorMemory);
2500             }
2501 
2502             bufferStorageVec.emplace_back(std::move(bufferStorage));
2503         }
2504 
2505         memoryManager->StoreMemToAllocate(bufferStorageVec, backendAllocator, 4);
2506     }
2507 
2508     return memoryManager;
2509 }
2510 
ValidateImportedInputID(ImportedInputId id)2511 LayerBindingId LoadedNetwork::ValidateImportedInputID(ImportedInputId id)
2512 {
2513     try
2514     {
2515         const auto& importedTensorHandlePin = m_PreImportedInputHandles.at(id);
2516         if (!importedTensorHandlePin.m_TensorHandle)
2517         {
2518             throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute:"
2519                                                        "PreImportedInput: {} has been deleted", id));
2520         }
2521         return importedTensorHandlePin.m_LayerBindingId;
2522     }
2523     catch (const std::out_of_range&)
2524     {
2525         throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: Unknown ImportedInputId: {}", id));
2526     }
2527 }
2528 
ValidateImportedOutputID(ImportedOutputId id)2529 LayerBindingId LoadedNetwork::ValidateImportedOutputID(ImportedOutputId id)
2530 {
2531     try
2532     {
2533         const auto& importedTensorHandlePin = m_PreImportedOutputHandles.at(id);
2534         if (!importedTensorHandlePin.m_TensorHandle)
2535         {
2536             throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: "
2537                                                        "PreImportedOutput: {} has been deleted", id));
2538         }
2539         return importedTensorHandlePin.m_LayerBindingId;
2540     }
2541     catch (const std::out_of_range&)
2542     {
2543         throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: Unknown ImportedOutputId: {}", id));
2544     }
2545 }
2546 
2547 }
2548