1 //
2 // Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5
6 #include "NeonBackend.hpp"
7 #include "NeonBackendId.hpp"
8 #include "NeonBackendModelContext.hpp"
9 #include "NeonWorkloadFactory.hpp"
10 #include "NeonLayerSupport.hpp"
11 #include "NeonTensorHandleFactory.hpp"
12
13 #include <armnn/BackendRegistry.hpp>
14 #include <armnn/Descriptors.hpp>
15
16 #include <aclCommon/ArmComputeSubgraphUtils.hpp>
17 #include <aclCommon/ArmComputeUtils.hpp>
18 #include <aclCommon/BaseMemoryManager.hpp>
19
20 #include <armnn/backends/IBackendContext.hpp>
21 #include <armnn/backends/IMemoryManager.hpp>
22
23 #include <armnn/utility/PolymorphicDowncast.hpp>
24
25 #include <neon/workloads/NeonAdditionWorkload.hpp>
26 #include <neon/workloads/NeonBatchNormalizationWorkload.hpp>
27 #include <neon/workloads/NeonConvolution2dWorkload.hpp>
28 #include <neon/workloads/NeonDepthwiseConvolutionWorkload.hpp>
29 #include <neon/workloads/NeonDivisionWorkload.hpp>
30 #include <neon/workloads/NeonFullyConnectedWorkload.hpp>
31 #include <neon/workloads/NeonMultiplicationWorkload.hpp>
32 #include <neon/workloads/NeonReduceWorkload.hpp>
33 #include <neon/workloads/NeonSubtractionWorkload.hpp>
34 #include <backendsCommon/DefaultAllocator.hpp>
35
36 #include <Optimizer.hpp>
37
38 #include <arm_compute/core/Types.h>
39 #include <arm_compute/runtime/Allocator.h>
40
41 namespace armnn
42 {
43
GetIdStatic()44 const BackendId& NeonBackend::GetIdStatic()
45 {
46 static const BackendId s_Id{NeonBackendId()};
47 return s_Id;
48 }
49
CreateMemoryManager() const50 IBackendInternal::IMemoryManagerUniquePtr NeonBackend::CreateMemoryManager() const
51 {
52 return std::make_unique<NeonMemoryManager>(std::make_unique<arm_compute::Allocator>(),
53 BaseMemoryManager::MemoryAffinity::Offset);
54 }
55
CreateWorkloadFactory(const IBackendInternal::IMemoryManagerSharedPtr & memoryManager) const56 IBackendInternal::IWorkloadFactoryPtr NeonBackend::CreateWorkloadFactory(
57 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
58 {
59 return std::make_unique<NeonWorkloadFactory>(
60 PolymorphicPointerDowncast<NeonMemoryManager>(memoryManager));
61 }
62
CreateWorkloadFactory(const IBackendInternal::IMemoryManagerSharedPtr & memoryManager,const ModelOptions & modelOptions) const63 IBackendInternal::IWorkloadFactoryPtr NeonBackend::CreateWorkloadFactory(
64 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager, const ModelOptions& modelOptions) const
65 {
66 return std::make_unique<NeonWorkloadFactory>(
67 PolymorphicPointerDowncast<NeonMemoryManager>(memoryManager), CreateBackendSpecificModelContext(modelOptions));
68 }
69
CreateWorkloadFactory(class TensorHandleFactoryRegistry & tensorHandleFactoryRegistry) const70 IBackendInternal::IWorkloadFactoryPtr NeonBackend::CreateWorkloadFactory(
71 class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry) const
72 {
73 auto memoryManager = std::make_shared<NeonMemoryManager>(std::make_unique<arm_compute::Allocator>(),
74 BaseMemoryManager::MemoryAffinity::Offset);
75
76 tensorHandleFactoryRegistry.RegisterMemoryManager(memoryManager);
77
78 auto factory = std::make_unique<NeonTensorHandleFactory>(memoryManager);
79 // Register copy and import factory pair
80 tensorHandleFactoryRegistry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId());
81 // Register the factory
82 tensorHandleFactoryRegistry.RegisterFactory(std::move(factory));
83
84
85 return std::make_unique<NeonWorkloadFactory>(
86 PolymorphicPointerDowncast<NeonMemoryManager>(memoryManager));
87 }
88
CreateWorkloadFactory(TensorHandleFactoryRegistry & tensorHandleFactoryRegistry,const ModelOptions & modelOptions) const89 IBackendInternal::IWorkloadFactoryPtr NeonBackend::CreateWorkloadFactory(
90 TensorHandleFactoryRegistry& tensorHandleFactoryRegistry, const ModelOptions& modelOptions) const
91 {
92 auto memoryManager = std::make_shared<NeonMemoryManager>(std::make_unique<arm_compute::Allocator>(),
93 BaseMemoryManager::MemoryAffinity::Offset);
94
95 tensorHandleFactoryRegistry.RegisterMemoryManager(memoryManager);
96
97 auto factory = std::make_unique<NeonTensorHandleFactory>(memoryManager);
98 // Register copy and import factory pair
99 tensorHandleFactoryRegistry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId());
100 // Register the factory
101 tensorHandleFactoryRegistry.RegisterFactory(std::move(factory));
102
103 return std::make_unique<NeonWorkloadFactory>(
104 PolymorphicPointerDowncast<NeonMemoryManager>(memoryManager), CreateBackendSpecificModelContext(modelOptions));
105 }
106
CreateBackendContext(const IRuntime::CreationOptions &) const107 IBackendInternal::IBackendContextPtr NeonBackend::CreateBackendContext(const IRuntime::CreationOptions&) const
108 {
109 return IBackendContextPtr{};
110 }
111
CreateBackendProfilingContext(const IRuntime::CreationOptions &,IBackendProfilingPtr &)112 IBackendInternal::IBackendProfilingContextPtr NeonBackend::CreateBackendProfilingContext(
113 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
114 {
115 return IBackendProfilingContextPtr{};
116 }
117
CreateBackendSpecificModelContext(const ModelOptions & modelOptions) const118 IBackendInternal::IBackendSpecificModelContextPtr NeonBackend::CreateBackendSpecificModelContext(
119 const ModelOptions& modelOptions) const
120 {
121 return IBackendSpecificModelContextPtr{new NeonBackendModelContext{modelOptions}};
122 }
123
GetLayerSupport() const124 IBackendInternal::ILayerSupportSharedPtr NeonBackend::GetLayerSupport() const
125 {
126 static ILayerSupportSharedPtr layerSupport
127 {
128 new NeonLayerSupport(IBackendInternal::IBackendSpecificModelContextPtr{})
129 };
130 return layerSupport;
131 }
132
GetLayerSupport(const ModelOptions & modelOptions) const133 IBackendInternal::ILayerSupportSharedPtr NeonBackend::GetLayerSupport(const ModelOptions& modelOptions) const
134 {
135 static ILayerSupportSharedPtr layerSupport
136 {
137 new NeonLayerSupport(CreateBackendSpecificModelContext(modelOptions))
138 };
139 return layerSupport;
140 }
141
OptimizeSubgraphView(const SubgraphView & subgraph,const ModelOptions & modelOptions) const142 OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
143 const ModelOptions& modelOptions) const
144 {
145 OptimizationViews optimizationViews(modelOptions);
146
147 auto it = subgraph.endIConnectable();
148 std::map<LayerGuid, Layer*> untouched;
149
150 while (it != subgraph.beginIConnectable())
151 {
152 --it;
153 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
154 untouched.insert({base.GetGuid(), &base});
155 }
156
157 it = subgraph.endIConnectable();
158 while (it != subgraph.beginIConnectable())
159 {
160 --it;
161 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
162
163 // Fuse activation into previous layer if supported by backend
164 if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d
165 || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected
166 || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication
167 || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division)
168 && (base.GetAdditionalInformation<ActivationDescriptor>() == nullptr))
169 {
170 for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output)
171 {
172 if (output->GetNumConnections() == 1)
173 {
174 for (auto&& childInput : output->GetConnections())
175 {
176 if ((childInput->GetOwningLayer().GetType() == LayerType::Activation) &&
177 (checkDataTypeInputandOutput(childInput->GetOwningLayer())))
178 {
179 Layer& child = childInput->GetOwningLayer();
180
181 auto* activationLayer = PolymorphicDowncast<ActivationLayer*>(&child);
182
183 const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") +
184 base.GetName();
185
186 // Get params from activation layer
187 ActivationDescriptor activationDesc = activationLayer->GetParameters();
188
189 if (base.GetType() == LayerType::Convolution2d)
190 {
191 Convolution2dLayer* baseLayer = PolymorphicDowncast<Convolution2dLayer*>(&base);
192
193 Optional<TensorInfo> biases;
194
195 if (baseLayer->GetParameters().m_BiasEnabled)
196 {
197 biases = baseLayer->GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
198 }
199
200 arm_compute::Status status = NeonConvolution2dWorkloadValidate(
201 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
202 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
203 baseLayer->GetParameters(),
204 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
205 biases,
206 false,
207 &activationDesc);
208
209 if (status)
210 {
211 FuseConvolution2dLayer<Convolution2dLayer>(optimizationViews,
212 baseLayer,
213 activationLayer,
214 activationDesc,
215 name);
216 untouched.erase(baseLayer->GetGuid());
217 untouched.erase(activationLayer->GetGuid());
218 }
219 }
220 else if (base.GetType() == LayerType::DepthwiseConvolution2d)
221 {
222 DepthwiseConvolution2dLayer* baseLayer =
223 PolymorphicDowncast<DepthwiseConvolution2dLayer*>(&base);
224
225 Optional<TensorInfo> biases;
226
227 if (baseLayer->GetParameters().m_BiasEnabled)
228 {
229 biases = baseLayer->GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
230 }
231
232 arm_compute::Status status = NeonDepthwiseConvolutionWorkloadValidate(
233 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
234 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
235 baseLayer->GetParameters(),
236 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
237 biases,
238 &activationDesc);
239
240 if (status)
241 {
242 FuseDepthwiseConvolution2dLayer<DepthwiseConvolution2dLayer>(optimizationViews,
243 baseLayer,
244 activationLayer,
245 activationDesc,
246 name);
247 untouched.erase(baseLayer->GetGuid());
248 untouched.erase(activationLayer->GetGuid());
249 }
250 }
251 else if (base.GetType() == LayerType::FullyConnected)
252 {
253 FullyConnectedLayer* baseLayer = PolymorphicDowncast<FullyConnectedLayer*>(&base);
254 FullyConnectedDescriptor descriptor = baseLayer->GetParameters();
255
256 // As bias is optional only try to get TensorInfo from input if bias is enabled.
257 Optional<TensorInfo> biases;
258 if (descriptor.m_BiasEnabled)
259 {
260 biases = baseLayer->GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
261 }
262
263 arm_compute::Status status = NeonFullyConnectedWorkloadValidate(
264 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
265 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
266 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
267 biases,
268 baseLayer->GetParameters(),
269 &activationDesc);
270
271 if (status)
272 {
273 FuseFullyConnectedLayer<FullyConnectedLayer>(optimizationViews,
274 baseLayer,
275 activationLayer,
276 activationDesc,
277 name);
278 untouched.erase(baseLayer->GetGuid());
279 untouched.erase(activationLayer->GetGuid());
280 }
281 }
282 else if (base.GetType() == LayerType::BatchNormalization)
283 {
284 BatchNormalizationLayer* baseLayer =
285 PolymorphicDowncast<BatchNormalizationLayer*>(&base);
286
287 arm_compute::Status status = NeonBatchNormalizationValidate(
288 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
289 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
290 baseLayer->m_Mean->GetTensorInfo(),
291 baseLayer->m_Variance->GetTensorInfo(),
292 baseLayer->m_Beta->GetTensorInfo(),
293 baseLayer->m_Gamma->GetTensorInfo(),
294 baseLayer->GetParameters(),
295 &activationDesc);
296
297 if (status)
298 {
299 BatchNormalizationLayer* replacementLayer =
300 FuseBatchNormalizationLayer<BatchNormalizationLayer>(optimizationViews,
301 baseLayer,
302 activationLayer,
303 activationDesc,
304 name);
305
306 replacementLayer->m_Beta = std::move(baseLayer->m_Beta);
307 replacementLayer->m_Gamma = std::move(baseLayer->m_Gamma);
308 replacementLayer->m_Mean = std::move(baseLayer->m_Mean);
309 replacementLayer->m_Variance = std::move(baseLayer->m_Variance);
310 untouched.erase(baseLayer->GetGuid());
311 untouched.erase(activationLayer->GetGuid());
312 }
313 }
314 else if (base.GetType() == LayerType::Addition)
315 {
316 AdditionLayer* baseLayer = PolymorphicDowncast<AdditionLayer*>(&base);
317
318 arm_compute::Status status = NeonAdditionWorkloadValidate(
319 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
320 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
321 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
322 &activationDesc);
323
324 if (status)
325 {
326 FuseAdditionLayer<AdditionLayer>(optimizationViews,
327 baseLayer,
328 activationLayer,
329 activationDesc,
330 name);
331 untouched.erase(baseLayer->GetGuid());
332 untouched.erase(activationLayer->GetGuid());
333 }
334 }
335 else if (base.GetType() == LayerType::Division)
336 {
337 DivisionLayer* baseLayer = PolymorphicDowncast<DivisionLayer*>(&base);
338
339 arm_compute::Status status = NeonDivisionWorkloadValidate(
340 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
341 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
342 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
343 &activationDesc);
344
345 if (status)
346 {
347 FuseDivisionLayer<DivisionLayer>(optimizationViews,
348 baseLayer,
349 activationLayer,
350 activationDesc,
351 name);
352 untouched.erase(baseLayer->GetGuid());
353 untouched.erase(activationLayer->GetGuid());
354 }
355 }
356 else if (base.GetType() == LayerType::Multiplication)
357 {
358 MultiplicationLayer* baseLayer = PolymorphicDowncast<MultiplicationLayer*>(&base);
359
360 arm_compute::Status status = NeonMultiplicationWorkloadValidate(
361 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
362 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
363 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
364 &activationDesc);
365
366 if (status)
367 {
368 FuseMultiplicationLayer<MultiplicationLayer>(optimizationViews,
369 baseLayer,
370 activationLayer,
371 activationDesc,
372 name);
373 untouched.erase(baseLayer->GetGuid());
374 untouched.erase(activationLayer->GetGuid());
375 }
376 }
377 else if (base.GetType() == LayerType::Subtraction)
378 {
379 SubtractionLayer* baseLayer = PolymorphicDowncast<SubtractionLayer*>(&base);
380
381 arm_compute::Status status = NeonSubtractionWorkloadValidate(
382 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
383 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
384 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
385 &activationDesc);
386
387 if (status)
388 {
389 FuseSubtractionLayer<SubtractionLayer>(optimizationViews,
390 baseLayer,
391 activationLayer,
392 activationDesc,
393 name);
394 untouched.erase(baseLayer->GetGuid());
395 untouched.erase(activationLayer->GetGuid());
396 }
397 }
398 else if (base.GetType() == LayerType::ElementwiseBinary)
399 {
400 ElementwiseBinaryLayer* baseLayer = PolymorphicDowncast<ElementwiseBinaryLayer*>(&base);
401
402 if (baseLayer->GetParameters().m_Operation == BinaryOperation::Add)
403 {
404 arm_compute::Status status = NeonAdditionWorkloadValidate(
405 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
406 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
407 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
408 &activationDesc);
409
410 if (status)
411 {
412 FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
413 baseLayer,
414 activationLayer,
415 activationDesc,
416 BinaryOperation::Add,
417 name);
418 untouched.erase(baseLayer->GetGuid());
419 untouched.erase(activationLayer->GetGuid());
420 }
421 }
422 else if (baseLayer->GetParameters().m_Operation == BinaryOperation::Div)
423 {
424 arm_compute::Status status = NeonDivisionWorkloadValidate(
425 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
426 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
427 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
428 &activationDesc);
429
430 if (status)
431 {
432 FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
433 baseLayer,
434 activationLayer,
435 activationDesc,
436 BinaryOperation::Div,
437 name);
438 untouched.erase(baseLayer->GetGuid());
439 untouched.erase(activationLayer->GetGuid());
440 }
441 }
442 else if (baseLayer->GetParameters().m_Operation == BinaryOperation::Mul)
443 {
444 arm_compute::Status status = NeonMultiplicationWorkloadValidate(
445 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
446 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
447 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
448 &activationDesc);
449
450 if (status)
451 {
452 FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
453 baseLayer,
454 activationLayer,
455 activationDesc,
456 BinaryOperation::Mul,
457 name);
458 untouched.erase(baseLayer->GetGuid());
459 untouched.erase(activationLayer->GetGuid());
460 }
461 }
462 else if (baseLayer->GetParameters().m_Operation == BinaryOperation::Sub)
463 {
464 arm_compute::Status status = NeonSubtractionWorkloadValidate(
465 baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
466 baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
467 activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
468 &activationDesc);
469
470 if (status)
471 {
472 FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
473 baseLayer,
474 activationLayer,
475 activationDesc,
476 BinaryOperation::Sub,
477 name);
478 untouched.erase(baseLayer->GetGuid());
479 untouched.erase(activationLayer->GetGuid());
480 }
481 }
482 // No fusion available for other BinaryOperations
483 }
484 }
485 }
486 }
487 }
488 }
489
490 // Separate reduce layer with multiple axes into multiple reduce layers with 1 axis.
491 if (base.GetType() == LayerType::Reduce)
492 {
493 ReduceLayer* baseLayer = PolymorphicDowncast<ReduceLayer*>(&base);
494 ReduceDescriptor reduceDescriptor = baseLayer->GetParameters();
495
496 if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1)
497 {
498 // Add new layers to the graph and connect them.
499 std::vector<IConnectableLayer*> layers = ChainReduceLayers<ReduceLayer>(optimizationViews,
500 baseLayer,
501 reduceDescriptor);
502
503 // Replace existing baselayer with new subgraph.
504 ReplaceLayers<ReduceLayer>(optimizationViews, baseLayer, layers);
505 untouched.erase(baseLayer->GetGuid());
506 }
507 }
508 }
509
510 if (optimizationViews.GetSubstitutions().empty())
511 {
512 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
513 }
514 else
515 {
516 ReportUntouchedLayers(optimizationViews, untouched);
517 }
518
519 return optimizationViews;
520 }
521
GetHandleFactoryPreferences() const522 std::vector<ITensorHandleFactory::FactoryId> NeonBackend::GetHandleFactoryPreferences() const
523 {
524 return std::vector<ITensorHandleFactory::FactoryId>() = { NeonTensorHandleFactory::GetIdStatic() };
525 }
526
RegisterTensorHandleFactories(class TensorHandleFactoryRegistry & registry)527 void NeonBackend::RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry)
528 {
529 auto memoryManager = std::make_shared<NeonMemoryManager>(std::make_unique<arm_compute::Allocator>(),
530 BaseMemoryManager::MemoryAffinity::Offset);
531
532 registry.RegisterMemoryManager(memoryManager);
533
534 auto factory = std::make_unique<NeonTensorHandleFactory>(memoryManager);
535 // Register copy and import factory pair
536 registry.RegisterCopyAndImportFactoryPair(factory->GetId(), factory->GetId());
537 // Register the factory
538 registry.RegisterFactory(std::move(factory));
539 }
540
GetDefaultAllocator() const541 std::unique_ptr<ICustomAllocator> NeonBackend::GetDefaultAllocator() const
542 {
543 return std::make_unique<DefaultAllocator>();
544 }
545
546
547 } // namespace armnn
548