xref: /aosp_15_r20/external/armnn/src/backends/cl/ClBackend.cpp (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1 //
2 // Copyright © 2017-2023 Arm Ltd. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include "ClBackend.hpp"
7 #include "ClBackendContext.hpp"
8 #include "ClBackendDefaultAllocator.hpp"
9 #include "ClBackendId.hpp"
10 #include "ClBackendModelContext.hpp"
11 #include "ClImportTensorHandleFactory.hpp"
12 #include "ClLayerSupport.hpp"
13 #include "ClTensorHandleFactory.hpp"
14 #include "ClWorkloadFactory.hpp"
15 
16 #include <armnn/BackendRegistry.hpp>
17 #include <armnn/Descriptors.hpp>
18 
19 #include <aclCommon/ArmComputeSubgraphUtils.hpp>
20 #include <aclCommon/ArmComputeUtils.hpp>
21 #include <aclCommon/BaseMemoryManager.hpp>
22 
23 #include <armnn/backends/IBackendContext.hpp>
24 #include <armnn/backends/IMemoryManager.hpp>
25 #include <armnn/utility/PolymorphicDowncast.hpp>
26 
27 #include "workloads/ClAdditionWorkload.hpp"
28 #include "workloads/ClBatchNormalizationFloatWorkload.hpp"
29 #include "workloads/ClConvolution2dWorkload.hpp"
30 #include "workloads/ClDepthwiseConvolutionWorkload.hpp"
31 #include "workloads/ClDivisionWorkload.hpp"
32 #include "workloads/ClFullyConnectedWorkload.hpp"
33 #include "workloads/ClMultiplicationWorkload.hpp"
34 #include "workloads/ClReduceWorkload.hpp"
35 #include "workloads/ClSubtractionWorkload.hpp"
36 
37 #include <Optimizer.hpp>
38 
39 #include <arm_compute/core/Types.h>
40 #include <arm_compute/runtime/CL/CLBufferAllocator.h>
41 
42 namespace armnn
43 {
44 
GetIdStatic()45 const BackendId& ClBackend::GetIdStatic()
46 {
47     static const BackendId s_Id{ClBackendId()};
48     return s_Id;
49 }
50 
CreateMemoryManager() const51 IBackendInternal::IMemoryManagerUniquePtr ClBackend::CreateMemoryManager() const
52 {
53     if (m_UsingCustomAllocator)
54     {
55         return std::make_unique<ClMemoryManager>(m_CustomAllocator);
56     }
57     return std::make_unique<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
58 }
59 
CreateWorkloadFactory(const IBackendInternal::IMemoryManagerSharedPtr & memoryManager) const60 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
61     const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
62 {
63     return std::make_unique<ClWorkloadFactory>(
64         PolymorphicPointerDowncast<ClMemoryManager>(memoryManager));
65 }
66 
CreateWorkloadFactory(const IBackendInternal::IMemoryManagerSharedPtr & memoryManager,const ModelOptions & modelOptions) const67 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
68     const IBackendInternal::IMemoryManagerSharedPtr& memoryManager, const ModelOptions& modelOptions) const
69 {
70     return std::make_unique<ClWorkloadFactory>(
71         PolymorphicPointerDowncast<ClMemoryManager>(memoryManager), CreateBackendSpecificModelContext(modelOptions));
72 }
73 
CreateWorkloadFactory(TensorHandleFactoryRegistry & registry) const74 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
75     TensorHandleFactoryRegistry& registry) const
76 {
77     std::shared_ptr<ClMemoryManager> memoryManager;
78     if (m_UsingCustomAllocator)
79     {
80         memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
81     }
82     else
83     {
84         memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
85     }
86 
87     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<ClTensorHandleFactory>(memoryManager);
88     std::unique_ptr<ITensorHandleFactory> importFactory = std::make_unique<ClImportTensorHandleFactory>(
89         static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc));
90 
91     registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId());
92     registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId());
93 
94     registry.RegisterMemoryManager(memoryManager);
95     registry.RegisterFactory(std::move(factory));
96     registry.RegisterFactory(std::move(importFactory));
97 
98     return std::make_unique<ClWorkloadFactory>(
99             PolymorphicPointerDowncast<ClMemoryManager>(memoryManager));
100 }
101 
CreateWorkloadFactory(TensorHandleFactoryRegistry & registry,const ModelOptions & modelOptions) const102 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
103     TensorHandleFactoryRegistry& registry, const ModelOptions& modelOptions) const
104 {
105     std::shared_ptr<ClMemoryManager> memoryManager;
106     if (m_UsingCustomAllocator)
107     {
108         memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
109     }
110     else
111     {
112         memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
113     }
114 
115     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<ClTensorHandleFactory>(memoryManager);
116     std::unique_ptr<ITensorHandleFactory> importFactory = std::make_unique<ClImportTensorHandleFactory>(
117         static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc));
118 
119     registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId());
120     registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId());
121 
122     registry.RegisterMemoryManager(memoryManager);
123     registry.RegisterFactory(std::move(factory));
124     registry.RegisterFactory(std::move(importFactory));
125 
126     return std::make_unique<ClWorkloadFactory>(
127         PolymorphicPointerDowncast<ClMemoryManager>(memoryManager), CreateBackendSpecificModelContext(modelOptions));
128 }
129 
CreateWorkloadFactory(TensorHandleFactoryRegistry & registry,const ModelOptions & modelOptions,MemorySourceFlags inputFlags,MemorySourceFlags outputFlags) const130 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
131     TensorHandleFactoryRegistry& registry,
132     const ModelOptions& modelOptions,
133     MemorySourceFlags inputFlags,
134     MemorySourceFlags outputFlags) const
135 {
136     // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
137     if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
138     {
139         inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
140     }
141     if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
142     {
143         outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
144     }
145     std::shared_ptr<ClMemoryManager> memoryManager;
146     if (m_UsingCustomAllocator)
147     {
148         memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
149     }
150     else
151     {
152         memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
153     }
154 
155     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<ClTensorHandleFactory>(memoryManager);
156     std::unique_ptr<ITensorHandleFactory> importFactory = std::make_unique<ClImportTensorHandleFactory>(
157             inputFlags, outputFlags);
158 
159     registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId());
160     registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId());
161 
162     registry.RegisterMemoryManager(memoryManager);
163     registry.RegisterFactory(std::move(factory));
164     registry.RegisterFactory(std::move(importFactory));
165 
166     return std::make_unique<ClWorkloadFactory>(
167         PolymorphicPointerDowncast<ClMemoryManager>(memoryManager), CreateBackendSpecificModelContext(modelOptions));
168 }
169 
GetHandleFactoryPreferences() const170 std::vector<ITensorHandleFactory::FactoryId> ClBackend::GetHandleFactoryPreferences() const
171 {
172     return std::vector<ITensorHandleFactory::FactoryId> {ClTensorHandleFactory::GetIdStatic(),
173                                                          ClImportTensorHandleFactory::GetIdStatic()};
174 }
175 
RegisterTensorHandleFactories(TensorHandleFactoryRegistry & registry)176 void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
177 {
178     std::shared_ptr<ClMemoryManager> memoryManager;
179     if (m_UsingCustomAllocator)
180     {
181         memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
182     }
183     else
184     {
185         memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
186     }
187 
188     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<ClTensorHandleFactory>(memoryManager);
189     std::unique_ptr<ITensorHandleFactory> importFactory = std::make_unique<ClImportTensorHandleFactory>(
190         static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc));
191 
192     registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId());
193     registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId());
194 
195     registry.RegisterMemoryManager(memoryManager);
196     registry.RegisterFactory(std::move(factory));
197     registry.RegisterFactory(std::move(importFactory));
198 
199 }
200 
RegisterTensorHandleFactories(TensorHandleFactoryRegistry & registry,MemorySourceFlags inputFlags,MemorySourceFlags outputFlags)201 void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
202                                               MemorySourceFlags inputFlags,
203                                               MemorySourceFlags outputFlags)
204 {
205     // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
206     if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
207     {
208         inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
209     }
210     if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
211     {
212         outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
213     }
214     std::shared_ptr<ClMemoryManager> memoryManager;
215     if (m_UsingCustomAllocator)
216     {
217         memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
218     }
219     else
220     {
221         memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
222     }
223 
224     std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<ClTensorHandleFactory>(memoryManager);
225     std::unique_ptr<ITensorHandleFactory> importFactory = std::make_unique<ClImportTensorHandleFactory>(
226             inputFlags, outputFlags);
227 
228     registry.RegisterCopyAndImportFactoryPair(factory->GetId(), importFactory->GetId());
229     registry.RegisterCopyAndImportFactoryPair(importFactory->GetId(), factory->GetId());
230 
231     registry.RegisterMemoryManager(memoryManager);
232     registry.RegisterFactory(std::move(factory));
233     registry.RegisterFactory(std::move(importFactory));
234 }
235 
CreateBackendContext(const IRuntime::CreationOptions & options) const236 IBackendInternal::IBackendContextPtr ClBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
237 {
238     return IBackendContextPtr{new ClBackendContext{options}};
239 }
240 
CreateBackendProfilingContext(const IRuntime::CreationOptions &,IBackendProfilingPtr &)241 IBackendInternal::IBackendProfilingContextPtr ClBackend::CreateBackendProfilingContext(
242     const IRuntime::CreationOptions&, IBackendProfilingPtr&)
243 {
244     return IBackendProfilingContextPtr{};
245 }
246 
CreateBackendSpecificModelContext(const ModelOptions & modelOptions) const247 IBackendInternal::IBackendSpecificModelContextPtr ClBackend::CreateBackendSpecificModelContext(
248     const ModelOptions& modelOptions) const
249 {
250     return IBackendSpecificModelContextPtr{new ClBackendModelContext{modelOptions}};
251 }
252 
GetLayerSupport() const253 IBackendInternal::ILayerSupportSharedPtr ClBackend::GetLayerSupport() const
254 {
255     static ILayerSupportSharedPtr layerSupport
256         {
257             new ClLayerSupport(IBackendInternal::IBackendSpecificModelContextPtr{})
258         };
259     return layerSupport;
260 }
261 
GetLayerSupport(const ModelOptions & modelOptions) const262 IBackendInternal::ILayerSupportSharedPtr ClBackend::GetLayerSupport(const ModelOptions& modelOptions) const
263 {
264     static ILayerSupportSharedPtr layerSupport
265     {
266         new ClLayerSupport(CreateBackendSpecificModelContext(modelOptions))
267     };
268     return layerSupport;
269 }
270 
GetDefaultAllocator() const271 std::unique_ptr<ICustomAllocator> ClBackend::GetDefaultAllocator() const
272 {
273     return std::make_unique<ClBackendDefaultAllocator>();
274 }
275 
OptimizeSubgraphView(const SubgraphView & subgraph,const ModelOptions & modelOptions) const276 OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
277                                                   const ModelOptions& modelOptions) const
278 {
279     OptimizationViews optimizationViews(modelOptions);
280 
281     auto it = subgraph.endIConnectable();
282     bool isFastMathEnabled = false;
283     std::map<LayerGuid, Layer*> untouched;
284 
285     while (it != subgraph.beginIConnectable())
286     {
287         --it;
288         Layer& base = *(PolymorphicDowncast<Layer*>(*it));
289         untouched.insert({base.GetGuid(), &base});
290     }
291 
292     it = subgraph.endIConnectable();
293 #if defined(ARMCOMPUTECL_ENABLED)
294     IBackendInternal::IBackendSpecificModelContextPtr modelContextPtr = CreateBackendSpecificModelContext(modelOptions);
295 
296     if (modelContextPtr)
297     {
298         auto clModelOptions = dynamic_cast<ClBackendModelContext*>(modelContextPtr.get());
299         if (clModelOptions)
300         {
301             isFastMathEnabled = clModelOptions->IsFastMathEnabled();
302         }
303     }
304 #endif
305     while (it != subgraph.beginIConnectable())
306     {
307         --it;
308         Layer& base = *(PolymorphicDowncast<Layer*>(*it));
309 
310         // Fuse activation into previous layer if supported by backend
311         if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d
312             || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected
313             || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication
314             || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division
315             || base.GetType() == LayerType::ElementwiseBinary)
316             && (base.GetAdditionalInformation<ActivationDescriptor>() == nullptr))
317         {
318             for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output)
319             {
320                 if (output->GetNumConnections() == 1)
321                 {
322                     for (auto&& childInput : output->GetConnections())
323                     {
324                         if ((childInput->GetOwningLayer().GetType() == LayerType::Activation) &&
325                             (checkDataTypeInputandOutput(childInput->GetOwningLayer())))
326                         {
327                             Layer& child = childInput->GetOwningLayer();
328 
329                             auto* activationLayer = PolymorphicDowncast<ActivationLayer*>(&child);
330 
331                             const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") +
332                                                      base.GetName();
333 
334                             // Get params from activation layer
335                             ActivationDescriptor activationDesc = activationLayer->GetParameters();
336 
337                             if (base.GetType() == LayerType::Convolution2d)
338                             {
339                                 Convolution2dLayer* baseLayer = PolymorphicDowncast<Convolution2dLayer*>(&base);
340 
341                                 Optional<TensorInfo> biases;
342 
343                                 if (baseLayer->GetParameters().m_BiasEnabled)
344                                 {
345                                     biases = baseLayer->GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
346                                 }
347 
348                                 arm_compute::Status status = ClConvolution2dWorkloadValidate(
349                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
350                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
351                                         baseLayer->GetParameters(),
352                                         baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
353                                         biases,
354                                         isFastMathEnabled,
355                                         &activationDesc);
356 
357                                 if (status)
358                                 {
359                                     FuseConvolution2dLayer<Convolution2dLayer>(optimizationViews,
360                                                                                baseLayer,
361                                                                                activationLayer,
362                                                                                activationDesc,
363                                                                                name);
364                                     untouched.erase(baseLayer->GetGuid());
365                                     untouched.erase(activationLayer->GetGuid());
366                                 }
367                             }
368                             else if (base.GetType() == LayerType::DepthwiseConvolution2d)
369                             {
370                                 DepthwiseConvolution2dLayer* baseLayer =
371                                         PolymorphicDowncast<DepthwiseConvolution2dLayer*>(&base);
372 
373                                 Optional<TensorInfo> biases;
374 
375                                 if (baseLayer->GetParameters().m_BiasEnabled)
376                                 {
377                                     biases = baseLayer->GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
378                                 }
379 
380                                 arm_compute::Status status = ClDepthwiseConvolutionWorkloadValidate(
381                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
382                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
383                                         baseLayer->GetParameters(),
384                                         baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
385                                         biases,
386                                         &activationDesc);
387 
388                                 if (status)
389                                 {
390                                     FuseDepthwiseConvolution2dLayer<DepthwiseConvolution2dLayer>(optimizationViews,
391                                                                                                  baseLayer,
392                                                                                                  activationLayer,
393                                                                                                  activationDesc,
394                                                                                                  name);
395                                     untouched.erase(baseLayer->GetGuid());
396                                     untouched.erase(activationLayer->GetGuid());
397                                 }
398                             }
399                             else if (base.GetType() == LayerType::FullyConnected)
400                             {
401                                 FullyConnectedLayer* baseLayer = PolymorphicDowncast<FullyConnectedLayer*>(&base);
402                                 FullyConnectedDescriptor descriptor = baseLayer->GetParameters();
403 
404                                 // As bias is optional only try to get TensorInfo from input if bias is enabled.
405                                 Optional<TensorInfo> biases;
406                                 if (descriptor.m_BiasEnabled)
407                                 {
408                                     biases = baseLayer->GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
409                                 }
410 
411                                 arm_compute::Status status = ClFullyConnectedWorkloadValidate(
412                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
413                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
414                                         baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
415                                         biases,
416                                         baseLayer->GetParameters(),
417                                         &activationDesc);
418 
419                                 if (status)
420                                 {
421                                     FuseFullyConnectedLayer<FullyConnectedLayer>(optimizationViews,
422                                                                                  baseLayer,
423                                                                                  activationLayer,
424                                                                                  activationDesc,
425                                                                                  name);
426                                     untouched.erase(baseLayer->GetGuid());
427                                     untouched.erase(activationLayer->GetGuid());
428                                 }
429                             }
430                             else if (base.GetType() == LayerType::BatchNormalization)
431                             {
432                                 BatchNormalizationLayer* baseLayer =
433                                         PolymorphicDowncast<BatchNormalizationLayer*>(&base);
434 
435                                 arm_compute::Status status = ClBatchNormalizationValidate(
436                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
437                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
438                                         baseLayer->m_Mean->GetTensorInfo(),
439                                         baseLayer->m_Variance->GetTensorInfo(),
440                                         baseLayer->m_Beta->GetTensorInfo(),
441                                         baseLayer->m_Gamma->GetTensorInfo(),
442                                         baseLayer->GetParameters(),
443                                         &activationDesc);
444 
445                                 if (status)
446                                 {
447                                     BatchNormalizationLayer* replacementLayer =
448                                         FuseBatchNormalizationLayer<BatchNormalizationLayer>(optimizationViews,
449                                                                                              baseLayer,
450                                                                                              activationLayer,
451                                                                                              activationDesc,
452                                                                                              name);
453 
454                                     replacementLayer->m_Beta     = std::move(baseLayer->m_Beta);
455                                     replacementLayer->m_Gamma    = std::move(baseLayer->m_Gamma);
456                                     replacementLayer->m_Mean     = std::move(baseLayer->m_Mean);
457                                     replacementLayer->m_Variance = std::move(baseLayer->m_Variance);
458                                     untouched.erase(baseLayer->GetGuid());
459                                     untouched.erase(activationLayer->GetGuid());
460                                 }
461                             }
462                             else if (base.GetType() == LayerType::Addition)
463                             {
464                                 AdditionLayer* baseLayer = PolymorphicDowncast<AdditionLayer*>(&base);
465 
466                                 arm_compute::Status status = ClAdditionValidate(
467                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
468                                         baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
469                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
470                                         &activationDesc);
471 
472                                 if (status)
473                                 {
474                                     FuseAdditionLayer<AdditionLayer>(optimizationViews,
475                                                                      baseLayer,
476                                                                      activationLayer,
477                                                                      activationDesc,
478                                                                      name);
479                                     untouched.erase(baseLayer->GetGuid());
480                                     untouched.erase(activationLayer->GetGuid());
481                                 }
482                             }
483                             else if (base.GetType() == LayerType::Division)
484                             {
485                                 DivisionLayer* baseLayer = PolymorphicDowncast<DivisionLayer*>(&base);
486 
487                                 arm_compute::Status status = ClDivisionWorkloadValidate(
488                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
489                                         baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
490                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
491                                         &activationDesc);
492 
493                                 if (status)
494                                 {
495                                     FuseDivisionLayer<DivisionLayer>(optimizationViews,
496                                                                      baseLayer,
497                                                                      activationLayer,
498                                                                      activationDesc,
499                                                                      name);
500                                     untouched.erase(baseLayer->GetGuid());
501                                     untouched.erase(activationLayer->GetGuid());
502                                 }
503                             }
504                             else if (base.GetType() == LayerType::Multiplication)
505                             {
506                                 MultiplicationLayer* baseLayer = PolymorphicDowncast<MultiplicationLayer*>(&base);
507 
508                                 arm_compute::Status status = ClMultiplicationWorkloadValidate(
509                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
510                                         baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
511                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
512                                         &activationDesc);
513 
514                                 if (status)
515                                 {
516                                     FuseMultiplicationLayer<MultiplicationLayer>(optimizationViews,
517                                                                                  baseLayer,
518                                                                                  activationLayer,
519                                                                                  activationDesc,
520                                                                                  name);
521                                     untouched.erase(baseLayer->GetGuid());
522                                     untouched.erase(activationLayer->GetGuid());
523                                 }
524                             }
525                             else if (base.GetType() == LayerType::Subtraction)
526                             {
527                                 SubtractionLayer* baseLayer = PolymorphicDowncast<SubtractionLayer*>(&base);
528 
529                                 arm_compute::Status status = ClSubtractionValidate(
530                                         baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
531                                         baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
532                                         activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
533                                         &activationDesc);
534 
535                                 if (status)
536                                 {
537                                     FuseSubtractionLayer<SubtractionLayer>(optimizationViews,
538                                                                            baseLayer,
539                                                                            activationLayer,
540                                                                            activationDesc,
541                                                                            name);
542                                     untouched.erase(baseLayer->GetGuid());
543                                     untouched.erase(activationLayer->GetGuid());
544                                 }
545                             }
546                             else if (base.GetType() == LayerType::ElementwiseBinary)
547                             {
548                                 ElementwiseBinaryLayer* baseLayer = PolymorphicDowncast<ElementwiseBinaryLayer*>(&base);
549 
550                                 if (baseLayer->GetParameters().m_Operation == BinaryOperation::Add)
551                                 {
552                                     arm_compute::Status status = ClAdditionValidate(
553                                             baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
554                                             baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
555                                             activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
556                                             &activationDesc);
557 
558                                     if (status)
559                                     {
560                                         FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
561                                                                                            baseLayer,
562                                                                                            activationLayer,
563                                                                                            activationDesc,
564                                                                                            BinaryOperation::Add,
565                                                                                            name);
566                                         untouched.erase(baseLayer->GetGuid());
567                                         untouched.erase(activationLayer->GetGuid());
568                                     }
569                                 }
570                                 else if (baseLayer->GetParameters().m_Operation == BinaryOperation::Div)
571                                 {
572                                     arm_compute::Status status = ClDivisionWorkloadValidate(
573                                             baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
574                                             baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
575                                             activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
576                                             &activationDesc);
577 
578                                     if (status)
579                                     {
580                                         FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
581                                                                                            baseLayer,
582                                                                                            activationLayer,
583                                                                                            activationDesc,
584                                                                                            BinaryOperation::Div,
585                                                                                            name);
586                                         untouched.erase(baseLayer->GetGuid());
587                                         untouched.erase(activationLayer->GetGuid());
588                                     }
589                                 }
590                                 else if (baseLayer->GetParameters().m_Operation == BinaryOperation::Mul)
591                                 {
592                                     arm_compute::Status status = ClMultiplicationWorkloadValidate(
593                                             baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
594                                             baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
595                                             activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
596                                             &activationDesc);
597 
598                                     if (status)
599                                     {
600                                         FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
601                                                                                            baseLayer,
602                                                                                            activationLayer,
603                                                                                            activationDesc,
604                                                                                            BinaryOperation::Mul,
605                                                                                            name);
606                                         untouched.erase(baseLayer->GetGuid());
607                                         untouched.erase(activationLayer->GetGuid());
608                                     }
609                                 }
610                                 else if (baseLayer->GetParameters().m_Operation == BinaryOperation::Sub)
611                                 {
612                                     arm_compute::Status status = ClSubtractionValidate(
613                                             baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
614                                             baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(),
615                                             activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
616                                             &activationDesc);
617 
618                                     if (status)
619                                     {
620                                         FuseElementwiseBinaryLayer<ElementwiseBinaryLayer>(optimizationViews,
621                                                                                            baseLayer,
622                                                                                            activationLayer,
623                                                                                            activationDesc,
624                                                                                            BinaryOperation::Sub,
625                                                                                            name);
626                                     }
627                                 }
628                                 // No fusion available for other BinaryOperations
629                             }
630                         }
631                     }
632                 }
633             }
634         }
635 
636         // Separate reduce layer with multiple axes into multiple reduce layers with 1 axis.
637         if (base.GetType() == LayerType::Reduce)
638         {
639             ReduceLayer* baseLayer            = PolymorphicDowncast<ReduceLayer*>(&base);
640             ReduceDescriptor reduceDescriptor = baseLayer->GetParameters();
641 
642             if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1)
643             {
644                 // Add new layers to the graph and connect them.
645                 std::vector<IConnectableLayer*> layers = ChainReduceLayers<ReduceLayer>(optimizationViews,
646                                                                                         baseLayer,
647                                                                                         reduceDescriptor);
648 
649                 // Replace existing baselayer with new subgraph.
650                 ReplaceLayers<ReduceLayer>(optimizationViews, baseLayer, layers);
651                 untouched.erase(baseLayer->GetGuid());
652             }
653         }
654 
655         // Special case to fuse padding into average pooling 2d for quantized datatype.
656         // Required to be done as a backend specific optimization as Neon does not support this special case.
657         if (base.GetType() == LayerType::Pooling2d)
658         {
659             Pooling2dLayer* baseLayer = PolymorphicDowncast<Pooling2dLayer*>(&base);
660             Pooling2dDescriptor poolingDescriptor = baseLayer->GetParameters();
661 
662             if (baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer().GetType() == LayerType::Pad)
663             {
664                 PadLayer* padLayer = PolymorphicDowncast<PadLayer*>(
665                     &baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer());
666                 if (padLayer->GetOutputSlot(0).GetNumConnections() == 1 &&
667                     optimizations::pad_fold::TryFoldPadIntoLayer2d(padLayer->GetParameters(),
668                                                                    poolingDescriptor,
669                                                                    padLayer->GetOutputSlot().GetTensorInfo(),
670                                                                    true))
671                 {
672                     FoldPadIntoAveragePool2d<Pooling2dLayer>(optimizationViews, baseLayer,
673                                                              poolingDescriptor, padLayer);
674                     untouched.erase(baseLayer->GetGuid());
675                     untouched.erase(padLayer->GetGuid());
676                 }
677             }
678         }
679     }
680 
681     if (optimizationViews.GetSubstitutions().empty())
682     {
683         optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
684     }
685     else
686     {
687         ReportUntouchedLayers(optimizationViews, untouched);
688     }
689 
690     return optimizationViews;
691 }
692 
693 } // namespace armnn
694