xref: /aosp_15_r20/external/armnn/samples/CustomMemoryAllocatorSample.cpp (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1 //
2 // Copyright © 2021, 2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include <armnn/ArmNN.hpp>
7 #include <armnn/backends/ICustomAllocator.hpp>
8 
9 #include <arm_compute/core/CL/CLKernelLibrary.h>
10 #include <arm_compute/runtime/CL/CLScheduler.h>
11 
12 #include <iostream>
13 
14 /** Sample implementation of ICustomAllocator for use with the ClBackend.
15  *  Note: any memory allocated must be host addressable with write access
16  *  in order for ArmNN to be able to properly use it. */
17 class SampleClBackendCustomAllocator : public armnn::ICustomAllocator
18 {
19 public:
20     SampleClBackendCustomAllocator() = default;
21 
allocate(size_t size,size_t alignment)22     void* allocate(size_t size, size_t alignment) override
23     {
24         // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment
25         if (alignment == 0)
26         {
27             alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
28         }
29         size_t space = size + alignment + alignment;
30         auto allocatedMemPtr = std::malloc(space * sizeof(size_t));
31 
32         if (std::align(alignment, size, allocatedMemPtr, space) == nullptr)
33         {
34             throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed");
35         }
36         return allocatedMemPtr;
37     }
38 
free(void * ptr)39     void free(void* ptr) override
40     {
41         std::free(ptr);
42     }
43 
GetMemorySourceType()44     armnn::MemorySource GetMemorySourceType() override
45     {
46         return armnn::MemorySource::Malloc;
47     }
48 };
49 
50 
51 // A simple example application to show the usage of a custom memory allocator. In this sample, the users single
52 // input number is multiplied by 1.0f using a fully connected layer with a single neuron to produce an output
53 // number that is the same as the input. All memory required to execute this mini network is allocated with
54 // the provided custom allocator.
55 //
56 // Using a Custom Allocator is required for use with Protected Mode and Protected Memory.
57 // This example is provided using only unprotected malloc as Protected Memory is platform
58 // and implementation specific.
59 //
60 // Note: This example is similar to the SimpleSample application that can also be found in armnn/samples.
61 //       The differences are in the use of a custom allocator, the backend is GpuAcc, and the inputs/outputs
62 //       are being imported instead of copied. (Import must be enabled when using a Custom Allocator)
63 //       You might find this useful for comparison.
main()64 int main()
65 {
66     using namespace armnn;
67 
68     float number;
69     std::cout << "Please enter a number: " << std::endl;
70     std::cin >> number;
71 
72     // Turn on logging to standard output
73     // This is useful in this sample so that users can learn more about what is going on
74     ConfigureLogging(true, false, LogSeverity::Info);
75 
76     // Construct ArmNN network
77     NetworkId networkIdentifier;
78     INetworkPtr network = INetwork::Create();
79     FullyConnectedDescriptor fullyConnectedDesc;
80     float weightsData[] = {1.0f}; // Identity
81     TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32, 0.0f, 0, true);
82     weightsInfo.SetConstant(true);
83     ConstTensor weights(weightsInfo, weightsData);
84 
85     IConnectableLayer* inputLayer   = network->AddInputLayer(0);
86     IConnectableLayer* weightsLayer = network->AddConstantLayer(weights, "Weights");
87     IConnectableLayer* fullyConnectedLayer =
88             network->AddFullyConnectedLayer(fullyConnectedDesc, "fully connected");
89     IConnectableLayer* outputLayer  = network->AddOutputLayer(0);
90 
91     inputLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0));
92     weightsLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(1));
93     fullyConnectedLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
94     weightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);
95 
96     // Create ArmNN runtime:
97     //
98     // This is the interesting bit when executing a model with a custom allocator.
99     // You can have different allocators for different backends. To support this
100     // the runtime creation option has a map that takes a BackendId and the corresponding
101     // allocator that should be used for that backend.
102     // Only GpuAcc supports a Custom Allocator for now
103     //
104     // Note: This is not covered in this example but if you want to run a model on
105     //       protected memory a custom allocator needs to be provided that supports
106     //       protected memory allocations and the MemorySource of that allocator is
107     //       set to MemorySource::DmaBufProtected
108     IRuntime::CreationOptions options;
109     auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
110     options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}};
111     IRuntimePtr runtime = IRuntime::Create(options);
112 
113     //Set the tensors in the network.
114     TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
115     inputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
116 
117     unsigned int numElements = inputTensorInfo.GetNumElements();
118     size_t totalBytes = numElements * sizeof(float);
119 
120     TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);
121     fullyConnectedLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
122 
123     // Optimise ArmNN network
124     OptimizerOptionsOpaque optOptions;
125     optOptions.SetImportEnabled(true);
126     IOptimizedNetworkPtr optNet =
127                 Optimize(*network, {"GpuAcc"}, runtime->GetDeviceSpec(), optOptions);
128     if (!optNet)
129     {
130         // This shouldn't happen for this simple sample, with GpuAcc backend.
131         // But in general usage Optimize could fail if the backend at runtime cannot
132         // support the model that has been provided.
133         std::cerr << "Error: Failed to optimise the input network." << std::endl;
134         return 1;
135     }
136 
137     // Load graph into runtime
138     std::string ignoredErrorMessage;
139     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
140     runtime->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties);
141 
142     // Creates structures for input & output
143     const size_t alignment =
144             arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
145 
146     void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
147 
148     // Input with negative values
149     auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
150     std::fill_n(inputPtr, numElements, number);
151 
152     void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
153     auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
154     std::fill_n(outputPtr, numElements, -10.0f);
155 
156     inputTensorInfo = runtime->GetInputTensorInfo(networkIdentifier, 0);
157     inputTensorInfo.SetConstant(true);
158     InputTensors inputTensors
159     {
160         {0, ConstTensor(inputTensorInfo, alignedInputPtr)},
161     };
162     OutputTensors outputTensors
163     {
164         {0, Tensor(runtime->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)}
165     };
166 
167     // Execute network
168     runtime->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
169 
170     // Tell the CLBackend to sync memory so we can read the output.
171     arm_compute::CLScheduler::get().sync();
172     auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
173     std::cout << "Your number was " << outputResult[0] << std::endl;
174     runtime->UnloadNetwork(networkIdentifier);
175     return 0;
176 
177 }
178