xref: /aosp_15_r20/external/executorch/backends/apple/mps/runtime/MPSExecutor.mm (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1//
2//  Copyright (c) 2023 Apple Inc. All rights reserved.
3//  Provided subject to the LICENSE file in the top level directory.
4//
5
6#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
7#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
8#include <executorch/backends/apple/mps/schema_generated.h>
9#include <executorch/backends/apple/mps/runtime/MPSExecutor.h>
10#import <Foundation/Foundation.h>
11#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
12#import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
13
14@interface MPSGraphExecutable()
15-(NSArray<MPSGraphShapedType *> *) getInputShapes;
16-(NSArray<MPSGraphShapedType *> *) getOutputShapes;
17@end
18
19
20namespace executorch {
21namespace backends {
22namespace mps {
23namespace delegate {
24
25using executorch::runtime::Error;
26using executorch::aten::Tensor;
27
28MPSExecutor::MPSExecutor() {
29  _use_shared_mem = true;
30  _buffers_initialized = false;
31
32#if TARGET_OS_SIMULATOR or defined(__x86_64__)
33  _use_shared_mem = false;
34#endif
35  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
36    _use_shared_mem = false;
37  }
38
39  _inputsArray = [[NSMutableArray<MPSGraphTensorData *> alloc]  initWithCapacity:getNumInputs()];
40  _outputsArray = [[NSMutableArray<MPSGraphTensorData *> alloc] initWithCapacity:getNumOutputs()];
41}
42
43ET_NODISCARD Error
44MPSExecutor::set_inputs_outputs(std::vector<const Tensor*>& inputs, std::vector<const Tensor*>& outputs) {
45  ET_CHECK_OR_RETURN_ERROR(inputs.size() == getNumInputs(), Internal, "Inputs mismatch");
46  ET_CHECK_OR_RETURN_ERROR(outputs.size() == getNumOutputs(), Internal, "Outputs mismatch");
47  // updateDataBuffers is a no-op for devices with shared memory.
48  // In case of devices with non-shared memory, it will blit the contents to a private GPU buffer.
49  updateDataBuffers(inputs, outputs);
50  for (MPSGraphTensor *tensor in [_executable feedTensors]) {
51    int i = _mpsGraphTensorToId[tensor];
52    MPSGraphTensorData* tensorData = [[[MPSGraphTensorData alloc]initWithMTLBuffer:_inputGPUBuffers[i]
53                                                                            shape:[_inputShapes[i] shape]
54                                                                          dataType:[_inputShapes[i] dataType]] autorelease];
55    _inputsArray[i] = tensorData;
56  }
57
58  for (int i = 0; i < outputs.size(); i++) {
59    MPSGraphTensorData* tensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer:_outputGPUBuffers[i]
60                                                                              shape:[_outputShapes[i] shape]
61                                                                          dataType:[_outputShapes[i] dataType]] autorelease];
62    _outputsArray[i] = tensorData;
63  }
64  return Error::Ok;
65}
66
67ET_NODISCARD Error MPSExecutor::forward(std::vector<const Tensor*>& outputs) {
68  Error err = Error::Ok;
69  MPSStream* mpsStream = getDefaultMPSStream();
70  if (mpsStream->commitAndContinueEnabled() || mpsStream->hasLiveCommandBuffer()) {
71    id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
72    [_executable encodeToCommandBuffer:commandBuffer
73                          inputsArray:_inputsArray
74                          resultsArray:_outputsArray
75                  executionDescriptor:nil];
76  } else {
77    [_executable runWithMTLCommandQueue:mpsStream->commandQueue()
78                            inputsArray:_inputsArray
79                           resultsArray:_outputsArray
80                    executionDescriptor:nil];
81  }
82  syncOutputBuffers(outputs);
83
84  // On simulator, the buffers are synchronized during `syncOutputBuffer`
85#if !TARGET_OS_SIMULATOR
86  if (mpsStream->commitAndContinueEnabled()) {
87    err = mpsStream->synchronize(SyncType::COMMIT_AND_CONTINUE);
88  } else {
89    err = mpsStream->synchronize(SyncType::COMMIT_AND_WAIT);
90  }
91
92  ET_CHECK_OR_RETURN_ERROR(
93    err == Error::Ok,
94    Internal,
95    "Could not synchronize on the MPSStream");
96#endif
97
98  return Error::Ok;
99}
100
101Error
102MPSExecutor::initDataBuffers() {
103  Error error = Error::Ok;
104
105  _inputShapes = [[_executable getInputShapes] retain];
106  _outputShapes = [[_executable getOutputShapes] retain];
107
108  int nInputs = getNumInputs();
109  int nOutputs = getNumOutputs();
110
111  _inputGPUBuffers.resize(nInputs);
112  _outputGPUBuffers.resize(nOutputs);
113
114  if (!_use_shared_mem) {
115    _inputCPUBuffers.resize(nInputs);
116    _outputCPUBuffers.resize(nOutputs);
117  }
118
119  // In case of shared memory, the CPU raw buffer is used directly as an MTLBuffer.
120  // In case of not being able to use shared memory, initialize the data buffers once
121  // and keep reusing them across inference runs.
122  auto getDataBuffer = [] (MPSShape* shape, MPSDataType mpsDataType) {
123    __block int64_t length = 1;
124    [shape enumerateObjectsUsingBlock:^(NSNumber * _Nonnull obj, NSUInteger idx, BOOL * _Nonnull stop) {
125        length *= obj.intValue;
126    }];
127    // Get total size in bytes.
128    length *= ((mpsDataType & 0xFFFF) >> 3);
129    MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared;
130    return [MPSDevice::getInstance()->device() newBufferWithLength:length
131                                                            options:options];
132
133  };
134
135  // Preallocate at init time the GPU buffers used to run
136  // the model in case shared memory is not being used.
137  if (!_use_shared_mem) {
138    for (int i = 0; i < nInputs; i++) {
139      _inputGPUBuffers[i] = getDataBuffer([_inputShapes[i] shape], [_inputShapes[i] dataType]);
140    }
141    for (int i = 0; i < nOutputs; i++) {
142      _outputGPUBuffers[i] = getDataBuffer([_outputShapes[i] shape], [_outputShapes[i] dataType]);
143    }
144  }
145
146  return error;
147}
148
149Error
150MPSExecutor::updateDataBuffers(
151  std::vector<const Tensor*>& inputs, std::vector<const Tensor*>& outputs
152) {
153  for (int i = 0; i < inputs.size(); i++) {
154    const Tensor& tensor = *inputs[i];
155    void* host_src = tensor.mutable_data_ptr<void*>();
156    if (_use_shared_mem) {
157      // Use directly the CPU buffer when using shared memory.
158      _inputGPUBuffers[i] = getMTLBufferStorage(tensor);
159    } else {
160      _inputCPUBuffers[i].flags = 0;
161#if TARGET_OS_SIMULATOR
162      // Simulator crashes when using newBufferWithBytesNoCopy.
163      // Use memcpy directly instead of using blit to copy the CPU
164      // data into the GPU buffer.
165      _inputCPUBuffers[i].srcOffset = 0;
166      _inputCPUBuffers[i].srcBuffer = host_src;
167      _inputCPUBuffers[i].srcCpu = 1;
168#else
169      MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared;
170      NSUInteger alignedLength = 0;
171      void* alignedPtr = pageAlignedBlockPtr(host_src, (NSUInteger)tensor.nbytes(), &alignedLength);
172      _inputCPUBuffers[i].srcOffset = uintptr_t(host_src) - uintptr_t(alignedPtr);
173      _inputCPUBuffers[i].srcBuffer = [MPSDevice::getInstance()->device() newBufferWithBytesNoCopy:alignedPtr
174                                                        length:alignedLength
175                                                      options:options
176                                                  deallocator:nil];
177
178#endif
179      _inputCPUBuffers[i].dstBuffer = _inputGPUBuffers[i];
180      _inputCPUBuffers[i].dstOffset = 0;
181      _inputCPUBuffers[i].length = tensor.nbytes();
182    }
183  }
184
185  if (_use_shared_mem) {
186    for (int i = 0; i < outputs.size(); i++) {
187      _outputGPUBuffers[i] = getMTLBufferStorage(*outputs[i]);
188    }
189  }
190
191  if (!_use_shared_mem) {
192    MPSStream* mpsStream = getDefaultMPSStream();
193      mpsStream->copy_and_sync(
194        _inputCPUBuffers, /*non_blocking=*/true);
195  }
196
197  return Error::Ok;
198}
199
200Error
201MPSExecutor::syncOutputBuffers(
202  std::vector<const Tensor*>& outputs) {
203  if (!_use_shared_mem)  {
204    MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared;
205    NSUInteger alignedLength = 0;
206    MPSStream* mpsStream = getDefaultMPSStream();
207
208  if (!_buffers_initialized) {
209      for (int i = 0; i < outputs.size(); i++) {
210        const Tensor& tensor = *outputs[i];
211        void* host_dst = tensor.mutable_data_ptr<void*>();
212        _outputCPUBuffers[i].flags = 0;
213#if TARGET_OS_SIMULATOR
214        _outputCPUBuffers[i].dstOffset = 0;
215        _outputCPUBuffers[i].dstBuffer = host_dst;
216        _outputCPUBuffers[i].dstCpu = 1;
217#else
218        void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)tensor.nbytes(), &alignedLength);
219        _outputCPUBuffers[i].dstOffset = (uintptr_t(host_dst) - uintptr_t(alignedPtr));
220        // 4 bytes alignment required on MacOS for blits.
221        ET_CHECK_MSG(_outputCPUBuffers[i].dstOffset % 4 == 0, "Unaligned blit request");
222        _outputCPUBuffers[i].dstBuffer = [MPSDevice::getInstance()->device() newBufferWithBytesNoCopy:alignedPtr
223                                                              length:alignedLength
224                                                            options:options
225                                                        deallocator:nil];
226#endif
227        _outputCPUBuffers[i].srcBuffer = _outputGPUBuffers[i];
228        _outputCPUBuffers[i].srcOffset = 0;
229        _outputCPUBuffers[i].length = tensor.nbytes();
230      }
231    }
232
233    mpsStream->copy_and_sync(
234      _outputCPUBuffers, /*non_blocking=*/true
235    );
236  }
237
238  _buffers_initialized = true;
239  return Error::Ok;
240}
241
242
243} // namespace delegate
244} // namespace mps
245} // namespace backends
246} // namespace executorch
247