1// 2// Copyright (c) 2023 Apple Inc. All rights reserved. 3// Provided subject to the LICENSE file in the top level directory. 4// 5 6#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h> 7#include <executorch/runtime/core/exec_aten/util/tensor_util.h> 8#include <executorch/backends/apple/mps/schema_generated.h> 9#include <executorch/backends/apple/mps/runtime/MPSExecutor.h> 10#import <Foundation/Foundation.h> 11#import <MetalPerformanceShaders/MetalPerformanceShaders.h> 12#import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h> 13 14@interface MPSGraphExecutable() 15-(NSArray<MPSGraphShapedType *> *) getInputShapes; 16-(NSArray<MPSGraphShapedType *> *) getOutputShapes; 17@end 18 19 20namespace executorch { 21namespace backends { 22namespace mps { 23namespace delegate { 24 25using executorch::runtime::Error; 26using executorch::aten::Tensor; 27 28MPSExecutor::MPSExecutor() { 29 _use_shared_mem = true; 30 _buffers_initialized = false; 31 32#if TARGET_OS_SIMULATOR or defined(__x86_64__) 33 _use_shared_mem = false; 34#endif 35 if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { 36 _use_shared_mem = false; 37 } 38 39 _inputsArray = [[NSMutableArray<MPSGraphTensorData *> alloc] initWithCapacity:getNumInputs()]; 40 _outputsArray = [[NSMutableArray<MPSGraphTensorData *> alloc] initWithCapacity:getNumOutputs()]; 41} 42 43ET_NODISCARD Error 44MPSExecutor::set_inputs_outputs(std::vector<const Tensor*>& inputs, std::vector<const Tensor*>& outputs) { 45 ET_CHECK_OR_RETURN_ERROR(inputs.size() == getNumInputs(), Internal, "Inputs mismatch"); 46 ET_CHECK_OR_RETURN_ERROR(outputs.size() == getNumOutputs(), Internal, "Outputs mismatch"); 47 // updateDataBuffers is a no-op for devices with shared memory. 48 // In case of devices with non-shared memory, it will blit the contents to a private GPU buffer. 49 updateDataBuffers(inputs, outputs); 50 for (MPSGraphTensor *tensor in [_executable feedTensors]) { 51 int i = _mpsGraphTensorToId[tensor]; 52 MPSGraphTensorData* tensorData = [[[MPSGraphTensorData alloc]initWithMTLBuffer:_inputGPUBuffers[i] 53 shape:[_inputShapes[i] shape] 54 dataType:[_inputShapes[i] dataType]] autorelease]; 55 _inputsArray[i] = tensorData; 56 } 57 58 for (int i = 0; i < outputs.size(); i++) { 59 MPSGraphTensorData* tensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer:_outputGPUBuffers[i] 60 shape:[_outputShapes[i] shape] 61 dataType:[_outputShapes[i] dataType]] autorelease]; 62 _outputsArray[i] = tensorData; 63 } 64 return Error::Ok; 65} 66 67ET_NODISCARD Error MPSExecutor::forward(std::vector<const Tensor*>& outputs) { 68 Error err = Error::Ok; 69 MPSStream* mpsStream = getDefaultMPSStream(); 70 if (mpsStream->commitAndContinueEnabled() || mpsStream->hasLiveCommandBuffer()) { 71 id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer(); 72 [_executable encodeToCommandBuffer:commandBuffer 73 inputsArray:_inputsArray 74 resultsArray:_outputsArray 75 executionDescriptor:nil]; 76 } else { 77 [_executable runWithMTLCommandQueue:mpsStream->commandQueue() 78 inputsArray:_inputsArray 79 resultsArray:_outputsArray 80 executionDescriptor:nil]; 81 } 82 syncOutputBuffers(outputs); 83 84 // On simulator, the buffers are synchronized during `syncOutputBuffer` 85#if !TARGET_OS_SIMULATOR 86 if (mpsStream->commitAndContinueEnabled()) { 87 err = mpsStream->synchronize(SyncType::COMMIT_AND_CONTINUE); 88 } else { 89 err = mpsStream->synchronize(SyncType::COMMIT_AND_WAIT); 90 } 91 92 ET_CHECK_OR_RETURN_ERROR( 93 err == Error::Ok, 94 Internal, 95 "Could not synchronize on the MPSStream"); 96#endif 97 98 return Error::Ok; 99} 100 101Error 102MPSExecutor::initDataBuffers() { 103 Error error = Error::Ok; 104 105 _inputShapes = [[_executable getInputShapes] retain]; 106 _outputShapes = [[_executable getOutputShapes] retain]; 107 108 int nInputs = getNumInputs(); 109 int nOutputs = getNumOutputs(); 110 111 _inputGPUBuffers.resize(nInputs); 112 _outputGPUBuffers.resize(nOutputs); 113 114 if (!_use_shared_mem) { 115 _inputCPUBuffers.resize(nInputs); 116 _outputCPUBuffers.resize(nOutputs); 117 } 118 119 // In case of shared memory, the CPU raw buffer is used directly as an MTLBuffer. 120 // In case of not being able to use shared memory, initialize the data buffers once 121 // and keep reusing them across inference runs. 122 auto getDataBuffer = [] (MPSShape* shape, MPSDataType mpsDataType) { 123 __block int64_t length = 1; 124 [shape enumerateObjectsUsingBlock:^(NSNumber * _Nonnull obj, NSUInteger idx, BOOL * _Nonnull stop) { 125 length *= obj.intValue; 126 }]; 127 // Get total size in bytes. 128 length *= ((mpsDataType & 0xFFFF) >> 3); 129 MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared; 130 return [MPSDevice::getInstance()->device() newBufferWithLength:length 131 options:options]; 132 133 }; 134 135 // Preallocate at init time the GPU buffers used to run 136 // the model in case shared memory is not being used. 137 if (!_use_shared_mem) { 138 for (int i = 0; i < nInputs; i++) { 139 _inputGPUBuffers[i] = getDataBuffer([_inputShapes[i] shape], [_inputShapes[i] dataType]); 140 } 141 for (int i = 0; i < nOutputs; i++) { 142 _outputGPUBuffers[i] = getDataBuffer([_outputShapes[i] shape], [_outputShapes[i] dataType]); 143 } 144 } 145 146 return error; 147} 148 149Error 150MPSExecutor::updateDataBuffers( 151 std::vector<const Tensor*>& inputs, std::vector<const Tensor*>& outputs 152) { 153 for (int i = 0; i < inputs.size(); i++) { 154 const Tensor& tensor = *inputs[i]; 155 void* host_src = tensor.mutable_data_ptr<void*>(); 156 if (_use_shared_mem) { 157 // Use directly the CPU buffer when using shared memory. 158 _inputGPUBuffers[i] = getMTLBufferStorage(tensor); 159 } else { 160 _inputCPUBuffers[i].flags = 0; 161#if TARGET_OS_SIMULATOR 162 // Simulator crashes when using newBufferWithBytesNoCopy. 163 // Use memcpy directly instead of using blit to copy the CPU 164 // data into the GPU buffer. 165 _inputCPUBuffers[i].srcOffset = 0; 166 _inputCPUBuffers[i].srcBuffer = host_src; 167 _inputCPUBuffers[i].srcCpu = 1; 168#else 169 MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared; 170 NSUInteger alignedLength = 0; 171 void* alignedPtr = pageAlignedBlockPtr(host_src, (NSUInteger)tensor.nbytes(), &alignedLength); 172 _inputCPUBuffers[i].srcOffset = uintptr_t(host_src) - uintptr_t(alignedPtr); 173 _inputCPUBuffers[i].srcBuffer = [MPSDevice::getInstance()->device() newBufferWithBytesNoCopy:alignedPtr 174 length:alignedLength 175 options:options 176 deallocator:nil]; 177 178#endif 179 _inputCPUBuffers[i].dstBuffer = _inputGPUBuffers[i]; 180 _inputCPUBuffers[i].dstOffset = 0; 181 _inputCPUBuffers[i].length = tensor.nbytes(); 182 } 183 } 184 185 if (_use_shared_mem) { 186 for (int i = 0; i < outputs.size(); i++) { 187 _outputGPUBuffers[i] = getMTLBufferStorage(*outputs[i]); 188 } 189 } 190 191 if (!_use_shared_mem) { 192 MPSStream* mpsStream = getDefaultMPSStream(); 193 mpsStream->copy_and_sync( 194 _inputCPUBuffers, /*non_blocking=*/true); 195 } 196 197 return Error::Ok; 198} 199 200Error 201MPSExecutor::syncOutputBuffers( 202 std::vector<const Tensor*>& outputs) { 203 if (!_use_shared_mem) { 204 MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache | MTLResourceStorageModeShared; 205 NSUInteger alignedLength = 0; 206 MPSStream* mpsStream = getDefaultMPSStream(); 207 208 if (!_buffers_initialized) { 209 for (int i = 0; i < outputs.size(); i++) { 210 const Tensor& tensor = *outputs[i]; 211 void* host_dst = tensor.mutable_data_ptr<void*>(); 212 _outputCPUBuffers[i].flags = 0; 213#if TARGET_OS_SIMULATOR 214 _outputCPUBuffers[i].dstOffset = 0; 215 _outputCPUBuffers[i].dstBuffer = host_dst; 216 _outputCPUBuffers[i].dstCpu = 1; 217#else 218 void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)tensor.nbytes(), &alignedLength); 219 _outputCPUBuffers[i].dstOffset = (uintptr_t(host_dst) - uintptr_t(alignedPtr)); 220 // 4 bytes alignment required on MacOS for blits. 221 ET_CHECK_MSG(_outputCPUBuffers[i].dstOffset % 4 == 0, "Unaligned blit request"); 222 _outputCPUBuffers[i].dstBuffer = [MPSDevice::getInstance()->device() newBufferWithBytesNoCopy:alignedPtr 223 length:alignedLength 224 options:options 225 deallocator:nil]; 226#endif 227 _outputCPUBuffers[i].srcBuffer = _outputGPUBuffers[i]; 228 _outputCPUBuffers[i].srcOffset = 0; 229 _outputCPUBuffers[i].length = tensor.nbytes(); 230 } 231 } 232 233 mpsStream->copy_and_sync( 234 _outputCPUBuffers, /*non_blocking=*/true 235 ); 236 } 237 238 _buffers_initialized = true; 239 return Error::Ok; 240} 241 242 243} // namespace delegate 244} // namespace mps 245} // namespace backends 246} // namespace executorch 247