1 /*
2 * Copyright (c) 2024 MediaTek Inc.
3 *
4 * Licensed under the BSD License (the "License"); you may not use this file
5 * except in compliance with the License. See the license file in the root
6 * directory of this source tree for more details.
7 */
8
9 #include "ModelChunk.h"
10
11 #include <sstream>
12
13 #include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
14
15 #include <executorch/extension/data_loader/file_data_loader.h>
16 #include <executorch/extension/evalue_util/print_evalue.h>
17 #include <executorch/runtime/executor/method.h>
18 #include <executorch/runtime/executor/program.h>
19 #include <executorch/runtime/platform/log.h>
20 #include <executorch/runtime/platform/profiler.h>
21 #include <executorch/runtime/platform/runtime.h>
22
23 #define ENSURE_INIT \
24 ET_CHECK_MSG(Initialized(), "Error: Model chunk not initialized.");
25
26 namespace example {
27
28 using executorch::aten::Tensor;
29 using executorch::aten::TensorImpl;
30 using executorch::extension::FileDataLoader;
31 using executorch::runtime::Error;
32 using executorch::runtime::HierarchicalAllocator;
33 using executorch::runtime::MemoryAllocator;
34 using executorch::runtime::MemoryManager;
35 using executorch::runtime::Method;
36 using executorch::runtime::MethodMeta;
37 using executorch::runtime::Program;
38 using executorch::runtime::Result;
39 using executorch::runtime::Span;
40 using executorch::runtime::Tag;
41
42 static constexpr size_t kMethodAllocatorPoolSize = 4 * 1024U * 1024U; // 4MB
43
44 // ExecuTorch model instance
45 // The member ordering affects the order of destruction.
46 struct ModelInstance {
47 std::unique_ptr<Program> program;
48
49 std::vector<std::unique_ptr<uint8_t[]>> planned_buffers;
50 std::vector<Span<uint8_t>> planned_spans;
51
52 std::vector<uint8_t> method_allocator_pool;
53 std::unique_ptr<MemoryAllocator> method_allocator;
54 std::unique_ptr<HierarchicalAllocator> planned_memory;
55 std::unique_ptr<MemoryManager> memory_manager;
56
57 std::unique_ptr<Method> method;
58 };
59
Initialize()60 void ModelChunk::Initialize() {
61 LoadModels();
62 GetModelIoInfo();
63 AllocateIoBuffers();
64 SetBackendInputs();
65 SetBackendOutputs();
66 mIsInitialized = true;
67 }
68
Initialized()69 bool ModelChunk::Initialized() {
70 return mIsInitialized;
71 }
72
Release()73 void ModelChunk::Release() {
74 ENSURE_INIT
75 ReleaseModels();
76 ReleaseIoBuffers();
77 }
78
Run()79 void ModelChunk::Run() {
80 ENSURE_INIT
81 auto beforeExec = std::chrono::high_resolution_clock::now();
82 Error status = Error::Ok;
83 status = GetModelMethod().execute();
84 auto afterExec = std::chrono::high_resolution_clock::now();
85 const double elapsedTime =
86 std::chrono::duration_cast<std::chrono::microseconds>(
87 afterExec - beforeExec)
88 .count();
89 ET_LOG(Debug, "Inference took %f ms", elapsedTime / 1000.0);
90 ET_CHECK_MSG(
91 status == Error::Ok,
92 "Execution of method failed with status 0x%" PRIx32,
93 status);
94 ET_LOG(Debug, "Model executed successfully.");
95 }
96
HotSwapModel(const size_t tokenBatchSize)97 bool ModelChunk::HotSwapModel(const size_t tokenBatchSize) {
98 ENSURE_INIT
99 // Save old values
100 const auto oldInstanceBatchSize = GetModelId();
101 const auto oldTokenBatchSize = mTokenBatchSize;
102
103 if (!HasModel(tokenBatchSize)) {
104 ET_LOG(
105 Error,
106 "Model swap: No model with batchSize=%zu is available",
107 tokenBatchSize);
108 return false;
109 }
110
111 if (oldInstanceBatchSize == tokenBatchSize) {
112 ET_LOG(Info, "Model swapping to itself");
113 return true;
114 }
115
116 SelectModel(tokenBatchSize);
117
118 const auto newInstanceBatchSize = GetModelId();
119 if (oldInstanceBatchSize == newInstanceBatchSize) {
120 ET_LOG(
121 Error,
122 "Failed to switch to model with batchSize=%zu. Model currently remain at batchSize=%zu",
123 tokenBatchSize,
124 oldTokenBatchSize);
125 return false;
126 }
127
128 // Update model variables
129 // Mask length = cache size (length) + num input token (token batch size)
130 mTokenBatchSize = tokenBatchSize;
131
132 UpdateModelIoInfo();
133 SetBackendInputs();
134 SetBackendOutputs();
135 return true;
136 }
137
SetInputBuffer(const void * data,const size_t size,const size_t index)138 void ModelChunk::SetInputBuffer(
139 const void* data,
140 const size_t size,
141 const size_t index) {
142 ENSURE_INIT
143 auto& targetBufInfo = mInputBufferInfos[index];
144 ET_CHECK_MSG(
145 targetBufInfo.nbytes >= size,
146 "Error: Input[%zu] has only allocated %zu but need to set input with size %zu",
147 index,
148 targetBufInfo.nbytes,
149 size);
150 std::memcpy(targetBufInfo.data, data, size);
151 }
152
SetInputBuffer(const BufferInfo & bufferInfo,const size_t index)153 void ModelChunk::SetInputBuffer(
154 const BufferInfo& bufferInfo,
155 const size_t index) {
156 // Allow calling this method without initialized first to assign preallocated
157 // buffers.
158 if (index >= mInputBufferInfos.size()) {
159 mInputBufferInfos.resize(index + 1);
160 }
161 // If the existing buffer has been allocated, memory copy the content.
162 // Otherwise, share the input buffer info.
163 auto& targetBufInfo = mInputBufferInfos[index];
164 if (targetBufInfo.data != nullptr) {
165 // Already allocated, do memcpy.
166 SetInputBuffer(bufferInfo.data, bufferInfo.nbytesUsed, index);
167 } else {
168 // Share the buffer info.
169 targetBufInfo = bufferInfo;
170 }
171 }
172
GetInputBuffer(const size_t index)173 BufferInfo ModelChunk::GetInputBuffer(const size_t index) {
174 ENSURE_INIT
175 ET_CHECK_MSG(
176 index < mInputBufferInfos.size(),
177 "Error: Index out of range: %zu",
178 index);
179 return mInputBufferInfos[index];
180 }
181
GetOutputBuffer(const size_t index)182 BufferInfo ModelChunk::GetOutputBuffer(const size_t index) {
183 ENSURE_INIT
184 ET_CHECK_MSG(
185 index < mOutputBufferInfos.size(),
186 "Error: Index out of range: %zu",
187 index);
188 return mOutputBufferInfos[index];
189 }
190
LogIoSummary()191 void ModelChunk::LogIoSummary() {
192 ENSURE_INIT
193 const auto& method = GetModelMethod();
194 const auto method_meta = method.method_meta();
195
196 auto getShapeStr = [](const auto shape) {
197 std::ostringstream ss;
198 ss << "(";
199 for (size_t i = 0; i < shape.size(); i++) {
200 ss << shape[i];
201 if (i < shape.size() - 1)
202 ss << ", ";
203 }
204 ss << ")";
205 return ss.str();
206 };
207
208 ET_LOG(Info, "Model Chunk IO Summary:");
209
210 const size_t input_size = method.inputs_size();
211 const size_t output_size = method.outputs_size();
212
213 for (size_t i = 0; i < input_size; i++) {
214 if (*method_meta.input_tag(i) != Tag::Tensor) {
215 ET_LOG(Info, " Input %zu: Non-Tensor", i);
216 continue;
217 }
218 const auto nbytes = method_meta.input_tensor_meta(i)->nbytes();
219 const auto shape = getShapeStr(method_meta.input_tensor_meta(i)->sizes());
220 const auto type =
221 static_cast<int>(method_meta.input_tensor_meta(i)->scalar_type());
222 ET_LOG(
223 Info,
224 " Input %zu: Shape: %s, Size: %zu bytes, Type: %d",
225 i,
226 shape.c_str(),
227 nbytes,
228 type);
229 }
230
231 for (size_t i = 0; i < output_size; i++) {
232 if (*method_meta.output_tag(i) != Tag::Tensor) {
233 ET_LOG(Info, " Output %zu: Non-Tensor", i);
234 continue;
235 }
236 const auto nbytes = method_meta.output_tensor_meta(i)->nbytes();
237 const auto shape = getShapeStr(method_meta.output_tensor_meta(i)->sizes());
238 const auto type =
239 static_cast<int>(method_meta.output_tensor_meta(i)->scalar_type());
240 ET_LOG(
241 Info,
242 " Output %zu: Shape: %s, Size: %zu bytes, Type: %d",
243 i,
244 shape.c_str(),
245 nbytes,
246 type);
247 }
248 }
249
GetModelIoInfo()250 void ModelChunk::GetModelIoInfo() {
251 const auto& method = GetModelMethod();
252 const auto method_meta = method.method_meta();
253
254 const size_t input_size = method.inputs_size();
255 const size_t output_size = method.outputs_size();
256
257 mInputBufferInfos.resize(input_size);
258 for (size_t i = 0; i < input_size; i++) {
259 if (*method_meta.input_tag(i) != Tag::Tensor) {
260 ET_LOG(Info, "Input %zu is not a tensor, skipping", i);
261 continue;
262 }
263 auto& bufInfo = mInputBufferInfos[i];
264 const auto nbytes = method_meta.input_tensor_meta(i)->nbytes();
265 if (bufInfo.data != nullptr) {
266 // Already preallocated, so just update the size used by the model.
267 ET_CHECK_MSG(
268 bufInfo.nbytes >= nbytes,
269 "Error: Model input[%zu] requires size=%zu but only preallocated size=%zu",
270 i,
271 nbytes,
272 bufInfo.nbytes);
273 bufInfo.nbytesUsed = nbytes;
274 continue;
275 }
276 bufInfo.nbytes = nbytes;
277 bufInfo.nbytesUsed = nbytes;
278 }
279
280 mOutputBufferInfos.resize(output_size);
281 for (size_t i = 0; i < output_size; i++) {
282 if (*method_meta.output_tag(i) != Tag::Tensor) {
283 ET_LOG(Info, "Output %zu is not a tensor, skipping", i);
284 continue;
285 }
286 auto& bufInfo = mOutputBufferInfos[i];
287 const auto nbytes = method_meta.output_tensor_meta(i)->nbytes();
288 if (bufInfo.data != nullptr) {
289 // Already preallocated, so just update the size used by model.
290 ET_CHECK_MSG(
291 bufInfo.nbytes >= nbytes,
292 "Error: Model output[%zu] requires size of %zu but only preallocated size of %zu",
293 i,
294 nbytes,
295 bufInfo.nbytes);
296 bufInfo.nbytesUsed = nbytes;
297 continue;
298 }
299 bufInfo.nbytes = nbytes;
300 bufInfo.nbytesUsed = nbytes;
301 }
302 }
303
304 // Update actual used IO sizes by the model
UpdateModelIoInfo()305 void ModelChunk::UpdateModelIoInfo() {
306 const auto& method = GetModelMethod();
307 const auto method_meta = method.method_meta();
308
309 const size_t numModelInputs = method.inputs_size();
310 const size_t numModelOutputs = method.outputs_size();
311
312 const size_t numInputBuffers = mInputBufferInfos.size();
313 const size_t numOutputBuffers = mOutputBufferInfos.size();
314
315 if (numInputBuffers != numModelInputs) {
316 ET_LOG(
317 Info,
318 "Existing num inputs (%zu) != new num inputs (%zu)",
319 numInputBuffers,
320 numModelInputs);
321 }
322 if (numOutputBuffers != numModelOutputs) {
323 ET_LOG(
324 Info,
325 "Existing num outputs (%zu) != new num outputs (%zu)",
326 numOutputBuffers,
327 numModelOutputs);
328 }
329 mInputBufferInfos.resize(numModelInputs);
330 for (size_t inputIdx = 0; inputIdx < numModelInputs; inputIdx++) {
331 auto& sizeAllocated = mInputBufferInfos[inputIdx].nbytes;
332 auto& sizeRequired = mInputBufferInfos[inputIdx].nbytesUsed;
333 const auto before = sizeRequired;
334
335 // Update
336 sizeRequired = method_meta.input_tensor_meta(inputIdx)->nbytes();
337 if (sizeAllocated < sizeRequired) {
338 ET_LOG(
339 Error,
340 "Insufficient buffer size for input[%zu]. Requires %zu but only allocated %zu",
341 inputIdx,
342 sizeRequired,
343 sizeAllocated);
344 }
345 if (before != sizeRequired) {
346 ET_LOG(
347 Debug,
348 "Update input[%zu] size: %zu -> %zu",
349 inputIdx,
350 before,
351 sizeRequired);
352 }
353 }
354 mOutputBufferInfos.resize(numModelOutputs);
355 for (size_t outputIdx = 0; outputIdx < numModelOutputs; outputIdx++) {
356 auto& sizeAllocated = mOutputBufferInfos[outputIdx].nbytes;
357 auto& sizeRequired = mOutputBufferInfos[outputIdx].nbytesUsed;
358 const auto before = sizeRequired;
359
360 // Update
361 sizeRequired = method_meta.output_tensor_meta(outputIdx)->nbytes();
362 if (sizeAllocated < sizeRequired) {
363 ET_LOG(
364 Error,
365 "Insufficient buffer size for output[%zu]. Requires %zu but only allocated %zu",
366 outputIdx,
367 sizeRequired,
368 sizeAllocated);
369 }
370 if (before != sizeRequired) {
371 ET_LOG(
372 Debug,
373 "Update output[%zu] size: %zu -> %zu",
374 outputIdx,
375 before,
376 sizeRequired);
377 }
378 }
379 }
380
LinkModelIO(const size_t inputIndex,const size_t outputIndex)381 void ModelChunk::LinkModelIO(
382 const size_t inputIndex,
383 const size_t outputIndex) {
384 mModelOutToInIndexLinks.emplace(outputIndex, inputIndex);
385 }
386
GetLinkedInputIndex(const size_t outputIndex) const387 std::optional<size_t> ModelChunk::GetLinkedInputIndex(
388 const size_t outputIndex) const {
389 auto hasKey = [](const auto& map, const auto& key) {
390 return map.find(key) != map.end();
391 };
392 if (hasKey(mModelOutToInIndexLinks, outputIndex))
393 return mModelOutToInIndexLinks.at(outputIndex);
394 else
395 return std::nullopt;
396 }
397
SetBackendInputs()398 void ModelChunk::SetBackendInputs() {
399 auto& method = GetModelMethod();
400 const auto method_meta = method.method_meta();
401 const size_t input_size = method.inputs_size();
402 for (size_t i = 0; i < input_size; i++) {
403 const auto tensor_meta = method_meta.input_tensor_meta(i);
404 auto scalar_type = tensor_meta->scalar_type();
405 auto sizes_raw = tensor_meta->sizes();
406 auto dim = sizes_raw.size();
407 auto dim_order_raw = tensor_meta->dim_order();
408 std::vector sizes(sizes_raw.begin(), sizes_raw.end());
409 std::vector dim_order(dim_order_raw.begin(), dim_order_raw.end());
410 auto buffer_data = mInputBufferInfos[i].data;
411
412 TensorImpl impl = TensorImpl(
413 scalar_type, dim, sizes.data(), buffer_data, dim_order.data());
414 Tensor tensor(&impl);
415 const auto error = method.set_input(tensor, i);
416 ET_CHECK_MSG(
417 error == Error::Ok,
418 "Error: 0x%" PRIx32 " setting input %zu.",
419 error,
420 i);
421 }
422 }
423
SetBackendOutputs()424 void ModelChunk::SetBackendOutputs() {
425 auto& method = GetModelMethod();
426 for (size_t i = 0; i < mOutputBufferInfos.size(); i++) {
427 auto data = mOutputBufferInfos[i].data;
428 const auto nbytes = mOutputBufferInfos[i].nbytes;
429 const auto output_err = method.set_output_data_ptr(data, nbytes, i);
430 ET_CHECK_MSG(
431 output_err == Error::Ok,
432 "Error: 0x%" PRIx32 " setting output %zu.",
433 output_err,
434 i);
435 }
436 }
437
AllocateIoBuffers()438 void ModelChunk::AllocateIoBuffers() {
439 auto& buffer_allocator = GET_NEURON_ALLOCATOR;
440
441 // Inputs
442 for (auto& inBufInfo : mInputBufferInfos) {
443 if (inBufInfo.data != nullptr) {
444 continue; // Already allocated
445 }
446 void* ahwb_data = buffer_allocator.Allocate(inBufInfo.nbytes);
447 inBufInfo.data = ahwb_data;
448 }
449
450 // Outputs
451 const auto numOutputBuffers = mOutputBufferInfos.size();
452 for (size_t outputIdx = 0; outputIdx < numOutputBuffers; outputIdx++) {
453 auto& outBufInfo = mOutputBufferInfos[outputIdx];
454 if (outBufInfo.data != nullptr) {
455 continue; // Already allocated
456 }
457 const auto linkedInputIdx = GetLinkedInputIndex(outputIdx);
458 if (linkedInputIdx) {
459 const auto& linkedInBufInfo = mInputBufferInfos[*linkedInputIdx];
460 // Ensure the linked IO sizes match, then reuse the linked input buffer
461 ET_CHECK_MSG(
462 outBufInfo.nbytes == linkedInBufInfo.nbytes,
463 "Error: Mismatch sizes between linked IO. "
464 "Input %zu size is %zu, but Output %zu size is %zu.",
465 *linkedInputIdx,
466 linkedInBufInfo.nbytes,
467 outputIdx,
468 outBufInfo.nbytes);
469 outBufInfo = linkedInBufInfo;
470 continue;
471 }
472 // Allocate output buffer as usual
473 void* ahwb_data = buffer_allocator.Allocate(outBufInfo.nbytes);
474 outBufInfo.data = ahwb_data;
475 }
476 }
477
ReleaseIoBuffers()478 void ModelChunk::ReleaseIoBuffers() {
479 auto& buffer_allocator = GET_NEURON_ALLOCATOR;
480
481 for (size_t i = 0; i < mInputBufferInfos.size(); i++)
482 buffer_allocator.RemoveBuffer(mInputBufferInfos[i].data);
483
484 for (size_t i = 0; i < mOutputBufferInfos.size(); i++)
485 buffer_allocator.RemoveBuffer(mOutputBufferInfos[i].data);
486 }
487
GetModelMethod()488 Method& ModelChunk::GetModelMethod() {
489 auto modelInstance = reinterpret_cast<ModelInstance*>(GetModelInstance());
490 return *(modelInstance->method);
491 }
492
493 // Override the virtual functions
CreateModelInstance(const std::string & modelPath)494 void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
495 auto modelInstance = new ModelInstance;
496
497 // Create a loader to get the data of the program file. There are other
498 // DataLoaders that use mmap() or point to data that's already in memory, and
499 // users can create their own DataLoaders to load from arbitrary sources.
500 Result<FileDataLoader> loader = FileDataLoader::from(modelPath.c_str());
501 ET_CHECK_MSG(
502 loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error());
503
504 // Parse the program file. This is immutable, and can also be reused between
505 // multiple execution invocations across multiple threads.
506 Result<Program> program_loaded = Program::load(&loader.get());
507 if (!program_loaded.ok()) {
508 ET_LOG(Error, "Failed to parse model file %s", modelPath.c_str());
509 return nullptr;
510 }
511 ET_LOG(Debug, "Model file %s is loaded.", modelPath.c_str());
512
513 // Extract program out to a persistent storage before calling any of its
514 // methods.
515 modelInstance->program =
516 std::make_unique<Program>(std::move(program_loaded.get()));
517 auto& program = modelInstance->program;
518
519 // Use the first method in the program.
520 const char* method_name = nullptr;
521 {
522 const auto method_name_result = program->get_method_name(0);
523 ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
524 method_name = *method_name_result;
525 }
526 ET_LOG(Debug, "Using method %s", method_name);
527
528 // MethodMeta describes the memory requirements of the method.
529 Result<MethodMeta> method_meta = program->method_meta(method_name);
530 ET_CHECK_MSG(
531 method_meta.ok(),
532 "Failed to get method_meta for %s: 0x%x",
533 method_name,
534 (unsigned int)method_meta.error());
535
536 modelInstance->method_allocator_pool.resize(kMethodAllocatorPoolSize);
537 modelInstance->method_allocator = std::make_unique<MemoryAllocator>(
538 kMethodAllocatorPoolSize, modelInstance->method_allocator_pool.data());
539 auto& method_allocator = modelInstance->method_allocator;
540 method_allocator->enable_profiling("method allocator");
541
542 auto& planned_buffers = modelInstance->planned_buffers; // Owns the memory
543 auto& planned_spans = modelInstance->planned_spans; // Passed to the allocator
544
545 size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
546 for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
547 // .get() will always succeed because id < num_memory_planned_buffers.
548 size_t buffer_size =
549 static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
550 ET_LOG(Debug, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
551 planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
552 planned_spans.push_back({planned_buffers.back().get(), buffer_size});
553 }
554 modelInstance->planned_memory = std::make_unique<HierarchicalAllocator>(
555 Span<Span<uint8_t>>{planned_spans.data(), planned_spans.size()});
556 auto& planned_memory = modelInstance->planned_memory;
557
558 // Assemble all of the allocators into the MemoryManager that the Executor
559 // will use.
560 auto& neuron_allocator = GET_NEURON_ALLOCATOR;
561 modelInstance->memory_manager = std::make_unique<MemoryManager>(
562 method_allocator.get(),
563 planned_memory.get(),
564 dynamic_cast<MemoryAllocator*>(&neuron_allocator));
565 auto& memory_manager = modelInstance->memory_manager;
566
567 ET_LOG(Debug, "Begin loading method %s", method_name);
568 Result<Method> method =
569 program->load_method(method_name, memory_manager.get());
570 ET_CHECK_MSG(
571 method.ok(),
572 "Loading of method %s failed with status 0x%" PRIx32,
573 method_name,
574 method.error());
575 ET_LOG(Debug, "Method loaded.");
576
577 modelInstance->method = std::make_unique<Method>(std::move(method.get()));
578 return modelInstance;
579 }
580
ReleaseModelInstance(void * modelInstance)581 void ModelChunk::ReleaseModelInstance(void* modelInstance) {
582 if (modelInstance != nullptr) {
583 delete reinterpret_cast<ModelInstance*>(modelInstance);
584 }
585 }
586
587 } // namespace example
588