1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * Copyright (c) Qualcomm Innovation Center, Inc.
4 * All rights reserved.
5 *
6 * This source code is licensed under the BSD-style license found in the
7 * LICENSE file in the root directory of this source tree.
8 */
9
10 /**
11 * @file
12 *
13 * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct
14 * and the portable kernels.
15 *
16 * User could specify arguments like desired input data, iterations, etc.
17 * Currently we assume that the outputs are all fp32 tensors.
18 */
19
20 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
21 #include <executorch/devtools/etdump/etdump_flatcc.h>
22 #include <executorch/extension/data_loader/file_data_loader.h>
23 #include <executorch/extension/runner_util/inputs.h>
24 #include <executorch/runtime/core/memory_allocator.h>
25 #include <executorch/runtime/executor/method.h>
26 #include <executorch/runtime/executor/program.h>
27 #include <executorch/runtime/platform/log.h>
28 #include <executorch/runtime/platform/runtime.h>
29
30 #include <gflags/gflags.h>
31
32 #include <chrono>
33 #include <fstream>
34 #include <memory>
35
36 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
37
38 DEFINE_string(
39 model_path,
40 "model.pte",
41 "Model serialized in flatbuffer format.");
42 DEFINE_string(
43 output_folder_path,
44 "outputs",
45 "Executorch inference data output path.");
46 DEFINE_string(input_list_path, "input_list.txt", "Model input list path.");
47 DEFINE_int32(iteration, 1, "Iterations of inference.");
48 DEFINE_int32(warm_up, 0, "Pre-run before inference.");
49 DEFINE_bool(
50 shared_buffer,
51 false,
52 "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend.");
53 DEFINE_uint32(method_index, 0, "Index of methods to be specified.");
54
55 DEFINE_string(
56 etdump_path,
57 "etdump.etdp",
58 "If etdump generation is enabled an etdump will be written out to this path");
59
60 DEFINE_bool(
61 dump_intermediate_outputs,
62 false,
63 "Dump intermediate outputs to etdump file.");
64
65 DEFINE_string(
66 debug_output_path,
67 "debug_output.bin",
68 "Path to dump debug outputs to.");
69
70 DEFINE_int32(
71 debug_buffer_size,
72 20000000, // 20MB
73 "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
74
75 using executorch::aten::Tensor;
76 using executorch::aten::TensorImpl;
77 using executorch::etdump::ETDumpGen;
78 using executorch::etdump::ETDumpResult;
79 using executorch::extension::FileDataLoader;
80 using executorch::extension::prepare_input_tensors;
81 using executorch::runtime::Error;
82 using executorch::runtime::EValue;
83 using executorch::runtime::EventTracerDebugLogLevel;
84 using executorch::runtime::HierarchicalAllocator;
85 using executorch::runtime::MemoryAllocator;
86 using executorch::runtime::MemoryManager;
87 using executorch::runtime::Method;
88 using executorch::runtime::MethodMeta;
89 using executorch::runtime::Program;
90 using executorch::runtime::Result;
91 using executorch::runtime::Span;
92 using executorch::runtime::TensorInfo;
93
94 class CustomMemory {
95 public:
CustomMemory(bool shared_buffer)96 CustomMemory(bool shared_buffer) : shared_buffer_(shared_buffer){};
Allocate(size_t bytes,size_t alignment)97 bool Allocate(size_t bytes, size_t alignment) {
98 if (shared_buffer_) {
99 ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment);
100 } else {
101 input_data_.resize(bytes);
102 ptr_ = input_data_.data();
103 }
104 return ptr_ != nullptr;
105 }
106
~CustomMemory()107 ~CustomMemory() {
108 if (shared_buffer_) {
109 if (ptr_ != nullptr) {
110 QnnExecuTorchFreeCustomMem(ptr_);
111 }
112 }
113 }
114
GetPtr()115 void* GetPtr() {
116 return ptr_;
117 }
118
119 CustomMemory(const CustomMemory&) = delete;
120 CustomMemory(CustomMemory&&) = delete;
121 CustomMemory& operator=(const CustomMemory&) = delete;
122 CustomMemory& operator=(CustomMemory&&) = delete;
123
124 private:
125 bool shared_buffer_{false};
126 void* ptr_{nullptr};
127 std::vector<char> input_data_;
128 };
129
main(int argc,char ** argv)130 int main(int argc, char** argv) {
131 executorch::runtime::runtime_init();
132
133 gflags::ParseCommandLineFlags(&argc, &argv, true);
134 if (argc != 1) {
135 std::string msg = "Extra commandline args:";
136 for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
137 msg += std::string(" ") + argv[i];
138 }
139 ET_LOG(Error, "%s", msg.c_str());
140 return 1;
141 }
142
143 // Create a loader to get the data of the program file. There are other
144 // DataLoaders that use mmap() or point to data that's already in memory, and
145 // users can create their own DataLoaders to load from arbitrary sources.
146 const char* model_path = FLAGS_model_path.c_str();
147 Result<FileDataLoader> loader = FileDataLoader::from(model_path);
148 ET_CHECK_MSG(
149 loader.ok(),
150 "FileDataLoader::from() failed: 0x%" PRIx32,
151 (int)loader.error());
152
153 // Parse the program file. This is immutable, and can also be reused between
154 // multiple execution invocations across multiple threads.
155 Result<Program> program = Program::load(&loader.get());
156 if (!program.ok()) {
157 ET_LOG(Error, "Failed to parse model file %s", model_path);
158 return 1;
159 }
160 ET_LOG(Info, "Model file %s is loaded.", model_path);
161
162 // Use the designated method in the program, default to the first one
163 const char* method_name = nullptr;
164 {
165 const auto method_name_result =
166 program->get_method_name(FLAGS_method_index);
167 ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
168 method_name = *method_name_result;
169 }
170 ET_LOG(Info, "Using method %s", method_name);
171
172 // MethodMeta describes the memory requirements of the method.
173 Result<MethodMeta> method_meta = program->method_meta(method_name);
174 ET_CHECK_MSG(
175 method_meta.ok(),
176 "Failed to get method_meta for %s: 0x%x",
177 method_name,
178 (unsigned int)method_meta.error());
179
180 //
181 // The runtime does not use malloc/new; it allocates all memory using the
182 // MemoryManger provided by the client. Clients are responsible for allocating
183 // the memory ahead of time, or providing MemoryAllocator subclasses that can
184 // do it dynamically.
185 //
186
187 // The method allocator is used to allocate all dynamic C++ metadata/objects
188 // used to represent the loaded method. This allocator is only used during
189 // loading a method of the program, which will return an error if there was
190 // not enough memory.
191 //
192 // The amount of memory required depends on the loaded method and the runtime
193 // code itself. The amount of memory here is usually determined by running the
194 // method and seeing how much memory is actually used, though it's possible to
195 // subclass MemoryAllocator so that it calls malloc() under the hood (see
196 // MallocMemoryAllocator).
197 //
198 // In this example we use a statically allocated memory pool.
199 MemoryAllocator method_allocator{
200 MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
201
202 // The memory-planned buffers will back the mutable tensors used by the
203 // method. The sizes of these buffers were determined ahead of time during the
204 // memory-planning pasees.
205 //
206 // Each buffer typically corresponds to a different hardware memory bank. Most
207 // mobile environments will only have a single buffer. Some embedded
208 // environments may have more than one for, e.g., slow/large DRAM and
209 // fast/small SRAM, or for memory associated with particular cores.
210 std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
211 std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
212 size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
213 for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
214 // .get() will always succeed because id < num_memory_planned_buffers.
215 size_t buffer_size =
216 static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
217 ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
218 planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
219 planned_spans.push_back({planned_buffers.back().get(), buffer_size});
220 }
221 HierarchicalAllocator planned_memory(
222 {planned_spans.data(), planned_spans.size()});
223
224 // Assemble all of the allocators into the MemoryManager that the Executor
225 // will use.
226 MemoryManager memory_manager(&method_allocator, &planned_memory);
227
228 //
229 // Load the method from the program, using the provided allocators. Running
230 // the method can mutate the memory-planned buffers, so the method should only
231 // be used by a single thread at at time, but it can be reused.
232 //
233 ETDumpGen etdump_gen;
234 Result<Method> method =
235 program->load_method(method_name, &memory_manager, &etdump_gen);
236 ET_CHECK_MSG(
237 method.ok(),
238 "Loading of method %s failed with status 0x%" PRIx32,
239 method_name,
240 (int)method.error());
241 ET_LOG(Info, "Method loaded.");
242
243 void* debug_buffer;
244 if (FLAGS_dump_intermediate_outputs) {
245 debug_buffer = malloc(FLAGS_debug_buffer_size);
246 Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
247 etdump_gen.set_debug_buffer(buffer);
248 etdump_gen.set_event_tracer_debug_level(
249 EventTracerDebugLogLevel::kIntermediateOutputs);
250 }
251
252 // Prepare the inputs.
253 // Allocate data memory for inputs and outputs
254 std::vector<std::unique_ptr<CustomMemory>> in_custom_mem;
255 std::vector<std::unique_ptr<CustomMemory>> out_custom_mem;
256 in_custom_mem.reserve(method->inputs_size());
257 out_custom_mem.reserve(method->outputs_size());
258
259 for (int input_index = 0; input_index < method->inputs_size();
260 ++input_index) {
261 MethodMeta method_meta = method->method_meta();
262 Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(input_index);
263 in_custom_mem.push_back(
264 std::make_unique<CustomMemory>(FLAGS_shared_buffer));
265 std::unique_ptr<CustomMemory>& custom_mem_ptr = in_custom_mem.back();
266 ET_CHECK_MSG(
267 custom_mem_ptr->Allocate(
268 tensor_meta->nbytes(), MemoryAllocator::kDefaultAlignment),
269 "Failed to allocate custom memory. tensor index: %d, bytes: %zu",
270 input_index,
271 tensor_meta->nbytes());
272 TensorImpl impl = TensorImpl(
273 tensor_meta->scalar_type(),
274 /*dim=*/tensor_meta->sizes().size(),
275 const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
276 custom_mem_ptr->GetPtr(),
277 const_cast<TensorImpl::DimOrderType*>(tensor_meta->dim_order().data()));
278 Error ret = method->set_input(Tensor(&impl), input_index);
279 ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", (int)ret);
280 }
281 for (int output_index = 0; output_index < method->outputs_size();
282 ++output_index) {
283 const Tensor& t = method->get_output(output_index).toTensor();
284 out_custom_mem.push_back(
285 std::make_unique<CustomMemory>(FLAGS_shared_buffer));
286 std::unique_ptr<CustomMemory>& custom_mem_ptr = out_custom_mem.back();
287 ET_CHECK_MSG(
288 custom_mem_ptr->Allocate(
289 t.nbytes(), MemoryAllocator::kDefaultAlignment),
290 "Failed to allocate custom memory. tensor index: %d, bytes: %zu",
291 output_index,
292 t.nbytes());
293 Error ret = method->set_output_data_ptr(
294 custom_mem_ptr->GetPtr(), t.nbytes(), output_index);
295 if (ret != Error::Ok) {
296 // This can error if the outputs are already pre-allocated. Ignore
297 // this error because it doesn't affect correctness, but log it.
298 ET_LOG(
299 Info,
300 "ignoring error from set_output_data_ptr(): 0x%" PRIx32,
301 (int)ret);
302 }
303 }
304 ET_LOG(Info, "Inputs prepared.");
305
306 // Fill in data for input
307 std::ifstream input_list(FLAGS_input_list_path);
308 if (input_list.is_open()) {
309 size_t num_inputs = method->inputs_size();
310 ET_LOG(Info, "Number of inputs: %zu", num_inputs);
311
312 auto split = [](std::string s, std::string delimiter) {
313 size_t pos_start = 0, pos_end, delim_len = delimiter.length();
314 std::string token;
315 std::vector<std::string> res;
316
317 while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
318 token = s.substr(pos_start, pos_end - pos_start);
319 pos_start = pos_end + delim_len;
320 res.push_back(token);
321 }
322 res.push_back(s.substr(pos_start));
323 return res;
324 };
325
326 std::string file_path;
327 int inference_index = 0;
328 double elapsed_time = 0;
329 while (std::getline(input_list, file_path)) {
330 auto input_files = split(file_path, " ");
331 if (input_files.size() == 0) {
332 break;
333 }
334 ET_CHECK_MSG(
335 input_files.size() == num_inputs,
336 "Number of inputs (%zu) mismatch with input files (%zu)",
337 num_inputs,
338 input_files.size());
339
340 for (int input_index = 0; input_index < num_inputs; ++input_index) {
341 MethodMeta method_meta = method->method_meta();
342 Result<TensorInfo> tensor_meta =
343 method_meta.input_tensor_meta(input_index);
344
345 std::ifstream fin(input_files[input_index], std::ios::binary);
346 fin.seekg(0, fin.end);
347 size_t file_size = fin.tellg();
348
349 ET_CHECK_MSG(
350 file_size == tensor_meta->nbytes(),
351 "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu",
352 input_index,
353 file_size,
354 tensor_meta->nbytes());
355
356 fin.seekg(0, fin.beg);
357 fin.read(
358 static_cast<char*>(in_custom_mem[input_index]->GetPtr()),
359 file_size);
360 fin.close();
361
362 // For pre-allocated use case, we need to call set_input
363 // to copy data for the input tensors since they doesn't
364 // share the data with in_custom_mem.
365 TensorImpl impl = TensorImpl(
366 tensor_meta->scalar_type(),
367 /*dim=*/tensor_meta->sizes().size(),
368 const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
369 in_custom_mem[input_index]->GetPtr(),
370 const_cast<TensorImpl::DimOrderType*>(
371 tensor_meta->dim_order().data()));
372 Error ret = method->set_input(Tensor(&impl), input_index);
373 ET_CHECK_MSG(
374 ret == Error::Ok, "Failed to set input tensor: %d", (int)ret);
375 }
376
377 Error status = Error::Ok;
378 // Warm up
379 ET_LOG(Info, "Perform %d inference for warming up", FLAGS_warm_up);
380 for (int i = 0; i < FLAGS_warm_up; ++i) {
381 status = method->execute();
382 }
383
384 // Inference with designated iterations
385 ET_LOG(Info, "Start inference (%d)", inference_index);
386 auto before_exec = std::chrono::high_resolution_clock::now();
387 for (int i = 0; i < FLAGS_iteration; ++i) {
388 status = method->execute();
389 }
390 auto after_exec = std::chrono::high_resolution_clock::now();
391 double interval_infs =
392 std::chrono::duration_cast<std::chrono::microseconds>(
393 after_exec - before_exec)
394 .count() /
395 1000.0;
396 elapsed_time += interval_infs;
397
398 ET_LOG(
399 Info,
400 "%d inference took %f ms, avg %f ms",
401 FLAGS_iteration,
402 interval_infs,
403 interval_infs / (float)FLAGS_iteration);
404 ET_CHECK_MSG(
405 status == Error::Ok,
406 "Execution of method %s failed with status 0x%" PRIx32,
407 method_name,
408 (int)status);
409
410 std::vector<EValue> outputs(method->outputs_size());
411 status = method->get_outputs(outputs.data(), method->outputs_size());
412 ET_CHECK(status == Error::Ok);
413 // The following code assumes all output EValues are floating point
414 // tensors. We need to handle other types of EValues and tensor
415 // dtypes. Furthermore, we need a util to print tensors in a more
416 // interpretable (e.g. size, dtype) and readable way.
417 // TODO for the above at T159700776
418 for (size_t output_index = 0; output_index < method->outputs_size();
419 output_index++) {
420 auto output_tensor = outputs[output_index].toTensor();
421 auto output_file_name = FLAGS_output_folder_path + "/output_" +
422 std::to_string(inference_index) + "_" +
423 std::to_string(output_index) + ".raw";
424 std::ofstream fout(output_file_name.c_str(), std::ios::binary);
425 fout.write(
426 output_tensor.const_data_ptr<char>(), output_tensor.nbytes());
427 fout.close();
428 }
429
430 ++inference_index;
431 }
432 ET_LOG(
433 Info,
434 "%d inference took %f ms, avg %f ms",
435 inference_index,
436 elapsed_time,
437 elapsed_time / inference_index);
438 } else {
439 // if no input is provided, fill the inputs with default values
440 auto inputs = prepare_input_tensors(*method);
441 ET_CHECK_MSG(
442 inputs.ok(),
443 "Could not prepare inputs: 0x%" PRIx32,
444 (uint32_t)inputs.error());
445 ET_LOG(
446 Info,
447 "Input list not provided. Inputs prepared with default values set.");
448 Error status = method->execute();
449 ET_CHECK_MSG(
450 status == Error::Ok,
451 "Execution of method %s failed with status 0x%" PRIx32,
452 method_name,
453 (int)status);
454 ET_LOG(Info, "Model executed successfully.");
455 }
456
457 // Dump the etdump data containing profiling/debugging data to the specified
458 // file.
459 ETDumpResult result = etdump_gen.get_etdump_data();
460 if (result.buf != nullptr && result.size > 0) {
461 ET_LOG(
462 Info,
463 "Write etdump to %s, Size = %zu",
464 FLAGS_etdump_path.c_str(),
465 result.size);
466 FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
467 fwrite((uint8_t*)result.buf, 1, result.size, f);
468 fclose(f);
469 free(result.buf);
470 }
471
472 if (FLAGS_dump_intermediate_outputs) {
473 ET_LOG(
474 Info,
475 "Write debug output binary to %s, Size = %zu",
476 FLAGS_debug_output_path.c_str(),
477 (size_t)FLAGS_debug_buffer_size);
478 FILE* f = fopen(FLAGS_debug_output_path.c_str(), "w+");
479 fwrite((uint8_t*)debug_buffer, 1, FLAGS_debug_buffer_size, f);
480 fclose(f);
481 free(debug_buffer);
482 }
483
484 return 0;
485 }
486