xref: /aosp_15_r20/external/executorch/examples/qualcomm/executor_runner/qnn_executor_runner.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * Copyright (c) Qualcomm Innovation Center, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the BSD-style license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 /**
11  * @file
12  *
13  * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct
14  * and the portable kernels.
15  *
16  * User could specify arguments like desired input data, iterations, etc.
17  * Currently we assume that the outputs are all fp32 tensors.
18  */
19 
20 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
21 #include <executorch/devtools/etdump/etdump_flatcc.h>
22 #include <executorch/extension/data_loader/file_data_loader.h>
23 #include <executorch/extension/runner_util/inputs.h>
24 #include <executorch/runtime/core/memory_allocator.h>
25 #include <executorch/runtime/executor/method.h>
26 #include <executorch/runtime/executor/program.h>
27 #include <executorch/runtime/platform/log.h>
28 #include <executorch/runtime/platform/runtime.h>
29 
30 #include <gflags/gflags.h>
31 
32 #include <chrono>
33 #include <fstream>
34 #include <memory>
35 
36 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
37 
38 DEFINE_string(
39     model_path,
40     "model.pte",
41     "Model serialized in flatbuffer format.");
42 DEFINE_string(
43     output_folder_path,
44     "outputs",
45     "Executorch inference data output path.");
46 DEFINE_string(input_list_path, "input_list.txt", "Model input list path.");
47 DEFINE_int32(iteration, 1, "Iterations of inference.");
48 DEFINE_int32(warm_up, 0, "Pre-run before inference.");
49 DEFINE_bool(
50     shared_buffer,
51     false,
52     "Specifies to use shared buffers for zero-copy usecase between the application and device/co-processor associated with the backend.");
53 DEFINE_uint32(method_index, 0, "Index of methods to be specified.");
54 
55 DEFINE_string(
56     etdump_path,
57     "etdump.etdp",
58     "If etdump generation is enabled an etdump will be written out to this path");
59 
60 DEFINE_bool(
61     dump_intermediate_outputs,
62     false,
63     "Dump intermediate outputs to etdump file.");
64 
65 DEFINE_string(
66     debug_output_path,
67     "debug_output.bin",
68     "Path to dump debug outputs to.");
69 
70 DEFINE_int32(
71     debug_buffer_size,
72     20000000, // 20MB
73     "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging.");
74 
75 using executorch::aten::Tensor;
76 using executorch::aten::TensorImpl;
77 using executorch::etdump::ETDumpGen;
78 using executorch::etdump::ETDumpResult;
79 using executorch::extension::FileDataLoader;
80 using executorch::extension::prepare_input_tensors;
81 using executorch::runtime::Error;
82 using executorch::runtime::EValue;
83 using executorch::runtime::EventTracerDebugLogLevel;
84 using executorch::runtime::HierarchicalAllocator;
85 using executorch::runtime::MemoryAllocator;
86 using executorch::runtime::MemoryManager;
87 using executorch::runtime::Method;
88 using executorch::runtime::MethodMeta;
89 using executorch::runtime::Program;
90 using executorch::runtime::Result;
91 using executorch::runtime::Span;
92 using executorch::runtime::TensorInfo;
93 
94 class CustomMemory {
95  public:
CustomMemory(bool shared_buffer)96   CustomMemory(bool shared_buffer) : shared_buffer_(shared_buffer){};
Allocate(size_t bytes,size_t alignment)97   bool Allocate(size_t bytes, size_t alignment) {
98     if (shared_buffer_) {
99       ptr_ = QnnExecuTorchAllocCustomMem(bytes, alignment);
100     } else {
101       input_data_.resize(bytes);
102       ptr_ = input_data_.data();
103     }
104     return ptr_ != nullptr;
105   }
106 
~CustomMemory()107   ~CustomMemory() {
108     if (shared_buffer_) {
109       if (ptr_ != nullptr) {
110         QnnExecuTorchFreeCustomMem(ptr_);
111       }
112     }
113   }
114 
GetPtr()115   void* GetPtr() {
116     return ptr_;
117   }
118 
119   CustomMemory(const CustomMemory&) = delete;
120   CustomMemory(CustomMemory&&) = delete;
121   CustomMemory& operator=(const CustomMemory&) = delete;
122   CustomMemory& operator=(CustomMemory&&) = delete;
123 
124  private:
125   bool shared_buffer_{false};
126   void* ptr_{nullptr};
127   std::vector<char> input_data_;
128 };
129 
main(int argc,char ** argv)130 int main(int argc, char** argv) {
131   executorch::runtime::runtime_init();
132 
133   gflags::ParseCommandLineFlags(&argc, &argv, true);
134   if (argc != 1) {
135     std::string msg = "Extra commandline args:";
136     for (int i = 1 /* skip argv[0] (program name) */; i < argc; i++) {
137       msg += std::string(" ") + argv[i];
138     }
139     ET_LOG(Error, "%s", msg.c_str());
140     return 1;
141   }
142 
143   // Create a loader to get the data of the program file. There are other
144   // DataLoaders that use mmap() or point to data that's already in memory, and
145   // users can create their own DataLoaders to load from arbitrary sources.
146   const char* model_path = FLAGS_model_path.c_str();
147   Result<FileDataLoader> loader = FileDataLoader::from(model_path);
148   ET_CHECK_MSG(
149       loader.ok(),
150       "FileDataLoader::from() failed: 0x%" PRIx32,
151       (int)loader.error());
152 
153   // Parse the program file. This is immutable, and can also be reused between
154   // multiple execution invocations across multiple threads.
155   Result<Program> program = Program::load(&loader.get());
156   if (!program.ok()) {
157     ET_LOG(Error, "Failed to parse model file %s", model_path);
158     return 1;
159   }
160   ET_LOG(Info, "Model file %s is loaded.", model_path);
161 
162   // Use the designated method in the program, default to the first one
163   const char* method_name = nullptr;
164   {
165     const auto method_name_result =
166         program->get_method_name(FLAGS_method_index);
167     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
168     method_name = *method_name_result;
169   }
170   ET_LOG(Info, "Using method %s", method_name);
171 
172   // MethodMeta describes the memory requirements of the method.
173   Result<MethodMeta> method_meta = program->method_meta(method_name);
174   ET_CHECK_MSG(
175       method_meta.ok(),
176       "Failed to get method_meta for %s: 0x%x",
177       method_name,
178       (unsigned int)method_meta.error());
179 
180   //
181   // The runtime does not use malloc/new; it allocates all memory using the
182   // MemoryManger provided by the client. Clients are responsible for allocating
183   // the memory ahead of time, or providing MemoryAllocator subclasses that can
184   // do it dynamically.
185   //
186 
187   // The method allocator is used to allocate all dynamic C++ metadata/objects
188   // used to represent the loaded method. This allocator is only used during
189   // loading a method of the program, which will return an error if there was
190   // not enough memory.
191   //
192   // The amount of memory required depends on the loaded method and the runtime
193   // code itself. The amount of memory here is usually determined by running the
194   // method and seeing how much memory is actually used, though it's possible to
195   // subclass MemoryAllocator so that it calls malloc() under the hood (see
196   // MallocMemoryAllocator).
197   //
198   // In this example we use a statically allocated memory pool.
199   MemoryAllocator method_allocator{
200       MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)};
201 
202   // The memory-planned buffers will back the mutable tensors used by the
203   // method. The sizes of these buffers were determined ahead of time during the
204   // memory-planning pasees.
205   //
206   // Each buffer typically corresponds to a different hardware memory bank. Most
207   // mobile environments will only have a single buffer. Some embedded
208   // environments may have more than one for, e.g., slow/large DRAM and
209   // fast/small SRAM, or for memory associated with particular cores.
210   std::vector<std::unique_ptr<uint8_t[]>> planned_buffers; // Owns the memory
211   std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
212   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
213   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
214     // .get() will always succeed because id < num_memory_planned_buffers.
215     size_t buffer_size =
216         static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
217     ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
218     planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
219     planned_spans.push_back({planned_buffers.back().get(), buffer_size});
220   }
221   HierarchicalAllocator planned_memory(
222       {planned_spans.data(), planned_spans.size()});
223 
224   // Assemble all of the allocators into the MemoryManager that the Executor
225   // will use.
226   MemoryManager memory_manager(&method_allocator, &planned_memory);
227 
228   //
229   // Load the method from the program, using the provided allocators. Running
230   // the method can mutate the memory-planned buffers, so the method should only
231   // be used by a single thread at at time, but it can be reused.
232   //
233   ETDumpGen etdump_gen;
234   Result<Method> method =
235       program->load_method(method_name, &memory_manager, &etdump_gen);
236   ET_CHECK_MSG(
237       method.ok(),
238       "Loading of method %s failed with status 0x%" PRIx32,
239       method_name,
240       (int)method.error());
241   ET_LOG(Info, "Method loaded.");
242 
243   void* debug_buffer;
244   if (FLAGS_dump_intermediate_outputs) {
245     debug_buffer = malloc(FLAGS_debug_buffer_size);
246     Span<uint8_t> buffer((uint8_t*)debug_buffer, FLAGS_debug_buffer_size);
247     etdump_gen.set_debug_buffer(buffer);
248     etdump_gen.set_event_tracer_debug_level(
249         EventTracerDebugLogLevel::kIntermediateOutputs);
250   }
251 
252   // Prepare the inputs.
253   // Allocate data memory for inputs and outputs
254   std::vector<std::unique_ptr<CustomMemory>> in_custom_mem;
255   std::vector<std::unique_ptr<CustomMemory>> out_custom_mem;
256   in_custom_mem.reserve(method->inputs_size());
257   out_custom_mem.reserve(method->outputs_size());
258 
259   for (int input_index = 0; input_index < method->inputs_size();
260        ++input_index) {
261     MethodMeta method_meta = method->method_meta();
262     Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(input_index);
263     in_custom_mem.push_back(
264         std::make_unique<CustomMemory>(FLAGS_shared_buffer));
265     std::unique_ptr<CustomMemory>& custom_mem_ptr = in_custom_mem.back();
266     ET_CHECK_MSG(
267         custom_mem_ptr->Allocate(
268             tensor_meta->nbytes(), MemoryAllocator::kDefaultAlignment),
269         "Failed to allocate custom memory. tensor index: %d, bytes: %zu",
270         input_index,
271         tensor_meta->nbytes());
272     TensorImpl impl = TensorImpl(
273         tensor_meta->scalar_type(),
274         /*dim=*/tensor_meta->sizes().size(),
275         const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
276         custom_mem_ptr->GetPtr(),
277         const_cast<TensorImpl::DimOrderType*>(tensor_meta->dim_order().data()));
278     Error ret = method->set_input(Tensor(&impl), input_index);
279     ET_CHECK_MSG(ret == Error::Ok, "Failed to set input tensor: %d", (int)ret);
280   }
281   for (int output_index = 0; output_index < method->outputs_size();
282        ++output_index) {
283     const Tensor& t = method->get_output(output_index).toTensor();
284     out_custom_mem.push_back(
285         std::make_unique<CustomMemory>(FLAGS_shared_buffer));
286     std::unique_ptr<CustomMemory>& custom_mem_ptr = out_custom_mem.back();
287     ET_CHECK_MSG(
288         custom_mem_ptr->Allocate(
289             t.nbytes(), MemoryAllocator::kDefaultAlignment),
290         "Failed to allocate custom memory. tensor index: %d, bytes: %zu",
291         output_index,
292         t.nbytes());
293     Error ret = method->set_output_data_ptr(
294         custom_mem_ptr->GetPtr(), t.nbytes(), output_index);
295     if (ret != Error::Ok) {
296       // This can error if the outputs are already pre-allocated. Ignore
297       // this error because it doesn't affect correctness, but log it.
298       ET_LOG(
299           Info,
300           "ignoring error from set_output_data_ptr(): 0x%" PRIx32,
301           (int)ret);
302     }
303   }
304   ET_LOG(Info, "Inputs prepared.");
305 
306   // Fill in data for input
307   std::ifstream input_list(FLAGS_input_list_path);
308   if (input_list.is_open()) {
309     size_t num_inputs = method->inputs_size();
310     ET_LOG(Info, "Number of inputs: %zu", num_inputs);
311 
312     auto split = [](std::string s, std::string delimiter) {
313       size_t pos_start = 0, pos_end, delim_len = delimiter.length();
314       std::string token;
315       std::vector<std::string> res;
316 
317       while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
318         token = s.substr(pos_start, pos_end - pos_start);
319         pos_start = pos_end + delim_len;
320         res.push_back(token);
321       }
322       res.push_back(s.substr(pos_start));
323       return res;
324     };
325 
326     std::string file_path;
327     int inference_index = 0;
328     double elapsed_time = 0;
329     while (std::getline(input_list, file_path)) {
330       auto input_files = split(file_path, " ");
331       if (input_files.size() == 0) {
332         break;
333       }
334       ET_CHECK_MSG(
335           input_files.size() == num_inputs,
336           "Number of inputs (%zu) mismatch with input files (%zu)",
337           num_inputs,
338           input_files.size());
339 
340       for (int input_index = 0; input_index < num_inputs; ++input_index) {
341         MethodMeta method_meta = method->method_meta();
342         Result<TensorInfo> tensor_meta =
343             method_meta.input_tensor_meta(input_index);
344 
345         std::ifstream fin(input_files[input_index], std::ios::binary);
346         fin.seekg(0, fin.end);
347         size_t file_size = fin.tellg();
348 
349         ET_CHECK_MSG(
350             file_size == tensor_meta->nbytes(),
351             "Input(%d) size mismatch. file bytes: %zu, tensor bytes: %zu",
352             input_index,
353             file_size,
354             tensor_meta->nbytes());
355 
356         fin.seekg(0, fin.beg);
357         fin.read(
358             static_cast<char*>(in_custom_mem[input_index]->GetPtr()),
359             file_size);
360         fin.close();
361 
362         // For pre-allocated use case, we need to call set_input
363         // to copy data for the input tensors since they doesn't
364         // share the data with in_custom_mem.
365         TensorImpl impl = TensorImpl(
366             tensor_meta->scalar_type(),
367             /*dim=*/tensor_meta->sizes().size(),
368             const_cast<TensorImpl::SizesType*>(tensor_meta->sizes().data()),
369             in_custom_mem[input_index]->GetPtr(),
370             const_cast<TensorImpl::DimOrderType*>(
371                 tensor_meta->dim_order().data()));
372         Error ret = method->set_input(Tensor(&impl), input_index);
373         ET_CHECK_MSG(
374             ret == Error::Ok, "Failed to set input tensor: %d", (int)ret);
375       }
376 
377       Error status = Error::Ok;
378       // Warm up
379       ET_LOG(Info, "Perform %d inference for warming up", FLAGS_warm_up);
380       for (int i = 0; i < FLAGS_warm_up; ++i) {
381         status = method->execute();
382       }
383 
384       // Inference with designated iterations
385       ET_LOG(Info, "Start inference (%d)", inference_index);
386       auto before_exec = std::chrono::high_resolution_clock::now();
387       for (int i = 0; i < FLAGS_iteration; ++i) {
388         status = method->execute();
389       }
390       auto after_exec = std::chrono::high_resolution_clock::now();
391       double interval_infs =
392           std::chrono::duration_cast<std::chrono::microseconds>(
393               after_exec - before_exec)
394               .count() /
395           1000.0;
396       elapsed_time += interval_infs;
397 
398       ET_LOG(
399           Info,
400           "%d inference took %f ms, avg %f ms",
401           FLAGS_iteration,
402           interval_infs,
403           interval_infs / (float)FLAGS_iteration);
404       ET_CHECK_MSG(
405           status == Error::Ok,
406           "Execution of method %s failed with status 0x%" PRIx32,
407           method_name,
408           (int)status);
409 
410       std::vector<EValue> outputs(method->outputs_size());
411       status = method->get_outputs(outputs.data(), method->outputs_size());
412       ET_CHECK(status == Error::Ok);
413       // The following code assumes all output EValues are floating point
414       // tensors. We need to handle other types of EValues and tensor
415       // dtypes. Furthermore, we need a util to print tensors in a more
416       // interpretable (e.g. size, dtype) and readable way.
417       // TODO for the above at T159700776
418       for (size_t output_index = 0; output_index < method->outputs_size();
419            output_index++) {
420         auto output_tensor = outputs[output_index].toTensor();
421         auto output_file_name = FLAGS_output_folder_path + "/output_" +
422             std::to_string(inference_index) + "_" +
423             std::to_string(output_index) + ".raw";
424         std::ofstream fout(output_file_name.c_str(), std::ios::binary);
425         fout.write(
426             output_tensor.const_data_ptr<char>(), output_tensor.nbytes());
427         fout.close();
428       }
429 
430       ++inference_index;
431     }
432     ET_LOG(
433         Info,
434         "%d inference took %f ms, avg %f ms",
435         inference_index,
436         elapsed_time,
437         elapsed_time / inference_index);
438   } else {
439     // if no input is provided, fill the inputs with default values
440     auto inputs = prepare_input_tensors(*method);
441     ET_CHECK_MSG(
442         inputs.ok(),
443         "Could not prepare inputs: 0x%" PRIx32,
444         (uint32_t)inputs.error());
445     ET_LOG(
446         Info,
447         "Input list not provided. Inputs prepared with default values set.");
448     Error status = method->execute();
449     ET_CHECK_MSG(
450         status == Error::Ok,
451         "Execution of method %s failed with status 0x%" PRIx32,
452         method_name,
453         (int)status);
454     ET_LOG(Info, "Model executed successfully.");
455   }
456 
457   // Dump the etdump data containing profiling/debugging data to the specified
458   // file.
459   ETDumpResult result = etdump_gen.get_etdump_data();
460   if (result.buf != nullptr && result.size > 0) {
461     ET_LOG(
462         Info,
463         "Write etdump to %s, Size = %zu",
464         FLAGS_etdump_path.c_str(),
465         result.size);
466     FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+");
467     fwrite((uint8_t*)result.buf, 1, result.size, f);
468     fclose(f);
469     free(result.buf);
470   }
471 
472   if (FLAGS_dump_intermediate_outputs) {
473     ET_LOG(
474         Info,
475         "Write debug output binary to %s, Size = %zu",
476         FLAGS_debug_output_path.c_str(),
477         (size_t)FLAGS_debug_buffer_size);
478     FILE* f = fopen(FLAGS_debug_output_path.c_str(), "w+");
479     fwrite((uint8_t*)debug_buffer, 1, FLAGS_debug_buffer_size, f);
480     fclose(f);
481     free(debug_buffer);
482   }
483 
484   return 0;
485 }
486